The Q6600
Benchmarks Game

k-nucleotide Go #7 program

source code

/* The Computer Language Benchmarks Game 
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 *
 * contributed by Mark van Weert
 * based on Go#6 C++#2
 */

package main

import (
    "bufio"
    "fmt"
    "io"
    "math"
    "os"
    "runtime"
    "sort"
    "strings"
    "sync"
)

var toNum = strings.NewReplacer("A", string(0), "C", string(1), "T", string(2), "G", string(3), )
var toChar = []byte{'A', 'C', 'T', 'G'}

func main() {
//    start := time.Now()
    dna := input()

    fmt.Println(WriteFrequencies(dna, 1))
    fmt.Println(WriteFrequencies(dna, 2))
    fmt.Println(WriteCount(dna, "GGT"))
    fmt.Println(WriteCount(dna, "GGTA"))
    fmt.Println(WriteCount(dna, "GGTATT"))
    fmt.Println(WriteCount(dna, "GGTATTTTAATT"))
    fmt.Println(WriteCount(dna, "GGTATTTTAATTTATAGT"))
//    fmt.Println(time.Since(start))
}

// input returns the transformed data
func input() []byte {
    data := readStdin()

    // A = 0, C = 1, T = 2, G = 3
    for idx := range data {
        data[idx] = data[idx] >> 1 & 3
    }
    return data
}

// readStdin returns the data marked with >THREE from stdin
func readStdin() []byte {
    // Create a reader for stdin
    reader := bufio.NewReaderSize(os.Stdin, 1<<20)
    var lineCount int
    for {
        // Keep reading until we encounter the start marker >THREE
        line, err := reader.ReadSlice('\n')
        if err != nil {
            panic(err)
        }

        if len(line) == 0 {
            continue
        }

        lineCount++

        if line[0] == '>' && strings.HasPrefix(string(line), ">THREE") {
            data := make([]byte, 0, lineCount*61)
            for {
                // Read from the three section
                line, err := reader.ReadSlice('\n')
                if err != nil && err != io.EOF {
                    panic(err)
                } else if err == io.EOF || len(line) == 0 {
                    // End of input reached
                    return data
                }
                // Append the read data to the data slice
                data = append(data, line[:len(line)-1]...)
            }
        }
    }
}

// WriteFrequencies returns the frequencies of the nucleotides with the given length
func WriteFrequencies(data []byte, size int) string {
    // Get counts
    counts := startCount32(data, size)

    // Store the map keys and values in a struct to sort them
    type kv struct {
        Key   uint32
        Value int
    }
    var sortedCounts []kv
    for k, v := range counts {
        sortedCounts = append(sortedCounts, kv{k, *v})
    }

    sort.Slice(sortedCounts, func(i, j int) bool {
        return sortedCounts[i].Value > sortedCounts[j].Value
    })

    sum := float32(len(data) - size + 1)
    var result string
    for _, count := range sortedCounts {
        result += fmt.Sprintf("%v %.3f\n", decompress32(count.Key, size), 100.0*float32(count.Value)/sum)
    }

    return result
}

// WriteCount returns the number of given nucleotides in the data
func WriteCount(data []byte, nucleotide string) string {
    size := len(nucleotide)

    if len(nucleotide) <= 16 {
        // Get counts nucleotide key can fit in an uint32
        counts := startCount32(data, size)

        key := compress32([]byte(toNum.Replace(nucleotide)))

        count := counts[key]
        if count == nil {
            count = new(int)
        }

        return fmt.Sprintf("%v\t%v", *count, nucleotide)

    } else {
        // Get counts for longer nucleotides
        counts := startCount64(data, size)
        key := compress64([]byte(toNum.Replace(nucleotide)))
        count := counts[key]
        if count == nil {
            count = new(int)
        }

        return fmt.Sprintf("%v\t%v", *count, nucleotide)
    }
}

func startCount32(data []byte, size int) map[uint32]*int {
    maps := make([]map[uint32]*int, runtime.NumCPU())

    wg := sync.WaitGroup{}

    // Create a map for each goroutine we will spawn
    for i := 0; i < len(maps); i++ {
        maps[i] = make(map[uint32]*int)
        wg.Add(1)
        go calc32(data, maps[i], i, size, &wg)
    }

    wg.Wait()

    // Add counts from all goroutines to the first map
    for i := 1; i < len(maps); i++ {
        for i2, val := range maps[i] {
            if val == nil {
                continue
            }
            if maps[0][i2] == nil {
                maps[0][i2] = new(int)
            }
            *maps[0][i2] += *val
        }
    }

    return maps[0]
}

func startCount64(data []byte, size int) map[uint64]*int {
    maps := make([]map[uint64]*int, runtime.NumCPU())

    wg := sync.WaitGroup{}

    // Create a map for each goroutine we will spawn
    for i := 0; i < len(maps); i++ {
        maps[i] = make(map[uint64]*int)
        wg.Add(1)
        go calc64(data, maps[i], i, size, &wg)
    }

    wg.Wait()

    // Add counts from all goroutines to the first map
    for i := 1; i < len(maps); i++ {
        for i2, val := range maps[i] {
            if val == nil {
                continue
            }
            if maps[0][i2] == nil {
                maps[0][i2] = new(int)
            }
            *maps[0][i2] += *val
        }
    }

    return maps[0]
}

func calc32(data []byte, result map[uint32]*int, begin int, size int, wg *sync.WaitGroup) {
    var key uint32
    goroutineCount := uint(runtime.NumCPU())

    // Init key
    for i := 0; i < size; i++ {
        key <<= 2
        key |= uint32(data[i+begin])
    }

    // Create a map to do the counts in. 
    // We don't use the map we are passed but instead return a copy of this

    p := new(int)
    *p++
    result[key] = p

    nsize := uint(size)
    if goroutineCount < nsize {
        nsize = goroutineCount
    }

    mask := ^uint32(math.MaxUint32 << uint(2*size))

    start := uint(begin) + goroutineCount
    end := uint(len(data) + 1 - size)
    for idx := start; idx < end; idx += goroutineCount {
        // Update the key with 1 byte at a time
        for i := uint(0); i < nsize; i++ {
            key <<= 2
            key |= uint32(data[idx+i])
        }

        // Mask out excess information
        key &= mask

        // Get pointer to the count for this key from the map
        // For a low number of different keys using a map with 
        // pointers is faster than just using a value map and do: m[key]++
        p, ok := result[key]
        if !ok {
            p = new(int)
            result[key] = p
        }
        *p++
    }

    // Signal done
    wg.Done()
}

func calc64(data []byte, result map[uint64]*int, begin int, size int, wg *sync.WaitGroup) {
    var key uint64
    goroutineCount := uint(runtime.NumCPU())

    // Init key
    for i := 0; i < size; i++ {
        key <<= 2
        key |= uint64(data[i+begin])
    }

    // Create a map to do the counts in. We don't use the map 
    // we are passed but instead return a copy of this

    p := new(int)
    *p++
    result[key] = p

    nsize := uint(size)
    if goroutineCount < nsize {
        nsize = goroutineCount
    }

    mask := ^uint64(math.MaxUint64 << uint(2*size))

    start := uint(begin) + goroutineCount
    end := uint(len(data) + 1 - size)
    for idx := start; idx < end; idx += goroutineCount {
        // Update the key with 1 byte at a time
        if uint(len(data)) < idx+nsize {
            continue
        }
        for i := uint(0); i < nsize; i++ {
            key <<= 2
            key |= uint64(data[idx+i])
        }

        // Mask out excess information
        key &= mask

        // Get pointer to the count for this key from the map
        // For a low number of different keys using a map with 
        // pointers is faster than just using a value map and do: m[key]++
        p, ok := result[key]
        if !ok {
            p = new(int)
            result[key] = p
        }
        *p++
    }

    // Signal done
    wg.Done()

}

func compress64(sequence []byte) uint64 {
    var num uint64
    for _, char := range sequence {
        num = num<<2 | uint64(char)
    }
    return num
}

func compress32(sequence []byte) uint32 {
    var num uint32
    for _, char := range sequence {
        num = num<<2 | uint32(char)
    }
    return num
}

func decompress32(num uint32, length int) string {
    var sequence = make([]byte, length)
    for i := 0; i < length; i++ {
        sequence[length-i-1] = toChar[byte(num&3)]
        num = num >> 2
    }
    return string(sequence)
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
go version go1.14 linux/amd64


Wed, 06 May 2020 01:28:18 GMT

MAKE:
/opt/src/go1.14.linux-amd64/go/bin/go build -o knucleotide.go-7.go_run

5.81s to complete and log all make actions

COMMAND LINE:
./knucleotide.go-7.go_run 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT