The Computer Language
24.11 Benchmarks Game

k-nucleotide Go #4 program

source code

/* The Computer Language Benchmarks Game 
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 *
 * contributed by Bert Gijsbers.
 * based on work by Dirk Moerenhout, Jason Alan Palmer and the Go Authors.
 * further optimized by Mark van Weert
 */

package main

import (
    "bufio"
    "bytes"
    "fmt"
    "os"
    "runtime"
    "sort"
    "strings"
)

var toNum = strings.NewReplacer(
    "A", string(0),
    "C", string(1),
    "G", string(3),
    "T", string(2),
)

var toChar = strings.NewReplacer(
    string(0), "A",
    string(1), "C",
    string(3), "G",
    string(2), "T",
)

var empty string

var jobs = []struct {
    length   int
    sequence string
    result   chan string
}{
    {1, empty, make(chan string, 1)},
    {2, empty, make(chan string, 1)},
    {0, "GGT", make(chan string, 1)},
    {0, "GGTA", make(chan string, 1)},
    {0, "GGTATT", make(chan string, 1)},
    {0, "GGTATTTTAATT", make(chan string, 1)},
    {0, "GGTATTTTAATTTATAGT", make(chan string, 1)},
}

func main() {
    dna := input()
    scheduler(dna)
    for i := range jobs {
        fmt.Println(<-jobs[i].result)
    }
}

func scheduler(dna []byte) {
    command := make(chan int, len(jobs))
    for i := runtime.NumCPU(); i > 0; i-- {
        go worker(dna, command)
    }

    for i := range jobs {
        // longest job first, shortest job last
        command <- len(jobs) - 1 - i
    }
    close(command)
}

func worker(dna []byte, command <-chan int) {
    for k := range command {
        if jobs[k].length > 0 {
            jobs[k].result <- frequencyReport(dna, jobs[k].length)
        } else {
            jobs[k].result <- sequenceReport(dna, jobs[k].sequence)
        }
    }
}

func input() (data []byte) {
    data = readThird()
    for i := 0; i < len(data); i++ {
        data[i] = data[i] >> 1 & 3
    }
    return
}

func readThird() (data []byte) {
    in, lineCount := findThree()
    data = make([]byte, 0, lineCount*61)
    for {
        line, err := in.ReadSlice('\n')
        if err != nil && (line == nil || len(line) == 0) || line[0] == '>' {
            break
        }
        last := len(line) - 1
        if line[last] == '\n' {
            line = line[:last]
        }
        data = append(data, line...)
    }
    return
}

func findThree() (in *bufio.Reader, lineCount int) {
    in = bufio.NewReaderSize(os.Stdin, 1<<20)
    for {
        line, err := in.ReadSlice('\n')
        if err != nil {
            panic("read error")
        }
        lineCount++
        if line[0] == '>' && strings.HasPrefix(string(line), ">THREE") {
            break
        }
    }
    return
}

type sequence struct {
    nucs  string
    count uint32
}

type sequenceSlice []sequence

func (ss sequenceSlice) Len() int {
    return len(ss)
}

func (ss sequenceSlice) Swap(i, j int) {
    ss[i], ss[j] = ss[j], ss[i]
}

func (ss sequenceSlice) Less(i, j int) bool {
    if ss[i].count == ss[j].count {
        return ss[i].nucs > ss[j].nucs
    }
    return ss[i].count > ss[j].count
}

func frequencyReport(dna []byte, length int) string {
    var sortedSeqs sequenceSlice
    counts := count32(dna, length)
    for num, pointer := range counts {
        sortedSeqs = append(
            sortedSeqs,
            sequence{toChar.Replace(string(decompress32(num, length))), pointer},
        )
    }
    sort.Sort(sortedSeqs)

    var buf bytes.Buffer
    for _, sequence := range sortedSeqs {
        buf.WriteString(fmt.Sprintf(
            "%v %.3f\n", sequence.nucs,
            100.0*float32(sequence.count)/float32(len(dna)-length+1)),
        )
    }
    return buf.String()
}

func sequenceReport(dna []byte, sequence string) string {
    var pointer uint32
    if len(sequence) <= 16 {
        counts := count32(dna, len(sequence))
        pointer = counts[compress32([]byte(toNum.Replace(sequence)))]
    } else {
        counts := count64(dna, len(sequence))
        pointer = counts[compress64([]byte(toNum.Replace(sequence)))]
    }
    var sequenceCount uint32
    sequenceCount = pointer
    return fmt.Sprintf("%v\t%v", sequenceCount, sequence)
}

func count32(dna []byte, length int) map[uint32]uint32 {
    counts := make(map[uint32]uint32)
    key := compress32(dna[:length-1])
    mask := uint32(1)<<uint(length<<1) - 1
    for index := uint(length) - 1; index < uint(len(dna)); index++ { // Use an uint as index to allow the compiler to remove a bounds check on the dna slice
        key = key<<2&mask | uint32(dna[index])
        counts[key]++ // No need to check if key is already present in map since Go returns the zero value of the map value if the key is not present
    }
    return counts
}

func count64(dna []byte, length int) map[uint64]uint32 {
    counts := make(map[uint64]uint32)
    key := compress64(dna[:length-1])
    mask := uint64(1)<<uint(length<<1) - 1
    for index := uint(length) - 1; index < uint(len(dna)); index++ { // Use an uint as index to allow the compiler to remove a bounds check on the dna slice
        key = key<<2&mask | uint64(dna[index])
        counts[key]++ // No need to check if key is already present in map since Go returns the zero value of the map value if the key is not present
    }
    return counts
}

func compress64(sequence []byte) uint64 {
    var num uint64
    for _, char := range sequence {
        num = num<<2 | uint64(char)
    }
    return num
}

func compress32(sequence []byte) uint32 {
    var num uint32
    for _, char := range sequence {
        num = num<<2 | uint32(char)
    }
    return num
}

func decompress32(num uint32, length int) (sequence []byte) {
    sequence = make([]byte, length)
    for i := 0; i < length; i++ {
        sequence[length-i-1] = byte(num & 3)
        num = num >> 2
    }
    return
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
go version go1.23.1 linux/amd64
GOAMD64=v2


 Fri, 06 Sep 2024 00:10:20 GMT

MAKE:
/opt/src/go1.23.1/go/bin/go build -o knucleotide.go-4.go_run knucleotide.go-4.go

5.95s to complete and log all make actions

COMMAND LINE:
 ./knucleotide.go-4.go_run 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT