The Computer Language
24.12 Benchmarks Game

k-nucleotide Julia #7 program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# Contributed by Adam Beckmeyer

import Printf: @printf

const LINESIZE = 61
const SIZEHINT = 4 * 4096
const FRAMES = (1, 2, 3, 4, 6, 12, 18)
const COUNTSFOR = codeunits.(("ggt", "ggta", "ggtatt", "ggtattttaatt", "ggtattttaatttatagt"))

struct Seq{T<:Unsigned}
    x::T
end
Seq(T) = Seq(0 % T)

# 10 works marginally better for this hashtable than previously used 7
Base.hash(s::Seq, h::UInt)::UInt = s.x ⊻ s.x >> 10

make_mask(frame) = (UInt64(1) << 2frame) - 1

# Gives a = 0x00, c = 0x01, g = 0x03, t = 0x02
encode(c) = (c >> 1) & 0x03

# Return `c` such that `c == decode(encode(c))`.
function decode(b::UInt8)
    for c in ('a' % UInt8, 'c' % UInt8, 'g' % UInt8)
        b == encode(c) && return c
    end
    't' % UInt8
end

# Push encoding for `c` to last two bits of `s`.
push(s::Seq{T}, c, mask) where T = Seq((mask & ((s.x << 2) | encode(c))) % T)

function to_string(s::Seq, n)
    out = Vector{UInt8}(undef, n)
    for i in eachindex(out)
        @inbounds out[i] = decode((s.x & 0x03) % UInt8)
        s = Seq(s.x >> 2)
    end
    String(reverse!(out))
end

function count_frame!(d::Dict{Seq{T}}, seq, frame) where T
    mask = make_mask(frame)
    v = Seq(zero(T))
    # Fill the Seq up with `frame` pairs of bits
    for i=1:(frame-1)
        v = @inbounds push(v, seq[i], mask)
    end
    # Slide Seq character-by-character over the remainder of sequence
    for i=frame:length(seq)
        v = @inbounds push(v, seq[i], mask)
        # This does two hashtable lookups currently. Doing better
        # isn't possible until julia gets something like
        # https://github.com/JuliaLang/julia/pull/33758
        d[v] = get(d, v, 0) + 1
    end
end

struct SeqFreq
    seq::String
    freq::Int32
    total::Int32
end

# A sequence of characters that occurs `freq` times out of `total`.
SeqFreq(seq::Seq, freq, frame, total) = SeqFreq(to_string(seq, frame), freq, total)

# Defined so that arrays of SeqFreq can be sort!ed
Base.isless(a::SeqFreq, b::SeqFreq) =
    a.freq == b.freq ? isless(a.seq, b.seq) : isless(a.freq, b.freq)

function Base.show(io::IO, x::SeqFreq)
    write(io, uppercase(String(x.seq)), ' ')
    @printf(io, "%2.3f", 100x.freq / x.total)
end

function write_freqs(io, d, frame, total)
    seqfreqs = [SeqFreq(seq, d[seq], frame, total) for seq in keys(d)]
    sort!(seqfreqs; rev=true)
    for seqf in seqfreqs
        println(io, seqf)
    end
    write(io, '\n')
end

# Lifted directly from knucleotide-julia-6
function get_third_seq(io)
    count = 0
    buffer_size = SIZEHINT
    buffer = Vector{UInt8}(undef, buffer_size)
    empty!(buffer)
    linebuffer = Vector{UInt8}(undef, LINESIZE)
    while !eof(io)
        if count === 3
            resize!(linebuffer, LINESIZE)
            new_length = length(buffer) + LINESIZE
            if new_length > buffer_size
                buffer_size = nextpow(2, nextpow(2, new_length))
                sizehint!(buffer, buffer_size)
            end#if
            nb = readbytes!(io, linebuffer)
            resize!(linebuffer, nb - 1)
            append!(buffer, linebuffer)
        else
            pos = position(io)
            nb = readbytes!(io, linebuffer)
            @inbounds count += first(linebuffer) === '>' % UInt8
            if last(linebuffer) !== '\n' % UInt8
                @inbounds seek(io, pos + findnext(isnewline, linebuffer, 1))
            end#if
        end#if
    end#while
    buffer
end#function

isnewline(c::UInt8)::Bool = c === '\n' % UInt8

function main(io, out)
    seq = get_third_seq(io)

    freqs = ((Dict{Seq{UInt32},Int32}() for _=1:6)..., Dict{Seq{UInt64},Int32}())
    Threads.@threads for i=1:7
        @inbounds count_frame!(freqs[i], seq, FRAMES[i])
    end

    @inbounds write_freqs(out, freqs[1], FRAMES[1], length(seq) - 1)
    @inbounds write_freqs(out, freqs[2], FRAMES[2], length(seq) - 2)

    mask = make_mask(18)
    for (i, v) in enumerate(COUNTSFOR)
        d = @inbounds freqs[i+2]
        frame = @inbounds FRAMES[i+2]
        # This is ugly but less ugly than having an entirely separate
        # loop for a single UInt64 Dict
        k = frame == 18 ? Seq(UInt64) : Seq(UInt32)
        for c in v
            k = push(k, c, mask)
        end
        write(out, string(get(d, k, 0)), '\t', uppercase(String(v)), '\n')
    end

    freqs
end

isinteractive() || main(stdin, stdout)
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
julia version 1.11.1


 Tue, 29 Oct 2024 21:55:55 GMT

MAKE:
printenv JULIA_NUM_THREADS
4

0.12s to complete and log all make actions

COMMAND LINE:
 /opt/src/julia-1.11.1/bin/julia -O3 --cpu-target=ivybridge --math-mode=ieee  -- knucleotide.julia-7.julia 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT