The Computer Language
Benchmarks Game

k-nucleotide Julia #5 program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# Contributed by Adam Beckmeyer

import Printf: @printf

const LINESIZE = 61
const SIZEHINT = 4 * 4096
const COUNTSFOR = zip(
    (3, 4, 5, 6, 7),
    collect.(codeunits.(("ggt", "ggta", "ggtatt", "ggtattttaatt",
                         "ggtattttaatttatagt")))
)
const PERMS1 = collect.(codeunits.((
    "a", "c", "g", "t"
)))
const PERMS2 = collect.(codeunits.((
    "aa", "ac", "ag", "at",
    "ca", "cc", "cg", "ct",
    "ga", "gc", "gg", "gt",
    "ta", "tc", "tg", "tt",
)))

const BYTE_TO_BITS = Vector{UInt8}(undef, 256)
@inbounds BYTE_TO_BITS['a' % UInt8] = 0x00
@inbounds BYTE_TO_BITS['c' % UInt8] = 0x01
@inbounds BYTE_TO_BITS['g' % UInt8] = 0x02
@inbounds BYTE_TO_BITS['t' % UInt8] = 0x03

# * Functions to do the calculation of number of occurrences
# -----

Base.@propagate_inbounds function hashvec(
    v::AbstractVector{UInt8}, ::Type{T}, indstart, indend
) where T
    result = zero(T)
    for ind in indstart:indend
        result = result<<2 | T(BYTE_TO_BITS[v[ind]])
    end#for
    result
end#function

Base.@propagate_inbounds hashvec(v, T) =
    hashvec(v, T, firstindex(v), lastindex(v))

Base.@propagate_inbounds nexthash(b::T, c::UInt8, mask::T) where T =
    mask & b<<2 | BYTE_TO_BITS[c]

function get_mask(n, ::Type{T}) where T<:Unsigned
    out = zero(T)
    for _ in 1:n
        out = out<<2 | 0x03
    end#for
    out
end#function

function count_frame!(frame, seq, dict::AbstractDict{K,V}) where {K,V}
    @specialize
    subseq = hashvec(seq, K, 1, frame)
    dict[subseq] = get(dict, subseq, zero(V)) + one(V)
    mask = get_mask(frame, K)
    
    @inbounds for i in frame + 1:length(seq)
        subseq = nexthash(subseq, seq[i], mask)
        dict[subseq] = get(dict, subseq, zero(V)) + one(V)
    end#for
end#function

# * Get input in the proper format
# -----

function get_third_seq(io)
    count = 0
    buffer_size = SIZEHINT
    buffer = Vector{UInt8}(undef, buffer_size)
    empty!(buffer)
    linebuffer = Vector{UInt8}(undef, LINESIZE)
    while !eof(io)
        if count === 3
            resize!(linebuffer, LINESIZE)
            new_length = length(buffer) + LINESIZE
            if new_length > buffer_size
                buffer_size = nextpow(2, nextpow(2, new_length))
                sizehint!(buffer, buffer_size)
            end#if
            nb = readbytes!(io, linebuffer)
            resize!(linebuffer, nb - 1)
            append!(buffer, linebuffer)
        else
            pos = position(io)
            nb = readbytes!(io, linebuffer)
            @inbounds count += first(linebuffer) === '>' % UInt8
            if last(linebuffer) !== '\n' % UInt8
                @inbounds seek(io, pos + findnext(isnewline, linebuffer, 1))
            end#if
        end#if
    end#while
    buffer
end#function

isnewline(c::UInt8)::Bool = c === '\n' % UInt8

# * Calculate and format statistics and output
# -----

function write_freq(io, d, perms)
    v = [i => d[hashvec(i, UInt8)] for i in perms]
    sort!(v; rev=true)
    total = sum(last, v)
    for (subseq, freq) in v
        write(io, uppercase(String(subseq)), ' ')
        @printf(io, "%2.3f\n", 100freq / total)
    end#for
    write(io, '\n')
end#function

function write_occurrences(io, d::AbstractDict{K}, subseq) where K
    n = get(d, hashvec(subseq, K), 0)
    print(io, n)
    write(io, '\t', uppercase(String(subseq)), '\n')
end#function

# Type piracy so things are printed in right order
function Base.isless(a::Pair{<:Vector,<:Integer}, b::Pair{<:Vector,<:Integer})
    if a.second === b.second
        isless(a.first, b.first)
    else
        isless(a.second, b.second)
    end#if
end#function

# * Tie everything together
# -----

function main(ioin, ioout)
    seq = get_third_seq(ioin)
    frames = [1, 2, 3, 4, 6, 12, 18]
    # All frames can fit into UInt32 except last one
    freqs = (Dict{UInt32,Int32}(), Dict{UInt32,Int32}(), Dict{UInt32,Int32}(),
             Dict{UInt32,Int32}(), Dict{UInt32,Int32}(), Dict{UInt32,Int32}(),
             Dict{UInt64,Int32}())
    # reverse so iterations that take longest start first
    @inbounds Threads.@threads for i in reverse(1:length(frames))
        count_frame!(frames[i], seq, freqs[i])
    end#for
    # output
    write_freq(ioout, freqs[1], PERMS1)
    write_freq(ioout, freqs[2], PERMS2)
    for (i, subseq) in COUNTSFOR
        @inbounds write_occurrences(ioout, freqs[i], subseq)
    end#for
    freqs
end#function

main(stdin, stdout)
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
julia version 1.2.0
export JULIA_NUM_THREADS=4


Tue, 20 Aug 2019 16:41:23 GMT

COMMAND LINE:
/opt/src/julia-1.2.0/bin/julia -O3  -- knucleotide.julia-5.julia 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT