# k-nucleotide Julia #4 program

## source code

```# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#

import Printf: @printf

const LINESIZE = 61
const SIZEHINT = 4 * 4096
const COUNTSFOR = zip(
(3, 4, 5, 6, 7),
collect.(codeunits.(("ggt", "ggta", "ggtatt", "ggtattttaatt",
"ggtattttaatttatagt")))
)
const PERMS1 = collect.(codeunits.((
"a", "c", "g", "t"
)))
const PERMS2 = collect.(codeunits.((
"aa", "ac", "ag", "at",
"ca", "cc", "cg", "ct",
"ga", "gc", "gg", "gt",
"ta", "tc", "tg", "tt",
)))

const BYTE_TO_BITS = Vector{UInt8}(undef, 256)
@inbounds BYTE_TO_BITS['a' % UInt8] = 0x00
@inbounds BYTE_TO_BITS['c' % UInt8] = 0x01
@inbounds BYTE_TO_BITS['g' % UInt8] = 0x02
@inbounds BYTE_TO_BITS['t' % UInt8] = 0x03

# * Functions to do the calculation of number of occurrences
# -----

Base.@propagate_inbounds function hashvec(
v::AbstractVector{UInt8}, ::Type{T}, indstart, indend
) where T
result = zero(T)
for ind in indstart:indend
result = result<<2 | T(BYTE_TO_BITS[v[ind]])
end#for
result
end#function

Base.@propagate_inbounds hashvec(v, T) =
hashvec(v, T, firstindex(v), lastindex(v))

Base.@propagate_inbounds nexthash(b::T, c::UInt8, mask::T) where T =

out = zero(T)
for _ in 1:n
out = out<<2 | 0x03
end#for
out
end#function

function count_frame!(frame, seq, dict::AbstractDict{K,V}) where {K,V}
@specialize
subseq = hashvec(seq, K, 1, frame)
dict[subseq] = get(dict, subseq, zero(V)) + one(V)

@inbounds for i in frame + 1:length(seq)
dict[subseq] = get(dict, subseq, zero(V)) + one(V)
end#for
end#function

# * Get input in the proper format
# -----

function get_third_seq(io)
count = 0
buffer_size = SIZEHINT
buffer = Vector{UInt8}(undef, buffer_size)
empty!(buffer)
linebuffer = Vector{UInt8}(undef, LINESIZE)
while !eof(io)
if count === 3
resize!(linebuffer, LINESIZE)
new_length = length(buffer) + LINESIZE
if new_length > buffer_size
buffer_size = nextpow(2, nextpow(2, new_length))
sizehint!(buffer, buffer_size)
end#if
resize!(linebuffer, nb - 1)
append!(buffer, linebuffer)
else
pos = position(io)
@inbounds count += first(linebuffer) === '>' % UInt8
if last(linebuffer) !== '\n' % UInt8
@inbounds seek(io, pos + findnext(isnewline, linebuffer, 1))
end#if
end#if
end#while
buffer
end#function

isnewline(c::UInt8)::Bool = c === '\n' % UInt8

# * Calculate and format statistics and output
# -----

function write_freq(io, d, perms)
v = [i => d[hashvec(i, UInt8)] for i in perms]
sort!(v; rev=true)
total = sum(last, v)
for (subseq, freq) in v
write(io, uppercase(String(subseq)), ' ')
@printf(io, "%2.3f\n", 100freq / total)
end#for
write(io, '\n')
end#function

function write_occurrences(io, d::AbstractDict{K}, subseq) where K
n = get(d, hashvec(subseq, K), 0)
print(io, n)
write(io, '\t', uppercase(String(subseq)), '\n')
end#function

# Type piracy so things are printed in right order
function Base.isless(a::Pair{<:Vector,<:Integer}, b::Pair{<:Vector,<:Integer})
if a.second === b.second
isless(a.first, b.first)
else
isless(a.second, b.second)
end#if
end#function

# * Tie everything together
# -----

function main(ioin, ioout)
seq = get_third_seq(ioin)
frames = [1, 2, 3, 4, 6, 12, 18]
# UInt8 for frames 1-4, UInt16 for 6, UInt32 for 12, UInt64 for 18
freqs = (Dict{UInt8,Int32}(), Dict{UInt8,Int32}(), Dict{UInt8,Int32}(),
Dict{UInt8,Int32}(), Dict{UInt16,Int32}(), Dict{UInt32,Int32}(),
Dict{UInt64,Int32}())
# reverse so iterations that take longest start first
count_frame!(frames[i], seq, freqs[i])
end#for
# output
write_freq(ioout, freqs[1], PERMS1)
write_freq(ioout, freqs[2], PERMS2)
for (i, subseq) in COUNTSFOR
@inbounds write_occurrences(ioout, freqs[i], subseq)
end#for
freqs
end#function

main(stdin, stdout)
```

## notes, command-line, and program output

```NOTES:
julia version 1.3.0

Tue, 26 Nov 2019 23:20:43 GMT

COMMAND LINE:
/opt/src/julia-1.3.0/bin/julia -O3  -- knucleotide.julia-4.julia 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT
```