source code
# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# Contributed by Adam Beckmeyer
import Printf: @printf
const LINESIZE = 61
const SIZEHINT = 4 * 4096
const FRAMES = (1, 2, 3, 4, 6, 12, 18)
const COUNTSFOR = codeunits.(("ggt", "ggta", "ggtatt", "ggtattttaatt", "ggtattttaatttatagt"))
struct Seq{T<:Unsigned}
x::T
end
Seq(T) = Seq(0 % T)
# 10 works marginally better for this hashtable than previously used 7
Base.hash(s::Seq, h::UInt) = s.x ⊻ s.x >> 10
make_mask(frame) = (UInt64(1) << 2frame) - 1
# Gives a = 0x00, c = 0x01, g = 0x03, t = 0x02
encode(c) = (c >> 1) & 0x03
# Return `c` such that `c == decode(encode(c))`.
function decode(b::UInt8)
for c in ('a' % UInt8, 'c' % UInt8, 'g' % UInt8)
b == encode(c) && return c
end
't' % UInt8
end
# Push encoding for `c` to last two bits of `s`.
push(s::Seq{T}, c, mask) where T = Seq((mask & ((s.x << 2) | encode(c))) % T)
function to_string(s::Seq, n)
out = Vector{UInt8}(undef, n)
for i in eachindex(out)
@inbounds out[i] = decode((s.x & 0x03) % UInt8)
s = Seq(s.x >> 2)
end
String(reverse!(out))
end
function count_frame!(d::Dict{Seq{T}}, seq, frame) where T
mask = make_mask(frame)
v = Seq(zero(T))
# Fill the Seq up with `frame` pairs of bits
for i=1:(frame-1)
v = @inbounds push(v, seq[i], mask)
end
# Slide Seq character-by-character over the remainder of sequence
for i=frame:length(seq)
v = @inbounds push(v, seq[i], mask)
# This does two hashtable lookups currently. Doing better
# isn't possible until julia gets something like
# https://github.com/JuliaLang/julia/pull/33758
d[v] = get(d, v, 0) + 1
end
end
struct SeqFreq
seq::String
freq::Int32
total::Int32
end
# A sequence of characters that occurs `freq` times out of `total`.
SeqFreq(seq::Seq, freq, frame, total) = SeqFreq(to_string(seq, frame), freq, total)
# Defined so that arrays of SeqFreq can be sort!ed
Base.isless(a::SeqFreq, b::SeqFreq) =
a.freq == b.freq ? isless(a.seq, b.seq) : isless(a.freq, b.freq)
function Base.show(io::IO, x::SeqFreq)
write(io, uppercase(String(x.seq)), ' ')
@printf(io, "%2.3f", 100x.freq / x.total)
end
function write_freqs(io, d, frame, total)
seqfreqs = [SeqFreq(seq, d[seq], frame, total) for seq in keys(d)]
sort!(seqfreqs; rev=true)
for seqf in seqfreqs
println(io, seqf)
end
write(io, '\n')
end
# Lifted directly from knucleotide-julia-6
function get_third_seq(io)
count = 0
buffer_size = SIZEHINT
buffer = Vector{UInt8}(undef, buffer_size)
empty!(buffer)
linebuffer = Vector{UInt8}(undef, LINESIZE)
while !eof(io)
if count === 3
resize!(linebuffer, LINESIZE)
new_length = length(buffer) + LINESIZE
if new_length > buffer_size
buffer_size = nextpow(2, nextpow(2, new_length))
sizehint!(buffer, buffer_size)
end#if
nb = readbytes!(io, linebuffer)
resize!(linebuffer, nb - 1)
append!(buffer, linebuffer)
else
pos = position(io)
nb = readbytes!(io, linebuffer)
@inbounds count += first(linebuffer) === '>' % UInt8
if last(linebuffer) !== '\n' % UInt8
@inbounds seek(io, pos + findnext(isnewline, linebuffer, 1))
end#if
end#if
end#while
buffer
end#function
isnewline(c::UInt8)::Bool = c === '\n' % UInt8
function main(io, out)
seq = get_third_seq(io)
freqs = ((Dict{Seq{UInt32},Int32}() for _=1:6)..., Dict{Seq{UInt64},Int32}())
Threads.@threads for i=1:7
@inbounds count_frame!(freqs[i], seq, FRAMES[i])
end
@inbounds write_freqs(out, freqs[1], FRAMES[1], length(seq) - 1)
@inbounds write_freqs(out, freqs[2], FRAMES[2], length(seq) - 2)
mask = make_mask(18)
for (i, v) in enumerate(COUNTSFOR)
d = @inbounds freqs[i+2]
frame = @inbounds FRAMES[i+2]
# This is ugly but less ugly than having an entirely separate
# loop for a single UInt64 Dict
k = frame == 18 ? Seq(UInt64) : Seq(UInt32)
for c in v
k = push(k, c, mask)
end
write(out, string(d[k]), '\t', uppercase(String(v)), '\n')
end
freqs
end
isinteractive() || main(stdin, stdout)