# k-nucleotide Julia #2 program

## source code

```# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# contributed by Jean-pierre Both

using Printf

import Base.isless

knuclength = [1, 2, 3, 4, 6, 12, 18]

mutable struct KNuc
name::AbstractString
count::Int
end

mutable struct Kmer
val::Int64
end

# sort down

function isless(x::KNuc, y::KNuc)
if x.count == y.count
return x.name > y.name
end
x.count > y.count
end

function print_knucs(a::Array{KNuc, 1})
sum = mapreduce(x -> x.count, +, a; init = 0)
for kn in a
@printf("%s %.3f\n", kn.name, (100.0* kn.count)/sum)
end
println()
end

function dumpDict(counts::Dict{String, Int})
kn = Vector{KNuc}(undef, length(counts))
i = 1
for elem in counts
kn[i] = KNuc(elem...)
i += 1
end
sort!(kn, lt = isless)
print_knucs(kn)
end

function print_knucs(a::Array{KNuc, 1})
sum = mapreduce(x -> x.count, +, a; init = 0)
for kn in a
@printf("%s %.3f\n", kn.name, 100.0kn.count/sum)
end
println()
end

function count_knuc_by_dict(shift::Int, data::String, dict::Dict{String, Int}, klength)
#
#
if shift == 0
first_i = klength
else
first_i = shift+1
end
#
key = Vector{Char}(undef, klength)
@simd for i in first_i:length(data)
# beginning of buffer may contains last 17 chars of preceding batch...
@inbounds key = data[i-klength+1:i]
dict[ key ] += 1
else
dict[ key ] = 1
end
end
dict
end

function count_knuc_by_dict_blocks(shift::Int, data::String, counts::Vector{Dict{String, Int}}, knuclength)
nbdict = length(knuclength)
counts[j] = count_knuc_by_dict(shift, data, counts[j], knuclength[j])
end
return counts
end

# shift used must be lenght of largest key minus 1 so 17!! in this case

function perf_k_nucleotide()
#
# allocate counters
nbdict = length(knuclength)
global counts = Vector{Dict{String, Int}}()
for j in 1:nbdict
push!(counts, Dict{String, Int}())
end
# open input file
#    fname = "knucleotide-Fasta-input.txt"
#    fname = "input25000000.txt"
#    io = open(fname)
io = stdin
#
three = ">THREE "
global nbline = 0
while true
nbline += 1
if length(line) >= length(three) && line[1:length(three)] == three
break
end
end
#
#    @printf("nb line read = %d", nbline);
#
fasta3 = ""
nbline = 100000
global lastLine17 = ""
global shift = 0
lines = Vector{String}(undef,nbline)
while eof(io) != true
iline = 0
while iline < nbline && eof(io) != true
iline += 1
@inbounds lines[iline] = uppercase(chomp(readline(io)))
end
fasta3 = lastLine17 * join(lines[1:iline],"");
counts = count_knuc_by_dict_blocks(shift, fasta3 , counts, knuclength);
# must keep last 17 chars.
lastLine17 = fasta3[end-16:end]
shift = 17
end

dumpDict(counts[1])
dumpDict(counts[2])

@printf("%d\tGGT\n",                get(counts[3], "GGT", 0))
@printf("%d\tGGTA\n",               get(counts[4], "GGTA", 0))
@printf("%d\tGGTATT\n",             get(counts[5], "GGTATT", 0))
@printf("%d\tGGTATTTTAATT\n",       get(counts[6], "GGTATTTTAATT", 0))
@printf("%d\tGGTATTTTAATTTATAGT\n", get(counts[7], "GGTATTTTAATTTATAGT", 0))

end

perf_k_nucleotide()
```

## notes, command-line, and program output

```NOTES:
64-bit Ubuntu quad core
julia version 1.3.0

Tue, 26 Nov 2019 23:23:30 GMT

COMMAND LINE:
/opt/src/julia-1.3.0/bin/julia -O3  -- knucleotide.julia-2.julia 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT
```