k-nucleotide F# .NET #3 program
source code
(* The Computer Language Benchmarks Game
* https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
*
* Contributed by Don Syme
* Based on C# contribution by Isaac Gouy, Antti Lankila, A.Nahr, The Anh Tran
* Uses one byte array rather than strings.
*)
open System
open System.IO
open System.Collections.Generic
open System.Text
open System.Threading
let input = Console.In
let toBytes (s: string) = s.ToCharArray() |> Array.map byte
let toString (s: byte []) = String(s |> Array.map char)
let prefixes = [| "ggt"; "ggta"; "ggtatt"; "ggtattttaatt"; "ggtattttaatttatagt" |]
let prefixBytes =
[| for p in prefixes -> toBytes p |]
let prefixLengths =
[| for p in prefixBytes -> p.Length |]
let prefixOffsets = Array.scan (+) 0 prefixLengths
let dnaStart = prefixOffsets.[prefixLengths.Length]
let dna =
seq { while true do yield input.ReadLine() }
|> Seq.takeWhile (fun x -> x <> null)
|> Seq.skipWhile (fun x -> not (x.StartsWith(">THREE")))
|> Seq.skip 1
|> String.concat ""
|> toBytes
|> Array.append (Array.concat prefixBytes)
let keyHash (k, n) =
let mutable h = 0
for i in 0..n - 1 do
h <- h * 31 + int dna.[k + i]
h
let keyText (k, n) = toString(dna.[k..k + n - 1]).ToUpper()
type KeyData =
{ mutable count: int
key: int * int }
let generateFrequencies(frameSize) =
let freq = new Dictionary<int, KeyData>()
let mutable v = Unchecked.defaultof<KeyData>
for i in dnaStart..dna.Length - frameSize do
let h = keyHash (i, frameSize)
if freq.TryGetValue(h, &v) then v.count <- v.count + 1
else freq.[h] <- { count = 1; key = (i, frameSize) }
freq
let writeCount(n) =
let freq = generateFrequencies(prefixLengths.[n])
let hash = keyHash(prefixOffsets.[n], prefixLengths.[n])
let count =
if freq.ContainsKey(hash) then freq.[hash].count
else 0
String.Format("{0}\t{1}", count, prefixes.[n].ToUpper())
type Pair = KeyValuePair<int, KeyData>
let writeFrequencies(nucleotideLength) =
let freq = generateFrequencies(nucleotideLength)
let items = new ResizeArray<Pair>(freq)
items.Sort(fun (kv1: Pair) (kv2: Pair) ->
let c = kv2.Value.count - kv1.Value.count
if c = 0 then keyText(kv1.Value.key).CompareTo(keyText(kv2.Value.key))
else c)
let buf = new StringBuilder()
let sum = dna.Length - dnaStart - nucleotideLength + 1
for element in items do
let percent = double element.Value.count * 100.0 / double sum
buf.AppendFormat("{0} {1:f3}\n", keyText(element.Value.key), percent) |> ignore
buf.ToString()
Async.Parallel [ yield async { return writeFrequencies(1) }
yield async { return writeFrequencies(2) }
for i in 0..prefixes.Length - 1 do
yield async { return writeCount(i) } ]
|> Async.RunSynchronously
|> Array.iter Console.Out.WriteLine
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
.NET SDK 9.0.100
Host Version: 9.0.0
Commit: 9d5a6a9aa4
<OutputType>Exe
<TargetFramework>net9.0
<ImplicitUsings>enable
<Nullable>enable
<AllowUnsafeBlocks>true
<ServerGarbageCollection>true
<ConcurrentGarbageCollection>true
<PublishAot>false
Fri, 15 Nov 2024 01:59:34 GMT
MAKE:
cp knucleotide.fsharpcore-3.fsharpcore Program.fs
cp Include/fsharpcore/program.fsproj .
mkdir obj
cp Include/fsharpcore/project.assets.json ./obj
/opt/src/dotnet-sdk-9.0.100/dotnet build -c Release --use-current-runtime
Determining projects to restore...
Restored /home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj (in 825 ms).
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.fs(32,37): warning FS3261: Nullness warning: The types 'string' and 'string | null' do not have compatible nullability. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.fs(34,8): warning FS3261: Nullness warning: The types 'string' and 'string | null' do not have equivalent nullability. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj]
program -> /home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/bin/Release/net9.0/linux-x64/program.dll
Build succeeded.
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.fs(32,37): warning FS3261: Nullness warning: The types 'string' and 'string | null' do not have compatible nullability. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.fs(34,8): warning FS3261: Nullness warning: The types 'string' and 'string | null' do not have equivalent nullability. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj]
2 Warning(s)
0 Error(s)
Time Elapsed 00:00:10.44
12.56s to complete and log all make actions
COMMAND LINE:
./bin/Release/net9.0/linux-x64/program 0 < knucleotide-input25000000.txt
PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754
AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902
1471758 GGT
446535 GGTA
47336 GGTATT
893 GGTATTTTAATT
893 GGTATTTTAATTTATAGT