The Computer Language
24.11 Benchmarks Game

k-nucleotide F# .NET #6 program

source code

// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// ported from C# version by Anthony Lloyd
// Use DictionarySlim by Anthony Lloyd

open System
open Microsoft.Collections.Extensions

[<Literal>]
let BLOCK_SIZE = 8388608 // 1024 * 1024 * 8

[<EntryPoint>]
let main _ =
  let threeStart,threeBlocks,threeEnd =
    let input = Console.OpenStandardInput()
    let mutable threeEnd = 0
    let read buffer =
        let rec read offset count =
            let bytesRead = input.Read(buffer, offset, count)
            if bytesRead=count then offset+count
            elif bytesRead=0 then offset
            else read (offset+bytesRead) (count-bytesRead)
        threeEnd <- read 0 BLOCK_SIZE

    let rec findHeader matchIndex buffer =
        let toFind = ">THREE"B
        let find i matchIndex =
            let rec find i matchIndex =
                if matchIndex=0 then
                    let i = Array.IndexOf(buffer, toFind.[0], i)
                    if -1=i then -1,0
                    else find (i+1) 1
                else
                    let fl = toFind.Length
                    let rec tryMatch i matchIndex =
                        if i>=BLOCK_SIZE || matchIndex>=fl then i,matchIndex
                        else
                            if buffer.[i]=toFind.[matchIndex] then
                                tryMatch (i+1) (matchIndex+1)
                            else
                                find i 0
                    let i,matchIndex = tryMatch i matchIndex
                    if matchIndex=fl then i,matchIndex else -1,matchIndex
            find i matchIndex
        read buffer
        let i,matchIndex = find 0 matchIndex
        if -1<>i then i,buffer
        else findHeader matchIndex buffer

    let rec findSequence i buffer =
        let i = Array.IndexOf(buffer, '\n'B, i)
        if i <> -1 then buffer,i+1
        else
            read buffer
            findSequence 0 buffer

    let buffer,threeStart = Array.zeroCreate BLOCK_SIZE
                            |> findHeader 0 ||> findSequence

    let threeBlocks =
        if threeEnd<>BLOCK_SIZE then // Needs to be at least 2 blocks
            for i = threeEnd to BLOCK_SIZE-1 do
                buffer.[i] <- 255uy
            threeEnd <- 0
            [[||];buffer]
        else
            let rec findEnd i buffer threeBlocks =
                let i = Array.IndexOf(buffer, '>'B, i)
                if i <> -1 then
                    threeEnd <- i
                    buffer::threeBlocks
                else
                    let threeBlocks = buffer::threeBlocks
                    let buffer = Array.zeroCreate BLOCK_SIZE
                    read buffer
                    if threeEnd<>BLOCK_SIZE then buffer::threeBlocks
                    else findEnd 0 buffer threeBlocks
            let threeBlocks = findEnd threeStart buffer []
            if threeStart+18>BLOCK_SIZE then // Key needs to be in first block
                let block0 = threeBlocks.[0]
                let block1 = threeBlocks.[1]
                Buffer.BlockCopy(block0, threeStart, block0, threeStart-18,
                    BLOCK_SIZE-threeStart)
                Buffer.BlockCopy(block1, 0, block0, BLOCK_SIZE-18, 18)
                for i = 0 to 17 do block1.[i] <- 255uy
            threeBlocks

    threeStart, List.rev threeBlocks |> List.toArray, threeEnd

  let toChar = [|'A'; 'C'; 'G'; 'T'|]
  let toNum = Array.zeroCreate 256
  toNum.[int 'c'B] <- 1uy; toNum.[int 'C'B] <- 1uy
  toNum.[int 'g'B] <- 2uy; toNum.[int 'G'B] <- 2uy
  toNum.[int 't'B] <- 3uy; toNum.[int 'T'B] <- 3uy
  toNum.[int '\n'B] <- 255uy; toNum.[int '>'B] <- 255uy; toNum.[255] <- 255uy

  Array.Parallel.iter (fun bs ->
    for i = 0 to Array.length bs-1 do
        bs.[i] <- toNum.[int bs.[i]]
  ) threeBlocks

  let count l mask (summary:_->string) = async {
      let mutable rollingKey = 0
      let firstBlock = threeBlocks.[0]
      let rec startKey l start =
          if l>0 then
             rollingKey <- rollingKey <<< 2 ||| int firstBlock.[start]
             startKey (l-1) (start+1)
      startKey l threeStart
      let dict = DictionarySlim<int,int> 256
      let inline check a lo hi =
        for i = lo to hi do
          let nb = Array.get a i
          if nb<>255uy then
              rollingKey <- rollingKey &&& mask <<< 2 ||| int nb
              let x = &dict.GetOrAddValueRef(rollingKey)
              x <- x + 1

      check firstBlock (threeStart+l) (BLOCK_SIZE-1)
      
      for i = 1 to threeBlocks.Length-2 do
          check threeBlocks.[i] 0 (BLOCK_SIZE-1)
          
      let lastBlock = threeBlocks.[threeBlocks.Length-1]
      check lastBlock 0 (threeEnd-1)
      return summary dict
    }

  let writeFrequencies fragmentLength (freq:DictionarySlim<int,int>) =
    let percent = 100.0 / (freq |> Seq.sumBy (fun i -> i.Value) |> float)
    freq |> Seq.sortByDescending (fun kv -> kv.Value)
    |> Seq.collect (fun kv ->
        let keyChars = Array.zeroCreate fragmentLength
        let mutable key = kv.Key
        for i in keyChars.Length-1..-1..0 do
            keyChars.[i] <- toChar.[int key &&& 0x3]
            key <- key >>> 2
        [String(keyChars);" ";(float kv.Value * percent).ToString("F3");"\n"]
      )
    |> String.Concat

  let writeCount (fragment:string) (dict:DictionarySlim<int,int>) =
    let mutable key = 0
    for i = 0 to fragment.Length-1 do
        key <- key <<< 2 ||| int toNum.[int fragment.[i]]
    let _,v = dict.TryGetValue key
    String.Concat(string v, "\t", fragment)

  let countEnding l mask b =
    let mutable rollingKey = 0L
    let firstBlock = threeBlocks.[0]
    let rec startKey l start =
          if l>0 then
             rollingKey <- rollingKey <<< 2 ||| int64 firstBlock.[start]
             startKey (l-1) (start+1)
    startKey l threeStart
    let dict = DictionarySlim<int64,int> 256
    let inline check a lo hi =
        for i = lo to hi do
          let nb = Array.get a i
          if nb=b then
            rollingKey <- rollingKey &&& mask <<< 2 ||| int64 nb
            let x = &dict.GetOrAddValueRef(rollingKey)
            x <- x + 1
          elif nb<>255uy then
            rollingKey <- rollingKey &&& mask <<< 2 ||| int64 nb

    check firstBlock (threeStart+l) (BLOCK_SIZE-1)

    for i = 1 to threeBlocks.Length-2 do
        check threeBlocks.[i] 0 (BLOCK_SIZE-1)

    let lastBlock = threeBlocks.[threeBlocks.Length-1]
    check lastBlock 0 (threeEnd-1)

    dict

  let count64 l mask (summary:_->string) = async {
      let! dicts =
        Seq.init 4 (fun i -> async { return byte i |> countEnding l mask })
        |> Async.Parallel
      let d = DictionarySlim<int64,int>(dicts |> Array.sumBy (fun i -> i.Count))
      dicts |> Array.iter (fun di ->
        di |> Seq.iter (fun kv ->
          let x = &d.GetOrAddValueRef kv.Key
          x <- kv.Value
        )
      )
      return summary d
    }

  let writeCount64 (fragment:string) (dict:DictionarySlim<_,_>) =
    let mutable key = 0L
    for i = 0 to fragment.Length-1 do
        key <- key <<< 2 ||| int64 toNum.[int fragment.[i]]
    let _,v = dict.TryGetValue key
    String.Concat(string v, "\t", fragment)

  Async.Parallel [
    count64 18 0x7FFFFFFFFL (writeCount64 "GGTATTTTAATTTATAGT")
    count 12 0x7FFFFF (writeCount "GGTATTTTAATT")
    count 6 0x3FF (writeCount "GGTATT")
    count 4 0x3F (writeCount "GGTA")
    count 3 0xF (writeCount "GGT")
    count 2 0x3 (writeFrequencies 2)
    count 1 0 (writeFrequencies 1)
  ]
  |> Async.RunSynchronously
  |> Array.rev
  |> Array.iter stdout.WriteLine

  exit 0
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
.NET SDK 9.0.100
Host Version: 9.0.0
Commit: 9d5a6a9aa4

<OutputType>Exe
<TargetFramework>net9.0
<ImplicitUsings>enable
<Nullable>enable
<AllowUnsafeBlocks>true
<ServerGarbageCollection>true
<ConcurrentGarbageCollection>true
<PublishAot>false


 Fri, 15 Nov 2024 02:06:16 GMT

MAKE:
cp knucleotide.fsharpcore-6.fsharpcore Program.fs
cp Include/fsharpcore/program.fsproj .
mkdir obj
cp Include/fsharpcore/project.assets.json ./obj
/opt/src/dotnet-sdk-9.0.100/dotnet build -c Release --use-current-runtime  	
  Determining projects to restore...
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj : warning NU1900: Error occurred while getting package vulnerability data: Unable to load the service index for source https://api.nuget.org/v3/index.json.
  Restored /home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj (in 6.23 sec).
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj : warning NU1900: Error occurred while getting package vulnerability data: Unable to load the service index for source https://api.nuget.org/v3/index.json.
  program -> /home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/bin/Release/net9.0/linux-x64/program.dll

Build succeeded.

/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj : warning NU1900: Error occurred while getting package vulnerability data: Unable to load the service index for source https://api.nuget.org/v3/index.json.
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.fsproj : warning NU1900: Error occurred while getting package vulnerability data: Unable to load the service index for source https://api.nuget.org/v3/index.json.
    2 Warning(s)
    0 Error(s)

Time Elapsed 00:00:15.66

17.82s to complete and log all make actions

COMMAND LINE:
 ./bin/Release/net9.0/linux-x64/program 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT