The Computer Language
24.09 Benchmarks Game

k-nucleotide C# .NET program

source code

// The Computer Language Benchmarks Game
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/

// submitted by Josh Goldfoot
// Modified to reduce memory and do more in parallel by Anthony Lloyd
// Added dictionary incrementor by Anthony Lloyd
// Use DictionarySlim by Anthony Lloyd

using System;
using System.IO;
using System.Text;
using System.Linq;
using System.Collections.Generic;
using System.Threading.Tasks;
using System.Runtime.CompilerServices;
using Microsoft.Collections.Extensions;

public static class KNucleotide
{
    const int BLOCK_SIZE = 1024 * 1024 * 8;
    public static List<byte[]> threeBlocks = new List<byte[]>();
    public static int threeStart, threeEnd;
    static readonly byte[] tonum = new byte[256];
    static readonly char[] tochar = new char[] { 'A', 'C', 'G', 'T' };

    static int Read(Stream stream, byte[] buffer, int offset, int count)
    {
        var bytesRead = stream.Read(buffer, offset, count);
        return bytesRead == count ? offset + count
             : bytesRead == 0 ? offset
             : Read(stream, buffer, offset + bytesRead, count - bytesRead);
    }

    static int Find(byte[] buffer, byte[] toFind, int i, ref int matchIndex)
    {
        if (matchIndex == 0)
        {
            i = Array.IndexOf(buffer, toFind[0], i);
            if (i == -1) return -1;
            matchIndex = 1;
            return Find(buffer, toFind, i + 1, ref matchIndex);
        }
        else
        {
            int bl = buffer.Length, fl = toFind.Length;
            while (i < bl && matchIndex < fl)
            {
                if (buffer[i++] != toFind[matchIndex++])
                {
                    matchIndex = 0;
                    return Find(buffer, toFind, i, ref matchIndex);
                }
            }
            return matchIndex == fl ? i : -1;
        }
    }

    public static void LoadThreeData()
    {
        var stream = Console.OpenStandardInput();

        // find three sequence
        int matchIndex = 0;
        var toFind = new[] { (byte)'>', (byte)'T', (byte)'H', (byte)'R', (byte)'E', (byte)'E' };
        var buffer = new byte[BLOCK_SIZE];
        do
        {
            threeEnd = Read(stream, buffer, 0, BLOCK_SIZE);
            threeStart = Find(buffer, toFind, 0, ref matchIndex);
        } while (threeStart == -1);

        // Skip to end of line
        matchIndex = 0;
        toFind = new[] { (byte)'\n' };
        threeStart = Find(buffer, toFind, threeStart, ref matchIndex);
        while (threeStart == -1)
        {
            threeEnd = Read(stream, buffer, 0, BLOCK_SIZE);
            threeStart = Find(buffer, toFind, 0, ref matchIndex);
        }
        threeBlocks.Add(buffer);

        if (threeEnd != BLOCK_SIZE) // Needs to be at least 2 blocks
        {
            var bytes = threeBlocks[0];
            for (int i = threeEnd; i < bytes.Length; i++)
                bytes[i] = 255;
            threeEnd = 0;
            threeBlocks.Add(Array.Empty<byte>());
            return;
        }

        // find next seq or end of input
        matchIndex = 0;
        toFind = new[] { (byte)'>' };
        threeEnd = Find(buffer, toFind, threeStart, ref matchIndex);
        while (threeEnd == -1)
        {
            buffer = new byte[BLOCK_SIZE];
            var bytesRead = Read(stream, buffer, 0, BLOCK_SIZE);
            threeEnd = bytesRead == BLOCK_SIZE ? Find(buffer, toFind, 0, ref matchIndex)
                        : bytesRead;
            threeBlocks.Add(buffer);
        }

        if (threeStart + 18 > BLOCK_SIZE) // Key needs to be in the first block
        {
            byte[] block0 = threeBlocks[0], block1 = threeBlocks[1];
            Buffer.BlockCopy(block0, threeStart, block0, threeStart - 18, BLOCK_SIZE - threeStart);
            Buffer.BlockCopy(block1, 0, block0, BLOCK_SIZE - 18, 18);
            for (int i = 0; i < 18; i++) block1[i] = 255;
        }
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    private static void Check(DictionarySlim<long, int> dict, ref long rollingKey, byte nb, long mask)
    {
        if (nb == 255) return;
        rollingKey = ((rollingKey & mask) << 2) | nb;
        dict.GetOrAddValueRef(rollingKey)++;
    }

    static Task<string> Count(int l, long mask, Func<DictionarySlim<long, int>, string> summary)
    {
        return Task.Run(() =>
        {
            long rollingKey = 0;
            var firstBlock = threeBlocks[0];
            var start = threeStart;
            while (--l > 0) rollingKey = (rollingKey << 2) | firstBlock[start++];
            var dict = new DictionarySlim<long, int>(1024);
            
            for (int i = start; i < firstBlock.Length; i++)
                Check(dict, ref rollingKey, firstBlock[i], mask);

            int lastBlockId = threeBlocks.Count - 1;
            for (int bl = 1; bl < lastBlockId; bl++)
            {
                var bytes = threeBlocks[bl];
                for (int i = 0; i < bytes.Length; i++)
                    Check(dict, ref rollingKey, bytes[i], mask);
            }

            var lastBlock = threeBlocks[lastBlockId];
            for (int i = 0; i < threeEnd; i++)
                Check(dict, ref rollingKey, lastBlock[i], mask);
            
            return summary(dict);
        });
    }

    static string WriteFrequencies(DictionarySlim<long, int> freq, int fragmentLength)
    {
        var sb = new StringBuilder();
        double percent = 100.0 / freq.Select(x => x.Value).Sum();
        foreach (var kv in freq.OrderByDescending(i => i.Value))
        {
            var keyChars = new char[fragmentLength];
            var key = kv.Key;
            for (int i = keyChars.Length - 1; i >= 0; --i)
            {
                keyChars[i] = tochar[key & 0x3];
                key >>= 2;
            }
            sb.Append(keyChars);
            sb.Append(" ");
            sb.AppendLine((kv.Value * percent).ToString("F3"));
        }
        return sb.ToString();
    }

    static string WriteCount(DictionarySlim<long, int> dictionary, string fragment)
    {
        long key = 0;
        for (int i = 0; i < fragment.Length; ++i)
            key = (key << 2) | tonum[fragment[i]];
        dictionary.TryGetValue(key, out var v);
        return string.Concat(v.ToString(), "\t", fragment);
    }

    public static void Main(string[] args)
    {
        tonum['c'] = 1; tonum['C'] = 1;
        tonum['g'] = 2; tonum['G'] = 2;
        tonum['t'] = 3; tonum['T'] = 3;
        tonum['\n'] = 255; tonum['>'] = 255; tonum[255] = 255;

        LoadThreeData();

        Parallel.ForEach(threeBlocks, bytes =>
        {
            for (int i = 0; i < bytes.Length; i++)
                bytes[i] = tonum[bytes[i]];
        });

        var task18 = Count(18, 34359738367, d => WriteCount(d, "GGTATTTTAATTTATAGT"));
        var task12 = Count(12, 8388607, d => WriteCount(d, "GGTATTTTAATT"));
        var task6 = Count(6, 0b1111111111, d => WriteCount(d, "GGTATT"));
        var task1 = Count(1, 0, d => WriteFrequencies(d, 1));
        var task2 = Count(2, 0b11, d => WriteFrequencies(d, 2));
        var task3 = Count(3, 0b1111, d => WriteCount(d, "GGT"));
        var task4 = Count(4, 0b111111, d => WriteCount(d, "GGTA"));

        Console.Out.WriteLineAsync(task1.Result);
        Console.Out.WriteLineAsync(task2.Result);
        Console.Out.WriteLineAsync(task3.Result);
        Console.Out.WriteLineAsync(task4.Result);
        Console.Out.WriteLineAsync(task6.Result);
        Console.Out.WriteLineAsync(task12.Result);
        Console.WriteLine(task18.Result);
    }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
.NET SDK 8.0.301
Host Version: 8.0.6
Commit: 3b8b000a0e

<OutputType>Exe
<TargetFramework>net8.0
<ImplicitUsings>enable
<Nullable>enable
<AllowUnsafeBlocks>true
<ServerGarbageCollection>true
<ConcurrentGarbageCollection>true
<PublishAot>false
<OptimizationPreference>Speed
<IlcInstructionSet>native


 Wed, 29 May 2024 21:21:53 GMT

MAKE:
cp knucleotide.csharpcore Program.cs
cp Include/csharpcore/program.csproj .
mkdir obj
cp Include/csharpcore/project.assets.json ./obj
~/dotnet/dotnet build -c Release --use-current-runtime  	
  Determining projects to restore...
  Restored /home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/program.csproj (in 839 ms).
  program -> /home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/bin/Release/net8.0/linux-x64/program.dll

Build succeeded.
    0 Warning(s)
    0 Error(s)

Time Elapsed 00:00:05.13

7.31s to complete and log all make actions

COMMAND LINE:
 ./bin/Release/net8.0/linux-x64/program 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT