The Computer Language
24.04 Benchmarks Game

k-nucleotide C# .NET program

source code

// The Computer Language Benchmarks Game
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/

// submitted by Josh Goldfoot
// Modified to reduce memory and do more in parallel by Anthony Lloyd
// Added dictionary incrementor by Anthony Lloyd
// Use DictionarySlim by Anthony Lloyd

using System;
using System.IO;
using System.Text;
using System.Linq;
using System.Collections.Generic;
using System.Threading.Tasks;
using System.Runtime.CompilerServices;
using Microsoft.Collections.Extensions;

public static class KNucleotide
{
    const int BLOCK_SIZE = 1024 * 1024 * 8;
    public static List<byte[]> threeBlocks = new List<byte[]>();
    public static int threeStart, threeEnd;
    static readonly byte[] tonum = new byte[256];
    static readonly char[] tochar = new char[] { 'A', 'C', 'G', 'T' };

    static int Read(Stream stream, byte[] buffer, int offset, int count)
    {
        var bytesRead = stream.Read(buffer, offset, count);
        return bytesRead == count ? offset + count
             : bytesRead == 0 ? offset
             : Read(stream, buffer, offset + bytesRead, count - bytesRead);
    }

    static int Find(byte[] buffer, byte[] toFind, int i, ref int matchIndex)
    {
        if (matchIndex == 0)
        {
            i = Array.IndexOf(buffer, toFind[0], i);
            if (i == -1) return -1;
            matchIndex = 1;
            return Find(buffer, toFind, i + 1, ref matchIndex);
        }
        else
        {
            int bl = buffer.Length, fl = toFind.Length;
            while (i < bl && matchIndex < fl)
            {
                if (buffer[i++] != toFind[matchIndex++])
                {
                    matchIndex = 0;
                    return Find(buffer, toFind, i, ref matchIndex);
                }
            }
            return matchIndex == fl ? i : -1;
        }
    }

    public static void LoadThreeData()
    {
        var stream = Console.OpenStandardInput();

        // find three sequence
        int matchIndex = 0;
        var toFind = new[] { (byte)'>', (byte)'T', (byte)'H', (byte)'R', (byte)'E', (byte)'E' };
        var buffer = new byte[BLOCK_SIZE];
        do
        {
            threeEnd = Read(stream, buffer, 0, BLOCK_SIZE);
            threeStart = Find(buffer, toFind, 0, ref matchIndex);
        } while (threeStart == -1);

        // Skip to end of line
        matchIndex = 0;
        toFind = new[] { (byte)'\n' };
        threeStart = Find(buffer, toFind, threeStart, ref matchIndex);
        while (threeStart == -1)
        {
            threeEnd = Read(stream, buffer, 0, BLOCK_SIZE);
            threeStart = Find(buffer, toFind, 0, ref matchIndex);
        }
        threeBlocks.Add(buffer);

        if (threeEnd != BLOCK_SIZE) // Needs to be at least 2 blocks
        {
            var bytes = threeBlocks[0];
            for (int i = threeEnd; i < bytes.Length; i++)
                bytes[i] = 255;
            threeEnd = 0;
            threeBlocks.Add(Array.Empty<byte>());
            return;
        }

        // find next seq or end of input
        matchIndex = 0;
        toFind = new[] { (byte)'>' };
        threeEnd = Find(buffer, toFind, threeStart, ref matchIndex);
        while (threeEnd == -1)
        {
            buffer = new byte[BLOCK_SIZE];
            var bytesRead = Read(stream, buffer, 0, BLOCK_SIZE);
            threeEnd = bytesRead == BLOCK_SIZE ? Find(buffer, toFind, 0, ref matchIndex)
                        : bytesRead;
            threeBlocks.Add(buffer);
        }

        if (threeStart + 18 > BLOCK_SIZE) // Key needs to be in the first block
        {
            byte[] block0 = threeBlocks[0], block1 = threeBlocks[1];
            Buffer.BlockCopy(block0, threeStart, block0, threeStart - 18, BLOCK_SIZE - threeStart);
            Buffer.BlockCopy(block1, 0, block0, BLOCK_SIZE - 18, 18);
            for (int i = 0; i < 18; i++) block1[i] = 255;
        }
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    private static void Check(DictionarySlim<long, int> dict, ref long rollingKey, byte nb, long mask)
    {
        if (nb == 255) return;
        rollingKey = ((rollingKey & mask) << 2) | nb;
        dict.GetOrAddValueRef(rollingKey)++;
    }

    static Task<string> Count(int l, long mask, Func<DictionarySlim<long, int>, string> summary)
    {
        return Task.Run(() =>
        {
            long rollingKey = 0;
            var firstBlock = threeBlocks[0];
            var start = threeStart;
            while (--l > 0) rollingKey = (rollingKey << 2) | firstBlock[start++];
            var dict = new DictionarySlim<long, int>(1024);
            
            for (int i = start; i < firstBlock.Length; i++)
                Check(dict, ref rollingKey, firstBlock[i], mask);

            int lastBlockId = threeBlocks.Count - 1;
            for (int bl = 1; bl < lastBlockId; bl++)
            {
                var bytes = threeBlocks[bl];
                for (int i = 0; i < bytes.Length; i++)
                    Check(dict, ref rollingKey, bytes[i], mask);
            }

            var lastBlock = threeBlocks[lastBlockId];
            for (int i = 0; i < threeEnd; i++)
                Check(dict, ref rollingKey, lastBlock[i], mask);
            
            return summary(dict);
        });
    }

    static string WriteFrequencies(DictionarySlim<long, int> freq, int fragmentLength)
    {
        var sb = new StringBuilder();
        double percent = 100.0 / freq.Select(x => x.Value).Sum();
        foreach (var kv in freq.OrderByDescending(i => i.Value))
        {
            var keyChars = new char[fragmentLength];
            var key = kv.Key;
            for (int i = keyChars.Length - 1; i >= 0; --i)
            {
                keyChars[i] = tochar[key & 0x3];
                key >>= 2;
            }
            sb.Append(keyChars);
            sb.Append(" ");
            sb.AppendLine((kv.Value * percent).ToString("F3"));
        }
        return sb.ToString();
    }

    static string WriteCount(DictionarySlim<long, int> dictionary, string fragment)
    {
        long key = 0;
        for (int i = 0; i < fragment.Length; ++i)
            key = (key << 2) | tonum[fragment[i]];
        dictionary.TryGetValue(key, out var v);
        return string.Concat(v.ToString(), "\t", fragment);
    }

    public static void Main(string[] args)
    {
        tonum['c'] = 1; tonum['C'] = 1;
        tonum['g'] = 2; tonum['G'] = 2;
        tonum['t'] = 3; tonum['T'] = 3;
        tonum['\n'] = 255; tonum['>'] = 255; tonum[255] = 255;

        LoadThreeData();

        Parallel.ForEach(threeBlocks, bytes =>
        {
            for (int i = 0; i < bytes.Length; i++)
                bytes[i] = tonum[bytes[i]];
        });

        var task18 = Count(18, 34359738367, d => WriteCount(d, "GGTATTTTAATTTATAGT"));
        var task12 = Count(12, 8388607, d => WriteCount(d, "GGTATTTTAATT"));
        var task6 = Count(6, 0b1111111111, d => WriteCount(d, "GGTATT"));
        var task1 = Count(1, 0, d => WriteFrequencies(d, 1));
        var task2 = Count(2, 0b11, d => WriteFrequencies(d, 2));
        var task3 = Count(3, 0b1111, d => WriteCount(d, "GGT"));
        var task4 = Count(4, 0b111111, d => WriteCount(d, "GGTA"));

        Console.Out.WriteLineAsync(task1.Result);
        Console.Out.WriteLineAsync(task2.Result);
        Console.Out.WriteLineAsync(task3.Result);
        Console.Out.WriteLineAsync(task4.Result);
        Console.Out.WriteLineAsync(task6.Result);
        Console.Out.WriteLineAsync(task12.Result);
        Console.WriteLine(task18.Result);
    }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
.NET SDK 8.0.200
Host Version: 8.0.2
Commit: 1381d5ebd2
<ServerGarbageCollection>true



 Wed, 06 Mar 2024 22:09:04 GMT

MAKE:
cp knucleotide.csharpcore Program.cs
cp Include/csharpcore/tmp.csproj .
mkdir obj
cp Include/csharpcore/project.assets.json ./obj
/usr/bin/dotnet build -c Release --no-restore --no-self-contained -r linux-x64  
MSBuild version 17.9.4+90725d08d for .NET
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(16,17): error CS0234: The type or namespace name 'Collections' does not exist in the namespace 'Microsoft' (are you missing an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(116,31): error CS0246: The type or namespace name 'DictionarySlim<,>' could not be found (are you missing a using directive or an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(123,54): error CS0246: The type or namespace name 'DictionarySlim<,>' could not be found (are you missing a using directive or an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(152,36): error CS0246: The type or namespace name 'DictionarySlim<,>' could not be found (are you missing a using directive or an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(172,30): error CS0246: The type or namespace name 'DictionarySlim<,>' could not be found (are you missing a using directive or an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]

Build FAILED.

/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(16,17): error CS0234: The type or namespace name 'Collections' does not exist in the namespace 'Microsoft' (are you missing an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(116,31): error CS0246: The type or namespace name 'DictionarySlim<,>' could not be found (are you missing a using directive or an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(123,54): error CS0246: The type or namespace name 'DictionarySlim<,>' could not be found (are you missing a using directive or an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(152,36): error CS0246: The type or namespace name 'DictionarySlim<,>' could not be found (are you missing a using directive or an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/Program.cs(172,30): error CS0246: The type or namespace name 'DictionarySlim<,>' could not be found (are you missing a using directive or an assembly reference?) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/knucleotide/tmp/tmp.csproj]
    0 Warning(s)
    5 Error(s)

Time Elapsed 00:00:02.61
make: [/home/dunham/all-benchmarksgame/2000-benchmarksgame/nanobench/makefiles/u64q.programs.Makefile:160: knucleotide.csharpcore_run] Error 1 (ignored)

4.67s to complete and log all make actions

COMMAND LINE:
 ./bin/Release/net8.0/linux-x64/tmp 0 < knucleotide-input250000.txt

MAKE ERROR