The Computer Language
24.04 Benchmarks Game

regex-redux C# .NET #4 program

source code

/* The Computer Language Benchmarks Game
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 * 
 * contributed by  Josh Goldfoot
*/

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using System.Text.RegularExpressions;

class regexredux
{
    static void Main(string[] args)
    {

        // read FASTA sequence
        String sequence = Console.In.ReadToEnd();
        int initialLength = sequence.Length;

        // remove FASTA sequence descriptions and new-lines
        sequence = Regex.Replace(sequence, ">.*\n|\n", "");
        int codeLength = sequence.Length;

        Task<int> substitution = Task.Run(() => {
            // regex substitution
            string newseq = Regex.Replace(sequence, "tHa[Nt]", "<4>");
            newseq = Regex.Replace(newseq, "aND|caN|Ha[DS]|WaS", "<3>");
            newseq = Regex.Replace(newseq, "a[NSt]|BY", "<2>");
            newseq = Regex.Replace(newseq, "<[^>]*>", "|");
            newseq = Regex.Replace(newseq, "\\|[^|][^|]*\\|" , "-");
            return newseq.Length;
        });

        // divide large sequence into chunks (one per core) and search each in parallel
        int[][] sums = Chunks(sequence).AsParallel().Select(CountRegexes).ToArray();

        var variants = Variants.variantsCopy();
        for (int i = 0; i < 9; i++)
            Console.WriteLine("{0} {1}", variants[i], sums.Sum(a => a[i]));

        Console.WriteLine("\n{0}\n{1}\n{2}",
           initialLength, codeLength, substitution.Result);
    }

    private static IEnumerable<string> Chunks(string sequence)
    {
        int numChunks = Environment.ProcessorCount;
        int start = 0;
        int chunkSize = sequence.Length / numChunks;
        while (--numChunks >= 0)
        {
            if (numChunks > 0)
                yield return sequence.Substring(start, chunkSize);
            else
                yield return sequence.Substring(start);
            start += chunkSize;
        }
    }

    private static int[] CountRegexes(string chunk)
    {
        // regex match
        int[] counts = new int[9];
        string[] variants = Variants.variantsCopy();

        for (int i = 0; i < 9; i++)
            for (var m = Regex.Match(chunk, variants[i]); m.Success; m = m.NextMatch()) counts[i]++;
        return counts;
    }
}

public class Variants
{
    public static string[] variantsCopy()
    {
        return new string[] {
          "agggtaaa|tttaccct"
         ,"[cgt]gggtaaa|tttaccc[acg]"
         ,"a[act]ggtaaa|tttacc[agt]t"
         ,"ag[act]gtaaa|tttac[agt]ct"
         ,"agg[act]taaa|ttta[agt]cct"
         ,"aggg[acg]aaa|ttt[cgt]ccct"
         ,"agggt[cgt]aa|tt[acg]accct"
         ,"agggta[cgt]a|t[acg]taccct"
         ,"agggtaa[cgt]|[acg]ttaccct"
        };
    }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
.NET SDK 8.0.200
Host Version: 8.0.2
Commit: 1381d5ebd2
<ServerGarbageCollection>true



 Mon, 01 Apr 2024 21:52:09 GMT

MAKE:
cp regexredux.csharpcore-4.csharpcore Program.cs
cp Include/csharpcore/tmp.csproj .
mkdir obj
cp Include/csharpcore/project.assets.json ./obj
/usr/bin/dotnet build -c Release --no-restore --no-self-contained -r linux-x64  
MSBuild version 17.9.6+a4ecab324 for .NET
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/Program.cs(13,7): warning CS8981: The type name 'regexredux' only contains lower-cased ascii characters. Such names may become reserved for the language. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/tmp.csproj]
  tmp -> /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/bin/Release/net8.0/linux-x64/tmp.dll

Build succeeded.

/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/Program.cs(13,7): warning CS8981: The type name 'regexredux' only contains lower-cased ascii characters. Such names may become reserved for the language. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/tmp.csproj]
    1 Warning(s)
    0 Error(s)

Time Elapsed 00:00:03.48

5.40s to complete and log all make actions

COMMAND LINE:
 ./bin/Release/net8.0/linux-x64/tmp 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361