source code
/* The Computer Language Benchmarks Game
* https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
*
* contributed by Josh Goldfoot
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using System.Text.RegularExpressions;
class regexredux
{
static void Main(string[] args)
{
// read FASTA sequence
String sequence = Console.In.ReadToEnd();
int initialLength = sequence.Length;
// remove FASTA sequence descriptions and new-lines
sequence = Regex.Replace(sequence, ">.*\n|\n", "");
int codeLength = sequence.Length;
Task<int> substitution = Task.Run(() => {
// regex substitution
string newseq = Regex.Replace(sequence, "tHa[Nt]", "<4>");
newseq = Regex.Replace(newseq, "aND|caN|Ha[DS]|WaS", "<3>");
newseq = Regex.Replace(newseq, "a[NSt]|BY", "<2>");
newseq = Regex.Replace(newseq, "<[^>]*>", "|");
newseq = Regex.Replace(newseq, "\\|[^|][^|]*\\|" , "-");
return newseq.Length;
});
// divide large sequence into chunks (one per core) and search each in parallel
int[][] sums = Chunks(sequence).AsParallel().Select(CountRegexes).ToArray();
var variants = Variants.variantsCopy();
for (int i = 0; i < 9; i++)
Console.WriteLine("{0} {1}", variants[i], sums.Sum(a => a[i]));
Console.WriteLine("\n{0}\n{1}\n{2}",
initialLength, codeLength, substitution.Result);
}
private static IEnumerable<string> Chunks(string sequence)
{
int numChunks = Environment.ProcessorCount;
int start = 0;
int chunkSize = sequence.Length / numChunks;
while (--numChunks >= 0)
{
if (numChunks > 0)
yield return sequence.Substring(start, chunkSize);
else
yield return sequence.Substring(start);
start += chunkSize;
}
}
private static int[] CountRegexes(string chunk)
{
// regex match
int[] counts = new int[9];
string[] variants = Variants.variantsCopy();
for (int i = 0; i < 9; i++)
for (var m = Regex.Match(chunk, variants[i]); m.Success; m = m.NextMatch()) counts[i]++;
return counts;
}
}
public class Variants
{
public static string[] variantsCopy()
{
return new string[] {
"agggtaaa|tttaccct"
,"[cgt]gggtaaa|tttaccc[acg]"
,"a[act]ggtaaa|tttacc[agt]t"
,"ag[act]gtaaa|tttac[agt]ct"
,"agg[act]taaa|ttta[agt]cct"
,"aggg[acg]aaa|ttt[cgt]ccct"
,"agggt[cgt]aa|tt[acg]accct"
,"agggta[cgt]a|t[acg]taccct"
,"agggtaa[cgt]|[acg]ttaccct"
};
}
}
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
.NET SDK 8.0.301
Host Version: 8.0.6
Commit: 3b8b000a0e
<OutputType>Exe
<TargetFramework>net8.0
<ImplicitUsings>enable
<Nullable>enable
<AllowUnsafeBlocks>true
<ServerGarbageCollection>true
<ConcurrentGarbageCollection>true
<PublishAot>false
<OptimizationPreference>Speed
<IlcInstructionSet>native
Wed, 29 May 2024 21:45:12 GMT
MAKE:
cp regexredux.csharpcore-4.csharpcore Program.cs
cp Include/csharpcore/program.csproj .
mkdir obj
cp Include/csharpcore/project.assets.json ./obj
~/dotnet/dotnet build -c Release --use-current-runtime
Determining projects to restore...
Restored /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/program.csproj (in 755 ms).
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/Program.cs(13,7): warning CS8981: The type name 'regexredux' only contains lower-cased ascii characters. Such names may become reserved for the language. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/program.csproj]
program -> /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/bin/Release/net8.0/linux-x64/program.dll
Build succeeded.
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/Program.cs(13,7): warning CS8981: The type name 'regexredux' only contains lower-cased ascii characters. Such names may become reserved for the language. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/program.csproj]
1 Warning(s)
0 Error(s)
Time Elapsed 00:00:05.22
6.93s to complete and log all make actions
COMMAND LINE:
./bin/Release/net8.0/linux-x64/program 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361