source code
/* The Computer Language Benchmarks Game
* https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
*
* regex-dna program contributed by Isaac Gouy
*/
using System;
using System.Text.RegularExpressions;
class regexredux
{
static void Main(string[] args){
// read FASTA sequence
String sequence = Console.In.ReadToEnd();
int initialLength = sequence.Length;
// remove FASTA sequence descriptions and new-lines
Regex r = new Regex(">.*\n|\n", RegexOptions.Compiled);
sequence = r.Replace(sequence,"");
int codeLength = sequence.Length;
// regex match
string[] variants = {
"agggtaaa|tttaccct"
,"[cgt]gggtaaa|tttaccc[acg]"
,"a[act]ggtaaa|tttacc[agt]t"
,"ag[act]gtaaa|tttac[agt]ct"
,"agg[act]taaa|ttta[agt]cct"
,"aggg[acg]aaa|ttt[cgt]ccct"
,"agggt[cgt]aa|tt[acg]accct"
,"agggta[cgt]a|t[acg]taccct"
,"agggtaa[cgt]|[acg]ttaccct"
};
int count;
foreach (string v in variants){
count = 0;
r = new Regex(v, RegexOptions.Compiled);
for (Match m = r.Match(sequence); m.Success; m = m.NextMatch()) count++;
Console.WriteLine("{0} {1}", v, count);
}
// regex substitution
IUB[] codes = {
new IUB("tHa[Nt]", "<4>")
,new IUB("aND|caN|Ha[DS]|WaS", "<3>")
,new IUB("a[NSt]|BY", "<2>")
,new IUB("<[^>]*>", "|")
,new IUB("\\|[^|][^|]*\\|" , "-")
};
foreach (IUB iub in codes) {
r = new Regex(iub.code, RegexOptions.Compiled);
sequence = r.Replace(sequence,iub.alternatives);
}
Console.WriteLine("\n{0}\n{1}\n{2}",
initialLength, codeLength, sequence.Length);
}
struct IUB
{
public string code;
public string alternatives;
public IUB(string code, string alternatives) {
this.code = code;
this.alternatives = alternatives;
}
}
}
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
.NET SDK 8.0.301
Host Version: 8.0.6
Commit: 3b8b000a0e
<OutputType>Exe
<TargetFramework>net8.0
<ImplicitUsings>enable
<Nullable>enable
<AllowUnsafeBlocks>true
<ServerGarbageCollection>true
<ConcurrentGarbageCollection>true
<PublishAot>false
<OptimizationPreference>Speed
<IlcInstructionSet>native
Wed, 29 May 2024 21:44:32 GMT
MAKE:
cp regexredux.csharpcore Program.cs
cp Include/csharpcore/program.csproj .
mkdir obj
cp Include/csharpcore/project.assets.json ./obj
~/dotnet/dotnet build -c Release --use-current-runtime
Determining projects to restore...
Restored /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/program.csproj (in 926 ms).
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/Program.cs(10,7): warning CS8981: The type name 'regexredux' only contains lower-cased ascii characters. Such names may become reserved for the language. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/program.csproj]
program -> /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/bin/Release/net8.0/linux-x64/program.dll
Build succeeded.
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/Program.cs(10,7): warning CS8981: The type name 'regexredux' only contains lower-cased ascii characters. Such names may become reserved for the language. [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/program.csproj]
1 Warning(s)
0 Error(s)
Time Elapsed 00:00:05.30
7.04s to complete and log all make actions
COMMAND LINE:
./bin/Release/net8.0/linux-x64/program 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361