source code
/* The Computer Language Benchmarks Game
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
regex-dna program contributed by Alexey Zolotov
without openmp, without assert
converted from regex-dna program
*/
#include <re2.h>
#include <iostream>
#include <stdio.h>
using namespace re2;
using namespace std;
#define BUFSIZE 1024
int main(void)
{
string str, out;
int len1, len2;
int read_size;
char *buf;
string pattern1[] = {
"agggtaaa|tttaccct",
"[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t",
"ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct",
"aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct",
"agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct"
};
string pattern2[] = {
"tHa[Nt]", "<4>", "aND|caN|Ha[DS]|WaS", "<3>", "a[NSt]|BY", "<2>",
"<[^>]*>", "|", "\\|[^|][^|]*\\|", "-"
};
fseek(stdin, 0, SEEK_END);
read_size = ftell(stdin);
buf = new char[read_size];
rewind(stdin);
read_size = fread(buf, 1, read_size, stdin);
str.append(buf, read_size);
delete [] buf;
len1 = str.length();
RE2::GlobalReplace(&str, ">.*\n|\n", "");
len2 = str.length();
out = str;
{
for (int i = 0; i < (int)(sizeof(pattern1) / sizeof(string)); i++) {
int count = 0;
RE2 pat(pattern1[i]);
StringPiece piece = str;
while (RE2::FindAndConsume(&piece, pat)) {
count++;
}
cout << pattern1[i] << " " << count << endl;
}
for (int i = 0; i < (int)(sizeof(pattern2) / sizeof(string)); i += 2) {
RE2::GlobalReplace(&out, pattern2[i], pattern2[i + 1]);
}
}
cout << endl;
cout << len1 << endl;
cout << len2 << endl;
cout << out.length() << endl;
}
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
g++ (Ubuntu 9.3.0-10ubuntu2) 9.3.0
Mon, 04 May 2020 19:49:10 GMT
MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=core2 -std=c++14 -I/usr/include/re2 regexredux.c++ -o regexredux.c++.o && \
/usr/bin/g++ regexredux.c++.o -o regexredux.gpp_run /usr/lib/x86_64-linux-gnu/libre2.a -lpthread
rm regexredux.c++
6.27s to complete and log all make actions
COMMAND LINE:
./regexredux.gpp_run 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361