source code
/* The Computer Language Benchmarks Game
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
regex-dna program contributed by Alexey Zolotov
modified by Vaclav Zeman
converted from regex-dna program
*/
#include <boost/regex.hpp>
#include <cassert>
#include <iostream>
#include <cstdio>
using namespace std;
const std::size_t BUFSIZE = 1024;
const boost::regex::flag_type re_flags = boost::regex::perl;
int main(void)
{
string str, out;
int len1, len2;
int read_size;
char *buf;
char const * pattern1[] = {
"agggtaaa|tttaccct",
"[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t",
"ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct",
"aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct",
"agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct"
};
const int pattern1_count = (int)(sizeof(pattern1) / sizeof(pattern1[0]));
string const pattern2[] = {
"tHa[Nt]", "<4>", "aND|caN|Ha[DS]|WaS", "<3>", "a[NSt]|BY", "<2>",
"<[^>]*>", "|", "\\|[^|][^|]*\\|", "-"
};
fseek(stdin, 0, SEEK_END);
read_size = ftell(stdin);
assert(read_size > 0);
str.resize (read_size);
rewind(stdin);
read_size = fread(&str[0], 1, read_size, stdin);
assert(read_size);
len1 = str.length();
boost::regex re1 (">[^\\n]+\\n|[\\n]", re_flags);
boost::regex_replace (str, re1, "").swap (str);
len2 = str.length();
out = str;
int counts[pattern1_count] = { 0 };
#pragma omp parallel sections
{
#pragma omp section
#pragma omp parallel for
for (int i = 0; i < pattern1_count; i++)
{
boost::regex pat(pattern1[i], re_flags);
boost::smatch m;
std::string::const_iterator start = str.begin (), end = str.end ();
while (boost::regex_search (start, end, m, pat))
{
++counts[i];
start += m.position () + m.length ();
}
}
#pragma omp section
for (int i = 0; i < (int)(sizeof(pattern2) / sizeof(string)); i += 2)
{
boost::regex re (pattern2[i], re_flags);
boost::regex_replace (out, re, pattern2[i + 1]).swap (out);
}
}
for (int i = 0; i != pattern1_count; ++i)
cout << pattern1[i] << " " << counts[i] << "\n";
cout << "\n";
cout << len1 << "\n";
cout << len2 << "\n";
cout << out.length() << endl;
}
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
g++ (Ubuntu 9.3.0-10ubuntu2) 9.3.0
Mon, 04 May 2020 19:44:42 GMT
MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=core2 -fopenmp regexredux.gpp-3.c++ -o regexredux.gpp-3.c++.o && \
/usr/bin/g++ regexredux.gpp-3.c++.o -o regexredux.gpp-3.gpp_run -fopenmp -lboost_regex
rm regexredux.gpp-3.c++
12.80s to complete and log all make actions
COMMAND LINE:
./regexredux.gpp-3.gpp_run 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361