The Computer Language
24.09 Benchmarks Game

regex-redux C++ g++ #5 program

source code

/* The Computer Language Benchmarks Game
   https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

   contributed by idzeta
*/


#define BOOST_DISABLE_THREADS 1
#include <future>
#include <re2/re2.h>
#include <boost/xpressive/xpressive.hpp>
#include <cassert>
#include <iostream>
#include <vector>

using namespace re2;
using namespace boost::xpressive;
using namespace std;
namespace rc = regex_constants;

int main()
{
    const string pattern1[] = {
        "agggtaaa|tttaccct",
        "[cgt]gggtaaa|tttaccc[acg]",
        "a[act]ggtaaa|tttacc[agt]t",
        "ag[act]gtaaa|tttac[agt]ct",
        "agg[act]taaa|ttta[agt]cct",
        "aggg[acg]aaa|ttt[cgt]ccct",
        "agggt[cgt]aa|tt[acg]accct",
        "agggta[cgt]a|t[acg]taccct",
        "agggtaa[cgt]|[acg]ttaccct"
    };

    const string pattern2[][2] = {
        "tHa[Nt]", "<4>",
        "aND|caN|Ha[DS]|WaS", "<3>",
        "a[NSt]|BY", "<2>",
        "<[^>]*>", "|",
        "\\|[^|][^|]*\\|", "-"
    };

    cout.sync_with_stdio(false);

    cin.seekg(0, ios_base::end);
    size_t read_size = cin.tellg();
    assert(read_size > 0);
    cin.seekg(0, ios_base::beg);

    string str(read_size, '\0');
    cin.read(&str[0], read_size);
    size_t len1 = cin.gcount();
    assert(len1);
    if (len1 < read_size) {
        str.resize(len1);
    }

    str = regex_replace(str, sregex::compile(">[^\n]*\n|\n"s, rc::optimize), "");
    size_t len2 = str.length();

    auto handle = async(launch::async, [&, out{str}]() mutable {
        for (auto *pattern : pattern2) {
            out = regex_replace(out, sregex::compile(pattern[0], rc::optimize), pattern[1]);
        }
        return out.length();
    });

    vector<future<int>> tasks;
    for (auto &&pattern : pattern1) {
        auto f = [&, count{0}, piece{StringPiece{str}}]() mutable {
            RE2 pat{pattern};
            while (RE2::FindAndConsume(&piece, pat)) {
                ++count;
            }
            return count;
        };
        tasks.push_back(async(launch::async, f));
    }

    for (size_t i = 0; i < tasks.size(); ++i) {
        cout << pattern1[i] << " ";
        cout << tasks[i].get() << endl;
    }

    cout << "\n" << len1 << "\n" << len2 << "\n";
    cout << handle.get() << endl;
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Ubuntu 13.2.0-23ubuntu4


 Tue, 04 Jun 2024 02:36:44 GMT

MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=ivybridge  -std=c++14 -I/usr/include/re2 regexredux.gpp-5.c++ -o regexredux.gpp-5.c++.o &&  \
        /usr/bin/g++ regexredux.gpp-5.c++.o -o regexredux.gpp-5.gpp_run /usr/lib/x86_64-linux-gnu/libre2.a -lpthread 
rm regexredux.gpp-5.c++

19.68s to complete and log all make actions

COMMAND LINE:
 ./regexredux.gpp-5.gpp_run 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361