The Computer Language
24.11 Benchmarks Game

regex-redux C++ g++ #4 program

source code

/* The Computer Language Benchmarks Game
   https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

   contributed by Filip Sajdak
*/

#include <pcre.h>
#include <iostream>
#include <memory>
#include <string>
#include <string_view>

template <typename T>
auto reserve(size_t size) {
   T out;
   out.reserve(size);
   return out;
}

template <auto initial_size = 16384, auto buffer_size = initial_size>
auto load(std::istream& in) {
      auto str = reserve<std::string>(initial_size);
      auto buffer = std::array<char, buffer_size>();

      while (in) {
         in.read(buffer.data(), buffer.size());
         str.append(buffer.cbegin(), buffer.cbegin()+in.gcount());
      }
      
      return str;
}

template<typename T, typename Deleter>
auto make_unique_with_deleter(T* ptr, Deleter&& deleter)
{
    return std::unique_ptr<T, Deleter>(ptr, std::forward<Deleter>(deleter));
} 

   auto make_regex(const std::string_view pattern) {
      char const* error;
      int offset;
      return make_unique_with_deleter(
         pcre_compile(pattern.data(), 0, &error, &offset, NULL), pcre_free);
   }
   
   auto make_aid(const pcre* regex) {
      char const* error;
      return make_unique_with_deleter(
         pcre_study(regex, PCRE_STUDY_JIT_COMPILE, &error), pcre_free_study);
   }

static void replace(const std::string_view pattern, const std::string_view replacement,
       const std::string_view source, std::string& output, pcre_jit_stack* const stack){

   const auto regex = make_regex(pattern);
   const auto aid   = make_aid(regex.get());

   int pos = 0;
   
   for(int match[3]; pcre_jit_exec(regex.get(), aid.get(), source.data(),
         source.size(), pos, 0, match, 3, stack) >= 0; pos = match[1]){
      output.append(std::cbegin(source) + pos, std::cbegin(source) + match[0]);
      output.append(std::cbegin(replacement), std::cend(replacement));
    }

   output.append(std::cbegin(source) + pos, std::cend(source));
}


int main(void){
   std::ios::sync_with_stdio(false);
   
    char const * const count_Info[]={
        "agggtaaa|tttaccct",
        "[cgt]gggtaaa|tttaccc[acg]",
        "a[act]ggtaaa|tttacc[agt]t",
        "ag[act]gtaaa|tttac[agt]ct",
        "agg[act]taaa|ttta[agt]cct",
        "aggg[acg]aaa|ttt[cgt]ccct",
        "agggt[cgt]aa|tt[acg]accct",
        "agggta[cgt]a|t[acg]taccct",
        "agggtaa[cgt]|[acg]ttaccct"
      }, * const replace_Info[][2]={
        {"tHa[Nt]", "<4>"},
        {"aND|caN|Ha[DS]|WaS", "<3>"},
        {"a[NSt]|BY", "<2>"},
        {"<[^>]*>", "|"},
        {"\\|[^|][^|]*\\|", "-"}
      };

   auto sequences = reserve<std::string>(16384);
    size_t postreplace_Size = 0;

   std::string input = load(std::cin);

    #pragma omp parallel
    {
      auto stack = make_unique_with_deleter(
         pcre_jit_stack_alloc(16384, 16384), pcre_jit_stack_free);

        #pragma omp single
        {
            replace(">.*\\n|\\n", "", input, sequences, stack.get());
        }

        #pragma omp single nowait
        {
            auto prereplace_String = sequences;
            auto postreplace_String = reserve<std::string>(sequences.capacity());
         
         for ( const auto& [pattern, replacement] : replace_Info ) {
            postreplace_String.clear();
            replace(pattern, replacement, 
               prereplace_String, postreplace_String, stack.get());
            std::swap(prereplace_String, postreplace_String);
            }

            postreplace_Size = prereplace_String.size();
        }

        #pragma omp for schedule(dynamic) ordered
        for(auto i=0u; i < std::size(count_Info); i++){

         auto regex = make_regex(count_Info[i]);
         auto aid = make_aid(regex.get());
         
         int count = 0;
         for(int pos = 0, match[3]; pcre_jit_exec(regex.get(), aid.get(),
               sequences.data(), sequences.size(), pos, 0, match, 3,
               stack.get()) >= 0; pos=match[1]) {
            ++count;
            }

            #pragma omp ordered
            printf("%s %d\n", count_Info[i], count);
        }
    }

    printf("\n%zu\n%zu\n%zu\n", input.size(), sequences.size(), postreplace_Size);
    return 0;
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
g++ (Ubuntu 14.2.0-4ubuntu2) 14.2.0


 Tue, 22 Oct 2024 21:13:39 GMT

MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=ivybridge  -std=c++17 -fopenmp -flto regexredux.gpp-4.c++ -o regexredux.gpp-4.c++.o &&  \
        /usr/bin/g++ regexredux.gpp-4.c++.o -o regexredux.gpp-4.gpp_run -fopenmp -flto -lpcre 
regexredux.gpp-4.c++: In instantiation of ‘auto load(std::istream&) [with auto initial_size = 16384; auto buffer_size = 16384; std::istream = std::basic_istream<char>]’:
regexredux.gpp-4.c++:94:28:   required from here
   94 |    std::string input = load(std::cin);
      |                        ~~~~^~~~~~~~~~
regexredux.gpp-4.c++:23:26: error: invalid use of incomplete type ‘struct std::array<char, 16384>’
   23 |       auto buffer = std::array<char, buffer_size>();
      |                          ^~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/include/c++/14/bits/stl_algobase.h:64,
                 from /usr/include/c++/14/string:51,
                 from /usr/include/c++/14/bits/locale_classes.h:40,
                 from /usr/include/c++/14/bits/ios_base.h:41,
                 from /usr/include/c++/14/ios:44,
                 from /usr/include/c++/14/ostream:40,
                 from /usr/include/c++/14/iostream:41,
                 from regexredux.gpp-4.c++:8:
/usr/include/c++/14/bits/stl_pair.h:99:12: note: declaration of ‘struct std::array<char, 16384>’
   99 |     struct array;
      |            ^~~~~
make: [/home/dunham/all-benchmarksgame/2000-benchmarksgame/nanobench/makefiles/u64q.programs.Makefile:54: regexredux.gpp-4.gpp_run] Error 1 (ignored)
rm regexredux.gpp-4.c++

2.45s to complete and log all make actions

COMMAND LINE:
 ./regexredux.gpp-4.gpp_run 0 < regexredux-input50000.txt

MAKE ERROR