The Computer Language
24.11 Benchmarks Game

regex-redux C clang #3 program

source code

/* The Computer Language Benchmarks Game
* https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
**
** regex-dna program contributed by Mike Pall
** converted from regex-dna program by Jeremy Zerfas
** omp parallel loop by Josh Goldfoot
**
*/

#define __USE_STRING_INLINES
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <pcre.h>
#include <omp.h>


typedef struct fbuf {
    char *buf;
    size_t size, len;
} fbuf_t;

static void fb_init(fbuf_t *b)
{
    b->buf = NULL;
    b->len = b->size = 0;
}

static char *fb_need(fbuf_t *b, size_t need)
{
    need += b->len;
    if (need > b->size) {
        if (b->size == 0) b->size = need;
        else while (need > b->size) b->size += b->size;
        if (!(b->buf = realloc(b->buf, b->size))) exit(1);
    }
    return b->buf + b->len;
}

#define FB_MINREAD    (3<<16)

/* Read all of a stdio stream into dst buffer. */
static size_t fb_readall(fbuf_t *dst, FILE *fp)
{
    char *dp;
    int n;
    for (dp = fb_need(dst, FB_MINREAD);
        (n = fread(dp, 1, dst->size - dst->len, fp)) > 0;
        dp = fb_need(dst, FB_MINREAD)) dst->len += n;
    if (ferror(fp)) exit(1);
    return dst->len;
}

/* Substitute pattern p with replacement r, copying from src to dst buffer. */
static size_t fb_subst(fbuf_t *dst, fbuf_t *src, const char *p, const char *r)
{
    pcre *re;
    pcre_extra *re_ex;
    const char *re_e;
    char *dp;
    int re_eo, m[3], pos, rlen, clen;
    if (!(re = pcre_compile(p, 0, &re_e, &re_eo, NULL))) exit(1);
    re_ex = pcre_study(re, PCRE_STUDY_JIT_COMPILE, &re_e);
    for (dst->len = 0, rlen = strlen(r), pos = 0;
        pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
        pos = m[1]) {
        clen = m[0] - pos;
        dp = fb_need(dst, clen + rlen);
        dst->len += clen + rlen;
        memcpy(dp, src->buf + pos, clen);
        memcpy(dp + clen, r, rlen);
    }
    clen = src->len - pos;
    dp = fb_need(dst, clen);
    dst->len += clen;
    memcpy(dp, src->buf + pos, clen);
    return dst->len;
}

/* Count all matches with pattern p in src buffer. */
static int fb_countmatches(fbuf_t *src, const char *p)
{
    pcre *re;
    pcre_extra *re_ex;
    const char *re_e;
    int re_eo, m[3], pos, count;
    if (!(re = pcre_compile(p, 0, &re_e, &re_eo, NULL))) exit(1);
    re_ex = pcre_study(re, PCRE_STUDY_JIT_COMPILE, &re_e);
    for (count = 0, pos = 0;
        pcre_exec(re, re_ex, src->buf, src->len, pos, 0, m, 3) >= 0;
        pos = m[1]) count++;
    return count;
}

static const char *variants[] = {
    "agggtaaa|tttaccct",         "[cgt]gggtaaa|tttaccc[acg]",
    "a[act]ggtaaa|tttacc[agt]t", "ag[act]gtaaa|tttac[agt]ct",
    "agg[act]taaa|ttta[agt]cct", "aggg[acg]aaa|ttt[cgt]ccct",
    "agggt[cgt]aa|tt[acg]accct", "agggta[cgt]a|t[acg]taccct",
    "agggtaa[cgt]|[acg]ttaccct", NULL
};

static const char *subst[] = {
    // regex-redux search patterns and replacement text.
    "tHa[Nt]", "<4>",
    "aND|caN|Ha[DS]|WaS", "<3>",
    "a[NSt]|BY", "<2>",
    "<[^>]*>", "|",
    "\\|[^|][^|]*\\|", "-",
    NULL
};

int main(int argc, char **argv)
{
#ifdef _OPENMP
    int tnum = omp_get_num_procs();
    if (tnum>4) tnum = 4;
    omp_set_num_threads(tnum);
#endif

    fbuf_t seq[2];
    const char **pp;
    size_t ilen, clen, slen;
    int flip;
    fb_init(&seq[0]);
    fb_init(&seq[1]);
    ilen = fb_readall(&seq[0], stdin);
    clen = fb_subst(&seq[1], &seq[0], ">.*|\n", "");
    char *results[9];
#pragma omp parallel
#pragma omp for
    for (int i = 0; i < 9; i++) {
        results[i] = (char*)malloc(40);
        sprintf(results[i], "%s %d\n", variants[i], fb_countmatches(&seq[1], variants[i]));
    }
    for (int i = 0; i < 9; i++) {
        printf("%s", results[i]);
        free(results[i]);
    }

    for (slen = 0, flip = 1, pp = subst; *pp; pp += 2, flip = 1 - flip)
        slen = fb_subst(&seq[1 - flip], &seq[flip], *pp, pp[1]);

    printf("\n%zu\n%zu\n%zu\n", ilen, clen, slen);
    return 0;
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Ubuntu clang version 18.1.8 (11)


 Wed, 23 Oct 2024 01:39:03 GMT

MAKE:
/usr/bin/clang -pipe -Wall -O3 -fomit-frame-pointer -march=ivybridge -fopenmp regexredux.clang-3.c -o regexredux.clang-3.clang_run -lpcre
rm regexredux.clang-3.c

5.85s to complete and log all make actions

COMMAND LINE:
 ./regexredux.clang-3.clang_run 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361