The Computer Language
22.05 Benchmarks Game

regex-redux Julia #3 program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# contributed by Daniel Jones
# fixed by David Campbell
# modified by Jarret Revels, Alex Arslan, Yichao Yu
# made (slightly) multi-threaded by Adam Beckmeyer

# Speedup from pcre_jit_match instead of pcre_match in Base. Maybe 2%.
const PCRE = Base.PCRE
function PCRE.exec(re, subject, offset, options, match_data)
    0 <= ccall((:pcre2_jit_match_8, PCRE.PCRE_LIB), Cint,
               (Ptr{Cvoid}, Ptr{UInt8}, Csize_t, Csize_t, Cuint, Ptr{Cvoid}, Ptr{Cvoid}),
               re, subject, sizeof(subject), offset, options, match_data,
               PCRE.get_local_match_context())
end

const variants = (
      r"agggtaaa|tttaccct",
      r"[cgt]gggtaaa|tttaccc[acg]",
      r"a[act]ggtaaa|tttacc[agt]t",
      r"ag[act]gtaaa|tttac[agt]ct",
      r"agg[act]taaa|ttta[agt]cct",
      r"aggg[acg]aaa|ttt[cgt]ccct",
      r"agggt[cgt]aa|tt[acg]accct",
      r"agggta[cgt]a|t[acg]taccct",
      r"agggtaa[cgt]|[acg]ttaccct"
)

const subs = (
    (r"tHa[Nt]", "<4>"),
    (r"aND|caN|Ha[DS]|WaS", "<3>"),
    (r"a[NSt]|BY", "<2>"),
    (r"<[^>]*>", "|"),
    (r"\|[^|][^|]*\|", "-")
)

function perf_regex_dna(io, out)
    seq = read(io, String)
    # lastindex is faster than length and correct with ascii input. 4% speedup.
    l1 = lastindex(seq)

    seq = replace(seq, r">.*\n|\n" => "")
    l2 = lastindex(seq)

    w = Threads.@spawn for v in variants
        c = 0
        for _ in eachmatch(v, seq)
            c += 1
        end
        println(out, v.pattern, ' ', c)
    end

    repseq = seq
    for (u, v) in subs
        repseq = replace(repseq, u => v)
    end


    wait(w)
    write(out, '\n', string(l1), '\n', string(l2), '\n', string(lastindex(repseq)), '\n')
end

isinteractive() || perf_regex_dna(stdin, stdout)
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
julia version 1.7.2


Wed, 04 May 2022 22:05:37 GMT

MAKE:
printenv JULIA_NUM_THREADS
4

0.09s to complete and log all make actions

COMMAND LINE:
/opt/src/julia-1.7.2/bin/julia -O3 --cpu-target=ivybridge --math-mode=ieee  -- regexredux.julia-3.julia 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361