The Computer Language
Benchmarks Game

regex-redux Julia #5 program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# contributed by Daniel Jones
# fixed by David Campbell
# modified by Jarret Revels, Alex Arslan, Yichao Yu
# made multi-threaded by Adam Beckmeyer

# Julia's replace function doesn't internally use pcre2_substitute
# which is more efficient than the string-building Julia currently
# uses. This gives about a 10% speedup compared to using Base.replace.
function rereplace!(s, slen, pat::Pair{Regex,String}, dest)
    re, rep = pat
    dest_len = Ref{Csize_t}(sizeof(dest))
    Base.compile(re)
    ccall((:pcre2_substitute_8, Base.PCRE.PCRE_LIB), Cint,
          (Ptr{Cvoid}, Ptr{UInt8}, Csize_t, Csize_t, Cuint, Ptr{Cvoid}, Ptr{Cvoid},
           Ptr{UInt8}, Csize_t, Ptr{UInt8}, Ref{Csize_t}), re.regex, s, slen,
          0, re.match_options | Base.PCRE.SUBSTITUTE_GLOBAL,
          C_NULL, C_NULL, rep, sizeof(rep), dest, dest_len)
    dest_len[]
end

const variants = (
      r"agggtaaa|tttaccct",
      r"[cgt]gggtaaa|tttaccc[acg]",
      r"a[act]ggtaaa|tttacc[agt]t",
      r"ag[act]gtaaa|tttac[agt]ct",
      r"agg[act]taaa|ttta[agt]cct",
      r"aggg[acg]aaa|ttt[cgt]ccct",
      r"agggt[cgt]aa|tt[acg]accct",
      r"agggta[cgt]a|t[acg]taccct",
      r"agggtaa[cgt]|[acg]ttaccct"
)

const subs = (
    r"tHa[Nt]"            => "<4>",
    r"aND|caN|Ha[DS]|WaS" => "<3>",
    r"a[NSt]|BY"          => "<2>",
    r"<[^>]*>"            => "|",
    r"\|[^|][^|]*\|"      => "-"
)

function main(io, out)
    # Read in the file as a string and get length
    seq = read(io, String)
    l1 = sizeof(seq)

    # Remove descriptions and linefeed characters while copying seq to seq2
    seq2 = Vector{UInt8}(undef, l1)
    l2 = rereplace!(seq, sizeof(seq), r">.*\n|\n" => "", seq2)
    resize!(seq2, l2)

    # Turning seq2 back into a string will cause it to be truncated to
    # zero, so we first need to make the working copies that will be
    # needed for the replace task. These must be slightly larger than
    # seq2 since some replacements can cause the string to grow. These
    # are placed in a Ref container to prevent the compiler from
    # boxing them inside the closure spawned to run in a thread (since
    # the variables don't have to be re-bound this way).
    rseq1 = Ref(similar(seq2, trunc(Int, 1.1l2)))
    copyto!(rseq1[], seq2)
    rseq2 = Ref(similar(rseq1[]))

    # First we start the task to find the length of the string with
    # all replacements since it takes the longest.
    l3 = Ref(l2 % Csize_t)
    replace_task = Threads.@spawn for sub in subs
        # This loop moves back and forth, first doing a replacement
        # while copying rseq1 to rseq2 and then another replacement
        # while copying rseq2 back to rseq1.
        l3[] = rereplace!(rseq1[], l3[], sub, rseq2[])
        rseq1[], rseq2[] = rseq2[], rseq1[]
    end

    # Then we count the occurrences of each variant within
    # seq. Although this could be done multi-threaded with a counter
    # per thread, there's no benefit to doing so since replace_task
    # takes longer than all variant counts in serial.
    seq = String(seq2)
    for v in variants
        println(out, v.pattern, ' ', count(v, seq))
    end

    println(out, '\n', l1)
    println(out, l2)
    # Wait for the replacing task to complete before printing sequence lengths.
    wait(replace_task)
    println(out, l3[])
end

isinteractive() || main(stdin, stdout)
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
julia version 1.5.0


Sun, 25 Oct 2020 04:17:26 GMT

MAKE:
printenv JULIA_NUM_THREADS
4
printenv JULIA_LLVM_ARGS


0.06s to complete and log all make actions

COMMAND LINE:
/opt/src/julia-1.5.0/bin/julia -O3 --cpu-target=ivybridge  -- regexredux.julia-5.julia 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361