The Computer Language
24.11 Benchmarks Game

regex-redux Rust #3 program

source code

// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// contributed by Francois Green

extern crate rayon;
extern crate regex;

use rayon::prelude::*;
use std::collections::HashMap;
use std::io::{self, Read};
use std::thread;

macro_rules! regex { ($re:expr) => { ::regex::bytes::Regex::new($re).unwrap() } }

fn main() {
    let mut input = Vec::with_capacity(51 * (1 << 20));

    io::stdin().read_to_end(&mut input).unwrap();

    let sequence = regex!(">[^\n]*\n|\n").replace_all(&input, &b""[..]).into_owned();

    let sequence_c = sequence.clone();

    let result = thread::spawn(move|| {
        vec![
            ("tHa[Nt]", &b"<4>"[..]),
            ("aND|caN|Ha[DS]|WaS", &b"<3>"[..]),
            ("a[NSt]|BY", &b"<2>"[..]),
            ("<[^>]*>", &b"|"[..]),
            ("\\|[^|][^|]*\\|", &b"-"[..]),
        ].iter()
         .fold(sequence_c, |mut buffer, &(pattern, replacement)| {
             regex!(pattern).replace_all(&mut buffer, replacement).into_owned()
         })
    });

    let variants = vec![
        "agggtaaa|tttaccct",
        "[cgt]gggtaaa|tttaccc[acg]",
        "a[act]ggtaaa|tttacc[agt]t",
        "ag[act]gtaaa|tttac[agt]ct",
        "agg[act]taaa|ttta[agt]cct",
        "aggg[acg]aaa|ttt[cgt]ccct",
        "agggt[cgt]aa|tt[acg]accct",
        "agggta[cgt]a|t[acg]taccct",
        "agggtaa[cgt]|[acg]ttaccct",
    ];
    
    let results: HashMap<&str, usize> = variants.par_iter()
        .map(|v| (&**v, regex!(v).find_iter(&sequence).count()))
        .collect();

    for v in variants.iter() {
        println!("{} {}", v, results.get::<str>(v).unwrap());
    }

    println!("\n{}\n{}\n{:?}", input.len(), sequence.len(), result.join().unwrap().len());
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
1.83.0
(90b35a623
2024-11-26)
LLVM version: 19.1.1


 Sat, 14 Dec 2024 20:24:08 GMT

MAKE:
/opt/src/rust-1.83.0/bin/rustc -C opt-level=3 -C target-cpu=ivybridge -C codegen-units=1 -L /opt/src/rust-libs --extern libc=/opt/src/rust-libs/liblibc-f6fe57662dfc7b71.rlib regexredux.rs -o regexredux.rust-3.rust_run

13.48s to complete and log all make actions

COMMAND LINE:
 ./regexredux.rust-3.rust_run 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361