source code
// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// contributed by Francois Green
extern crate rayon;
extern crate regex;
use rayon::prelude::*;
use std::collections::HashMap;
use std::io::{self, Read};
use std::thread;
macro_rules! regex { ($re:expr) => { ::regex::bytes::Regex::new($re).unwrap() } }
fn main() {
let mut input = Vec::with_capacity(51 * (1 << 20));
io::stdin().read_to_end(&mut input).unwrap();
let sequence = regex!(">[^\n]*\n|\n").replace_all(&input, &b""[..]).into_owned();
let sequence_c = sequence.clone();
let result = thread::spawn(move|| {
vec![
("tHa[Nt]", &b"<4>"[..]),
("aND|caN|Ha[DS]|WaS", &b"<3>"[..]),
("a[NSt]|BY", &b"<2>"[..]),
("<[^>]*>", &b"|"[..]),
("\\|[^|][^|]*\\|", &b"-"[..]),
].iter()
.fold(sequence_c, |mut buffer, &(pattern, replacement)| {
regex!(pattern).replace_all(&mut buffer, replacement).into_owned()
})
});
let variants = vec![
"agggtaaa|tttaccct",
"[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t",
"ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct",
"aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct",
"agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct",
];
let results: HashMap<&str, usize> = variants.par_iter()
.map(|v| (&**v, regex!(v).find_iter(&sequence).count()))
.collect();
for v in variants.iter() {
println!("{} {}", v, results.get::<str>(v).unwrap());
}
println!("\n{}\n{}\n{:?}", input.len(), sequence.len(), result.join().unwrap().len());
}
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
rustc 1.44.0 (49cae5576 2020-06-01)
LLVM version: 9.0
Fri, 05 Jun 2020 20:54:42 GMT
MAKE:
/opt/src/rust-1.44.0/bin/rustc -C opt-level=3 -C target-cpu=core2 -C lto -C codegen-units=1 -L /opt/src/rust-libs --extern crossbeam_utils=/opt/src/rust-libs/libcrossbeam_utils-5d7d3a493e735d64.rlib regexredux.rs -o regexredux.rust-3.rust_run
37.56s to complete and log all make actions
COMMAND LINE:
./regexredux.rust-3.rust_run 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361