source code
// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// contributed by Tom Kaitchuck
// contributed by Andre Bogus
extern crate crossbeam_utils;
extern crate regex;
use crossbeam_utils::thread::scope;
use regex::bytes::Regex;
use std::{
borrow::Cow,
io::{self, Read},
};
fn regex(s: &str) -> Regex {
Regex::new(s).unwrap()
}
fn count_reverse_complements(sequence: &[u8]) -> String {
// Search for occurrences of the following patterns:
static VARIANTS: &[&str] = &[
"agggtaaa|tttaccct",
"[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t",
"ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct",
"aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct",
"agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct",
];
VARIANTS
.iter()
.map(|variant| {
format!(
"{} {}\n",
variant,
regex(variant).find_iter(sequence).count()
)
})
.collect()
}
fn find_replaced_sequence_length(sequence: &[u8]) -> usize {
// Replace the following patterns, one at a time:
static SUBSTS: &[(&str, &[u8])] = &[
("tHa[Nt]", b"<4>"),
("aND|caN|Ha[DS]|WaS", b"<3>"),
("a[NSt]|BY", b"<2>"),
("<[^>]*>", b"|"),
("\\|[^|][^|]*\\|", b"-"),
];
let mut seq = Cow::Borrowed(sequence);
// Perform the replacements in sequence:
for (re, replacement) in SUBSTS.iter().cloned() {
seq = Cow::Owned(regex(re).replace_all(&seq, replacement).into_owned());
}
seq.len()
}
fn main() {
let mut input = Vec::with_capacity(51 * (1 << 20));
io::stdin().read_to_end(&mut input).unwrap();
let sequence = regex(">[^\n]*\n|\n").replace_all(&input, &b""[..]).into_owned();
scope(|s| {
let result = s.spawn(|_| find_replaced_sequence_length(&sequence));
println!(
"{}\n{}\n{}\n{}",
count_reverse_complements(&sequence[..]),
input.len(),
sequence.len(),
result.join().unwrap()
);
})
.unwrap();
}
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
1.80.1
(3f5fd8dd4
2024-08-06)
LLVM version: 18.1.7
Thu, 05 Sep 2024 22:52:30 GMT
MAKE:
/opt/src/rust-1.80.1/bin/rustc -C opt-level=3 -C target-cpu=ivybridge -C codegen-units=1 -L /opt/src/rust-libs --extern crossbeam_utils=/opt/src/rust-libs/libcrossbeam_utils-5fe1a42cb8431226.rlib regexredux.rs -o regexredux.rust-5.rust_run
9.86s to complete and log all make actions
COMMAND LINE:
./regexredux.rust-5.rust_run 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361