The Computer Language
Benchmarks Game

regex-redux Rust #6 program

source code

// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// contributed by Tom Kaitchuck

extern crate rayon;
extern crate regex;

use std::io::{self, Read};
use rayon::prelude::*;
use std::mem;

struct Regex {
    string: &'static str,
    regex: ::regex::bytes::Regex,
}

impl Regex {
    fn new(string: &'static str) -> Regex {
        Regex {
            string: string,
            regex: ::regex::bytes::Regex::new(string).unwrap(),
        }
    }

    fn replace_all<'t>(&self, text: &'t [u8], rep: &[u8], out: &mut Vec<u8>) {
        let mut last_match = 0;
        for m in self.regex.find_iter(text) {
            out.extend_from_slice(&text[last_match..m.start()]);
            out.extend_from_slice(&rep);
            last_match = m.end();
        }
        out.extend_from_slice(&text[last_match..]);
    }
}

fn count_reverse_complements(sequence : &Vec<u8>) -> Vec<String> {
    // Search for occurrences of the following patterns:
    let variants = vec![
            Regex::new("agggtaaa|tttaccct"),
            Regex::new("[cgt]gggtaaa|tttaccc[acg]"),
            Regex::new("a[act]ggtaaa|tttacc[agt]t"),
            Regex::new("ag[act]gtaaa|tttac[agt]ct"),
            Regex::new("agg[act]taaa|ttta[agt]cct"),
            Regex::new("aggg[acg]aaa|ttt[cgt]ccct"),
            Regex::new("agggt[cgt]aa|tt[acg]accct"),
            Regex::new("agggta[cgt]a|t[acg]taccct"),
            Regex::new("agggtaa[cgt]|[acg]ttaccct"),
    ];
    variants.par_iter().map(|ref variant|
        format!("{} {}",
                variant.string,
                variant.regex.find_iter(sequence).count())).collect()
}

fn find_replaced_sequence_length(sequence: Vec<u8>, scratch_buff: Vec<u8>) -> usize {
    // Replace the following patterns, one at a time:
    let substs = vec![
            (Regex::new("tHa[Nt]"),             &b"<4>"[..]),
            (Regex::new("aND|caN|Ha[DS]|WaS"),  &b"<3>"[..]),
            (Regex::new("a[NSt]|BY"),           &b"<2>"[..]),
            (Regex::new("<[^>]*>"),             &b"|"[..]),
            (Regex::new("\\|[^|][^|]*\\|"),     &b"-"[..]),
    ];

    let mut current = sequence;
    let mut next = scratch_buff;
    // Perform the replacements in sequence:
    for (re, replacement) in substs {
        re.replace_all(&current, replacement, &mut next);
        mem::swap(&mut current, &mut next);
        next.clear();
    }
    current.len()
}

fn main() {
    let mut input = Vec::with_capacity(51 * (1 << 20));
    io::stdin().read_to_end(&mut input).unwrap();
    let input_len = input.len();
    let mut sequence: Vec<u8> = Vec::with_capacity(input.len());
    Regex::new(">[^\n]*\n|\n").replace_all(&input, &b""[..], &mut sequence);
    let sequence_len = sequence.len();
    input.clear();
    let (result, counts) = rayon::join(
        || find_replaced_sequence_length(sequence.clone(), input),
        || count_reverse_complements(&sequence),
    );
    for variant in counts {
	    println!("{}", variant)
    }
    println!("\n{}\n{}\n{:?}", input_len, sequence_len, result);
}
    
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
rustc 1.45.0 (5c1f21c3b 2020-07-13)
LLVM version: 9.0


Fri, 17 Jul 2020 23:58:16 GMT

MAKE:
/opt/src/rust-1.45.0/bin/rustc -C opt-level=3 -C target-cpu=ivybridge --C codegen-units=1 -L /opt/src/rust-libs regexredux.rs -o regexredux.rust-6.rust_run

8.14s to complete and log all make actions

COMMAND LINE:
./regexredux.rust-6.rust_run 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361