The Computer Language
Benchmarks Game

k-nucleotide Rust #8 program

source code

// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// contributed by Tom Kaitchuck

// Based on k-nucleotide Rust #7
// Switched to used Hashbrown and removed custom hash code.
// Removed rayon and use threads directly
// Copied the read_input function from k-nucleotide Rust #4

extern crate hashbrown;

use std::io::BufRead;
use std::sync::Arc;
use hashbrown::HashMap;
use std::thread;

type Map = HashMap<Code, u32>;

#[derive(Hash, PartialEq, PartialOrd, Ord, Eq, Clone, Copy)]
struct Code(u64);
impl Code {
    fn push(&mut self, c: u8, mask: u64) {
        self.0 <<= 2;
        self.0 |= c as u64;
        self.0 &= mask;
    }
    fn from_str(s: &str) -> Code {
        let mask = Code::make_mask(s.len());
        let mut res = Code(0);
        for c in s.as_bytes() {
            res.push(Code::encode_byte(*c), mask);
        }
        res
    }
    fn to_string(&self, frame: usize) -> String {
        let mut res = vec![];
        let mut code = self.0;
        for _ in 0..frame {
            let c = match code as u8 & 0b11 {
                c if c == Code::encode_byte(b'A') => b'A',
                c if c == Code::encode_byte(b'T') => b'T',
                c if c == Code::encode_byte(b'G') => b'G',
                c if c == Code::encode_byte(b'C') => b'C',
                _ => unreachable!(),
            };
            res.push(c);
            code >>= 2;
        }
        res.reverse();
        String::from_utf8(res).unwrap()
    }
    fn make_mask(frame: usize) -> u64 {
        (1u64 << (2 * frame)) - 1
    }
    #[inline(always)]
    fn encode_byte(c: u8) -> u8 {
        (c & 0b110) >> 1
    }
}

struct Iter<'a> {
    iter: std::slice::Iter<'a, u8>,
    code: Code,
    mask: u64,
}
impl<'a> Iter<'a> {
    fn new(input: &[u8], frame: usize) -> Iter {
        let mut iter = input.iter();
        let mut code = Code(0);
        let mask = Code::make_mask(frame);
        for c in iter.by_ref().take(frame - 1) {
            code.push(*c, mask);
        }
        Iter {
            iter: iter,
            code: code,
            mask: mask,
        }
    }
}
impl<'a> Iterator for Iter<'a> {
    type Item = Code;
    fn next(&mut self) -> Option<Self::Item> {
        self.iter.next().map(|&c| {
            self.code.push(c, self.mask);
            self.code
        })
    }
}

fn gen_freq(input: &[u8], frame: usize) -> Map {
    let mut freq = Map::default();
    for code in Iter::new(input, frame) {
        *freq.entry(code).or_insert(0) += 1;
    }
    freq
}

#[derive(Clone, Copy)]
struct Freq(usize);
#[derive(Clone, Copy)]
struct Occ(&'static str);

impl Freq {
    fn print(&self, freq: &Map) {
        let mut v: Vec<_> = freq.iter()
                                .map(|(&code, &count)| (count, code))
                                .collect();
        v.sort();
        let total = v.iter().map(|&(count, _)| count).sum::<u32>() as f32;
        for &(count, key) in v.iter().rev() {
            println!("{} {:.3}", 
            key.to_string(self.0), (count as f32 * 100.) / total);
        }
        println!("");
    }
}
impl Occ {
    fn print(&self, freq: &Map) {
        let count = if freq.contains_key(&Code::from_str(self.0)) {
            freq[&Code::from_str(self.0)]
        } else { 
            0 
        };
        println!("{}\t{}", count, self.0);                                
    }
}

fn read_input() -> Vec<u8> {
    let stdin = std::io::stdin();
    let mut r = stdin.lock();
    let key = b">THREE";
    let mut res = Vec::with_capacity(65536);
    let mut line = Vec::with_capacity(64);

    loop {
        match r.read_until(b'\n', &mut line) {
            Ok(b) if b > 0 => if line.starts_with(key) { break },
            _ => break,
        }
        line.clear();
    }

    loop {
        line.clear();
        match r.read_until(b'\n', &mut line) {
            Ok(b) if b > 0 => 
                res.extend(line[..line.len()-1].iter()
                   .cloned().map(Code::encode_byte)),
            _ => break,
        }
    }
    res
}

fn main() {
    let occs = vec![
        Occ("GGTATTTTAATTTATAGT"),
        Occ("GGTATTTTAATT"),
        Occ("GGTATT"),
        Occ("GGTA"),
        Occ("GGT"),
    ];
    let input = Arc::new(read_input());

    // In reverse to spawn big tasks first
    let results : Vec<_> = occs.into_iter().map(|item| {
        let input = input.clone();
        thread::spawn(move || (item, gen_freq(&input, item.0.len())) )
    }).collect();

    Freq(1).print(&gen_freq(&input, 1));
    Freq(2).print(&gen_freq(&input, 2));

    for t in results.into_iter().rev() {
        let (item, freq) = t.join().unwrap();
        item.print(&freq);
    }
}

    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
rustc 1.37.0 (eae3437df 2019-08-13)
LLVM version 8.0


Thu, 15 Aug 2019 17:44:48 GMT

MAKE:
/opt/src/rust-1.37.0/bin/rustc -C opt-level=3 -C target-cpu=core2 -C lto -C codegen-units=1 -L /opt/src/rust-libs --extern futures=/opt/src/rust-libs/libfutures-caaaf3872a2ec828.rlib --extern hashbrown=/opt/src/rust-libs/libhashbrown-24e9d47f22903e82.rlib knucleotide.rs -o knucleotide.rust-8.rust_run
error[E0463]: can't find crate for `proc_macro_hack` which `hashbrown` depends on
  --> knucleotide.rs:11:1
   |
11 | extern crate hashbrown;
   | ^^^^^^^^^^^^^^^^^^^^^^^ can't find crate

error: aborting due to previous error

For more information about this error, try `rustc --explain E0463`.
make: [/home/dunham/8000-benchmarksgame/nanobench/makefiles/u64q.programs.Makefile:638: knucleotide.rust-8.rust_run] Error 1 (ignored)

7.01s to complete and log all make actions

COMMAND LINE:
./knucleotide.rust-8.rust_run 0 < knucleotide-input250000.txt

MAKE ERROR