The Computer Language
Benchmarks Game

k-nucleotide Rust #7 program

source code

// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// contributed by Alexander

extern crate futures;
extern crate tokio_threadpool;
extern crate itertools;
extern crate num;
extern crate hashbrown;

use futures::{Future, lazy};
use self::tokio_threadpool::ThreadPool;
use itertools::Itertools;
use num::{FromPrimitive, ToPrimitive};
use std::cmp::Ordering;
use std::sync::Arc;

type Map<T> = hashbrown::HashMap<T, u32>;

trait ShlXorMsk<T> {
    fn sh(a: T, x: u8, m: T) -> T;
    fn mask(len: usize) -> T;
}

impl ShlXorMsk<u8> for u8 {
    fn sh(a: u8, x: u8, m: u8) -> u8 {
        m & (a << 2) | x
    }
    fn mask(len: usize) -> u8 {
        ((1u16 << 2 * len) - 1) as u8
    }
}

impl ShlXorMsk<u16> for u16 {
    fn sh(a: u16, x: u8, m: u16) -> u16 {
        m & (a << 2) | (x as u16)
    }
    fn mask(len: usize) -> u16 {
        ((1u32 << 2 * len) - 1) as u16
    }
}

impl ShlXorMsk<u32> for u32 {
    fn sh(a: u32, x: u8, m: u32) -> u32 {
        m & (a << 2) | (x as u32)
    }
    fn mask(len: usize) -> u32 {
        ((1u64 << 2 * len) - 1) as u32
    }
}

impl ShlXorMsk<u64> for u64 {
    fn sh(a: u64, x: u8, m: u64) -> u64 {
        m & (a << 2) | (x as u64)
    }
    fn mask(len: usize) -> u64 {
        (1u64 << 2 * len) - 1
    }
}

fn match_key(k: u8) -> char {
    match k {
        0b00 => 'A',
        0b01 => 'C',
        0b10 => 'T',
        0b11 => 'G',
        _ => '_',
    }
}

fn print_stat(h: Map<u8>, seq_len: usize) {
    let total = h.values().sum::<u32>();

    h.into_iter()
        .sorted_by(|&(ref a, x), &(ref b, y)| {
            let ord1 = Ord::cmp(&y, &x);
            if ord1 == Ordering::Equal {
                Ord::cmp(&b, &a)
            } else {
                ord1
            }
        })
        .for_each(|(k, v)| {
            if seq_len == 1 {
                println!("{} {:.3}", match_key(k), (100 * v) as f32 / total as f32);
            } else {
                println!(
                    "{}{} {:.3}",
                    match_key(k >> 2),
                    match_key(0b11 & k),
                    (100 * v) as f32 / total as f32
                );
            };
        });
    println!();
}

fn print<
    T: FromPrimitive + ToPrimitive + Default + std::hash::Hash + std::cmp::Eq + ShlXorMsk<T> + Copy,
>(
    h: Map<T>,
    seq: &str,
) {
    let mask = T::from_u64((1u64 << (2 * seq.len() as u32)) - 1).unwrap();
    let k = seq.to_ascii_lowercase()
        .as_bytes()
        .iter()
        .map(|x| 0b11u8 & x >> 1)
        .fold(T::default(), |acc, x| T::sh(acc, x, mask));
    println!("{}\t{}", h.get(&k).unwrap_or(&0), seq);
}

fn freq<
    T: FromPrimitive + ToPrimitive + Default + std::hash::Hash + std::cmp::Eq + ShlXorMsk<T> + Copy,
>(
    s_vec: &[u8],
    len: usize,
) -> Map<T> {
    let mut h = Map::default();
    let mask = T::mask(len);
    let mut it = s_vec.iter();
    let mut a = it.by_ref()
        .take(len - 1)
        .fold(T::default(), |acc, &x| T::sh(acc, x, mask));
    for &x in it {
        a = T::sh(a, x, mask);
        *h.entry(a).or_insert(0) += 1;
    }
    h
}

fn get_seq<R: std::io::BufRead>(mut r: R, key: &[u8]) -> Vec<u8> {
    let mut res = Vec::with_capacity(65536);
    let mut line = Vec::with_capacity(64);

    loop {
        match r.read_until(b'\n', &mut line) {
            Ok(b) if b > 0 => if line.starts_with(key) { break },
            _ => break,
        }
        line.clear();
    }

    loop {
        line.clear();
        match r.read_until(b'\n', &mut line) {
            Ok(b) if b > 0 => res.extend(line[..line.len()-1].iter().map(|&x| 0b11 & x >> 1)),
            _ => break,
        }
    }

    res
}

pub fn calc<R: std::io::BufRead>(r: R) {
    let s_vec = get_seq(r, b">THREE");

    let pool = ThreadPool::new();

    let arc_vec = Arc::new(s_vec);
    let s1 = arc_vec.clone();
    let s2 = arc_vec.clone();
    let s3 = arc_vec.clone();
    let s4 = arc_vec.clone();
    let s5 = arc_vec.clone();
    let s6 = arc_vec.clone();
    let s7 = arc_vec.clone();
    let f7 = pool.spawn_handle(lazy(move || Ok::<_, ()>(freq(&s1, 18))));
    let f6 = pool.spawn_handle(lazy(move || Ok::<_, ()>(freq(&s2, 12))));
    let f5 = pool.spawn_handle(lazy(move || Ok::<_, ()>(freq(&s3, 6))));
    let f4 = pool.spawn_handle(lazy(move || Ok::<_, ()>(freq(&s4, 4))));
    let f3 = pool.spawn_handle(lazy(move || Ok::<_, ()>(freq(&s5, 3))));
    let f2 = pool.spawn_handle(lazy(move || Ok::<_, ()>(freq(&s6, 2))));
    let f1 = pool.spawn_handle(lazy(move || Ok::<_, ()>(freq(&s7, 1))));
    print_stat(f1.wait().unwrap(), 1);
    print_stat(f2.wait().unwrap(), 2);
    print::<u8>(f3.wait().unwrap(), "GGT");
    print::<u8>(f4.wait().unwrap(), "GGTA");
    print::<u16>(f5.wait().unwrap(), "GGTATT");
    print::<u32>(f6.wait().unwrap(), "GGTATTTTAATT");
    print::<u64>(f7.wait().unwrap(), "GGTATTTTAATTTATAGT");
}

fn main() {
    let stdin = std::io::stdin();
    calc(stdin.lock());
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
rustc 1.39.0 (4560ea788 2019-11-04)
LLVM version: 9.0


Thu, 07 Nov 2019 19:59:58 GMT

MAKE:
/opt/src/rust-1.39.0/bin/rustc -C opt-level=3 -C target-cpu=core2 -C lto -C codegen-units=1 -L /opt/src/rust-libs --extern futures=/opt/src/rust-libs/libfutures-d6d90b26a1db9345.rlib --extern hashbrown=/opt/src/rust-libs/libhashbrown-bdfa72d8eb1135d5.rlib knucleotide.rs -o knucleotide.rust-7.rust_run
error[E0463]: can't find crate for `proc_macro_hack` which `hashbrown` depends on
  --> knucleotide.rs:10:1
   |
10 | extern crate hashbrown;
   | ^^^^^^^^^^^^^^^^^^^^^^^ can't find crate

error: aborting due to previous error

For more information about this error, try `rustc --explain E0463`.
make: [/home/dunham/8000-benchmarksgame/nanobench/makefiles/u64q.programs.Makefile:650: knucleotide.rust-7.rust_run] Error 1 (ignored)

5.85s to complete and log all make actions

COMMAND LINE:
./knucleotide.rust-7.rust_run 0 < knucleotide-input250000.txt

MAKE ERROR