The Computer Language
Benchmarks Game

k-nucleotide OCaml #3 program

source code

(* The Computer Language Benchmarks Game
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 *
 * contributed by Troestler Christophe
 * modified by Mauricio Fernandez
 * optimized by Fabrice Le Fessant
 * modified by Roman Kashitsyn: use Bytes instead of String
 *)

let tab = Array.make 256 0
let _ =
  tab.(Char.code 'A') <- 0;
  tab.(Char.code 'a') <- 0;
  tab.(Char.code 'T') <- 1;
  tab.(Char.code 't') <- 1;
  tab.(Char.code 'C') <- 2;
  tab.(Char.code 'c') <- 2;
  tab.(Char.code 'g') <- 3;
  tab.(Char.code 'G') <- 3

let uppercase line =
  let len = Bytes.length line in
  for i = 0 to len - 1 do
    let c = Bytes.get line i in
    Bytes.set line i (Char.unsafe_chr tab.(Char.code c))
  done

(* Extract DNA sequence "THREE" from stdin *)
let dna =
  let is_not_three s = String.length s < 6 || String.sub s 0 6 <> ">THREE" in
  while is_not_three(input_line stdin) do () done;
  let buf = Buffer.create 130_000_000 in
  (* Skip possible comment *)
  (try
     while true do
       let line = Bytes.unsafe_of_string (input_line stdin) in
       if Bytes.get line 0 <> ';' then begin
	 uppercase line;
	 Buffer.add_bytes buf line;
	 raise Exit
       end
     done with _ -> ());
  (* Read the DNA sequence *)
  (try while true do
       let line = Bytes.unsafe_of_string (input_line stdin) in
       if Bytes.get line 0 = '>' then raise End_of_file;
       uppercase line;
       Buffer.add_bytes buf line
     done with End_of_file -> ());
  Buffer.contents buf


module K15 = struct
  type t = int
  let equal k1 k2 = k1 = k2
  let hash n = n
end

module K16 = struct
  type t = int * int
  let equal (a1,a2) (b1,b2) = a1 = b1 && a2 = b2
  let hash (a1, _) = a1
end

type entry = {
  mutable count : int;
}

let threshold15 =
  match Sys.word_size with
    32 -> 15
  | 64 -> 31
  | _ -> assert false
let threshold16 = threshold15 + 1

let c = 0x40000-1
module H15 = Hashtbl.Make(K15)
module H16 = Hashtbl.Make(K16)
let h15 = H15.create c
let h16 = H16.create c

let rec pack_word n k h =
  let b = Char.code dna.[n] in
  let h = h * 4 + b in
  if k > 1 then
    pack_word (n+1) (k-1) h
  else h

let pack15 k n =
  pack_word n k 0

let pack16 k n =
  let h1 = pack_word n threshold15 0 in
  let h2 = pack_word (n+ threshold15) (k- threshold15) 0 in
  (h1, h2)

let rec pack_word_in dna n k h =
  let b = dna.[n] in
  let b = tab.(Char.code b) in
  let h = h * 4 + b in
  if k > 1 then
    pack_word_in dna (n+1) (k-1) h
  else h

let pack_key15 seq =
  let k = String.length seq in
  pack_word_in seq 0 k 0

let pack_key16 seq =
  let k = String.length seq in
  let h1 = pack_word_in seq 0 threshold15 0 in
  let h2 = pack_word_in seq threshold15 (k- threshold15) 0 in
  (h1, h2)

let char = [| 'A'; 'T'; 'C'; 'G' |]

let rec unpack h s pos k =
  let pos = pos - 1 in
  Bytes.set s pos char.(h land 3);
  if k > 1 then
    unpack (h lsr 2) s pos (k-1)

let unpack15 k h1 =
  let s = Bytes.create k in
  unpack h1 s k k;
  Bytes.unsafe_to_string s

let unpack16 k (h1, h2) =
  let s = Bytes.create k in
  unpack h1 s threshold15 threshold15;
  unpack h2 s k (k- threshold15);
  Bytes.unsafe_to_string s

let count15 k =
  for i = 0 to String.length dna - k - 1 do
    let packed = pack15 k i in
    try
      let key = H15.find h15 packed in
      key.count <- key.count + 1
    with Not_found ->
      H15.add h15 packed { count = 1 }
  done

let count16 k =
  for i = 0 to String.length dna - k - 1 do
    let packed = pack16 k i in
    try
      let key = H16.find h16 packed in
      key.count <- key.count + 1
    with Not_found ->
      H16.add h16 packed { count = 1 }
  done

let count k =
  if k < threshold16 then count15 k else count16 k

let compare_freq ((k1:string),(f1:float)) (k2, f2) =
  if f1 > f2 then -1 else if f1 < f2 then 1 else String.compare k1 k2

let write_frequencies15 k =
  count15 k;
  let tot = float(H15.fold (fun _ n t -> n.count + t) h15 0) in
  let frq =
    H15.fold (fun h n l ->
	(unpack15 k h, 100. *. float n.count /. tot) :: l) h15 [] in
  let frq = List.sort compare_freq frq in
  String.concat ""
    (List.map (fun (k,f) -> Printf.sprintf "%s %.3f\n" k f) frq)

let write_frequencies16 k =
  count16 k;
  let tot = float(H16.fold (fun _ n t -> n.count + t) h16 0) in
  let frq =
    H16.fold (fun h n l ->
	(unpack16 k h, 100. *. float n.count /. tot) :: l) h16 [] in
  let frq = List.sort compare_freq frq in
  String.concat ""
    (List.map (fun (k,f) -> Printf.sprintf "%s %.3f\n" k f) frq)

let write_count15 k seq =
  count15 k;
  Printf.sprintf "%d\t%s" (try (H15.find h15 (pack_key15 seq)).count with Not_found -> 0) seq

let write_count16 k seq =
  count16 k;
  Printf.sprintf "%d\t%s" (try (H16.find h16 (pack_key16 seq)).count with Not_found -> 0) seq

let write_frequencies k =
  if k < threshold16 then write_frequencies15 k
  else write_frequencies16 k

let write_count seq =
  let k = String.length seq in
  if k < threshold16 then write_count15 k seq
  else write_count16 k seq

type t = Size of int | Dna of string

let invoke (f : t -> string) x : unit -> string =
  let input, output = Unix.pipe() in
  match Unix.fork() with
  | -1 -> Unix.close input; Unix.close output; (let v = f x in fun () -> v)
  | 0 ->
    Unix.close input;
    let output = Unix.out_channel_of_descr output in
    Marshal.to_channel output (f x) [];
    close_out output;
    exit 0
  | pid ->
    Unix.close output;
    let input = Unix.in_channel_of_descr input in fun () ->
      let v = Marshal.from_channel input in
      ignore (Unix.waitpid [] pid);
      close_in input;
      v

let parallelize f l =
  let list = List.map (invoke f) (List.rev l) in
  List.iter (fun g -> print_endline (g ())) (List.rev list)

let () =
  parallelize
    (fun i ->
       match i with
	 Size i ->
         write_frequencies i
       | Dna k ->
         write_count k
    ) [Size 1;
       Size 2;
       Dna "GGT";
       Dna "GGTA";
       Dna "GGTATT";
       Dna "GGTATTTTAATT";
       Dna "GGTATTTTAATTTATAGT"]
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
The OCaml native-code compiler, version 4.09.0


Thu, 24 Oct 2019 18:19:16 GMT

MAKE:
mv knucleotide.ocaml-3.ocaml knucleotide.ocaml-3.ml
/opt/src/ocaml-4.09.0/bin/ocamlopt -noassert -unsafe -fPIC -nodynlink -inline 100 -O3 unix.cmxa knucleotide.ocaml-3.ml -o knucleotide.ocaml-3.ocaml_run
rm knucleotide.ocaml-3.ml

2.85s to complete and log all make actions

COMMAND LINE:
./knucleotide.ocaml-3.ocaml_run 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT