The Computer Language
24.12 Benchmarks Game

k-nucleotide OCaml program

source code

(* 
 * The Computer Language Benchmarks Game
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 *
 * contributed by Troestler Christophe
 * modified by Mauricio Fernandez
 * fixed by glacialthinker 
 *
 *)

module S = struct
  type t = string

  let size = 0x40000

  let equal (s1:string) s2 = (s1 = s2)

  let hash s =
    let h = ref 0 in
    for i = 0 to String.length s - 1 do h := !h * 5 + Char.code s.[i] done;
    !h
end

module H = Hashtbl.Make(S)

(* [counts k dna] fills and return the hashtable [count] of
   k-nucleotide keys and count values for a particular reading-frame
   of length [k] of the string [dna].  Keys point to mutable values
   for speed (to avoid looking twice the same key to reinsert the
   value). *)
let count = H.create S.size
let counts k dna =
  H.clear count;
  let key = Bytes.create k in
    for i = 0 to String.length dna - k do
      String.unsafe_blit dna i key 0 k;
      try incr(H.find count (Bytes.unsafe_to_string key)) with Not_found ->
        H.add count (Bytes.to_string key) (ref 1)
    done;
    count

(* [write_frequencies k dna] writes the frequencies for a
   reading-frame of length [k] sorted by descending frequency and then
   ascending k-nucleotide key. *)
let compare_freq ((k1:string),(f1:float)) (k2, f2) =
  if f1 > f2 then -1 else if f1 < f2 then 1 else compare k1 k2

let write_frequencies k dna =
  let cnt = counts k dna in
  let tot = float(H.fold (fun _ n t -> !n + t) cnt 0) in
  let frq = H.fold (fun k n l -> (k, 100. *. float !n /. tot) :: l) cnt [] in
  let frq = List.sort compare_freq frq in
  List.iter (fun (k,f) -> Printf.printf "%s %.3f\n" k f) frq;
  print_string "\n"

let write_count seq dna =
  let cnt = counts (String.length seq) dna in
  Printf.printf "%d\t%s\n" (try !(H.find cnt seq) with Not_found -> 0) seq

(* Extract DNA sequence "THREE" from stdin *)
let dna_three =
  let is_not_three s = try String.sub s 0 6 <> ">THREE" with _ -> true in
  while is_not_three(input_line stdin) do () done;
  let buf = Buffer.create 1000 in
  (* Skip possible comment *)
  (try while true do
     let line = input_line stdin in
     if line.[0] <> ';' then
       (Buffer.add_string buf (String.uppercase_ascii line); raise Exit)
   done with _ -> ());
  (* Read the DNA sequence *)
  (try while true do
       let line = input_line stdin in
       if line.[0] = '>' then raise End_of_file;
       Buffer.add_string buf (String.uppercase_ascii line)
   done with End_of_file -> ());
  Buffer.contents buf

let () = Gc.set { (Gc.get()) with Gc.minor_heap_size = 1024 * 2048 }

let () =
  List.iter (fun i -> write_frequencies i dna_three) [1; 2];
  List.iter (fun k -> write_count k dna_three)
    ["GGT"; "GGTA"; "GGTATT"; "GGTATTTTAATT"; "GGTATTTTAATTTATAGT"]
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
OCaml native-code
5.4.0+dev0-2024-08-25


 Thu, 05 Sep 2024 17:46:10 GMT

MAKE:
mv knucleotide.ocaml knucleotide.ml
~/.opam/5.1.1/bin/ocamlopt -noassert -unsafe -fPIC -nodynlink -inline 100 -O3 -I +unix unix.cmxa -ccopt -march=ivybridge knucleotide.ml -o knucleotide.ocaml_run
rm knucleotide.ml

2.12s to complete and log all make actions

COMMAND LINE:
 ./knucleotide.ocaml_run 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT