The Computer Language
24.09 Benchmarks Game

regex-redux OCaml #3 program

source code

(* The Computer Language Benchmarks Game
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 *
 * regex-dna program contributed by Christophe TROESTLER
 * updated by Roman Kashitsyn: use Bytes instead of String
 * updated by Gaƫtan Dubreil: use the Re library and parallelize processing
 *)

open Printf

let variants = ["agggtaaa|tttaccct";          "[cgt]gggtaaa|tttaccc[acg]";
                "a[act]ggtaaa|tttacc[agt]t";  "ag[act]gtaaa|tttac[agt]ct";
                "agg[act]taaa|ttta[agt]cct";  "aggg[acg]aaa|ttt[cgt]ccct";
                "agggt[cgt]aa|tt[acg]accct";  "agggta[cgt]a|t[acg]taccct";
                "agggtaa[cgt]|[acg]ttaccct"]

let subst = ["tHa[Nt]", "<4>"; "aND|caN|Ha[DS]|WaS", "<3>";
             "a[NSt]|BY", "<2>"; "<[^>]*>", "|"; "\\|[^|][^|]*\\|", "-"] 

(* Read all of a redirected FASTA format file from stdin. *)
let file_data, file_length =
  let b = Buffer.create 0xFFFF in
  let s = Bytes.create 0xFFF in
  let r = ref 1 in
  while !r > 0 do
    r := input stdin s 0 0xFFF;
    Buffer.add_substring b (Bytes.unsafe_to_string s) 0 !r
  done;
  (Buffer.contents b, Buffer.length b)

(* Remove FASTA sequence descriptions and all linefeed characters.  *)
let dna = Re.replace_string (Re.Pcre.regexp ">.*\n|\n") "" file_data
let code_length = String.length dna

(* Count matches of [re]. *)
let count re s =
  let re = Re.Pcre.regexp re in
  let i = ref 0 in
  let n = ref 0 in
  try
    while true do
      let grps = Re.exec ~pos:!i re s in
      i := Re.Group.stop grps 0;
      incr n
    done;
    assert false
  with Not_found -> !n

let () =
  if Unix.fork() = 0 then (
    List.iter (fun re -> printf "%s %i\n" re (count re dna)) variants;
  )
  else (
    let b = ref dna in
    List.iter (fun (re, s) ->
        b := Re.replace_string (Re.Pcre.regexp re) s !b) subst;

    ignore(Unix.wait());
    printf "\n%i\n%i\n%i\n" file_length code_length (String.length !b)
  )
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
OCaml native-code
5.4.0+dev0-2024-08-25


 Thu, 05 Sep 2024 18:10:57 GMT

MAKE:
mv regexredux.ocaml-3.ocaml regexredux.ocaml-3.ml
~/.opam/5.1.1/bin/ocamlopt -noassert -unsafe -fPIC -nodynlink -inline 100 -O3 -I /home/dunham/.opam/5.1.1/lib/re unix.cmxa re.cmxa -ccopt -march=ivybridge regexredux.ocaml-3.ml -o regexredux.ocaml-3.ocaml_run
File "regexredux.ocaml-3.ml", line 32, characters 10-27:
32 | let dna = Re.replace_string (Re.Pcre.regexp ">.*\n|\n") "" file_data
               ^^^^^^^^^^^^^^^^^
Warning 6 [labels-omitted]: label by was omitted in the application of this function.

File "_none_", line 1:
Alert ocaml_deprecated_auto_include: 
OCaml's lib directory layout changed in 5.0. The unix subdirectory has been
automatically added to the search path, but you should add -I +unix to the
command-line to silence this alert (e.g. by adding unix to the list of
libraries in your dune file, or adding use_unix to your _tags file for
ocamlbuild, or using -package unix for ocamlfind).

File "regexredux.ocaml-3.ml", line 56, characters 13-30:
56 |         b := Re.replace_string (Re.Pcre.regexp re) s !b) subst;
                  ^^^^^^^^^^^^^^^^^
Warning 6 [labels-omitted]: label by was omitted in the application of this function.

File "_none_", line 1:
Alert ocaml_deprecated_auto_include: 
OCaml's lib directory layout changed in 5.0. The unix subdirectory has been
automatically added to the search path, but you should add -I +unix to the
command-line to silence this alert (e.g. by adding unix to the list of
libraries in your dune file, or adding use_unix to your _tags file for
ocamlbuild, or using -package unix for ocamlfind).
rm regexredux.ocaml-3.ml

2.23s to complete and log all make actions

COMMAND LINE:
 ./regexredux.ocaml-3.ocaml_run 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361