regex-redux OCaml #2 program
source code
(* The Computer Language Benchmarks Game
* https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
*
* regex-dna program contributed by Christophe TROESTLER
* converted from regex-dna program
*
* updated by Roman Kashitsyn: use Bytes instead of String
*)
open Printf
let variants = ["agggtaaa\\|tttaccct"; "[cgt]gggtaaa\\|tttaccc[acg]";
"a[act]ggtaaa\\|tttacc[agt]t"; "ag[act]gtaaa\\|tttac[agt]ct";
"agg[act]taaa\\|ttta[agt]cct"; "aggg[acg]aaa\\|ttt[cgt]ccct";
"agggt[cgt]aa\\|tt[acg]accct"; "agggta[cgt]a\\|t[acg]taccct";
"agggtaa[cgt]\\|[acg]ttaccct"]
(* Remove the "\\" which is mandatory in OCaml regex. *)
let re_bs = Str.regexp_string "\\"
let to_string = Str.global_replace re_bs ""
let subst = ["tHa[Nt]", "<4>"; "aND\\|caN\\|Ha[DS]\\|WaS", "<3>";
"a[NSt]\\|BY", "<2>"; "<[^>]*>", "|"; "|[^|][^|]*|", "-"]
(* Read all of a redirected FASTA format file from stdin. *)
let file_data, file_length =
let b = Buffer.create 0xFFFF in
let s = Bytes.create 0xFFF in
let r = ref 1 in
while !r > 0 do
r := input stdin s 0 0xFFF;
Buffer.add_substring b (Bytes.unsafe_to_string s) 0 !r
done;
(Buffer.contents b, Buffer.length b)
(* Remove FASTA sequence descriptions and all linefeed characters. *)
let dna = Str.global_replace (Str.regexp "^>.*$\\|\n") "" file_data
let code_length = String.length dna
(* Count matches of [re]. *)
let count re s =
let re = Str.regexp_case_fold re in
let i = ref 0 in
let n = ref 0 in
try
while true do
i := 1 + Str.search_forward re s !i;
incr n
done;
assert false
with Not_found -> !n
let () =
List.iter (fun re -> printf "%s %i\n" (to_string re) (count re dna)) variants;
let b = ref dna in
List.iter (fun (re, s) ->
b := Str.global_replace (Str.regexp re) s !b) subst;
printf "\n%i\n%i\n%i\n" file_length code_length (String.length !b)
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
OCaml native-code
5.4.0+dev0-2024-08-25
Thu, 05 Sep 2024 18:12:25 GMT
MAKE:
mv regexredux.ocaml-2.ocaml regexredux.ocaml-2.ml
~/.opam/5.1.1/bin/ocamlopt -noassert -unsafe -fPIC -nodynlink -inline 100 -O3 -I +unix -I +str unix.cmxa str.cmxa -ccopt -march=ivybridge regexredux.ocaml-2.ml -o regexredux.ocaml-2.ocaml_run
rm regexredux.ocaml-2.ml
2.18s to complete and log all make actions
COMMAND LINE:
./regexredux.ocaml-2.ocaml_run 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361