regex-redux Racket program
source code
#lang racket/base
;;; The Computer Language Benchmarks Game
;;; https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
;;; based on a version by by Anthony Borla
;;; regex-dna program contributed by Matthew Flatt
;;; converted from regex-dna program
(require racket/port)
;; -------------------------------
(define VARIANTS
'(#"agggtaaa|tttaccct" #"[cgt]gggtaaa|tttaccc[acg]" #"a[act]ggtaaa|tttacc[agt]t"
#"ag[act]gtaaa|tttac[agt]ct" #"agg[act]taaa|ttta[agt]cct" #"aggg[acg]aaa|ttt[cgt]ccct"
#"agggt[cgt]aa|tt[acg]accct" #"agggta[cgt]a|t[acg]taccct" #"agggtaa[cgt]|[acg]ttaccct"))
(define IUBS
'((#"tHa[Nt]" #"<4>") (#"aND|caN|Ha[DS]|WaS" #"<3>") (#"a[NSt]|BY" #"<2>")
(#"<[^>]*>" #"|") (#"\\|[^|][^|]*\\|" #"-")))
;; -------------------------------
(define (ci-byte-regexp s)
(byte-regexp (bytes-append #"(?i:" s #")")))
;; -------------------------------
(define (match-count str rx offset cnt)
(let ([m (regexp-match-positions rx str offset)])
(if m
(match-count str rx (cdar m) (add1 cnt))
cnt)))
;; -------------------------------
;; Load sequence and record its length
(let* ([orig (port->bytes)]
[filtered (regexp-replace* #rx#"(?:>.*?\n)|\n" orig #"")])
;; Perform regexp counts
(for ([i (in-list VARIANTS)])
(printf "~a ~a\n" i (match-count filtered (ci-byte-regexp i) 0 0)))
;; Perform regexp replacements, and record sequence length
(let ([replaced
(for/fold ([sequence filtered]) ([IUB IUBS])
(regexp-replace* (byte-regexp (car IUB)) sequence (cadr IUB)))])
;; Print statistics
(printf "\n~a\n~a\n~a\n"
(bytes-length orig)
(bytes-length filtered)
(bytes-length replaced))))
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
Welcome to Racket v7.7.
Wed, 06 May 2020 18:00:01 GMT
MAKE:
/opt/src/racket-7.7/bin/raco make regexredux.racket
4.11s to complete and log all make actions
COMMAND LINE:
/opt/src/racket-7.7/bin/racket regexredux.racket 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361