The Computer Language
24.12 Benchmarks Game

k-nucleotide Racket #4 program

source code

#lang racket/base
(require racket/fixnum)
(require racket/generator)
(require racket/sequence)
;;;
;;; The Computer Language Benchmarks Game
;;; https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

;;; contributed by Matthew Flatt, modified by
;;; modified by James Bergstra

;;; Notes on the implementation: the strategy is to map the DNA letters to the
;;; bytes 0, 1, 2, 3, and then create a hash function that is simply the
;;; concatenation of these two-byte codes. This is handy because the slow part
;;; of this test is building the hash table, and this hash function means that
;;; we can take advantage of overlapping DNA sub-sequences to get a
;;; constant-time hash function (that does not depend on the sequence length).
;;;
;;; The bottleneck in this code seems to be Racket's hash table. The time to
;;; create the last hash table (for the len-18 string) seems to be about half
;;; the runtime of the whole program.

;; Map A->0, C->1, G->2 T->3 (and lowercase too)
(define dna->num
  (let ([tbl (make-bytes 256 255)])
    (for ([ch (in-list (bytes->list #"ACGTacgt"))]
          [ii (in-list '(0 1 2 3 0 1 2 3))])
      (bytes-set! tbl ch ii))
    (lambda (ch) (bytes-ref tbl ch))))

;;; map a hash key back to a string (needed for printing)
(define (unhash key len)
  (let ([rval (make-string len)])
    (sequence-fold
      (lambda (key pos)
        (string-set! rval pos (string-ref "ACGT" (bitwise-and key 3)))
        (arithmetic-shift key -2))
      key
      (in-range len))
    rval))

;;; Ideally this would serve all-counts, but my attempt to do that
;;; was horribly slow.
(define (hashes keylen dna as-codes)
  (generator ()
    (let ([key 0] [ishift (* 2 keylen)] [thresh (sub1 keylen)])
      (for
        ([bb (in-bytes dna)]
         [ii (in-range (bytes-length dna))])
        (set! key (arithmetic-shift (+ key (arithmetic-shift (if as-codes bb (dna->num bb) ) ishift)) -2))
        (if (>= ii thresh) (yield key) #f))
      )))

(define (all-counts keylen dna)
  (let ([table (make-hasheq)]
        [key 0]
        [ishift (* 2 keylen)]
        )
    (for
      ([bb (in-bytes dna)]
       [ii (in-range (bytes-length dna))])
      (set! key (arithmetic-shift (+ key (arithmetic-shift bb ishift)) -2))
      (if (>= ii (- keylen 1)) (hash-set! table key (add1 (hash-ref table key 0))) #f)
      )
    table))

(define (readbuf in foo)
  (let ([s (open-output-bytes)])
    ;; Skip to ">THREE ..."
    (regexp-match #rx#"(?m:^>THREE.*$)" in)
    ;; Copy everything but newlines
    (for ([l (in-bytes-lines in)])
      (write-bytes l s))
    ;; Replace letters with numbers 0, 1, 2, 3
    (let ([actg (get-output-bytes s)])
      (for ([ii (in-range (bytes-length actg))])
           (bytes-set! actg ii (foo (bytes-ref actg ii))))
      actg)))

(define (write-freqs table len)
  (let* ([content (hash-map table cons)]
         [total (exact->inexact (apply + (map cdr content)))])
    (for ([a (sort content > #:key cdr)])
      (printf "~a ~a\n" 
              (unhash (car a) len)
              (real->decimal-string (* 100 (/ (cdr a) total)) 3)))))

(define (write-one-freq table key)
  (let ([cnt (hash-ref table ((hashes (bytes-length key) key #f)) 0)])
    (printf "~a\t~a\n" cnt key)))

(define dna (readbuf (current-input-port) dna->num))

(write-freqs (all-counts 1 dna) 1)
(newline)

(write-freqs (all-counts 2 dna) 2)
(newline)

;; Specific sequences:
(for ([seq '(#"GGT" #"GGTA" #"GGTATT" #"GGTATTTTAATT" #"GGTATTTTAATTTATAGT")]) 
  (write-one-freq (all-counts (bytes-length seq) dna) seq))
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Racket v8.14 [cs].


 Fri, 06 Sep 2024 18:34:44 GMT

MAKE:
make: *** No rule to make target 'knucleotide.racket-4.racket_run'.  Stop.

0.06s to complete and log all make actions

COMMAND LINE:
 /opt/src/./racket-8.14/bin/racket knucleotide.racket-4.racket 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT