The Computer Language
24.04 Benchmarks Game

k-nucleotide Haskell GHC #2 program

source code

-- The Computer Language Benchmarks Game
-- https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
--
-- Contributed by cahu ette


module Main where

import Data.Bits
import Data.List
import Data.Word
import Data.Hashable
import Data.Traversable
import Text.Printf

import Data.STRef
import Control.Monad
import Control.Monad.ST
import Control.Parallel.Strategies


import qualified Data.Map.Strict           as M
import qualified Data.HashTable.Class      as HC
import qualified Data.HashTable.ST.Basic   as H
import qualified Data.ByteString.Char8     as B


type HashTable s k v = H.HashTable s k v


{- By using only 2 bits to encode keys, it's important to use a different table
 - for different key sizes. Otherwise, if we encode 'A' as 0x00, "AT" and
 - "AAT" would map to the same bucket in the table.
 -
 - We could use 3 bits per letter to avoid this problem if needed.
 -}
bitsForChar :: Char -> Word64
bitsForChar 'a' = 0
bitsForChar 'A' = 0
bitsForChar 'c' = 1
bitsForChar 'C' = 1
bitsForChar 'g' = 2
bitsForChar 'G' = 2
bitsForChar 't' = 3
bitsForChar 'T' = 3
bitsForChar _   = error "Ay, Caramba!"


charForBits :: Word64 -> Char
charForBits 0 = 'A'
charForBits 1 = 'C'
charForBits 2 = 'G'
charForBits 3 = 'T'
charForBits _ = error "Ay, Caramba!"


packKey :: B.ByteString -> Word64
packKey = go zeroBits
  where
    go k bs = case B.uncons bs of
        Nothing      -> k
        Just (c, cs) -> go (unsafeShiftL k 2 .|. bitsForChar c) cs
{-# INLINE packKey #-}

unpackKey :: Int -> Word64 -> B.ByteString
unpackKey = go []
  where
    go s 0 _ = B.pack s
    go s l i = go (charForBits (i .&. 3) : s) (l-1) (unsafeShiftR i 2)
{-# INLINE unpackKey #-}


updateTable :: (Eq k, Hashable k)
            => HashTable s k (STRef s Int)
            -> (Int -> Int)
            -> k
            -> ST s ()
updateTable t f k = do
    mv <- H.lookup t k
    case mv of
        Nothing -> newSTRef 1 >>= H.insert t k
        Just v  -> modifySTRef' v f
{-# INLINE updateTable #-}


getVal :: (Eq k, Hashable k)
       => HashTable s k (STRef s Int)
       -> k
       -> ST s Int
getVal t k = do
    mv <- H.lookup t k
    case mv of Nothing -> return 0
               Just sv -> readSTRef sv
--{-# INLINE getVal #-}


tableToList :: HashTable s k (STRef s a) -> ST s [(k, a)]
tableToList t = do
    pairs <- HC.toList t
    forM pairs $ \(k, v) -> do
        a <- readSTRef v
        return (k, a)


countOccurrences :: Int -> Int -> B.ByteString -> ST s (HashTable s Word64 (STRef s Int))
countOccurrences jumpSize frameSize input = do
    t <- H.new

    let go bs = unless (B.length bs < frameSize) $ do
            let k = takeFrame bs
            updateTable t (+1) (packKey k)
            go (nextFrame bs)

    go input
    return t

  where
    takeFrame = B.take frameSize
    nextFrame = B.drop jumpSize


extractSequence :: String -> B.ByteString -> B.ByteString
extractSequence s = findSeq
  where
    prefix = B.pack ('>' : s)
    skipSeq =
          B.dropWhile (/= '>')
        . B.drop 1
    takeSeq =
          B.filter    (/= '\n')
        . B.takeWhile (/=  '>') -- extract until next header
        . B.dropWhile (/= '\n') -- skip header
    findSeq str
        | prefix `B.isPrefixOf` str  =  takeSeq str
        | otherwise                  =  findSeq (skipSeq str)



main :: IO ()
main = do
    s <- extractSequence "THREE" <$> B.getContents

    let keys    = ["GGT","GGTA","GGTATT","GGTATTTTAATT","GGTATTTTAATTTATAGT"]
    let threads = [0 .. 63]

    let threadWorkOcc key tid = runST $ do
            t <- countOccurrences (length threads) (B.length key) (B.drop tid s)
            getVal t (packKey key)

    let calcOcc key = sum $ runEval $
            mapM (rpar . threadWorkOcc (B.pack key)) threads

    let threadWorkFreq len tid = runST $ do
            t  <- countOccurrences (length threads) len (B.drop tid s)
            vs <- tableToList t
            return $ map (\(k, v) -> (B.unpack (unpackKey len k), freq v)) vs
          where
            freq v = 100 * fromIntegral v / fromIntegral (B.length s - len + 1)

    let calcFreq len =
            let l = concat $ runEval $ mapM (rpar . threadWorkFreq len) threads
                m = foldr (uncurry $ M.insertWith (+)) M.empty l
            in
                M.toList m

    let resultsOcc = map (\k -> (k, calcOcc k)) keys

    printFreq (calcFreq 1)
    putStrLn ""
    printFreq (calcFreq 2)
    putStrLn ""
    forM_ resultsOcc $ \(k, r) -> printf "%d\t%s\n" r k

  where

    sortFreq = sortBy
        (\ (k :: String, v :: Double) (k', v') ->
            (compare v' v) `mappend` (compare k k'))

    printFreq :: [(String, Double)] -> IO ()
    printFreq l = forM_ (sortFreq l) $ uncurry (printf "%s %.3f\n")
        
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
The Glorious Glasgow Haskell
Compilation System,
version 9.4.8
LLVM version 17.0.2


 Fri, 08 Mar 2024 00:10:44 GMT

MAKE:
mv knucleotide.ghc-2.ghc knucleotide.ghc-2.hs
~/.ghcup/bin/ghc --make -optlo-passes='module(default<O2>)' -O2 -fno-llvm-tbaa -XBangPatterns -threaded -rtsopts -funbox-strict-fields -XScopedTypeVariables -package hashable -package unordered-containers -package  pvar -package ghc-compact knucleotide.ghc-2.hs -o knucleotide.ghc-2.ghc_run
Loaded package environment from /home/dunham/.ghc/x86_64-linux-9.4.8/environments/default
[1 of 2] Compiling Main             ( knucleotide.ghc-2.hs, knucleotide.ghc-2.o )
[2 of 2] Linking knucleotide.ghc-2.ghc_run
rm knucleotide.ghc-2.hs

6.95s to complete and log all make actions

COMMAND LINE:
 ./knucleotide.ghc-2.ghc_run +RTS -N4 -K2048M -RTS 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT