source code
-- The Computer Language Benchmarks Game
-- https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
--
-- contributed by Jaro Reinders
--
-- adapted from the C implementation that was:
-- contributed by Jeremy Zerfas
-- modified by Zoltan Herczeg
import Foreign.C.Types
import qualified Data.Vector.Storable as V
import qualified Data.Vector.Storable.Mutable as M
import Foreign.Ptr
import Foreign.Marshal.Alloc
import Data.Word
import Control.Monad
import Foreign.Storable
import System.IO
import Control.Concurrent
import Data.Foldable
import Data.Char
-- first some foreign imports (Haskell does not have a proper pcre2 binding!)
foreign import capi "pcre2.h value PCRE2_JIT_COMPLETE"
c_PCRE2_JIT_COMPLETE :: CUInt
foreign import ccall "pcre2.h pcre2_get_ovector_pointer_8"
c_pcre2_get_ovector_pointer :: Ptr MatchData -> IO (Ptr CSize)
foreign import ccall "pcre2.h pcre2_compile_8"
c_pcre2_compile :: Ptr CChar -> CSize -> CInt -> Ptr CInt -> Ptr CSize
-> Ptr () -> IO (Ptr Code)
foreign import ccall "pcre2.h pcre2_jit_compile_8"
c_pcre2_jit_compile :: Ptr Code -> CUInt -> IO ()
-- This one is marked unsafe for extra performance.
-- See https://wiki.haskell.org/Performance/FFI
foreign import ccall unsafe "pcre2.h pcre2_jit_match_8"
c_pcre2_jit_match :: Ptr Code -> Ptr CChar -> CSize -> CSize -> CUInt
-> Ptr MatchData -> Ptr MatchContext -> IO CInt
foreign import ccall "pcre2.h pcre2_code_free_8"
c_pcre2_code_free :: Ptr Code -> IO ()
foreign import ccall "pcre2.h pcre2_match_context_create_8"
c_pcre2_match_context_create :: Ptr GeneralContext -> IO (Ptr MatchContext)
foreign import ccall "pcre2.h pcre2_jit_stack_create_8"
c_pcre2_jit_stack_create :: CSize -> CSize -> Ptr () -> IO (Ptr JitStack)
foreign import ccall "pcre2.h pcre2_jit_stack_assign_8"
c_pcre2_jit_stack_assign :: Ptr MatchContext -> Ptr () -> Ptr JitStack
-> IO ()
foreign import ccall "pcre2.h pcre2_match_data_create_8"
c_pcre2_match_data_create :: CUInt -> Ptr GeneralContext
-> IO (Ptr MatchData)
foreign import ccall "pcre2.h pcre2_match_context_free_8"
c_pcre2_match_context_free :: Ptr MatchContext -> IO ()
foreign import ccall "pcre2.h pcre2_jit_stack_free_8"
c_pcre2_jit_stack_free :: Ptr JitStack -> IO ()
foreign import ccall "pcre2.h pcre2_match_data_free_8"
c_pcre2_match_data_free :: Ptr MatchData -> IO ()
data MatchData
data MatchContext
data Code
data GeneralContext
data JitStack
data GrowString = GrowString
{-# UNPACK #-} !(M.IOVector Word8)
{-# UNPACK #-} !Int
-- Freeze and trim
freezeGrowString :: GrowString -> IO (V.Vector Word8)
freezeGrowString (GrowString dat siz) = V.unsafeFreeze (M.slice 0 siz dat)
-- Function for searching a srcString for a pattern, replacing it with some
-- specified replacement, and storing the result in dstString.
--
-- dstString might be reallocated so this function returns a new GrowString.
-- For optimal performance you should not use the old GrowString anymore.
replace :: V.Vector Word8 -> V.Vector Word8 -> V.Vector Word8
-> GrowString -> Ptr MatchContext -> Ptr MatchData -> IO GrowString
replace !pattern !replacement !srcString !dstString !mcontext !mdata =
alloca $ \errorCode -> alloca $ \errorOffset -> do
match <- c_pcre2_get_ovector_pointer mdata
-- Compile and study pattern.
regex <- V.unsafeWith pattern $ \patternPtr -> c_pcre2_compile
(castPtr patternPtr) (fromIntegral (V.length pattern)) 0 errorCode
errorOffset nullPtr
c_pcre2_jit_compile regex c_PCRE2_JIT_COMPLETE
-- Find each match of the pattern in srcString and append the characters
-- preceding each match and the replacement text to dstString.
let
go !pos !dstString = do
!x <- V.unsafeWith srcString $ \srcStringPtr ->
c_pcre2_jit_match regex (castPtr srcStringPtr)
(fromIntegral srcStringLen) (fromIntegral pos) 0 mdata mcontext
if (x >= 0) then do
!match0 <- fromIntegral <$> peekElemOff match 0
-- Allocate more memory for dstString if there is not enough space
-- for the characters preceding the match and the replacement text.
let
growLoop str@(GrowString !dat !siz)
| siz + match0 - pos + replacementSize > M.length dat = do
!dat' <- M.grow dat (M.length dat) :: IO (M.IOVector Word8)
growLoop (GrowString dat' siz)
| otherwise = return str
(GrowString dat siz) <- growLoop dstString
-- Append the characters preceding the match and the replacement text
-- to dstString and update the size of dstString.
let
!siz' = siz + match0 - pos
!siz'' = siz' + replacementSize
V.copy (M.slice siz (match0 - pos) dat)
(V.slice pos (match0 - pos) srcString)
V.copy (M.slice siz' replacementSize dat) replacement
-- Find the new pos to continue after the current match.
!pos' <- fromIntegral <$> peekElemOff match 1
go pos' (GrowString dat siz'')
else return (pos, dstString)
(!pos, !dstString') <- go 0 dstString
c_pcre2_code_free regex
-- Allocate more memory for dstString if there is not enough space for the
-- characters following the last match (or the entire srcString if there
-- was no match).
let
growLoop str@(GrowString !dat !siz)
| siz + srcStringLen - pos > M.length dat = do
dat' <- M.grow dat (M.length dat) :: IO (M.IOVector Word8)
growLoop (GrowString dat' siz)
| otherwise = return str
(GrowString dat siz) <- growLoop dstString'
-- Append the characters following the last match (or the entire srcString
-- if there was no match) to dstString and update the size of dstString.
V.copy (M.slice siz (srcStringLen - pos) dat)
(V.slice pos (srcStringLen - pos) srcString)
return (GrowString dat (siz + srcStringLen - pos))
where
srcStringLen = V.length srcString
replacementSize = V.length replacement
main :: IO ()
main = do
let
f = V.fromList . map (fromIntegral . ord)
countInfo = map f
[ "agggtaaa|tttaccct"
, "[cgt]gggtaaa|tttaccc[acg]"
, "a[act]ggtaaa|tttacc[agt]t"
, "ag[act]gtaaa|tttac[agt]ct"
, "agg[act]taaa|ttta[agt]cct"
, "aggg[acg]aaa|ttt[cgt]ccct"
, "agggt[cgt]aa|tt[acg]accct"
, "agggta[cgt]a|t[acg]taccct"
, "agggtaa[cgt]|[acg]ttaccct"
]
replaceInfo = map (\(a,b) -> (f a, f b))
[ ("tHa[Nt]", "<4>")
, ("aND|caN|Ha[DS]|WaS", "<3>")
, ("a[NSt]|BY", "<2>")
, ("<[^>]*>", "|")
, ("\\|[^|][^|]*\\|", "-")
]
input <- (\x -> GrowString x 0) <$> M.new 16384
sequences <- (\x -> GrowString x 0) <$> M.new 16384
-- Read in input from stdin until we reach the end or encounter an error.
let
readLoop (GrowString !dat !siz) = do
bytesRead <- M.unsafeWith (M.slice siz (M.length dat - siz) dat)
$ \inputPtr -> hGetBuf stdin inputPtr (M.length dat - siz)
if bytesRead > 0 then do
-- update the size of input to reflect the newly read input and if
-- we've reached the full capacity of the input string then also double
-- its size.
dat' <- if (siz + bytesRead == M.length dat)
then M.grow dat (M.length dat) :: IO (M.IOVector Word8)
else return dat
readLoop (GrowString dat' (siz + bytesRead))
else return (GrowString dat siz)
input' <- freezeGrowString =<< readLoop input
let !inputSiz = V.length input'
let
threadInit = do
mcontext <- c_pcre2_match_context_create nullPtr
stack <- c_pcre2_jit_stack_create 16384 16384 nullPtr
c_pcre2_jit_stack_assign mcontext nullPtr stack
mdata <- c_pcre2_match_data_create 16 nullPtr
return (mcontext, stack, mdata)
(mcontext, stack, mdata) <- threadInit
-- Find all sequence descriptions and new lines in input, replace them with
-- empty strings, and store the result in the sequences string.
sequences'@(GrowString seqDat seqSiz) <-
replace (f ">.*\\n|\\n") (f "") input' sequences mcontext mdata
-- Work on performing all the replacements serially.
replaceVar <- newEmptyMVar
-- Fork this thread explicitely to capability 0 to discourage the scheduler
-- from interrupting this thread.
forkOn 0 $ do
-- We'll use two strings when doing all the replacements, searching for
-- patterns in prereplaceString and using postreplaceString to store the
-- string after the replacements have been made. After each iteration these
-- two then get swapped. Start out with both strings having the same
-- capacity as the sequences string and also copy the sequences string into
-- prereplaceString for the initial iteration.
prereplaceString <- (\x -> GrowString x seqSiz) <$> M.clone seqDat
postreplaceString <- (\x -> GrowString x 0) <$> M.new (M.length seqDat)
-- Iterate through all the replacement patterns and their replacements in
-- replaceInfo
let
cons (pre@(GrowString dat _),post) (a,b) = do
dat' <- freezeGrowString pre
post' <- replace a b dat' post mcontext mdata
let pre' = (GrowString dat 0)
-- Swap pre and post in the next iteration.
return (post', pre')
-- If any replacements were made, they'll be in the fst element of the
-- tuple instead of the second because of the swap done at the end of each
-- iteration.
(GrowString _ !siz, _) <- foldlM cons
(prereplaceString, postreplaceString) replaceInfo
c_pcre2_match_context_free mcontext
c_pcre2_jit_stack_free stack
c_pcre2_match_data_free mdata
putMVar replaceVar siz
-- Iterate through all the count patterns in countInfo and perform the
-- counting for each one on a different thread if available.
first <- newMVar ()
rest <- replicateM (length countInfo) newEmptyMVar
for_ (zip3 countInfo (first : rest) rest) $ \(pattern, prev, next) ->
forkIO $ do
(mcontext, stack, mdata) <- threadInit
match <- c_pcre2_get_ovector_pointer mdata
-- Compile and study pattern.
regex <- alloca $ \errorCode -> alloca $ \errorOffset ->
V.unsafeWith pattern $ \patternPtr -> c_pcre2_compile
(castPtr patternPtr) (fromIntegral (V.length pattern)) 0 errorCode
errorOffset nullPtr
c_pcre2_jit_compile regex c_PCRE2_JIT_COMPLETE
-- Find each match of the pattern in the sequences string and increment
-- count for each match.
let
go !count !pos = do
x <- M.unsafeWith dat $ \datPtr -> c_pcre2_jit_match regex
(castPtr datPtr) (fromIntegral siz) pos 0 mdata mcontext
if x >= 0 then do
-- Find the new pos to continue searching after the current match.
pos' <- peekElemOff match 1
go (count + 1) pos'
else return count
where
(GrowString dat siz) = sequences'
count <- go 0 0
c_pcre2_code_free regex
-- Print the count for each pattern in the correct order.
takeMVar prev
V.unsafeWith pattern $ \patternPtr ->
hPutBuf stdout patternPtr (V.length pattern)
putStr " "
print count
putMVar next ()
c_pcre2_match_context_free mcontext
c_pcre2_jit_stack_free stack
c_pcre2_match_data_free mdata
siz <- takeMVar (replaceVar)
takeMVar (last rest)
-- Print the size of the original input, the size of the input without the
-- sequence descriptions & new lines, and the size after having made all the
-- replacements.
putStrLn ""
print inputSiz
print seqSiz
print siz
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
The Glorious Glasgow Haskell
Compilation System,
version 9.10.1
LLVM version 18.1.3
Thu, 01 Aug 2024 19:43:57 GMT
MAKE:
mv regexredux.ghc-3.ghc regexredux.ghc-3.hs
~/.ghcup/bin/ghc --make -fllvm -O2 -XBangPatterns -threaded -rtsopts -XForeignFunctionInterface -XCApiFFI -lpcre2-8 -optc "-DPCRE2_CODE_UNIT_WIDTH=8" regexredux.ghc-3.hs -o regexredux.ghc-3.ghc_run
Loaded package environment from /home/dunham/.ghc/x86_64-linux-9.10.1/environments/default
[1 of 2] Compiling Main ( regexredux.ghc-3.hs, regexredux.ghc-3.o )
[2 of 2] Linking regexredux.ghc-3.ghc_run
rm regexredux.ghc-3.hs
20.59s to complete and log all make actions
COMMAND LINE:
./regexredux.ghc-3.ghc_run +RTS -N4 -H250M -RTS 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361