source code
# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# regex-dna program contributed by Dominique Wahli
# 2to3
# mp by Ahmad Syukri
# modified by Justin Peel
# converted from regex-dna program
from sys import stdin
from re import sub, findall
from multiprocessing import Pool
def init(arg):
global seq
seq = arg
def var_find(f):
return len(findall(f, seq))
def main():
seq = stdin.read()
ilen = len(seq)
seq = sub('>.*\n|\n', '', seq)
clen = len(seq)
pool = Pool(initializer = init, initargs = (seq,))
variants = (
'agggtaaa|tttaccct',
'[cgt]gggtaaa|tttaccc[acg]',
'a[act]ggtaaa|tttacc[agt]t',
'ag[act]gtaaa|tttac[agt]ct',
'agg[act]taaa|ttta[agt]cct',
'aggg[acg]aaa|ttt[cgt]ccct',
'agggt[cgt]aa|tt[acg]accct',
'agggta[cgt]a|t[acg]taccct',
'agggtaa[cgt]|[acg]ttaccct')
for f in zip(variants, pool.imap(var_find, variants)):
print(f[0], f[1])
subst = {
'tHa[Nt]' : '<4>', 'aND|caN|Ha[DS]|WaS' : '<3>', 'a[NSt]|BY' : '<2>',
'<[^>]*>' : '|', '\\|[^|][^|]*\\|' : '-'}
for f, r in list(subst.items()):
seq = sub(f, r, seq)
print()
print(ilen)
print(clen)
print(len(seq))
if __name__=="__main__":
main()
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
Python 3.8.0
Mon, 11 May 2020 02:49:14 GMT
MAKE:
mv regexredux.python3 regexredux.py
mypy .
Success: no issues found in 1 source file
5.93s to complete and log all make actions
COMMAND LINE:
/opt/src/Python-3.8.0/bin/python3 -OO regexredux.py 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361