source code
# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# Rewrite for regex-redux by Aaron Tavistock
# array not dictionary by Isaac Gouy
class RegexRedux
MATCHERS = [
/agggtaaa|tttaccct/,
/[cgt]gggtaaa|tttaccc[acg]/,
/a[act]ggtaaa|tttacc[agt]t/,
/ag[act]gtaaa|tttac[agt]ct/,
/agg[act]taaa|ttta[agt]cct/,
/aggg[acg]aaa|ttt[cgt]ccct/,
/agggt[cgt]aa|tt[acg]accct/,
/agggta[cgt]a|t[acg]taccct/,
/agggtaa[cgt]|[acg]ttaccct/
]
# ruby 1.8.7: to iterate in-order use array not dictionary
FINAL_TRANSFORM = [
[ /tHa[Nt]/, '<4>' ],
[ /aND|caN|Ha[DS]|WaS/, '<3>' ],
[ /a[NSt]|BY/, '<2>' ],
[ /<[^>]*>/, '|' ],
[ /\|[^|][^|]*\|/, '-' ]
]
def initialize(io)
@seq = io.readlines.join
@original_size = @seq.size
@clean_size = remove_breaks!
@match_results = match_results
@final_size = final_transform!
end
def to_s
"%s\n\n%d\n%d\n%d" % [
@match_results.join("\n"),
@original_size,
@clean_size,
@final_size
]
end
def pattern_count(regex)
count = 0
@seq.scan(regex) { count += 1 }
"#{regex.source} #{count}"
end
def forked_pattern_count(regex)
reader, writer = IO.pipe
Process.fork do
reader.close
writer.write(original_pattern_count(regex))
end
writer.close
results = reader.read
reader.close
results
end
if (RUBY_PLATFORM != 'java')
alias_method :original_pattern_count, :pattern_count
alias_method :pattern_count, :forked_pattern_count
end
def remove_breaks!
@seq.gsub!(/>.*\n|\n/, '')
@seq.size
end
def match_results
threads = MATCHERS.map do |matcher|
Thread.new do
Thread.current[:result] = pattern_count(matcher)
end
end
threads.each(&:join)
threads.map { |t| t[:result] }
end
def final_transform!
FINAL_TRANSFORM.each { |f,r| @seq.gsub!(f,r) }
@seq.size
end
end
regex_redux = RegexRedux.new(STDIN)
puts regex_redux.to_s
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
ruby 3.4.0dev
(2024-12-25
master f450108330)
+YJIT +PRISM [x86_64-linux]
Thu, 26 Dec 2024 03:45:40 GMT
COMMAND LINE:
/opt/src/ruby-3.4.0/bin/ruby --yjit -W0 regexredux.ruby-3.ruby 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361