The Computer Language
Benchmarks Game

regex-redux Ruby #3 program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# Rewrite for regex-redux by Aaron Tavistock

class RegexRedux

  MATCHERS = [
    /agggtaaa|tttaccct/,
    /[cgt]gggtaaa|tttaccc[acg]/,
    /a[act]ggtaaa|tttacc[agt]t/,
    /ag[act]gtaaa|tttac[agt]ct/,
    /agg[act]taaa|ttta[agt]cct/,
    /aggg[acg]aaa|ttt[cgt]ccct/,
    /agggt[cgt]aa|tt[acg]accct/,
    /agggta[cgt]a|t[acg]taccct/,
    /agggtaa[cgt]|[acg]ttaccct/
  ]

  FINAL_TRANSFORM = {
    /tHa[Nt]/ => '<4>', 
    /aND|caN|Ha[DS]|WaS/ => '<3>', 
    /a[NSt]|BY/ => '<2>', 
    /<[^>]*>/ => '|',
    /\|[^|][^|]*\|/ => '-'
  }

  def initialize(io)
    @seq = io.readlines.join
    @original_size = @seq.size
    @clean_size = remove_breaks!
    @match_results = match_results
    @final_size = final_transform!
  end

  def to_s
    "%s\n\n%d\n%d\n%d" % [
      @match_results.join("\n"),
      @original_size,
      @clean_size,
      @final_size
    ]
  end

  def pattern_count(regex)
    count = 0
    @seq.scan(regex) { count += 1 }
    "#{regex.source} #{count}"
  end 

  def forked_pattern_count(regex)
    reader, writer = IO.pipe
    Process.fork do
      reader.close
      writer.write(original_pattern_count(regex))
    end

    writer.close
    results = reader.read
    reader.close
  
    results
  end

if (RUBY_PLATFORM != 'java') and (RUBY_ENGINE != 'truffleruby')
    alias_method :original_pattern_count, :pattern_count
    alias_method :pattern_count, :forked_pattern_count
  end

  def remove_breaks!
    @seq.gsub!(/>.*\n|\n/, '')
    @seq.size
  end

  def match_results
    threads = MATCHERS.map do |matcher|
      Thread.new do
        Thread.current[:result] = pattern_count(matcher)
      end
    end
    threads.each(&:join)
    threads.map { |t| t[:result] }
  end

  def final_transform!
    FINAL_TRANSFORM.each { |f,r| @seq.gsub!(f,r) }
    @seq.size
  end

end

regex_redux = RegexRedux.new(STDIN)
puts regex_redux.to_s

    
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
ruby 2.6.0preview3 (2018-11-06 trunk 65578) [x86_64-linux]


Thu, 08 Nov 2018 04:36:28 GMT

COMMAND LINE:
/opt/src/ruby-2.6.0-preview3/bin/ruby --jit -W0 regexredux.yarv-3.yarv 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361