The Computer Language
24.12 Benchmarks Game

regex-redux Perl #5 program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# contributed by Eelko de Vos, 2021
#
# This version splits up the first match counts into forked processes,
# then continues to do the "hard" search-replaces and when that ends
# starts to look for the answers of the childs to the first matches.
#
# It's all file-based, no pipes used here. That could be a small
# optimization but it won't make the program significantly faster.


use strict;

# Hold my answers, please
my @length;

# Take in the entire file
my $file_in=do { local $/; <STDIN> };
$length[0]=length($file_in)-1;


$file_in=~s/\>.*?\n|\n//gs;
$length[1]=length($file_in);


my @match_these = (   
    'agggtaaa|tttaccct', 
    '[cgt]gggtaaa|tttaccc[acg]', 
    'a[act]ggtaaa|tttacc[agt]t', 
    'ag[act]gtaaa|tttac[agt]ct', 
    'agg[act]taaa|ttta[agt]cct', 
    'aggg[acg]aaa|ttt[cgt]ccct', 
    'agggt[cgt]aa|tt[acg]accct', 
    'agggta[cgt]a|t[acg]taccct', 
    'agggtaa[cgt]|[acg]ttaccct');



# create all answers, in forked children
my $cntr=0;
foreach my $search (@match_these) {
    # remove all counter files if they are still present
    unlink "$cntr.dat";

    if (!fork()) {
	my @matches=($file_in=~/$search/gs);
	my $count=scalar(@matches);

	open W,">$cntr.dat";
	print W "$search\t$count\n";
	close W;
	
	exit(0);
    }
    $cntr++;
}

# meanwhile, start processing the last answers
my %search_replace = (
    'tHa[Nt]' => '<4>',
    'aND|caN|Ha[DS]|WaS' => '<3>',
    'a[NSt]|BY' => '<2>',
    '<[^>]*>' => '|',
    '\\|[^\|][^\|]*\\|' => '-' );

# We need this exact order so we need to explicitely again set these
# keys here:
my @search_replace = (
    'tHa[Nt]',
    'aND|caN|Ha[DS]|WaS',
    'a[NSt]|BY',
    '<[^>]*>',
    '\\|[^\|][^\|]*\\|');

# Now do the costly search-replaces
foreach my $key (@search_replace) {
    $file_in=~s/$key/$search_replace{$key}/gs;
}

# All done!
# Now wait for and print results of the childs using a busy wait
for (my $cntr=0; $cntr<scalar(@match_these); $cntr++) {
    while (!-f "$cntr.dat") {
    }
    my $str="";
    while ($str eq "") {
	open F,"<$cntr.dat";
	$str = join("",<F>);
	close F;
    }
    print $str;
}
$length[2]=length($file_in);


# print the results...
print "\n".$length[0]."\n";
print $length[1]."\n";
print $length[2]."\n";
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
This is perl 5, version 40
subversion 0 (v5.40.0)
x86_64-linux-thread-multi


 Mon, 10 Jun 2024 19:27:25 GMT

COMMAND LINE:
 /opt/src/perl-5.40.0/bin/perl regexredux.perl-5.perl 0 < regexredux-input50000.txt

UNEXPECTED OUTPUT 

1,9c1,9
< agggtaaa|tttaccct	3
< [cgt]gggtaaa|tttaccc[acg]	12
< a[act]ggtaaa|tttacc[agt]t	43
< ag[act]gtaaa|tttac[agt]ct	27
< agg[act]taaa|ttta[agt]cct	58
< aggg[acg]aaa|ttt[cgt]ccct	16
< agggt[cgt]aa|tt[acg]accct	15
< agggta[cgt]a|t[acg]taccct	18
< agggtaa[cgt]|[acg]ttaccct	20
---
> agggtaaa|tttaccct 3
> [cgt]gggtaaa|tttaccc[acg] 12
> a[act]ggtaaa|tttacc[agt]t 43
> ag[act]gtaaa|tttac[agt]ct 27
> agg[act]taaa|ttta[agt]cct 58
> aggg[acg]aaa|ttt[cgt]ccct 16
> agggt[cgt]aa|tt[acg]accct 15
> agggta[cgt]a|t[acg]taccct 18
> agggtaa[cgt]|[acg]ttaccct 20
11c11
< 508410
---
> 508411

PROGRAM OUTPUT:
agggtaaa|tttaccct	3
[cgt]gggtaaa|tttaccc[acg]	12
a[act]ggtaaa|tttacc[agt]t	43
ag[act]gtaaa|tttac[agt]ct	27
agg[act]taaa|ttta[agt]cct	58
aggg[acg]aaa|ttt[cgt]ccct	16
agggt[cgt]aa|tt[acg]accct	15
agggta[cgt]a|t[acg]taccct	18
agggtaa[cgt]|[acg]ttaccct	20

508410
500000
273927