The Q6600
Benchmarks Game

regex-redux Perl #3 program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# contributed by A. Sinan Unur
#
# This version uses both fork and threads. The
# child does the matching work and parallelizes
# it using threads while the parent does the
# substitutions.
#
# Mixing threads and forks may not be good for
# your health, but here it is.

use threads;

use constant ITEMS_PER_THREAD => 3;

run();

sub count_matches {
    my $seq = shift;
    my $keys = shift;
    my $patterns = shift;

    my %results;
    for my $i (0 .. $#$patterns) {
        $results{ $keys->[$i] } = () = $seq =~ /$patterns->[$i]/g;
    }
    return %results;
}

sub run {

    my @variants = qw/
       agggtaaa|tttaccct
       [cgt]gggtaaa|tttaccc[acg]
       a[act]ggtaaa|tttacc[agt]t
       ag[act]gtaaa|tttac[agt]ct
       agg[act]taaa|ttta[agt]cct
       aggg[acg]aaa|ttt[cgt]ccct
       agggt[cgt]aa|tt[acg]accct
       agggta[cgt]a|t[acg]taccct
       agggtaa[cgt]|[acg]ttaccct
    /;

    my @variants_re = map qr/$_/iaa, @variants;

    my @iub = map { my $x = $_; sub { $_[0] =~ s/$x->[0]/$x->[1]/g }} (
        [ qr{ tHa [Nt] }x,                 '<4>' ],
        [ qr{ aND | caN | Ha[DS] | WaS }x, '<3>' ],
        [ qr{ a [NSt] | BY }x,             '<2>' ],
        [ qr{ < [^>]* > }x,                '|'   ],
        [ qr{ \| [^|] [^|]* \| }x,         '-'   ],
    );

    my $seq = do { local $/; <STDIN> };

    my $input_length = length( $seq );

    $seq =~ s/>.*\n|\n//g;

    my $cleaned_length = length( $seq );

    pipe(my $reader, my $writer)
        or die "Failed to set up pipe: $!";

    my $pid = fork;

    if (!$pid) {
        # we are in the child
        die "Failed to fork: $!" unless defined $pid;
        close $reader or die "Failed to close parent's reader in child: $!";

        my @threads = map threads->create(
            { context => 'list' },
            \&count_matches,
            $seq,
            [    @variants[$_ .. ($_ + (ITEMS_PER_THREAD - 1))] ],
            [ @variants_re[$_ .. ($_ + (ITEMS_PER_THREAD - 1))] ],
        ), grep !($_ % ITEMS_PER_THREAD), 0 .. $#variants ;

        my %results = map $_->join, @threads;

        print $writer "$_ $results{$_}\n" for @variants;

        close $writer or die "Failed to close child's writer in child: $!";
        exit( 0 );
    }
    else {
        # we are in the parent
        close $writer or die "Failed to close child's writer in parent: $!";

        # do our own work
        $_->($seq) for @iub;

        # print match counts from child
        print while <$reader>;

        close $reader or die "Failed to close parent's reader in parent: $!";

        waitpid($pid, 0);

        print "$_\n" for '', $input_length, $cleaned_length, length( $seq );
    }
}

    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
This is perl 5, version 30, subversion 0 (v5.30.0)
built for x86_64-linux-thread-multi


Sun, 10 May 2020 22:50:08 GMT

COMMAND LINE:
/opt/src/perl-5.30.0/bin/perl regexredux.perl-3.perl 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361