The Computer Language
Benchmarks Game

regex-redux Go #5 program

source code

/* The Computer Language Benchmarks Game
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 *
 * regex-dna program contributed by The Go Authors.
 * modified by Tylor Arndt.
 * modified by Chandra Sekar S to use optimized PCRE binding.
 * modified by Matt Dellandrea.
 * modified by Pavel Griaznov to use PCRE JIT compilation.
 */

package main

import (
    "fmt"
    "io/ioutil"
    "os"
    "runtime"

    "github.com/GRbit/go-pcre"
)

type substitution struct {
    pattern     string
    replacement string
}

func countMatches(pat string, b []byte) int {
    m := pcre.MustCompileJIT(pat, 0, pcre.STUDY_JIT_COMPILE).Matcher(b, 0)
    n := 0

    for f := m.Matches; f; f = m.Match(b, 0) {
        n++

        b = b[m.Index()[1]:]
    }

    return n
}

func main() {
    var variants = []string{
        "agggtaaa|tttaccct",
        "[cgt]gggtaaa|tttaccc[acg]",
        "a[act]ggtaaa|tttacc[agt]t",
        "ag[act]gtaaa|tttac[agt]ct",
        "agg[act]taaa|ttta[agt]cct",
        "aggg[acg]aaa|ttt[cgt]ccct",
        "agggt[cgt]aa|tt[acg]accct",
        "agggta[cgt]a|t[acg]taccct",
        "agggtaa[cgt]|[acg]ttaccct",
    }

    var substs = []substitution{
        {"tHa[Nt]", "<4>"},
        {"aND|caN|Ha[DS]|WaS", "<3>"},
        {"a[NSt]|BY", "<2>"},
        {"<[^>]*>", "|"},
        {"\\|[^|][^|]*\\|", "-"},
    }

    runtime.GOMAXPROCS(runtime.NumCPU())

    b, err := ioutil.ReadAll(os.Stdin)
    if err != nil {
        fmt.Fprintf(os.Stderr, "can't read input: %s\n", err)
        os.Exit(2)
    }

    ilen := len(b)

    // Delete the comment lines and newlines
    b = pcre.
        MustCompileJIT("(>[^\n]*)?\n", 0, pcre.STUDY_JIT_COMPILE).
        ReplaceAll(b, []byte{}, 0)
    clen := len(b)

    mresults := make([]chan int, len(variants))
    for i := 0; i < len(variants); i++ {
        mresults[i] = make(chan int)

        go func(ch chan int, s string) {
            ch <- countMatches(s, b)
        }(mresults[i], variants[i])
    }

    lenresult := make(chan int)

    go func(b []byte) {
        for i := 0; i < len(substs); i++ {
            b = pcre.
                MustCompileJIT(substs[i].pattern, 0, pcre.STUDY_JIT_COMPILE).
                ReplaceAll(b, []byte(substs[i].replacement), 0)
        }
        lenresult <- len(b)
    }(b)

    for i := 0; i < len(variants); i++ {
        fmt.Printf("%s %d\n", variants[i], <-mresults[i])
    }

    fmt.Printf("\n%d\n%d\n%d\n", ilen, clen, <-lenresult)
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
go version go1.13 linux/amd64


Mon, 25 Nov 2019 00:42:43 GMT

MAKE:
/opt/src/go1.13.linux-amd64/go/bin/go build -o regexredux.go-5.go_run

9.20s to complete and log all make actions

COMMAND LINE:
./regexredux.go-5.go_run 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361