source code
package main
/* The Computer Language Benchmarks Game
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
Contributed by Dean Becker
*/
import (
"io/ioutil"
"os"
"fmt"
"regexp"
)
// regex and replacement string
type subst struct {
re *regexp.Regexp
replacementString []byte
}
// regex and result value
type variant struct {
re *regexp.Regexp
result int // for storing the result of the re
}
var (
variants []*variant
substitutions []*subst
bytes []byte
originalLen int
cleanedLen int
cleanRE *subst
)
func main() {
doneCh := make(chan int)
// initialize concurrently
go loadFile(doneCh)
go initRegexes(doneCh)
// wait for the above routines to finish
<-doneCh
<-doneCh
// clean the input
bytes = cleanRE.re.ReplaceAllLiteral(bytes, cleanRE.replacementString)
cleanedLen = len(bytes)
// since this one takes longest, start it first
finalLen := make(chan int)
go func() {
// copy our bytes so we don't trounce the variant routines
bb := make([]byte, len(bytes))
copy(bb, bytes)
for _, sub := range substitutions {
bb = sub.re.ReplaceAll(bb, sub.replacementString)
}
finalLen <- len(bb)
}()
// variant routines
for _, v := range variants {
go countVariants(doneCh, v)
}
// await all variant results (so we can see them in order)
for range variants {
<-doneCh
}
// print all variant results
for _, v := range variants {
fmt.Printf("%s %d\n", v.re.String(), v.result)
}
// print finalLen when it's available
fmt.Printf("\n%d\n%d\n%d\n", originalLen, cleanedLen, <-finalLen)
}
func loadFile(doneCh chan int) {
var err error
bytes, err = ioutil.ReadAll(os.Stdin)
if err != nil {
fmt.Fprintf(os.Stderr, "can't read input: %s\n", err)
os.Exit(2)
}
originalLen = len(bytes)
doneCh <- 1
}
func countVariants(doneCh chan int, v *variant) {
v.result = len(v.re.FindAll(bytes, -1))
doneCh <- 1
}
func initRegexes(doneCh chan int) {
variants = []*variant{
{re: regexp.MustCompile("agggtaaa|tttaccct")},
{re: regexp.MustCompile("[cgt]gggtaaa|tttaccc[acg]")},
{re: regexp.MustCompile("a[act]ggtaaa|tttacc[agt]t")},
{re: regexp.MustCompile("ag[act]gtaaa|tttac[agt]ct")},
{re: regexp.MustCompile("agg[act]taaa|ttta[agt]cct")},
{re: regexp.MustCompile("aggg[acg]aaa|ttt[cgt]ccct")},
{re: regexp.MustCompile("agggt[cgt]aa|tt[acg]accct")},
{re: regexp.MustCompile("agggta[cgt]a|t[acg]taccct")},
{re: regexp.MustCompile("agggtaa[cgt]|[acg]ttaccct")},
}
substitutions = []*subst{
{regexp.MustCompile("tHa[Nt]"), []byte("<4>")},
{regexp.MustCompile("aND|caN|Ha[DS]|WaS"), []byte("<3>")},
{regexp.MustCompile("a[NSt]|BY"), []byte("<2>")},
{regexp.MustCompile("<[^>]*>"), []byte("|")},
{regexp.MustCompile("\\|[^|][^|]*\\|"), []byte("-")},
}
cleanRE = &subst{regexp.MustCompile("(>[^\n]+)?\n"), []byte("")}
doneCh <- 1
}
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
go version go1.14 linux/amd64
Wed, 06 May 2020 02:39:24 GMT
MAKE:
/opt/src/go1.14.linux-amd64/go/bin/go build -o regexredux.go-3.go_run
5.95s to complete and log all make actions
COMMAND LINE:
./regexredux.go-3.go_run 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361