source code
/*
The Computer Language Benchmarks Game
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
contributed by Francois Green
*/
import java.io.*;
import java.util.concurrent.CompletableFuture;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.*;
public class regexredux {
public static void main(String[] args) throws IOException {
var baos = new ByteArrayOutputStream();
{
byte[] buf = new byte[65536];
int count;
while ((count = System.in.read(buf)) > 0) {
baos.write(buf, 0, count);
}
}
final var input = baos.toString("US-ASCII");
final var sequence = Pattern.compile(">.*\n|\n")
.matcher(input).replaceAll("");
final var replacements = CompletableFuture.supplyAsync(() ->
Stream.of(
Map.entry("tHa[Nt]", "<4>"),
Map.entry("aND|caN|Ha[DS]|WaS", "<3>"),
Map.entry("a[NSt]|BY", "<2>"),
Map.entry("<[^>]*>", "|"),
Map.entry("\\|[^|][^|]*\\|", "-")
).reduce(sequence,
(buffer, e) -> Pattern.compile(e.getKey())
.matcher(buffer).replaceAll(e.getValue()),
(a, __) -> a));
final var variants = List.of(
"agggtaaa|tttaccct",
"[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t",
"ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct",
"aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct",
"agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct"
);
final var results = variants.parallelStream()
.collect(Collectors.toMap(v -> v, v -> Pattern
.compile(v).matcher(sequence).results().count()));
variants.forEach(v -> System.out.println(v + " " + results.get(v)));
System.out.println();
System.out.println(input.length());
System.out.println(sequence.length());
System.out.println(replacements.join().length());
}
}
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
java 23 2024-09-17
Java HotSpot(TM) 64-Bit Server VM
(build 23+37-2369,
mixed mode, sharing)
Fri, 20 Sep 2024 05:41:50 GMT
MAKE:
mv regexredux.java-6.java regexredux.java
/opt/src/jdk-23/bin/javac -d . -cp . -sourcepath Include/java regexredux.java
2.08s to complete and log all make actions
COMMAND LINE:
/opt/src/jdk-23/bin/java -cp . regexredux 0 < regexredux-input5000000.txt
PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178
50833411
50000000
27388361