The Computer Language
24.11 Benchmarks Game

regex-redux Java #2 program

source code

/*
   The Computer Language Benchmarks Game
   https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

   contributed by Piotr Tarsa
*/

import jextract_pcre2.pcre2_h;

import java.lang.foreign.Arena;
import java.lang.foreign.MemorySegment;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.function.Consumer;
import java.util.function.Function;

import static java.lang.foreign.ValueLayout.*;

public class regexredux {
    private static final ExecutorService EXECUTOR_SERVICE =
            Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());

    private static final Arena GLOBAL_ARENA = Arena.global();

    public static void main(String[] args) throws Exception {
        final byte[] rawInput = System.in.readAllBytes();
        final int initialLength = rawInput.length;

        final var sequence = GLOBAL_ARENA.allocate(initialLength);
        final int sequenceLength = withArena(arena -> {
            var rawInputBuffer = arena.allocateFrom(JAVA_BYTE, rawInput);
            var compiledPattern = compilePattern(">.*\\n|\\n");
            return substitute(compiledPattern, rawInputBuffer, initialLength,
                    pcre2_h.NULL(), sequence, initialLength, "");
        });

        var magicRegExpsCount = EXECUTOR_SERVICE.submit(() -> {
            final Map<String, String> iub = new LinkedHashMap<>();
            iub.put("tHa[Nt]", "<4>");
            iub.put("aND|caN|Ha[DS]|WaS", "<3>");
            iub.put("a[NSt]|BY", "<2>");
            iub.put("<[^>]*>", "|");
            iub.put("\\|[^|][^|]*\\|", "-");

            return withArena(arena -> {
                var currentLength = sequenceLength;
                var bufLength = currentLength * 3 / 2;
                var buf1 = arena.allocate(bufLength);
                var buf2 = arena.allocate(bufLength);
                MemorySegment.copy(sequence, 0, buf1, 0, sequenceLength);
                var flip = false;

                for (Entry<String, String> entry : iub.entrySet()) {
                    var pattern = entry.getKey();
                    var replacement = entry.getValue();

                    var compiledPattern = compilePattern(pattern);
                    currentLength = substitute(compiledPattern,
                            flip ? buf2 : buf1, currentLength,
                            pcre2_h.NULL(),
                            flip ? buf1 : buf2, bufLength,
                            replacement);
                    flip = !flip;
                }
                return currentLength;
            });
        });

        var variants = Arrays.asList("agggtaaa|tttaccct",
                "[cgt]gggtaaa|tttaccc[acg]",
                "a[act]ggtaaa|tttacc[agt]t",
                "ag[act]gtaaa|tttac[agt]ct",
                "agg[act]taaa|ttta[agt]cct",
                "aggg[acg]aaa|ttt[cgt]ccct",
                "agggt[cgt]aa|tt[acg]accct",
                "agggta[cgt]a|t[acg]taccct",
                "agggtaa[cgt]|[acg]ttaccct");

        var tasks = variants.stream().map(variant -> (Callable<String>) () -> {
            var compiledPattern = compilePattern(variant);
            var oVectorSize = 100;
            var matchData = pcre2_h
                    .pcre2_match_data_create_8(oVectorSize, pcre2_h.NULL());
            var oVectorPtr = pcre2_h
                    .pcre2_get_ovector_pointer_8(matchData)
                    .reinterpret(16 * oVectorSize);
            oVectorPtr.setAtIndex(JAVA_LONG, 1, 0);
            long count = 0;
            var result = 1;
            while ((result = pcre2_h.pcre2_jit_match_8(compiledPattern,
                    sequence, sequenceLength,
                    oVectorPtr.getAtIndex(JAVA_LONG, 2L * result - 1), 0,
                    matchData, pcre2_h.NULL())) > 0) count += result;
            if (result != pcre2_h.PCRE2_ERROR_NOMATCH()) {
                showPcre2ErrorIfAny("jit match", result);
            }
            return variant + " " + count;
        }).toList();

        for (var result : EXECUTOR_SERVICE.invokeAll(tasks)) {
            System.out.println(result.get());
        }

        System.out.println();
        System.out.println(initialLength);
        System.out.println(sequenceLength);
        System.out.println(magicRegExpsCount.get());

        EXECUTOR_SERVICE.shutdown();
    }

    private static int substitute(
            MemorySegment compiledPattern,
            MemorySegment inputBuffer, int inputLength,
            MemorySegment matchContext,
            MemorySegment outputBuffer, int outputBufferLength,
            String replacement) {
        return withArena(arena -> {
            var replacementBytes =
                    replacement.getBytes(StandardCharsets.US_ASCII);
            var replacementBuffer = arena.allocateFrom(
                    JAVA_BYTE, replacementBytes);
            var outputLengthHolder = arena.allocate(JAVA_LONG);
            outputLengthHolder.setAtIndex(JAVA_LONG, 0, outputBufferLength);
            var options = pcre2_h.PCRE2_SUBSTITUTE_GLOBAL() |
                    pcre2_h.PCRE2_NO_UTF_CHECK();
            var substitutionResult = pcre2_h.pcre2_substitute_8(
                    compiledPattern,
                    inputBuffer, inputLength,
                    0, options, pcre2_h.NULL(),
                    matchContext,
                    replacementBuffer, replacementBytes.length,
                    outputBuffer, outputLengthHolder);
            showPcre2ErrorIfAny("substitutionResult", substitutionResult);
            return substitutionResult < 0 ?
                    0 : outputLengthHolder.getAtIndex(JAVA_INT, 0);
        });
    }

    private static MemorySegment compilePattern(String pattern) {
        return withArena(arena -> {
            var patternBytes = pattern.getBytes(StandardCharsets.US_ASCII);
            var patternLength = patternBytes.length;
            var bufPattern = arena.allocateFrom(JAVA_BYTE, patternBytes);
            var bufErrorCode = arena.allocate(pcre2_h.int64_t);
            var bufErrorOffset = arena.allocate(pcre2_h.int64_t);
            var compiledPattern = pcre2_h.pcre2_compile_8(
                    bufPattern, patternLength, 0,
                    bufErrorCode, bufErrorOffset, pcre2_h.NULL());
            if (compiledPattern.equals(pcre2_h.NULL())) {
                showPcre2Error("pcre2_compile_8 failed at offset " +
                                bufErrorOffset.getAtIndex(JAVA_INT, 0),
                        bufErrorCode.getAtIndex(JAVA_INT, 0));
            }
            var jitCompileResult = pcre2_h.pcre2_jit_compile_8(
                    compiledPattern, pcre2_h.PCRE2_JIT_COMPLETE());
            showPcre2ErrorIfAny("pcre_2jit_compile_8", jitCompileResult);
            return compiledPattern;
        });
    }

    private static void showPcre2ErrorIfAny(
            String description, int resultOrErrorCode) {
        if (resultOrErrorCode < 0) {
            showPcre2Error(description, resultOrErrorCode);
        }
    }

    private static void showPcre2Error(String description, int errorCode) {
        withArena(arena -> {
            var bufSize = 1000;
            var buf = arena.allocate(bufSize);
            var errorMsgLength = pcre2_h.pcre2_get_error_message_8(
                    errorCode, buf, bufSize);
            if (errorMsgLength >= 0) {
                var errorMsgBytes = new byte[errorMsgLength];
                buf.asByteBuffer().get(errorMsgBytes);
                var errorMsg = new String(errorMsgBytes, 0, errorMsgLength,
                        StandardCharsets.US_ASCII);
                new Exception(description + " " + errorCode +
                        " = " + errorMsg).printStackTrace(System.out);
            } else {
                new Exception(description +
                        " Error during getting error message: " +
                        errorMsgLength).printStackTrace(System.out);
            }
        });
    }

    private static void withArena(Consumer<Arena> body) {
        try (var arena = Arena.ofConfined()) {
            body.accept(arena);
        }
    }

    private static <T> T withArena(Function<Arena, T> body) {
        try (var arena = Arena.ofConfined()) {
            return body.apply(arena);
        }
    }
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
java 23 2024-09-17
Java HotSpot(TM) 64-Bit Server VM
(build 23+37-2369, 
mixed mode, sharing)


 Sun, 10 Nov 2024 20:35:34 GMT

MAKE:
mv regexredux.java-2.java regexredux.java
/opt/src/jdk-23/bin/javac -d . -cp . -sourcepath Include/java regexredux.java

4.95s to complete and log all make actions

COMMAND LINE:
 /opt/src/jdk-23/bin/java  -cp . regexredux 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361

WARNING: A restricted method in java.lang.foreign.SymbolLookup has been called
WARNING: java.lang.foreign.SymbolLookup::libraryLookup has been called by jextract_pcre2.pcre2_h_1 in an unnamed module
WARNING: Use --enable-native-access=ALL-UNNAMED to avoid a warning for callers in this module
WARNING: Restricted methods will be blocked in a future release unless native access is enabled