The Computer Language
24.11 Benchmarks Game

regex-redux Intel Fortran program

source code

! The Computer Language Benchmarks Game
! https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
!
! Contributed by Tyler Funnell
! translation of GCC #5 by Jeremy Zerfas & Zoltan Herczeg
! compilation: ifort -O3 -xHost -ipo -qopenmp regexredux.f90


module pcre2_mod
    use iso_c_binding
    implicit none

    type var_str
        character(len=:), allocatable :: data
        integer :: size
    end type

    integer(c_int), parameter :: PCRE2_JIT_COMPLETE = int(z'00000001')

    interface
        ! void pcre2_code_free(pcre2_code *code)
        subroutine pcre2_code_free(code) bind(c, name='pcre2_code_free_8')
            import
            type(c_ptr), intent(in), value :: code
        end subroutine

        ! pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
        !                           uint32_t options, int *errorcode,
        !                           PCRE2_SIZE *erroroffset,
        !                           pcre2_compile_context *ccontext)
        function pcre2_compile &
                (pattern, length, options, errorcode, erroroffset, ccontext) &
                result(code) bind(c, name='pcre2_compile_8')
            import
            type(c_ptr) :: code
            character(kind=c_char), intent(in) :: pattern(*)
            integer(c_int), intent(in), value :: length, options
            integer(c_int), intent(out) :: errorcode
            integer(c_size_t), intent(out) :: erroroffset
            type(c_ptr), intent(in), value :: ccontext
        end function

        ! PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data)
        function pcre2_get_ovector_pointer(match_data) result(ret) &
                bind(c, name='pcre2_get_ovector_pointer_8')
            import
            type(c_ptr) :: ret
            type(c_ptr), intent(in), value :: match_data
        end function

        ! int pcre2_jit_compile(pcre2_code *code, uint32_t options)
        function pcre2_jit_compile(code, options) &
                result(errorcode) bind(c, name='pcre2_jit_compile_8')
            import
            integer :: errorcode
            type(c_ptr), intent(in), value :: code
            integer(c_int), intent(in), value :: options
        end function

        ! int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject,
        !                     PCRE2_SIZE length, PCRE2_SIZE startoffset,
        !                     uint32_t options, pcre2_match_data *match_data,
        !                     pcre2_match_context *mcontext)
        function pcre2_jit_match (code, subject, length, startoffset, options, &
                match_data, mcontext) &
                result(ret) bind(c, name='pcre2_jit_match_8')
            import
            integer :: ret
            type(c_ptr), intent(in), value :: code, match_data, mcontext
            character(len=1, kind=c_char), intent(in) :: subject(*)
            integer(c_int), intent(in), value :: length, startoffset, options
        end function

        ! void pcre2_jit_stack_assign(pcre2_match_context *mcontext,
        !   pcre2_jit_callback callback_function, void *callback_data)
        subroutine pcre2_jit_stack_assign &
                (mcontext, callback_function, callback_data) &
                bind(c, name='pcre2_jit_stack_assign_8')
            import
            type(c_ptr), intent(in), value :: mcontext, callback_function, &
                callback_data
        end subroutine

        ! pcre2_jit_stack *pcre2_jit_stack_create(
        !   PCRE2_SIZE startsize, PCRE2_SIZE maxsize,
        !   pcre2_general_context *gcontext)
        function pcre2_jit_stack_create(startsize, maxsize, gcontext) &
                result(stack) bind(c, name='pcre2_jit_stack_create_8')
            import
            type(c_ptr) :: stack
            integer(c_size_t), intent(in), value :: startsize, maxsize
            type(c_ptr), intent(in), value :: gcontext
        end function

        ! void pcre2_jit_stack_free(pcre2_jit_stack *jit_stack)
        subroutine pcre2_jit_stack_free(jit_stack) &
                bind(c, name='pcre2_jit_stack_free_8')
            import
            type(c_ptr), intent(in), value :: jit_stack
        end subroutine

        ! pcre2_match_context *pcre2_match_context_create(
        !   pcre2_general_context *gcontext)
        function pcre2_match_context_create(gcontext) &
                result(mcontext) bind(c, name='pcre2_match_context_create_8')
            import
            type(c_ptr) :: mcontext
            type(c_ptr), intent(in), value :: gcontext
        end function

        ! void pcre2_match_context_free(pcre2_match_context *mcontext)
        subroutine pcre2_match_context_free(mcontext) &
                bind(c, name='pcre2_match_context_free_8')
            import
            type(c_ptr), intent(in), value :: mcontext
        end subroutine

        ! pcre2_match_data *pcre2_match_data_create(
        !   uint32_t ovecsize, pcre2_general_context *gcontext)
        function pcre2_match_data_create(ovecsize, gcontext) &
                result(mdata) bind(c, name='pcre2_match_data_create_8')
            import
            type(c_ptr) :: mdata
            integer(c_size_t), intent(in), value :: ovecsize
            type(c_ptr), intent(in), value :: gcontext
        end function

        ! void pcre2_match_data_free(pcre2_match_data *match_data)
        subroutine pcre2_match_data_free(match_data) &
                bind(c, name='pcre2_match_data_free_8')
            import
            type(c_ptr), intent(in), value :: match_data
        end subroutine
    end interface

    contains
        subroutine replace(pattern, replacement, src_string, dst_string, &
                mcontext, mdata)
            character(len=*), intent(in) :: pattern, replacement
            type(var_str), intent(in) :: src_string
            type(var_str), intent(inout) :: dst_string
            type(c_ptr), intent(in) :: mcontext, mdata

            character(len=:), allocatable :: temp

            type(c_ptr) :: regex, c_match
            integer(c_size_t), pointer :: match(:)
            integer(c_int) :: errorcode, pos
            integer(c_size_t) :: erroroffset


            c_match = pcre2_get_ovector_pointer(mdata)
            call c_f_pointer(c_match, match, [2])

            regex = pcre2_compile(pattern, len(pattern), 0_c_int, errorcode, &
                erroroffset, c_null_ptr)
            errorcode = pcre2_jit_compile(regex, PCRE2_JIT_COMPLETE)

            pos = 0

            ! Find each match of the pattern in src_string and append the
            ! characters preceding each match and the replacement text to
            ! dst_string.
            do
                errorcode = pcre2_jit_match(regex, src_string%data, &
                    src_string%size, pos, 0, mdata, mcontext)

                if (errorcode < 0) exit

                ! Allocate more memory for dst_string if there is not enough
                ! space for the characters preceding the match and the
                ! replacement text.
                do
                    if (dst_string%size + match(1) - pos + len(replacement) <= &
                        len(dst_string%data)) exit

                    allocate (character(len(dst_string%data) * 2) :: temp)
                    temp(1:dst_string%size) = dst_string%data(:dst_string%size)
                    call move_alloc(temp, dst_string%data)
                end do

                ! Append the characters preceding the match and the replacement
                ! text to dst_string and update the size of dst_string.
                dst_string%data( &
                    (dst_string%size + 1):(dst_string%size + match(1) - pos) &
                ) = src_string%data((pos + 1):match(1))
                dst_string%size = dst_string%size + match(1) - pos

                dst_string%data( &
                    (dst_string%size + 1):(dst_string%size + len(replacement)) &
                ) = replacement
                dst_string%size = dst_string%size + len(replacement)

                ! Update pos to continue searching after the current match.
                pos = match(2)
            end do

        call pcre2_code_free(regex)

        ! Allocate more memory for dst_string if there is not enough space for
        ! the characters following the last match (or the entire src_string if
        ! there was no match).
        do
            if ( &
                dst_string%size + src_string%size - pos <= &
                len(dst_string%data) & 
            ) exit

            allocate (character(len(dst_string%data) * 2) :: temp)
            temp(1:dst_string%size) = dst_string%data(:dst_string%size)
            call move_alloc(temp, dst_string%data)
        end do

        ! Append the characters following the last match (or the entire
        ! src_string if there was no match) to dst_string and update the size
        ! of dst_string.
        dst_string%data( &
            (dst_string%size + 1):(dst_string%size + src_string%size - pos) &
        ) = src_string%data((pos + 1):src_string%size)
        dst_string%size = dst_string%size + src_string%size - pos

        end subroutine
end module pcre2_mod

program regexredux
    use iso_fortran_env
    use iso_c_binding
    use pcre2_mod
    implicit none

    type const_str
        character(len=:), allocatable :: data
    end type

    type(const_str), dimension(:), allocatable :: count_info
    type(const_str), dimension(:, :), allocatable :: replace_info

    type(var_str) :: input, temp, sequences, prereplace_string, &
        postreplace_string
    integer :: input_size, info_count, i

    type(c_ptr) :: mdata, mcontext, stack, regex, c_match
    integer(c_size_t), pointer :: match(:)
    integer(c_size_t) :: erroroffset
    integer(c_int) :: errorcode, pos


    count_info = [ &
        const_str('agggtaaa|tttaccct'), &
        const_str('[cgt]gggtaaa|tttaccc[acg]'), &
        const_str('a[act]ggtaaa|tttacc[agt]t'), &
        const_str('ag[act]gtaaa|tttac[agt]ct'), &
        const_str('agg[act]taaa|ttta[agt]cct'), &
        const_str('aggg[acg]aaa|ttt[cgt]ccct'), &
        const_str('agggt[cgt]aa|tt[acg]accct'), &
        const_str('agggta[cgt]a|t[acg]taccct'), &
        const_str('agggtaa[cgt]|[acg]ttaccct') &
    ]
    replace_info = reshape([ &
        const_str('tHa[Nt]'), const_str('<4>'), &
        const_str('aND|caN|Ha[DS]|WaS'), const_str('<3>'), &
        const_str('a[NSt]|BY'), const_str('<2>'), &
        const_str('<[^>]*>'), const_str('|'), &
        const_str('\|[^|][^|]*\|'), const_str('-') &
    ], [2, 5])

    open(unit=input_unit, action="read", form="unformatted", access="stream")
    inquire(unit=input_unit, size=input_size)
    allocate(character(len=input_size) :: input%data)
    read(input_unit) input%data
    close(input_unit)

    input%size = input_size

    !$omp parallel private(mcontext, stack, mdata)
    mcontext = pcre2_match_context_create(c_null_ptr)
    stack = pcre2_jit_stack_create(16384_c_size_t, 16384_c_size_t, c_null_ptr)
    call pcre2_jit_stack_assign(mcontext, c_null_ptr, stack)
    mdata = pcre2_match_data_create(16_c_size_t, c_null_ptr)

    !$omp single
    allocate (character(len=16384) :: sequences%data)
    call replace('>.*\n|\n', '', input, sequences, mcontext, mdata)
    deallocate(input%data)
    !$omp end single

    ! We'll use two strings when doing all the replacements, searching
    ! for patterns in prereplace_string and using postreplace_string to
    ! store the string after the replacements have been made. After
    ! each iteration these two then get swapped. Start out with both
    ! strings having the same capacity as the sequences string and also
    ! copy the sequences string into prereplace_string for the initial
    ! iteration.
    !$omp single
    allocate(character(len=len(sequences%data)) :: prereplace_string%data)
    allocate(character(len=len(sequences%data)) :: postreplace_string%data)
    prereplace_string%data(:) = sequences%data(:)
    prereplace_string%size = sequences%size
    postreplace_string%size = 0

    ! Iterate through all the replacement patterns and their
    ! replacements in replace_info.
    do i = 1, size(replace_info, 2)
        call replace( &
            replace_info(1, i)%data, replace_info(2, i)%data, &
            prereplace_string, postreplace_string, mcontext, mdata)

        call move_alloc(prereplace_string%data, temp%data)
        call move_alloc(postreplace_string%data, prereplace_string%data)
        prereplace_string%size = postreplace_string%size
        call move_alloc(temp%data, postreplace_string%data)
        postreplace_string%size = 0
    end do

    deallocate(prereplace_string%data)
    deallocate(postreplace_string%data)
    !$omp end single nowait

    ! Iterate through all the count patterns in count_info[] and perform
    ! the counting for each one on a different thread if available.
    !$omp do schedule(dynamic) ordered &
    !$omp private(errorcode, erroroffset, info_count, pos, regex, c_match, match)
    do i = 1, size(count_info)
        errorcode = 0
        info_count = 0
        pos = 0

        c_match = pcre2_get_ovector_pointer(mdata)
        call c_f_pointer(c_match, match, [2])

        ! Compile and study pattern.
        regex = pcre2_compile( &
            count_info(i)%data, len(count_info(i)%data), &
            0_c_int, errorcode, erroroffset, c_null_ptr)
        errorcode = pcre2_jit_compile(regex, PCRE2_JIT_COMPLETE)

        ! Find each match of the pattern in the sequences string and
        ! increment count for each match.
        do
            errorcode = pcre2_jit_match( &
                regex, sequences%data, sequences%size, pos, 0, mdata, mcontext)

            if (errorcode < 0) exit

            info_count = info_count + 1
            pos = match(2)
        end do

        call pcre2_code_free(regex)

        ! Print the count for each pattern in the correct order.
        !$omp ordered
        print '(A, " ", I0)', count_info(i)%data, info_count
        !$omp end ordered
    end do
    !$omp end do

    call pcre2_match_context_free(mcontext)
    call pcre2_jit_stack_free(stack)
    call pcre2_match_data_free(mdata)
    !$omp end parallel

    deallocate(sequences%data)

    ! Print the size of the original input, the size of the input without the
    ! sequence descriptions & new lines, and the size after having made all the
    ! replacements.
    print '(/ I0 / I0 / I0)', input_size, sequences%size, prereplace_string%size

end program regexredux
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
ifx (IFX)
2024.1.0 20240308


 Mon, 03 Jun 2024 20:49:52 GMT

MAKE:
mv regexredux.ifx regexredux.f90
~/intel/oneapi/compiler/latest/bin/ifx -O3 -march=ivybridge -ipo -qopenmp -lpcre2-8 regexredux.f90 -o regexredux.ifx_run
rm regexredux.f90

4.76s to complete and log all make actions

COMMAND LINE:
 ./regexredux.ifx_run 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361