The Computer Language
23.03 Benchmarks Game

regex-redux Python 3 #2 program

source code

# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# Contributed by Jeremy Zerfas
#
# WARNING: I normally do my programming in other languages. This may not be an
# optimal Python program. Please contribute a better program if you can make a
# better program.

from ctypes import c_char, c_char_p, c_int, c_size_t, c_uint32, \
                   byref, CDLL, create_string_buffer, POINTER
from ctypes.util import find_library
from multiprocessing import Pipe, Process
from multiprocessing.connection import wait
from multiprocessing.sharedctypes import RawArray
from os import cpu_count
from sys import stdin


# We'll be using PCRE2 for our regular expression needs instead of using
# Python's built in regular expression engine because it is significantly
# faster.
PCRE2=CDLL(find_library("pcre2-8"))

# By default, Python assumes that C functions will return int's but that is not
# correct for the following functions which return pointers instead so we tell
# it the correct types here.
PCRE2.pcre2_compile_8.restype=POINTER(c_char)
PCRE2.pcre2_match_data_create_8.restype=POINTER(c_char)
PCRE2.pcre2_get_ovector_pointer_8.restype=POINTER(c_size_t*2)


# Function for searching a src_String for a pattern, replacing it with some
# specified replacement, and storing the result in dst_String which is also
# returned along with the updated dst_String_Length.
def replace(pattern, replacement, src_String, src_String_Length,
  dst_String, dst_String_Length):

	# Compile the pattern and also enable JIT compilation to make matching
	# faster.
	regex=PCRE2.pcre2_compile_8(pattern, c_size_t(len(pattern)), c_uint32(0),
	  byref(c_int()), byref(c_size_t()), None)
	PCRE2.pcre2_jit_compile_8(regex, c_uint32(0x1))

	# Convert dst_String_Length from an int to c_size_t so that the
	# pcre2_substitute_8() function will be able to access it properly.
	dst_String_Length=c_size_t(dst_String_Length)

	# Have PCRE2 do most of the replacement work.
	PCRE2.pcre2_substitute_8(regex, src_String, c_size_t(src_String_Length),
	  c_size_t(0), c_uint32(0x100), None, None,
	  replacement, c_size_t(len(replacement)),
	  dst_String, byref(dst_String_Length))

	PCRE2.pcre2_code_free_8(regex)

	return dst_String, dst_String_Length.value


# This function is used as the main function for the worker subprocesses. The
# worker subprocesses communicate with the manager process via worker_Pipe. The
# worker subprocesses receive tasks from the manager process, complete the
# appropriate task, and then send back a result to the manager process (except
# when the manager process tells the worker subprocess to exit). Tasks are
# performed on the sequences string which is shared between all the processes
# using shared memory.
def process_Task(worker_Pipe, sequences, sequences_Length):

	# When a worker subprocess first starts up, it sends None to the manager
	# process as an indicator that the worker subprocess is ready to process
	# tasks.
	worker_Pipe.send(None)

	# Start an infinte loop to process received tasks accordingly until a
	# request to exit is received.
	while True:

		# Wait for a request.
		task=worker_Pipe.recv()

		# If there is no task to do, then just exit the loop and let this
		# subprocess exit.
		if task is None:
			break

		# If the task contains a tuple and the first element is an int, then
		# this is a task requesting that a match_Count be performed for
		# pattern_To_Count and that this is for the element located at
		# index_For_Pattern_To_Count in count_Info[].
		elif type(task[0]) is int:
			index_For_Pattern_To_Count, pattern_To_Count=task

			match_Data=PCRE2.pcre2_match_data_create_8(c_uint32(1), None)
			match=PCRE2.pcre2_get_ovector_pointer_8(match_Data)

			match.contents[1]=0
			match_Count=0

			# Compile the pattern and also enable JIT compilation to make
			# matching faster.
			regex=PCRE2.pcre2_compile_8(pattern_To_Count,
			  c_size_t(len(pattern_To_Count)), c_uint32(0), byref(c_int()),
			  byref(c_size_t()), None)
			PCRE2.pcre2_jit_compile_8(regex, c_uint32(0x1))

			# Count how many matches there are for pattern in sequences.
			while PCRE2.pcre2_jit_match_8(regex,
			  sequences, c_size_t(sequences_Length),
			  c_size_t(match.contents[1]), c_uint32(0), match_Data, None)>=0:
				match_Count+=1

			PCRE2.pcre2_match_data_free_8(match_Data)
			PCRE2.pcre2_code_free_8(regex)

			# Send the result back to the manager process.
			worker_Pipe.send((index_For_Pattern_To_Count, match_Count))

		# If we got here, then this is a task requesting that replacements be
		# made to the sequences string using all the (pattern, replacement)
		# tuples in the task.
		else:

			# We'll use two strings when doing all the replacements, searching
			# for patterns in prereplace_String and using postreplace_String to
			# store the string after the replacements have been made. After
			# each iteration these two then get swapped. Make both strings 10%
			# larger than the sequences string in order to accomodate some of
			# the replacements causing sequences to grow and also copy the
			# sequences string into prereplace_String for the initial iteration.
			string_Capacities=int(sequences_Length*1.1)
			prereplace_String=create_string_buffer(sequences.raw,
			  string_Capacities)
			prereplace_String_Length=sequences_Length
			postreplace_String=create_string_buffer(string_Capacities)
			postreplace_String_Length=string_Capacities

			# Iterate through all the (pattern, replacement) tuples in the task.
			for pattern, replacement in task:
				postreplace_String, postreplace_String_Length=replace(
				  pattern, replacement,
				  prereplace_String, prereplace_String_Length,
				  postreplace_String, postreplace_String_Length)

				# Swap prereplace_String and postreplace_String in preparation
				# for the next iteration.
				prereplace_String, postreplace_String= \
				  postreplace_String, prereplace_String
				prereplace_String_Length=postreplace_String_Length
				postreplace_String_Length=string_Capacities

			# If any replacements were made, they'll be in prereplace_String
			# instead of postreplace_String because of the swap done after each
			# iteration. Send its length back to the manager process.
			worker_Pipe.send(prereplace_String_Length)


if __name__ == '__main__':
	# Read in input from stdin and also get the input_Length.
	input=stdin.buffer.read()
	input_Length=len(input)


	# Set up some shared memory for the sequences string so that it can be
	# shared between all the processes and make it the same length as the
	# input_Length.
	sequences=RawArray(c_char, input_Length)
	sequences_Length=input_Length

	# Find all sequence descriptions and new lines in input, replace them
	# with empty strings, and store the result in the sequences string.
	sequences, sequences_Length=replace(b">.*\\n|\\n", b"", input, input_Length,
	  sequences, sequences_Length)

	# We'll be using the sequences string instead of the input string from now
	# on so delete our reference to it since this can often free up the memory
	# that was used by it.
	del input


	# Start a worker subprocess on each processor that is available to us and
	# send each worker subprocess the sequences string & a worker_Pipe to use
	# for communicating with the manager process.
	manager_Pipes=[]
	for i in range(cpu_count() or 1):
		manager_Pipe, worker_Pipe=Pipe()
		manager_Pipes.append(manager_Pipe)
		Process(target=process_Task,
		  args=(worker_Pipe, sequences, sequences_Length)).start()


	# Wait for the first worker subproces to send us a None object that
	# indicates it's ready to start processing tasks and then have it start
	# working on performing all the replacements serially.
	manager_Pipes[0].recv()
	manager_Pipes[0].send((
	    (b"tHa[Nt]", b"<4>"),
	    (b"aND|caN|Ha[DS]|WaS", b"<3>"),
	    (b"a[NSt]|BY", b"<2>"),
	    (b"<[^>]*>", b"|"),
	    (b"\\|[^|][^|]*\\|", b"-")
	  ))


	count_Info=[
	    b"agggtaaa|tttaccct",
	    b"[cgt]gggtaaa|tttaccc[acg]",
	    b"a[act]ggtaaa|tttacc[agt]t",
	    b"ag[act]gtaaa|tttac[agt]ct",
	    b"agg[act]taaa|ttta[agt]cct",
	    b"aggg[acg]aaa|ttt[cgt]ccct",
	    b"agggt[cgt]aa|tt[acg]accct",
	    b"agggta[cgt]a|t[acg]taccct",
	    b"agggtaa[cgt]|[acg]ttaccct"
	  ]

	# Now the manger process needs to start managing all the worker subprocesses
	# by waiting for them to become ready to process tasks, handling any results
	# that they send back, sending them any remaining counting tasks, and then
	# finally telling them when it's OK for them to exit (when there are no more
	# tasks to process).
	index_For_Next_Count=0
	while manager_Pipes:

		# Wait for any one of the manager_Pipes to receive something.
		for manager_Pipe in wait(manager_Pipes):
			result=manager_Pipe.recv()

			# If the result is an int, then it's the postreplace_Length that
			# resulted after applying all the replacments that were specified
			# above.
			if type(result) is int:
				postreplace_Length=result

			# If the result is a tuple, then it's the results from one of the
			# counting tasks for the patterns in count_Info[]. The first element
			# is the index of the pattern that the result is for and the second
			# element is the number of matches for it. Add the number of matches
			# to count_Info[].
			elif type(result) is tuple:
				count_Info[result[0]]=[count_Info[result[0]], result[1]]


			# Send the worker subprocess the index_For_Next_Count and pattern to
			# work on if we haven't reached the end of count_Info[] yet.
			if index_For_Next_Count<len(count_Info):
				manager_Pipe.send((index_For_Next_Count,
				  count_Info[index_For_Next_Count]))
				index_For_Next_Count+=1

			# If we have reached the end of count_Info[] then there are no more
			# tasks to start working on so just send the worker subprocess None
			# to indicate it can exit and also stop keeping track of the
			# manger_Pipe for it.
			else:
				manager_Pipe.send(None)
				manager_Pipes.remove(manager_Pipe)


	# Print the match_Count for each pattern in count_Info[].
	for pattern, match_Count in count_Info:
		print(pattern.decode(), match_Count)

	# Print the size of the original input, the size of the input without the
	# sequence descriptions & new lines, and the size after having made all the
	# replacements.
	print()
	print(input_Length)
	print(sequences_Length)
	print(postreplace_Length)
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Python 3.11.1


Tue, 24 Jan 2023 20:00:53 GMT

MAKE:
mv regexredux.python3-2.python3 regexredux.python3-2.py
pyright .
No configuration file found.
No pyproject.toml file found.
stubPath /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/typings is not a valid directory.
Assuming Python platform Linux
Searching for source files
Found 1 source file
pyright 1.1.288
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/regexredux.python3-2.py
  /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/regexredux.python3-2.py:226:24 - error: Cannot access member "recv" for type "int"
    Member "recv" is unknown (reportGeneralTypeIssues)
  /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/regexredux.python3-2.py:226:11 - error: Expected 1 more positional argument (reportGeneralTypeIssues)
  /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/regexredux.python3-2.py:246:18 - error: Cannot access member "send" for type "int"
    Member "send" is unknown (reportGeneralTypeIssues)
  /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/regexredux.python3-2.py:255:18 - error: Cannot access member "send" for type "int"
    Member "send" is unknown (reportGeneralTypeIssues)
  /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/regexredux.python3-2.py:255:23 - error: Argument of type "None" cannot be assigned to parameter "__data" of type "ReadableBuffer" in function "send"
    Type "None" cannot be assigned to type "ReadableBuffer"
      Type "None" cannot be assigned to type "ReadOnlyBuffer"
      Type "None" cannot be assigned to type "bytearray"
      Type "None" cannot be assigned to type "memoryview"
      Type "None" cannot be assigned to type "array[Any]"
      Type "None" cannot be assigned to type "mmap"
      Type "None" cannot be assigned to type "_CData"
      Type "None" cannot be assigned to type "PickleBuffer" (reportGeneralTypeIssues)
  /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/regexredux.python3-2.py:261:17 - error: Cannot access member "decode" for type "int"
    Member "decode" is unknown (reportGeneralTypeIssues)
  /home/dunham/all-benchmarksgame/benchmarksgame_i53330/regexredux/tmp/regexredux.python3-2.py:269:8 - error: "postreplace_Length" is possibly unbound (reportUnboundVariable)
7 errors, 0 warnings, 0 informations 
Completed in 2.121sec
make: [/home/dunham/all-benchmarksgame/2000-benchmarksgame/nanobench/makefiles/u64q.programs.Makefile:388: regexredux.python3-2.python3_run] Error 1 (ignored)

3.86s to complete and log all make actions

COMMAND LINE:
/opt/src/Python-3.11.1/bin/python3 -OO regexredux.python3-2.py 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361