The Computer Language
24.12 Benchmarks Game

regex-redux Free Pascal program

source code

Program regexredux;

(*
  The Computer Language Benchmarks Game
  https://salsa.debian.org/benchmarksgame-team/benchmarksgame/

  contributed by Vitaly Trifonov
  adapted for 'redux' by Peter Blackman
  fixes for fpc 3.0 by Michal Stransky
*)

{$mode objfpc}
uses sysutils;


(******************************    pcre wrap   *****************************)

const
  libpcre = 'pcre';
  PCRE_STUDY_JIT_COMPILE = $00001;


type
  pcre = Pointer;
  pcre_extra = Pointer;
  PPChar = ^PChar;


function pcre_compile( const pattern: PChar;
                       options: Integer;
                       const errptr: PPChar;
                       erroffset: PInteger;
                       const tableptr: PChar ): pcre; cdecl; external libpcre;

function pcre_exec( const code: pcre;
                    const extra: pcre_extra;
                    const subject: PChar;
                    length, startoffset, options: Integer;
                    ovector: PInteger;
                    ovecsize: Integer ): Integer; cdecl; external libpcre;

function pcre_study( const external_re: pcre;
                     options: integer;
                     errorptr: PPChar ): pcre_extra; cdecl; external libpcre;

(***************************************************************************)

const
    patt: array[1..5] of pChar = (
        'tHa[Nt]',
        'aND|caN|Ha[DS]|WaS',
        'a[NSt]|BY',
        '<[^>]*>',
        '\|[^|][^|]*\|');

    repl: array[1..5] of pChar = ('<4>', '<3>', '<2>', '|', '-');


var
  patterns: array[1..9] of PChar =
    (
      'agggtaaa|tttaccct',
      '[cgt]gggtaaa|tttaccc[acg]',
      'a[act]ggtaaa|tttacc[agt]t',
      'ag[act]gtaaa|tttac[agt]ct',
      'agg[act]taaa|ttta[agt]cct',
      'aggg[acg]aaa|ttt[cgt]ccct',
      'agggt[cgt]aa|tt[acg]accct',
      'agggta[cgt]a|t[acg]taccct',
      'agggtaa[cgt]|[acg]ttaccct'
    );


(* Count match with pattern of regexp in seq buffer. *)
function count( const pattern, seq: PChar; len: Integer ): Longint;
var
  cre: pcre;
  cre_ex: pcre_extra;
  err: PChar;
  ofs: Integer;
  ind: Longint = 0;
  m: array[0..2] of Integer;
begin
  cre := pcre_compile(pattern, 0, @err, @ofs, nil);
  cre_ex := pcre_study(cre, PCRE_STUDY_JIT_COMPILE, @err);
  m[1] := 0;

  while pcre_exec(cre,   cre_ex, seq, len,   m[1], 0, m, 3) >= 0 do
    ind += 1;

  count := ind
end;

function split_count ( const ipattern, seq: PChar; len: Integer ): Longint;
var
  split: PChar;
  vcount: Longint;
  pattern : Pchar;
begin
  pattern := StrNew(ipattern);
  split := strscan(pattern, '|');
  Byte(split^) := 0;

  vcount := count(pattern, seq, len);
  vcount += count(@split[1], seq, len);

  StrDispose(pattern);
  split_count := vcount
end;

(* Substitute pattern of regexp with repl, return new length. *)
function subst( const pattern, repl: PChar; var seq: PChar; len: Integer ): Longint;
var
  cre: pcre;
  cre_ex: pcre_extra;
  err: PChar;
  ofs: Integer;
  size_repl, size, bsize, pos: Longint;
  m: array[0..2] of Integer;
  newSeq, otmpseq: PChar;
begin
  cre := pcre_compile(pattern, 0, @err, @ofs, nil);
  cre_ex := pcre_study(cre, PCRE_STUDY_JIT_COMPILE, @err);
  size_repl := strlen(repl);
  m[1] := 0; size := 0;

(* Calculate required size for malloc. *)
  while pcre_exec(cre,   cre_ex, seq, len,   m[1], 0, m, 3) >= 0 do
    size += size_repl - m[1] + m[0];
  size += len;

  GetMem(newSeq, SizeOf(Char)*(size+1));

(* Do substitute. *)
  m[1] := 0; pos := 0;
  otmpseq := newSeq;


  if size_repl <> 0 then
    while pcre_exec(cre,   cre_ex, seq, len,   m[1], 0, m, 3) >= 0 do
    begin
      bsize := m[0] - pos;
      strlcopy(otmpseq, @seq[pos], bsize);

      otmpseq := @otmpseq[bsize];
      pos := m[1];

      otmpseq := strecopy(otmpseq, repl);
    end
  else
    while pcre_exec(cre,   cre_ex, seq, len,   m[1], 0, m, 3) >= 0 do
    begin
      bsize := m[0] - pos;
      strlcopy(otmpseq, @seq[pos], bsize);

      otmpseq := @otmpseq[bsize];
      pos := m[1];
    end;

  strcopy(otmpseq, @seq[pos]);

  FreeMem(seq);
  seq := newSeq;

  subst := size
end;


var
  i, seqLen : Longint;
  readLen   : Longint = 0;
  maxSeqLen : Longint = 6000000;
  seq       : PChar;

begin
  GetMem(seq, SizeOf(Char)*(maxSeqLen+1));

(* Read FASTA format file from stdin and count length. *)
  while not eof do
  begin
    if readLen = maxSeqLen then
    begin
      maxSeqLen += 3000000;
      seq := ReAllocMem(seq, SizeOf(Char)*(maxSeqLen+1));
    end;
    read(seq[readLen]);
    readLen += 1
  end;
  Byte(seq[readLen]) := 0; //end read data


(* Remove FASTA sequence descriptions and all linefeed characters.  *)
  seqLen := subst('>.*|\n', '', seq, readLen);


(* Count all matches of patterns[i] in  seq buffer. *)
  for i := 1 to length (patterns) do 
    writeln(patterns[i], ' ', split_count(patterns[i], seq, seqLen));

  writeln;
  writeln(readLen);
  writeln(seqLen);

(* All IUB substitutes. *)
  for i := 1 to length(patt) do
    seqLen := subst(patt[i], repl[i], seq, seqLen);

  writeln(seqLen);

  FreeMem(seq);
end.
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Free Pascal Compiler
version 3.2.2 [2021/05/16]


 Wed, 22 May 2024 21:34:11 GMT

MAKE:
mv regexredux.fpascal regexredux.pas
/opt/src/fpc-3.2.2/bin/fpc -FuInclude/fpascal -XXs -O3 -Ci- -Cr- -g- -CpCOREAVX -CfAVX -Tlinux  -oFPASCAL_RUN regexredux.pas
Free Pascal Compiler version 3.2.2 [2021/05/16] for x86_64
Copyright (c) 1993-2021 by Florian Klaempfl and others
Target OS: Linux for x86-64
Compiling regexredux.pas
Linking FPASCAL_RUN
211 lines compiled, 1.3 sec
mv FPASCAL_RUN regexredux.fpascal_run
rm regexredux.pas

1.58s to complete and log all make actions

COMMAND LINE:
 ./regexredux.fpascal_run 0 < regexredux-input5000000.txt

PROGRAM OUTPUT:
agggtaaa|tttaccct 356
[cgt]gggtaaa|tttaccc[acg] 1250
a[act]ggtaaa|tttacc[agt]t 4252
ag[act]gtaaa|tttac[agt]ct 2894
agg[act]taaa|ttta[agt]cct 5435
aggg[acg]aaa|ttt[cgt]ccct 1537
agggt[cgt]aa|tt[acg]accct 1431
agggta[cgt]a|t[acg]taccct 1608
agggtaa[cgt]|[acg]ttaccct 2178

50833411
50000000
27388361