The Computer Language
24.12 Benchmarks Game

regex-redux Ada 2012 GNAT #6 program

source code

--
-- The Computer Language Benchmarks Game
-- https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
--
-- contributed by Francois Fabien (juillet 2012)
--
-- regex-dna benchmark using PCRE
-- with concurrent processing (multi-core).
--
-- compile with:
--   gnatmake -O3 -gnatn -gnatp Regexredux.adb
--
pragma Restrictions (No_Abort_Statements);
pragma Restrictions (Max_Select_Alternatives => 0);
pragma Restrictions (Max_Asynchronous_Select_Nesting => 0);
pragma Suppress (All_Checks);

with Text_IO;           use Text_IO;
with Block_Input;       use Block_Input;
with Pcre;              use Pcre;
with DNA;               use DNA;
with System.Task_Info;  use System.Task_Info;
with Ada.Strings.Fixed; use Ada.Strings.Fixed;
with Ada.Strings.Maps;  use Ada.Strings;

procedure Regexredux is

   package Int_IO is new Integer_IO (Integer);

   Max_Size  : constant := 51_000_000; -- max input size
   Input_Str : String_Access := new String (1 .. Max_Size);

   DNA_Code : String_Access;

   Initial_Length, Code_Length, Seq_Length : Natural := 0;

   type Count_Nbrs is range 0 .. 2 ** 15 - 1;
   for Count_Nbrs'Size use 16;

   type Counts_Arrays is array (Variant_Index) of Count_Nbrs;
   Counts : Counts_Arrays;

   Core_Nbr : constant Positive := Number_Of_Processors;

   -- remove headers and ends of line from input file.
   procedure Strip_Input
     (Source, Dest : in String_Access;
      Last_Src     : in Natural;
      Last_Dest    : out Natural)
   is

      EOL        : constant Maps.Character_Set := Maps.To_Set (ASCII.LF);
      End_Of_Cmt : Natural;
      Test_Char  : Character;
      Next, Pos  : Positive                    := 1;
   begin
      while Next <= Last_Src loop
         Test_Char := Source.all (Next);
         case Test_Char is
            when ASCII.LF =>
               Next := Next + 1;
            when '>' =>
               End_Of_Cmt := Index (Source.all (Next .. Next + 100), EOL);
               Next       := End_Of_Cmt + 1;
            when others =>
               Dest.all (Pos) := Test_Char;
               Pos            := Pos + 1;
               Next           := Next + 1;
         end case;
      end loop;
      Last_Dest := Pos - 1;
   end Strip_Input;

   procedure Count_Variant (Var : in Variant_Index; Count : out Count_Nbrs) is
      Regexp       : Pcre_Type;
      Regexp_Extra : Pcre_Extra_type;
      Retcode      : Integer;
      Position     : Natural    := 0;
      L_Count      : Count_Nbrs := 0;
      m0, m1       : Integer;
   begin
      Compile
        (Pattern       => Variant_Labels (Var).all,
         Options       => 0,
         Matcher       => Regexp,
         Matcher_Extra => Regexp_Extra);

      loop
         Match
           (Regexp,
            Regexp_Extra,
            DNA_Code.all (1)'Address,
            Code_Length,
            Position,
            0,
            m0,
            m1,
            Retcode);
         exit when Retcode < 0;
         L_Count  := L_Count + 1;
         Position := m1;
      end loop;
      Free (Regexp);
      Free (Regexp_Extra);
      Count := L_Count;

   end Count_Variant;

   procedure Replace_Variants
     (Source    : in String_Access;
      Last_Src  : in Natural;
      Last_Dest : out Natural)
   is

      Dest_Size    : constant Natural := Integer (Float (Last_Src) * 1.35);
      Dest         : String_Access    := new String (1 .. Dest_Size);
      Regexp       : Pcre_Type;
      Regexp_Extra : Pcre_Extra_type;
      Retcode      : Integer;
      Position     : Integer;
      Char_To_Copy : Integer;
      Start, Stop  : Integer          := 1;
      m0, m1       : Integer;
      IUB          : Mut;
   begin

      Compile
        (Pattern       => "[BDHKMNRSVWY]",
         Options       => 0,
         Matcher       => Regexp,
         Matcher_Extra => Regexp_Extra);

      Position := 0;
      loop
         Match
           (Regexp,
            Regexp_Extra,
            Source.all (1)'Address,
            Last_Src,
            Position,
            0,
            m0,
            m1,
            Retcode);
         exit when Retcode < 0;
         Char_To_Copy         := m0 - Position;
         Stop                 := Start + Char_To_Copy - 1;
         Dest (Start .. Stop) := Source (Position + 1 .. m0);
         -- insert IUB into destination string
         IUB                          := to_Mut (Source.all (m1));
         Start                        := Stop + Iub_Table (IUB).Len + 1;
         Dest (Stop + 1 .. Start - 1) := Iub_Table (IUB).Alt.all;
         Position                     := m1;

      end loop;
      -- copy remaining part of the source
      Char_To_Copy         := Last_Src - Position;
      Stop                 := Start + Char_To_Copy - 1;
      Dest (Start .. Stop) := Source (Position + 1 .. Last_Src);
      Free (Regexp);
      Free (Regexp_Extra);
      Free (Dest);

      Last_Dest := Stop;
   end Replace_Variants;

   procedure Parallel_Job (Cores : in Positive) is

      -- synchronize the variant countings and replacing.
      protected Dispatcher is
         entry Give (Variant : out Natural);
         entry Take (Variant : in Natural; Count : in Count_Nbrs);
         entry Endrep;
         entry Report (Result : out Counts_Arrays);
      private
         Done               : Boolean := False;
         Counted, Replaced  : Boolean := False;
         Cur_Job, Jobs_Done : Natural := 0;
         Var_Counts         : Counts_Arrays;
      end Dispatcher;

      protected body Dispatcher is
         -- assign a job to a worker
         entry Give (Variant : out Natural) when True is
         begin
            if Cur_Job = Var_Size then
               -- when all jobs are given send termination signal to workers
               Variant := 0;
            else
               Cur_Job := Cur_Job + 1;
               Variant := Cur_Job;
            end if;
         end Give;
         -- retrieve the result from a worker task
         entry Take (Variant : in Natural; Count : in Count_Nbrs) when True is
         begin
            Var_Counts (Variant_Index (Variant))  := Count;
            Jobs_Done                             := Jobs_Done + 1;
            if Jobs_Done = Var_Size then -- can transfer the result
               Counted := True;
               Done    := Replaced;-- shortcut for Counted and Replaced
            end if;
         end Take;
         -- signalling end of replacement
         entry Endrep when True is
         begin
            Replaced := True;
            Done     := Counted; -- shortcut for Counted and Replaced
         end Endrep;
         -- when all jobs are completed, give way to the main
         entry Report (Result : out Counts_Arrays) when Done is
         begin
            Result := Var_Counts;
         end Report;

      end Dispatcher;

      -- Each Worker compute a single variant count in parallel
      task type Workers;

      task body Workers is
         Var     : Natural;
         L_Count : Count_Nbrs;
      begin
         Busy : loop
            Dispatcher.Give (Var);
            exit Busy when Var = 0;
            Count_Variant (Variant_Index (Var), L_Count);
            Dispatcher.Take (Var, L_Count);
         end loop Busy;
      end Workers;

      Workshop : array (1 .. Cores) of Workers;

      task Replacer;

      task body Replacer is
      begin
         Replace_Variants (DNA_Code, Code_Length, Seq_Length);
         Dispatcher.Endrep;
      end Replacer;

   begin
      -- wait for the jobs to be completed
      Dispatcher.Report (Counts);
      delay 0.01; -- leave time for tasks termination.
   end Parallel_Job;

begin
   Open_Stdin;
   Read (Input_Str.all, Initial_Length);
   Close_Stdin;

   DNA_Code := new String (1 .. Initial_Length);

   Strip_Input (Input_Str, DNA_Code, Initial_Length, Code_Length);
   Free (Input_Str);

   if Core_Nbr > 1 then
      Parallel_Job (Core_Nbr);
   else
      for V in Variant_Index'Range loop
         Count_Variant (V, Counts (V));
      end loop;
      Replace_Variants (DNA_Code, Code_Length, Seq_Length);
   end if;

   for V in Variant_Index'Range loop
      Put_Line (Variant_Labels (V).all & Count_Nbrs'Image (Counts (V)));
   end loop;

   New_Line;
   Int_IO.Put (Initial_Length, Width => 6);
   New_Line;
   Int_IO.Put (Code_Length, Width => 6);
   New_Line;
   Int_IO.Put (Seq_Length, Width => 6);
   New_Line;

   Free (DNA_Code);
end Regexredux;
------------------------------------------------------------------------------
-- Constants
------------------------------------------------------------------------------
with Unchecked_Conversion;
with Unchecked_Deallocation;

package DNA is

   type String_Access is access all String;
   procedure Free is new Unchecked_Deallocation (String, String_Access);


   Var_Size : constant := 9;
   type Variant_Index is range 1 .. Var_Size;
   for Variant_Index'Size use 8;

   Variant_Labels : constant array (Variant_Index) of String_Access :=
     (new String'("agggtaaa|tttaccct"),
      new String'("[cgt]gggtaaa|tttaccc[acg]"),
      new String'("a[act]ggtaaa|tttacc[agt]t"),
      new String'("ag[act]gtaaa|tttac[agt]ct"),
      new String'("agg[act]taaa|ttta[agt]cct"),
      new String'("aggg[acg]aaa|ttt[cgt]ccct"),
      new String'("agggt[cgt]aa|tt[acg]accct"),
      new String'("agggta[cgt]a|t[acg]taccct"),
      new String'("agggtaa[cgt]|[acg]ttaccct"));

   type Mut is ('B','D','H','K','M','N','R','S','V','W','Y');
   for Mut'Size use Character'Size;

   for Mut use -- map to character values
     ('B' => 66,
      'D' => 68,
      'H' => 72,
      'K' => 75,
      'M' => 77,
      'N' => 78,
      'R' => 82,
      'S' => 83,
      'V' => 86,
      'W' => 87,
      'Y' => 89);

   function to_Mut is new Unchecked_Conversion (
      Source => Character,
      Target => Mut);

   type Iub_Rec is record
      Alt : String_Access;
      Len : Positive;
   end record;

   Iub_Table : constant array (Mut) of Iub_Rec :=
     ((new String'("(c|g|t)"), 7),
      (new String'("(a|g|t)"), 7),
      (new String'("(a|c|t)"), 7),
      (new String'("(g|t)"), 5),
      (new String'("(a|c)"), 5),
      (new String'("(a|c|g|t)"), 9),
      (new String'("(a|g)"), 5),
      (new String'("(c|g)"), 5),
      (new String'("(a|c|g)"), 7),
      (new String'("(a|t)"), 5),
      (new String'("(c|t)"), 5));
end DNA;
-----------------------------------------------------------------------
--  interface to library PCRE : regular expression
-----------------------------------------------------------------------
with System; use System;

package Pcre is

   Pcre_Error : exception;

   type Pcre_Type is private;
   type Pcre_Extra_type is private;

   Null_Pcre       : constant Pcre_Type;
   Null_Pcre_Extra : constant Pcre_Extra_type;

   procedure Compile
     (Pattern       : in String;
      Options       : in Integer;
      Matcher       : out Pcre_Type;
      Matcher_Extra : out Pcre_Extra_type);

   procedure Match
     (Matcher             : in Pcre_Type;
      Matcher_Extra       : in Pcre_Extra_type;
      Subject             : System.Address;
      -- Address of the first element of a string;
      Length, Startoffset : in Integer;
      Options             : in Integer;
      Match_0, Match_1    : out Integer;
      Result              : out Integer);

   procedure Free (M : Pcre_Type);

   procedure Free (M : Pcre_Extra_type);

private

   type Pcre_Type is new System.Address;
   type Pcre_Extra_type is new System.Address;

   Null_Pcre       : constant Pcre_Type       := Pcre_Type (Null_Address);
   Null_Pcre_Extra : constant Pcre_Extra_type :=
      Pcre_Extra_type (Null_Address);

end Pcre;
------------------------------------------------------------------------------
with Interfaces.C.Strings;     use Interfaces.C.Strings;
with Interfaces.C;             use Interfaces.C;
with Ada.Unchecked_Conversion;

package body Pcre is

   pragma Linker_Options ("-lpcre");

   use Interfaces;

   function To_chars_ptr is new Ada.Unchecked_Conversion (
      Address,
      chars_ptr);

   function Pcre_Compile
     (pattern   : chars_ptr;
      options   : Integer;
      errptr    : access chars_ptr;
      erroffset : access Integer;
      tableptr  : chars_ptr)
      return      Pcre_Type;
   pragma Import (C, Pcre_Compile, "pcre_compile");

   function Pcre_Study
     (code    : Pcre_Type;
      options : Integer;
      errptr  : access chars_ptr)
      return    Pcre_Extra_type;
   pragma Import (C, Pcre_Study, "pcre_study");

   function Pcre_Exec
     (code        : Pcre_Type;
      extra       : Pcre_Extra_type;
      subject     : chars_ptr;
      length      : Integer;
      startoffset : Integer;
      options     : Integer;
      ovector     : System.Address;
      ovecsize    : C.int)
      return        Integer;
   pragma Import (C, Pcre_Exec, "pcre_exec");

   procedure Compile
     (Pattern       : in String;
      Options       : in Integer;
      Matcher       : out Pcre_Type;
      Matcher_Extra : out Pcre_Extra_type)
   is
      Regexp       : Pcre_Type;
      Regexp_Extra : Pcre_Extra_type;
      Error_Ptr    : aliased chars_ptr;
      Error_Offset : aliased Integer;
      Pat          : chars_ptr := New_String (Pattern);
   begin
      Regexp :=
         Pcre_Compile
           (Pat,
            Options,
            Error_Ptr'Access,
            Error_Offset'Access,
            Null_Ptr);
      Free (Pat);

      if Regexp = Null_Pcre then
         raise Pcre_Error;
      end if;
      Matcher      := Regexp;
      Regexp_Extra := Pcre_Study (Regexp, 0, Error_Ptr'Access);
      if Regexp_Extra = Null_Pcre_Extra then
         raise Pcre_Error;
      end if;
      Matcher_Extra := Regexp_Extra;
   end Compile;

   procedure Match
     (Matcher             : in Pcre_Type;
      Matcher_Extra       : in Pcre_Extra_type;
      Subject             : System.Address;
      -- Address of the first element of a string;
      Length, Startoffset : in Integer;
      Options             : in Integer;
      Match_0, Match_1    : out Integer;
      Result              : out Integer)
   is
      Vecsize : constant := 3; -- top-level matching

      m : array (0 .. Vecsize - 1) of C.int;
      pragma Convention (C, m);
      pragma Volatile (m); -- used by the C library

      Start  : constant chars_ptr :=
         To_chars_ptr (Subject);
   begin

      Result  :=
         Pcre_Exec
           (Matcher,
            Matcher_Extra,
            Start,
            Length,
            Startoffset,
            Options,
            m (0)'Address,
            C.int (Vecsize));
      Match_0 := Integer (m (0));
      Match_1 := Integer (m (1));

   end Match;

   type Access_Free is access procedure (Item : System.Address);
   Pcre_Free : Access_Free;
   pragma Import (C, Pcre_Free, "pcre_free");

   procedure Free (M : Pcre_Type) is
   begin
      Pcre_Free (System.Address (M));
   end Free;

   procedure Free (M : Pcre_Extra_type) is
   begin
      Pcre_Free (System.Address (M));
   end Free;

end Pcre;
-------------------------------------------------------------------------------
package Block_Input is

   procedure Read (Item : in out String; Last : out Natural);

   procedure Open_Stdin;

   procedure Close_Stdin;

   pragma Inline (Read);

end Block_Input;
------------------------------------------------------------------------------
with Ada.Streams.Stream_IO;
with Unchecked_Conversion;

package body Block_Input is

   use Ada.Streams;

   Stdin : Stream_IO.File_Type;

   procedure Read (Item : in out String; Last : out Natural) is

      Block_Size : constant := 16 * 1024;

      subtype Index is Stream_Element_Offset range
         Stream_Element_Offset (1) .. Stream_Element_Offset (Block_Size);
      subtype XString is String (1 .. Block_Size);
      subtype XBytes is Stream_Element_Array (Index);
      function To_String is new Unchecked_Conversion (
         Source => XBytes,
         Target => XString);

      One_Block : XBytes;
      Str_Block : XString;
      Final     : Stream_Element_Offset;
      Start     : Natural := Item'First;
      Stop      : Natural;
   begin
      while not Stream_IO.End_Of_File (Stdin) loop
         Stream_IO.Read (Stdin, One_Block, Final);
         Str_Block            := To_String (One_Block);
         Stop                 := Start + Natural (Final) - 1;
         Item (Start .. Stop) := Str_Block (1 .. Natural (Final));
         Start                := Stop + 1;
      end loop;
      Last := Stop;
   end Read;

   procedure Open_Stdin is
   begin
      Stream_IO.Open
        (File => Stdin,
         Mode => Stream_IO.In_File,
         Name => "/dev/stdin");
   end Open_Stdin;

   procedure Close_Stdin is
   begin
      Stream_IO.Close (Stdin);
   end Close_Stdin;

end Block_Input;
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
GNATMAKE 13.3.0


 Tue, 22 Oct 2024 21:34:12 GMT

MAKE:
gnatchop -r -w regexredux.gnat-6.gnat
splitting regexredux.gnat-6.gnat into:
   regexredux.adb
   dna.ads
   pcre.ads
   pcre.adb
   block_input.ads
   block_input.adb
gnatmake -O3 -fomit-frame-pointer -march=ivybridge -gnatn2 -f regexredux.adb -o regexredux.gnat-6.gnat_run 
x86_64-linux-gnu-gcc-13 -c -O3 -fomit-frame-pointer -march=ivybridge -gnatn2 regexredux.adb
x86_64-linux-gnu-gcc-13 -c -O3 -fomit-frame-pointer -march=ivybridge -gnatn2 block_input.adb
x86_64-linux-gnu-gcc-13 -c -O3 -fomit-frame-pointer -march=ivybridge -gnatn2 dna.ads
x86_64-linux-gnu-gcc-13 -c -O3 -fomit-frame-pointer -march=ivybridge -gnatn2 pcre.adb
x86_64-linux-gnu-gnatbind-13 -x regexredux.ali
x86_64-linux-gnu-gnatlink-13 regexredux.ali -O3 -fomit-frame-pointer -march=ivybridge -o regexredux.gnat-6.gnat_run

4.58s to complete and log all make actions

COMMAND LINE:
 ./regexredux.gnat-6.gnat_run 0 < regexredux-input50000.txt

UNEXPECTED OUTPUT 

13c13
< 668262
---
> 273927

PROGRAM OUTPUT:
agggtaaa|tttaccct 3
[cgt]gggtaaa|tttaccc[acg] 12
a[act]ggtaaa|tttacc[agt]t 43
ag[act]gtaaa|tttac[agt]ct 27
agg[act]taaa|ttta[agt]cct 58
aggg[acg]aaa|ttt[cgt]ccct 16
agggt[cgt]aa|tt[acg]accct 15
agggta[cgt]a|t[acg]taccct 18
agggtaa[cgt]|[acg]ttaccct 20

508411
500000
668262