k-nucleotide C clang program

source code

// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// Contributed by Jeremy Zerfas

// This controls the maximum length for each set of oligonucleotide frequencies
// and each oligonucleotide count output by this program.
#define MAXIMUM_OUTPUT_LENGTH 4096

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <khash.h>

// Define a custom hash function to use instead of khash's default hash
// function. This custom hash function uses a simpler bit shift and XOR which
// results in several percent faster performance compared to when khash's
// default hash function is used.
#define CUSTOM_HASH_FUNCTION(key) ((key) ^ (key)>>7)

KHASH_INIT(oligonucleotide, uint64_t, uint32_t, 1, CUSTOM_HASH_FUNCTION
  , kh_int64_hash_equal)

// intptr_t should be the native integer type on most sane systems.
typedef intptr_t intnative_t;

typedef struct {
	uint64_t	key;
	uint32_t	value;
} element;


// Macro to convert a nucleotide character to a code. Note that upper and lower
// case ASCII letters only differ in the fifth bit from the right and we only
// need the three least significant bits to differentiate the letters 'A', 'C',
// 'G', and 'T'. Spaces in this array/string will never be used as long as
// characters other than 'A', 'C', 'G', and 'T' aren't used.
#define code_For_Nucleotide(nucleotide) (" \0 \1\3  \2"[nucleotide & 0x7])


// And one more macro to convert the codes back to nucleotide characters.
#define nucleotide_For_Code(code) ("ACGT"[code & 0x3])


// Function to use when sorting elements with qsort() later. Elements with
// larger values will come first and in cases of identical values then elements
// with smaller keys will come first.
static int element_Compare(const element * const left_Element
  , const element * const right_Element){

	// Sort based on element values.
	if(left_Element->value < right_Element->value) return 1;
	if(left_Element->value > right_Element->value) return -1;

	// If we got here then both items have the same value so then sort based on
	// key.
	return left_Element->key > right_Element->key ? 1 : -1;
}


// Generate frequencies for all oligonucleotides in polynucleotide that are of
// desired_Length_For_Oligonucleotides and then save it to output.
static void generate_Frequencies_For_Desired_Length_Oligonucleotides(
  const char * const polynucleotide, const intnative_t polynucleotide_Length
  , const intnative_t desired_Length_For_Oligonucleotides, char * const output){

	khash_t(oligonucleotide) * hash_Table=kh_init(oligonucleotide);

	uint64_t key=0;
	const uint64_t mask=((uint64_t)1<<2*desired_Length_For_Oligonucleotides)-1;

	// For the first several nucleotides we only need to append them to key in
	// preparation for the insertion of complete oligonucleotides to hash_Table.
	for(intnative_t i=0; i<desired_Length_For_Oligonucleotides-1; i++)
		key=(key<<2 & mask) | polynucleotide[i];

	// Add all the complete oligonucleotides of
	// desired_Length_For_Oligonucleotides to hash_Table and update the count
	// for each oligonucleotide.
	for(intnative_t i=desired_Length_For_Oligonucleotides-1
	  ; i<polynucleotide_Length; i++){

		key=(key<<2 & mask) | polynucleotide[i];

		int element_Was_Unused;
		const khiter_t k=kh_put(oligonucleotide, hash_Table, key
		  , &element_Was_Unused);

		// If the element_Was_Unused, then initialize the count to 1, otherwise
		// increment the count.
		if(element_Was_Unused)
			kh_value(hash_Table, k)=1;
		else
			kh_value(hash_Table, k)++;
	}

	// Create an array of elements from hash_Table.
	intnative_t elements_Array_Size=kh_size(hash_Table), i=0;
	element * elements_Array=malloc(elements_Array_Size*sizeof(element));
	uint32_t value;
	kh_foreach(hash_Table, key, value
	  , elements_Array[i++]=((element){key, value}));

	kh_destroy(oligonucleotide, hash_Table);

	// Sort elements_Array.
	qsort(elements_Array, elements_Array_Size, sizeof(element)
	  , (int (*)(const void *, const void *)) element_Compare);

	// Print the frequencies for each oligonucleotide.
	for(intnative_t output_Position=0, i=0; i<elements_Array_Size; i++){

		// Convert the key for the oligonucleotide to a string.
		char oligonucleotide[desired_Length_For_Oligonucleotides+1];
		for(intnative_t j=desired_Length_For_Oligonucleotides-1; j>-1; j--){
			oligonucleotide[j]=nucleotide_For_Code(elements_Array[i].key);
			elements_Array[i].key>>=2;
		}
		oligonucleotide[desired_Length_For_Oligonucleotides]='\0';

		// Output the frequency for oligonucleotide to output.
		output_Position+=snprintf(output+output_Position
		  , MAXIMUM_OUTPUT_LENGTH-output_Position, "%s %.3f\n", oligonucleotide
		  , 100.0f*elements_Array[i].value
		  /(polynucleotide_Length-desired_Length_For_Oligonucleotides+1));
	}

	free(elements_Array);
}


// Generate a count for the number of times oligonucleotide appears in
// polynucleotide and then save it to output.
static void generate_Count_For_Oligonucleotide(
  const char * const polynucleotide, const intnative_t polynucleotide_Length
  , const char * const oligonucleotide, char * const output){
	const intnative_t oligonucleotide_Length=strlen(oligonucleotide);

	khash_t(oligonucleotide) * const hash_Table=kh_init(oligonucleotide);

	uint64_t key=0;
	const uint64_t mask=((uint64_t)1<<2*oligonucleotide_Length)-1;

	// For the first several nucleotides we only need to append them to key in
	// preparation for the insertion of complete oligonucleotides to hash_Table.
	for(intnative_t i=0; i<oligonucleotide_Length-1; i++)
		key=(key<<2 & mask) | polynucleotide[i];

	// Add all the complete oligonucleotides of oligonucleotide_Length to
	// hash_Table and update the count for each oligonucleotide.
	for(intnative_t i=oligonucleotide_Length-1; i<polynucleotide_Length; i++){

		key=(key<<2 & mask) | polynucleotide[i];

		int element_Was_Unused;
		const khiter_t k=kh_put(oligonucleotide, hash_Table, key
		  , &element_Was_Unused);

		// If the element_Was_Unused, then initialize the count to 1, otherwise
		// increment the count.
		if(element_Was_Unused)
			kh_value(hash_Table, k)=1;
		else
			kh_value(hash_Table, k)++;
	}

	// Generate the key for oligonucleotide.
	key=0;
	for(intnative_t i=0; i<oligonucleotide_Length; i++)
		key=(key<<2) | code_For_Nucleotide(oligonucleotide[i]);

	// Output the count for oligonucleotide to output.
	khiter_t k=kh_get(oligonucleotide, hash_Table, key);
	uintmax_t count=k==kh_end(hash_Table) ? 0 : kh_value(hash_Table, k);
	snprintf(output, MAXIMUM_OUTPUT_LENGTH, "%ju\t%s", count, oligonucleotide);

	kh_destroy(oligonucleotide, hash_Table);
}


int main(){
	char buffer[4096];

	// Find the start of the third polynucleotide.
	while(fgets(buffer, sizeof(buffer), stdin) && memcmp(">THREE", buffer
	  , sizeof(">THREE")-1));

	// Start with 1 MB of storage for reading in the polynucleotide and grow
	// geometrically.
	intnative_t polynucleotide_Capacity=1048576;
	intnative_t polynucleotide_Length=0;
	char * polynucleotide=malloc(polynucleotide_Capacity);

	// Start reading and encoding the third polynucleotide.
	while(fgets(buffer, sizeof(buffer), stdin) && buffer[0]!='>'){
		for(intnative_t i=0; buffer[i]!='\0'; i++)
			if(buffer[i]!='\n')
				polynucleotide[polynucleotide_Length++]
				  =code_For_Nucleotide(buffer[i]);

		// Make sure we still have enough memory allocated for any potential
		// nucleotides in the next line.
		if(polynucleotide_Capacity-polynucleotide_Length<sizeof(buffer))
			polynucleotide=realloc(polynucleotide, polynucleotide_Capacity*=2);
	}

	// Free up any leftover memory.
	polynucleotide=realloc(polynucleotide, polynucleotide_Length);

	char output_Buffer[7][MAXIMUM_OUTPUT_LENGTH];

	// Do the following functions in parallel.
	#pragma omp parallel sections
	{
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGTATTTTAATTTATAGT", output_Buffer[6]);
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGTATTTTAATT", output_Buffer[5]);
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGTATT", output_Buffer[4]);
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGTA", output_Buffer[3]);
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGT", output_Buffer[2]);

		#pragma omp section
		generate_Frequencies_For_Desired_Length_Oligonucleotides(polynucleotide
		  , polynucleotide_Length, 2, output_Buffer[1]);
		#pragma omp section
		generate_Frequencies_For_Desired_Length_Oligonucleotides(polynucleotide
		  , polynucleotide_Length, 1, output_Buffer[0]);
	}

	// Output the results to stdout.
	for(intnative_t i=0; i<7; printf("%s\n", output_Buffer[i++]));

	free(polynucleotide);

	return 0;
}

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Ubuntu clang version 19.1.1 (1ubuntu1)


 Sun, 23 Feb 2025 00:44:48 GMT

MAKE:
/usr/bin/clang -pipe -Wall -O3 -fomit-frame-pointer -march=ivybridge -fopenmp -IInclude knucleotide.c -o knucleotide.clang_run 
rm knucleotide.c

5.85 seconds to complete and log all make actions

COMMAND LINE:
 ./knucleotide.clang_run 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT