The Computer Language
Benchmarks Game

k-nucleotide C clang program

source code

// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// Contributed by Jeremy Zerfas

// This controls the maximum length for each set of oligonucleotide frequencies
// and each oligonucleotide count output by this program.
#define MAXIMUM_OUTPUT_LENGTH 4096

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <khash.h>

// Define a custom hash function to use instead of khash's default hash
// function. This custom hash function uses a simpler bit shift and XOR which
// results in several percent faster performance compared to when khash's
// default hash function is used.
#define CUSTOM_HASH_FUNCTION(key) ((key) ^ (key)>>7)

KHASH_INIT(oligonucleotide, uint64_t, uint32_t, 1, CUSTOM_HASH_FUNCTION
  , kh_int64_hash_equal)

// intptr_t should be the native integer type on most sane systems.
typedef intptr_t intnative_t;

typedef struct {
	uint64_t	key;
	uint32_t	value;
} element;


// Macro to convert a nucleotide character to a code. Note that upper and lower
// case ASCII letters only differ in the fifth bit from the right and we only
// need the three least significant bits to differentiate the letters 'A', 'C',
// 'G', and 'T'. Spaces in this array/string will never be used as long as
// characters other than 'A', 'C', 'G', and 'T' aren't used.
#define code_For_Nucleotide(nucleotide) (" \0 \1\3  \2"[nucleotide & 0x7])


// And one more macro to convert the codes back to nucleotide characters.
#define nucleotide_For_Code(code) ("ACGT"[code & 0x3])


// Function to use when sorting elements with qsort() later. Elements with
// larger values will come first and in cases of identical values then elements
// with smaller keys will come first.
static int element_Compare(const element * const left_Element
  , const element * const right_Element){

	// Sort based on element values.
	if(left_Element->value < right_Element->value) return 1;
	if(left_Element->value > right_Element->value) return -1;

	// If we got here then both items have the same value so then sort based on
	// key.
	return left_Element->key > right_Element->key ? 1 : -1;
}


// Generate frequencies for all oligonucleotides in polynucleotide that are of
// desired_Length_For_Oligonucleotides and then save it to output.
static void generate_Frequencies_For_Desired_Length_Oligonucleotides(
  const char * const polynucleotide, const intnative_t polynucleotide_Length
  , const intnative_t desired_Length_For_Oligonucleotides, char * const output){

	khash_t(oligonucleotide) * hash_Table=kh_init(oligonucleotide);

	uint64_t key=0;
	const uint64_t mask=((uint64_t)1<<2*desired_Length_For_Oligonucleotides)-1;

	// For the first several nucleotides we only need to append them to key in
	// preparation for the insertion of complete oligonucleotides to hash_Table.
	for(intnative_t i=0; i<desired_Length_For_Oligonucleotides-1; i++)
		key=(key<<2 & mask) | polynucleotide[i];

	// Add all the complete oligonucleotides of
	// desired_Length_For_Oligonucleotides to hash_Table and update the count
	// for each oligonucleotide.
	for(intnative_t i=desired_Length_For_Oligonucleotides-1
	  ; i<polynucleotide_Length; i++){

		key=(key<<2 & mask) | polynucleotide[i];

		int element_Was_Unused;
		const khiter_t k=kh_put(oligonucleotide, hash_Table, key
		  , &element_Was_Unused);

		// If the element_Was_Unused, then initialize the count to 1, otherwise
		// increment the count.
		if(element_Was_Unused)
			kh_value(hash_Table, k)=1;
		else
			kh_value(hash_Table, k)++;
	}

	// Create an array of elements from hash_Table.
	intnative_t elements_Array_Size=kh_size(hash_Table), i=0;
	element * elements_Array=malloc(elements_Array_Size*sizeof(element));
	uint32_t value;
	kh_foreach(hash_Table, key, value
	  , elements_Array[i++]=((element){key, value}));

	kh_destroy(oligonucleotide, hash_Table);

	// Sort elements_Array.
	qsort(elements_Array, elements_Array_Size, sizeof(element)
	  , (int (*)(const void *, const void *)) element_Compare);

	// Print the frequencies for each oligonucleotide.
	for(intnative_t output_Position=0, i=0; i<elements_Array_Size; i++){

		// Convert the key for the oligonucleotide to a string.
		char oligonucleotide[desired_Length_For_Oligonucleotides+1];
		for(intnative_t j=desired_Length_For_Oligonucleotides-1; j>-1; j--){
			oligonucleotide[j]=nucleotide_For_Code(elements_Array[i].key);
			elements_Array[i].key>>=2;
		}
		oligonucleotide[desired_Length_For_Oligonucleotides]='\0';

		// Output the frequency for oligonucleotide to output.
		output_Position+=snprintf(output+output_Position
		  , MAXIMUM_OUTPUT_LENGTH-output_Position, "%s %.3f\n", oligonucleotide
		  , 100.0f*elements_Array[i].value
		  /(polynucleotide_Length-desired_Length_For_Oligonucleotides+1));
	}

	free(elements_Array);
}


// Generate a count for the number of times oligonucleotide appears in
// polynucleotide and then save it to output.
static void generate_Count_For_Oligonucleotide(
  const char * const polynucleotide, const intnative_t polynucleotide_Length
  , const char * const oligonucleotide, char * const output){
	const intnative_t oligonucleotide_Length=strlen(oligonucleotide);

	khash_t(oligonucleotide) * const hash_Table=kh_init(oligonucleotide);

	uint64_t key=0;
	const uint64_t mask=((uint64_t)1<<2*oligonucleotide_Length)-1;

	// For the first several nucleotides we only need to append them to key in
	// preparation for the insertion of complete oligonucleotides to hash_Table.
	for(intnative_t i=0; i<oligonucleotide_Length-1; i++)
		key=(key<<2 & mask) | polynucleotide[i];

	// Add all the complete oligonucleotides of oligonucleotide_Length to
	// hash_Table and update the count for each oligonucleotide.
	for(intnative_t i=oligonucleotide_Length-1; i<polynucleotide_Length; i++){

		key=(key<<2 & mask) | polynucleotide[i];

		int element_Was_Unused;
		const khiter_t k=kh_put(oligonucleotide, hash_Table, key
		  , &element_Was_Unused);

		// If the element_Was_Unused, then initialize the count to 1, otherwise
		// increment the count.
		if(element_Was_Unused)
			kh_value(hash_Table, k)=1;
		else
			kh_value(hash_Table, k)++;
	}

	// Generate the key for oligonucleotide.
	key=0;
	for(intnative_t i=0; i<oligonucleotide_Length; i++)
		key=(key<<2) | code_For_Nucleotide(oligonucleotide[i]);

	// Output the count for oligonucleotide to output.
	khiter_t k=kh_get(oligonucleotide, hash_Table, key);
	uintmax_t count=k==kh_end(hash_Table) ? 0 : kh_value(hash_Table, k);
	snprintf(output, MAXIMUM_OUTPUT_LENGTH, "%ju\t%s", count, oligonucleotide);

	kh_destroy(oligonucleotide, hash_Table);
}


int main(){
	char buffer[4096];

	// Find the start of the third polynucleotide.
	while(fgets(buffer, sizeof(buffer), stdin) && memcmp(">THREE", buffer
	  , sizeof(">THREE")-1));

	// Start with 1 MB of storage for reading in the polynucleotide and grow
	// geometrically.
	intnative_t polynucleotide_Capacity=1048576;
	intnative_t polynucleotide_Length=0;
	char * polynucleotide=malloc(polynucleotide_Capacity);

	// Start reading and encoding the third polynucleotide.
	while(fgets(buffer, sizeof(buffer), stdin) && buffer[0]!='>'){
		for(intnative_t i=0; buffer[i]!='\0'; i++)
			if(buffer[i]!='\n')
				polynucleotide[polynucleotide_Length++]
				  =code_For_Nucleotide(buffer[i]);

		// Make sure we still have enough memory allocated for any potential
		// nucleotides in the next line.
		if(polynucleotide_Capacity-polynucleotide_Length<sizeof(buffer))
			polynucleotide=realloc(polynucleotide, polynucleotide_Capacity*=2);
	}

	// Free up any leftover memory.
	polynucleotide=realloc(polynucleotide, polynucleotide_Length);

	char output_Buffer[7][MAXIMUM_OUTPUT_LENGTH];

	// Do the following functions in parallel.
	#pragma omp parallel sections
	{
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGTATTTTAATTTATAGT", output_Buffer[6]);
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGTATTTTAATT", output_Buffer[5]);
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGTATT", output_Buffer[4]);
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGTA", output_Buffer[3]);
		#pragma omp section
		generate_Count_For_Oligonucleotide(polynucleotide
		  , polynucleotide_Length, "GGT", output_Buffer[2]);

		#pragma omp section
		generate_Frequencies_For_Desired_Length_Oligonucleotides(polynucleotide
		  , polynucleotide_Length, 2, output_Buffer[1]);
		#pragma omp section
		generate_Frequencies_For_Desired_Length_Oligonucleotides(polynucleotide
		  , polynucleotide_Length, 1, output_Buffer[0]);
	}

	// Output the results to stdout.
	for(intnative_t i=0; i<7; printf("%s\n", output_Buffer[i++]));

	free(polynucleotide);

	return 0;
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Ubuntu clang version 12.0.0-1ubuntu


Sat, 05 Jun 2021 03:51:05 GMT

MAKE:
/usr/bin/clang -pipe -Wall -O3 -fomit-frame-pointer -march=ivybridge -fopenmp -IInclude knucleotide.c -o knucleotide.clang_run 
/usr/bin/ld: cannot find -lomp
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make: [/home/dunham/all-benchmarksgame/2000-benchmarksgame/nanobench/makefiles/u64q.programs.Makefile:102: knucleotide.clang_run] Error 1 (ignored)
rm knucleotide.c

4.12s to complete and log all make actions

COMMAND LINE:
./knucleotide.clang_run 0 < knucleotide-input250000.txt

MAKE ERROR