The Computer Language
24.12 Benchmarks Game

spectral-norm C gcc #2 program

source code

/* The Computer Language Benchmarks Game
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 *
 * Contributed by Bogdan Sharkov 
 * Optimized memory access (AVX intrinsics), compile with -march=native -fopenmp for best results 
 *
 * Contributed by Mr Ledrug 
 * Algorithm lifted from Intel Fortran #2 code by Steve Decker et al.
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <malloc.h>
#include <x86intrin.h>

double dot(double * v, double * u, int n) {
   int i;
   double sum = 0;
   for (i = 0; i < n; i++)
      sum += *v++ * *u++;
   return sum;
}

void mult_Av(double * v, double * out, const int n) {
__m256i ONE = _mm256_set1_epi32(1);   
__m256i EIGHT = _mm256_set1_epi32(8); 
   unsigned int i, j;
   #pragma omp parallel for schedule(static)
   for (i = 1; i <= n; i++) {
	   
	   j = i-1;
	   int k = n+j;
	   __m256i m = _mm256_set1_epi32(i);
	   __m256d mysum =_mm256_setzero_pd();
	   __m256i x = _mm256_setr_epi32(j, j + 1, j + 2, j + 3, j + 4, j + 5, j + 6, j + 7);	   
		
		for (double* pv = v; j < k; j+=8, pv+=4) {
			__m256i y = _mm256_add_epi32(x, ONE);
			__m256i a = _mm256_mullo_epi32(x, y);
			x = _mm256_add_epi32(x, EIGHT);
			a = _mm256_srli_epi32(a, 1);
			a = _mm256_add_epi32(a, m);			
			
			 __m128i a1 = _mm256_castsi256_si128(a);
			 __m256d  mul = _mm256_cvtepi32_pd(a1);
			 __m256d  src = _mm256_load_pd(pv);
			 pv+=4;
			 __m256d  res = _mm256_div_pd(src, mul);	
			 mysum = _mm256_add_pd(mysum, res);			

			   a1 = _mm256_extracti128_si256(a, 1);
			   mul = _mm256_cvtepi32_pd(a1);
			   src = _mm256_load_pd(pv);
			 
			 res = _mm256_div_pd(src, mul);	
			 mysum = _mm256_add_pd(mysum, res);							
		}  
  
	  
  	  out[i-1] = ((double*)&mysum)[0] + ((double*)&mysum)[1] + ((double*)&mysum)[2] + ((double*)&mysum)[3];
   }
	for(i=n;i<n+7;i++) out[i] =0.0;
}

void mult_Atv(double * v, double * out,  const int n) {
	__m256i ONE = _mm256_set1_epi32(1);   
	__m256i EIGHT = _mm256_set1_epi32(8);
   int i;
   
#pragma omp parallel for schedule(static)
   for (i = 0; i < n; i++) {
	   double* pv = v;
	  __m256d mysum =_mm256_setzero_pd();
	   __m256i x = _mm256_setr_epi32(i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7);
	  __m256i aj = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
	  for (int j=0; j < n;j+=8, pv+=4){		  
			__m256i y = _mm256_add_epi32(x, ONE);
			__m256i a = _mm256_mullo_epi32(x, y);
			x = _mm256_add_epi32(x, EIGHT);
			a = _mm256_srli_epi32(a, 1);
			a = _mm256_add_epi32(a, aj);
			aj = _mm256_add_epi32(aj, EIGHT);
			
			 __m128i a1 = _mm256_castsi256_si128(a);
			 __m256d  mul = _mm256_cvtepi32_pd(a1);
			 __m256d  src = _mm256_load_pd(pv);
			 pv+=4;
			 __m256d  res = _mm256_div_pd(src, mul);	
			 mysum = _mm256_add_pd(mysum, res);			

			 a1 = _mm256_extracti128_si256(a, 1);
			 mul = _mm256_cvtepi32_pd(a1);
			 src = _mm256_load_pd(pv);
			 
			 res = _mm256_div_pd(src, mul);	
			 mysum = _mm256_add_pd(mysum, res);
	  }		  
	  out[i] = ((double*)&mysum)[0] + ((double*)&mysum)[1] + ((double*)&mysum)[2] + ((double*)&mysum)[3];
   }
   
    for(i=n;i<n+7;i++) out[i]=0.0;
}

double *tmp;
void mult_AtAv(double *v, double *out, const int n) {
   mult_Av(v, tmp, n);
   mult_Atv(tmp, out, n);
}

int main(int argc, char**argv) {
   int n = 2000;
   if (argc >= 2) n = atoi(argv[1]);

   double *u, *v;
   u = _mm_malloc((n+7) * sizeof(double),sizeof(__m256d));
   v = _mm_malloc((n+7) * sizeof(double),sizeof(__m256d));
   tmp = _mm_malloc((n+7) * sizeof(double),sizeof(__m256d));
   
   int i;
   for (i = 0; i < n; i++) u[i] = 1;
   for(i=n;i<n+7;i++) u[i]=0.0;
	
   for (i = 0; i < 10; i++) {
      mult_AtAv(u, v, n);
      mult_AtAv(v, u, n);
   }

   printf("%.9f\n", sqrt(dot(u,v, n) / dot(v,v,n)));

	_mm_free(u);
	_mm_free(v);
	_mm_free(tmp);


   return 0; 
}
    
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
gcc (Ubuntu 14.2.0-4ubuntu2) 14.2.0


 Tue, 22 Oct 2024 19:25:45 GMT

MAKE:
/usr/bin/gcc -pipe -Wall -O3 -fomit-frame-pointer -march=ivybridge -fopenmp spectralnorm.gcc-2.c -o spectralnorm.gcc-2.gcc_run -lm
In file included from /usr/lib/gcc/x86_64-linux-gnu/14/include/immintrin.h:53,
                 from /usr/lib/gcc/x86_64-linux-gnu/14/include/x86intrin.h:32,
                 from spectralnorm.gcc-2.c:15:
/usr/lib/gcc/x86_64-linux-gnu/14/include/avx2intrin.h: In function ‘mult_Av._omp_fn.0’:
/usr/lib/gcc/x86_64-linux-gnu/14/include/avx2intrin.h:1096:1: error: inlining failed in call to ‘always_inline’ ‘_mm256_extracti128_si256’: target specific option mismatch
 1096 | _mm256_extracti128_si256 (__m256i __X, const int __M)
      | ^~~~~~~~~~~~~~~~~~~~~~~~
spectralnorm.gcc-2.c:52:33: note: called from here
   52 |                            a1 = _mm256_extracti128_si256(a, 1);
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/14/include/avx2intrin.h:119:1: error: inlining failed in call to ‘always_inline’ ‘_mm256_add_epi32’: target specific option mismatch
  119 | _mm256_add_epi32 (__m256i __A, __m256i __B)
      | ^~~~~~~~~~~~~~~~
spectralnorm.gcc-2.c:43:29: note: called from here
   43 |                         a = _mm256_add_epi32(a, m);
      |                             ^~~~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/14/include/avx2intrin.h:773:1: error: inlining failed in call to ‘always_inline’ ‘_mm256_srli_epi32’: target specific option mismatch
  773 | _mm256_srli_epi32 (__m256i __A, int __B)
      | ^~~~~~~~~~~~~~~~~
spectralnorm.gcc-2.c:42:29: note: called from here
   42 |                         a = _mm256_srli_epi32(a, 1);
      |                             ^~~~~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/14/include/avx2intrin.h:119:1: error: inlining failed in call to ‘always_inline’ ‘_mm256_add_epi32’: target specific option mismatch
  119 | _mm256_add_epi32 (__m256i __A, __m256i __B)
      | ^~~~~~~~~~~~~~~~
spectralnorm.gcc-2.c:41:29: note: called from here
   41 |                         x = _mm256_add_epi32(x, EIGHT);
      |                             ^~~~~~~~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/14/include/avx2intrin.h:560:1: error: inlining failed in call to ‘always_inline’ ‘_mm256_mullo_epi32’: target specific option mismatch
  560 | _mm256_mullo_epi32 (__m256i __A, __m256i __B)
      | ^~~~~~~~~~~~~~~~~~
spectralnorm.gcc-2.c:40:37: note: called from here
   40 |                         __m256i a = _mm256_mullo_epi32(x, y);
      |                                     ^~~~~~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/14/include/avx2intrin.h:119:1: error: inlining failed in call to ‘always_inline’ ‘_mm256_add_epi32’: target specific option mismatch
  119 | _mm256_add_epi32 (__m256i __A, __m256i __B)
      | ^~~~~~~~~~~~~~~~
spectralnorm.gcc-2.c:39:37: note: called from here
   39 |                         __m256i y = _mm256_add_epi32(x, ONE);
      |                                     ^~~~~~~~~~~~~~~~~~~~~~~~
/usr/lib/gcc/x86_64-linux-gnu/14/include/avx2intrin.h:1096:1: error: inlining failed in call to ‘always_inline’ ‘_mm256_extracti128_si256’: target specific option mismatch
 1096 | _mm256_extracti128_si256 (__m256i __X, const int __M)
      | ^~~~~~~~~~~~~~~~~~~~~~~~
spectralnorm.gcc-2.c:52:33: note: called from here
   52 |                            a1 = _mm256_extracti128_si256(a, 1);
      |                                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
make: [/home/dunham/all-benchmarksgame/2000-benchmarksgame/nanobench/makefiles/u64q.programs.Makefile:43: spectralnorm.gcc-2.gcc_run] Error 1 (ignored)
rm spectralnorm.gcc-2.c

2.42s to complete and log all make actions

COMMAND LINE:
 ./spectralnorm.gcc-2.gcc_run 500

MAKE ERROR