# spectral-norm C++ g++ #8 program

## source code

```// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// Original C contributed by Sebastien Loisel
// Conversion to C++ by Jon Harrop
// OpenMP parallelize by The Anh Tran
// Add SSE by The Anh Tran

// Fastest with this flag: -Os
// g++ -pipe -Os -fomit-frame-pointer -march=native -fopenmp -mfpmath=sse -msse2 ./spec.c++ -o ./spec.run

#include <cmath>
#include <cstdlib>
#include <cstdio>
#include <sched.h>
#include <omp.h>

// define SIMD data type. 2 doubles are packed in 1 XMM register
typedef double v2dt __attribute__((vector_size(16)));
v2dt const v1 = {1.0, 1.0};

struct Param
{
union
{
double* u;      // source
v2dt*   xmm_u;
};

union
{
double* tmp;   // temporary
v2dt*   xmm_tmp;
};

union
{
double* v;         // destination
v2dt*   xmm_v;
};

int    length;         // source/desti vec's length
int    half_length;

int    r_begin;      // working range of each thread
int    r_end;

double   vBv;
double   vv;
};

// Return:   1.0 / (i + j) * (i + j +1) / 2 + i + 1;
double
eval_A(int i, int j)
{
int d = (((i+j) * (i+j+1)) >> 1) + i+1;

return 1.0 / d;
}

// Return: 2 doubles in xmm register [double1, double2]
//      double1 = 1.0 / (i + j) * (i + j +1) / 2 + i + 1;
//      double2 = 1.0 / (i+1 + j) * (i+1 + j +1) / 2 + i+1 + 1;
//   Or:
//      double2 = 1.0 / (i + j+1) * (i + j+1 +1) / 2 + i + 1;
template<bool inc_i>
v2dt
eval_A_xmm(int i, int j)
{
if (inc_i)
i <<= 1;
else
j <<= 1;

int d1 = (((i+j) * (i+j+1)) >> 1) + i+1;
int d2 = (((i+1 +j) * (i+1 +j+1)) >> 1) + i +1;

if (inc_i)
d2 += 1;

v2dt r = {d1, d2};
return v1 / r;
}

double
{
double const* val = reinterpret_cast<double const*>(&x);
return val[0] + val[1];
}

void
eval_A_times_u (Param const &p)
{
for (int i = p.r_begin, ie = p.r_end; i < ie; ++i)
{
v2dt sum = {0, 0};

// xmm = 2 doubles => index [0..length/2)
int j = 0;
for (; j < p.half_length; ++j)
sum += eval_A_xmm<false>(i, j) * p.xmm_u[j];

// If source vector is odd size. This should be called <= 1 time
for (j = j*2; j < p.length; ++j)
p.tmp[i] += eval_A(i, j) * p.u[j];
}
}

void
eval_At_times_u(Param const &p)
{
for (int i = p.r_begin, ie = p.r_end; i < ie; ++i)
{
v2dt sum = {0, 0};

int j = 0;
for (; j < p.half_length; ++j)
sum += eval_A_xmm<true>(j, i) * p.xmm_tmp[j];

for (j = j*2; j < p.length; ++j)
p.v[i] += eval_A(j, i) * p.tmp[j];
}
}

// Each thread modifies its portion in destination vector
// -> barrier needed to sync access
void
eval_AtA_times_u(Param const &p)
{
eval_A_times_u( p );
#pragma omp barrier

eval_At_times_u( p );
#pragma omp barrier
}

void
final_sum(Param& p)
{
v2dt sum_vBv   = {0,0};
v2dt sum_vv      = {0,0};

int i = p.r_begin /2;
int ie = p.r_end /2;

for (; i < ie; ++i)
{
sum_vv   += p.xmm_v[i] * p.xmm_v[i];
sum_vBv   += p.xmm_u[i] * p.xmm_v[i];
}

for (i = i*2; i < p.r_end; ++i)
{
p.vBv   += p.u[i] * p.v[i];
p.vv   += p.v[i] * p.v[i];
}
}

void
fill_10(Param const& p)
{
int i = p.r_begin /2;
int ie = p.r_end /2;

for (; i < ie; ++i)
p.xmm_u[i] = v1;

for (i = i*2; i < p.r_end; ++i)
p.u[i] = 1.0;
}

// Search for appropriate number of threads to spawn
int
{
cpu_set_t cs;
CPU_ZERO(&cs);
sched_getaffinity(0, sizeof(cs), &cs);

int count = 0;
for (int i = 0; i < CPU_SETSIZE; ++i)
{
if (CPU_ISSET(i, &cs))
++count;
}
return count;
}

double
spectral_game(int N)
{
// Align L2 cache line
__attribute__((aligned(64))) double u[N];
__attribute__((aligned(64))) double tmp[N];
__attribute__((aligned(64))) double v[N];

double vBv   = 0.0;
double vv   = 0.0;

{
// this block will be executed by NUM_THREADS
// variable declared in this block is private for each thread
int chunk      = N / threadcount;

Param my_param;

my_param.u             = u;
my_param.tmp         = tmp;
my_param.v             = v;

my_param.length         = N;
my_param.half_length   = N /2;

// calculate each thread's working range [r1 .. r2) => static schedule
my_param.r_end      = (threadid < (threadcount -1)) ? (my_param.r_begin + chunk) : N;

fill_10(my_param);
#pragma omp barrier

// Evaluating
for (int ite = 0; ite < 10; ++ite)
{
my_param.u = u;   // source is u
my_param.v = v;   // desti is v
eval_AtA_times_u(my_param);

my_param.u = v; // source is v
my_param.v = u; // desti is u
eval_AtA_times_u(my_param);
}

my_param.u = u;
my_param.v = v;
final_sum(my_param);

#pragma omp critical
{
vBv   += my_param.vBv;
vv   += my_param.vv;
}
} // parallel region

return sqrt( vBv/vv );
}

int
main(int argc, char *argv[])
{
int N = ((argc >= 2) ? atoi(argv[1]) : 2000);

printf("%.9f\n", spectral_game(N));
return 0;
}

```

## notes, command-line, and program output

```NOTES:
Ubuntu 13.2.0-23ubuntu4

Tue, 04 Jun 2024 02:45:31 GMT

MAKE:
/usr/bin/g++ -c -pipe -O3 -fomit-frame-pointer -march=ivybridge  -fopenmp -O0 spectralnorm.gpp-8.c++ -o spectralnorm.gpp-8.c++.o &&  \
/usr/bin/g++ spectralnorm.gpp-8.c++.o -o spectralnorm.gpp-8.gpp_run -fopenmp
spectralnorm.gpp-8.c++: In instantiation of ‘v2dt eval_A_xmm(int, int) [with bool inc_i = false; v2dt = __vector(2) double]’:
spectralnorm.gpp-8.c++:108:34:   required from here
spectralnorm.gpp-8.c++:85:14: warning: narrowing conversion of ‘d1’ from ‘int’ to ‘double’ [-Wnarrowing]
85 |    v2dt r = {d1, d2};
|              ^~
spectralnorm.gpp-8.c++:85:18: warning: narrowing conversion of ‘d2’ from ‘int’ to ‘double’ [-Wnarrowing]
85 |    v2dt r = {d1, d2};
|                  ^~
spectralnorm.gpp-8.c++: In instantiation of ‘v2dt eval_A_xmm(int, int) [with bool inc_i = true; v2dt = __vector(2) double]’:
spectralnorm.gpp-8.c++:127:33:   required from here
spectralnorm.gpp-8.c++:85:14: warning: narrowing conversion of ‘d1’ from ‘int’ to ‘double’ [-Wnarrowing]
85 |    v2dt r = {d1, d2};
|              ^~
spectralnorm.gpp-8.c++:85:18: warning: narrowing conversion of ‘d2’ from ‘int’ to ‘double’ [-Wnarrowing]
85 |    v2dt r = {d1, d2};
|                  ^~
rm spectralnorm.gpp-8.c++

3.33s to complete and log all make actions

COMMAND LINE:
./spectralnorm.gpp-8.gpp_run 5500

PROGRAM OUTPUT:
1.274224153
```