The Computer Language
24.11 Benchmarks Game

n-body Intel C #9 program

source code

/* The Computer Language Benchmarks Game
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 *
 * contributed by Miles
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <x86intrin.h>

#define N 5
#define PI 3.141592653589793
#define SOLAR_MASS (4 * PI * PI)
#define DAYS_PER_YEAR 365.24
#define PAIRS (N*(N-1)/2)

// utilize vrsqrtps to compute an approximation of 1/sqrt(x) with float,
// cast back to double and refine using a variation of
// Goldschmidt’s algorithm to get < 1e-9 error
static inline __m256d _mm256_rsqrt_pd(__m256d s) {
    __m128 q = _mm256_cvtpd_ps(s);
    q = _mm_rsqrt_ps(q);
    __m256d x = _mm256_cvtps_pd(q);
    __m256d y = s * x * x;
    __m256d a = _mm256_mul_pd(y, _mm256_set1_pd(0.375));
    a = _mm256_mul_pd(a, y);
    __m256d b = _mm256_mul_pd(y, _mm256_set1_pd(1.25));
    b = _mm256_sub_pd(b, _mm256_set1_pd(1.875));
    y = _mm256_sub_pd(a, b);
    x = _mm256_mul_pd(x, y);
    return x;
}

// compute rsqrt of distance between each pair of bodies
static inline void kernel(__m256d *r, double *w, __m256d *p) {
    for (int i = 1, k = 0; i < N; i++)
        for (int j = 0; j < i; j++, k++)
            r[k] = _mm256_sub_pd(p[i], p[j]);
    
    for (int k = 0; k < PAIRS; k += 4) {
        __m256d x0 = _mm256_mul_pd(r[k  ], r[k  ]);
        __m256d x1 = _mm256_mul_pd(r[k+1], r[k+1]);
        __m256d x2 = _mm256_mul_pd(r[k+2], r[k+2]);
        __m256d x3 = _mm256_mul_pd(r[k+3], r[k+3]);

        __m256d t0 = _mm256_hadd_pd(x0, x1);
        __m256d t1 = _mm256_hadd_pd(x2, x3);
        __m256d y0 = _mm256_permute2f128_pd(t0, t1, 0x21);
        __m256d y1 = _mm256_blend_pd(t0, t1, 0b1100);

        __m256d z = _mm256_add_pd(y0, y1);
        z = _mm256_rsqrt_pd(z);
        _mm256_store_pd(w+k, z);
    }
}

static double energy(double *m, __m256d *p, __m256d *v) {
    double e = 0.0;

    __m256d r[PAIRS+3];
    double w[PAIRS+3] __attribute__((aligned(sizeof(__m256d))));

    r[N] = _mm256_set1_pd(0.0);
    r[N+1] = _mm256_set1_pd(0.0);
    r[N+2] = _mm256_set1_pd(0.0);

    for (int k = 0; k < N; k++)
        r[k] = _mm256_mul_pd(v[k], v[k]);

    for (int k = 0; k < N; k += 4) {
        __m256d t0 = _mm256_hadd_pd(r[k  ], r[k+1]);
        __m256d t1 = _mm256_hadd_pd(r[k+2], r[k+3]);
        __m256d y0 = _mm256_permute2f128_pd(t0, t1, 0x21);
        __m256d y1 = _mm256_blend_pd(t0, t1, 0b1100);

        __m256d z = _mm256_add_pd(y0, y1);
        _mm256_store_pd(w+k, z);
    }

    for (int k = 0; k < N; k++)
        e += 0.5 * m[k] * w[k];

    r[PAIRS] = _mm256_set1_pd(1.0);
    r[PAIRS+1] = _mm256_set1_pd(1.0);
    r[PAIRS+2] = _mm256_set1_pd(1.0);

    kernel(r, w, p);

    for (int i = 1, k = 0; i < N; i++)
        for (int j = 0; j < i; j++, k++)
            e -= m[i] * m[j] * w[k];

    return e;
}

static void advance(int n, double dt, double *m, __m256d *p, __m256d *v) {
    __m256d r[PAIRS+3];
    double w[PAIRS+3] __attribute__((aligned(sizeof(__m256d))));

    r[PAIRS] = _mm256_set1_pd(1.0);
    r[PAIRS+1] = _mm256_set1_pd(1.0);
    r[PAIRS+2] = _mm256_set1_pd(1.0);

    __m256d rt = _mm256_set1_pd(dt);

    __m256d rm[N];
    for (int i = 0; i < N; i++)
        rm[i] = _mm256_set1_pd(m[i]);

    for (int s = 0; s < n; s++) {
        kernel(r, w, p);

        for (int k = 0; k < PAIRS; k += 4) {
            __m256d x = _mm256_load_pd(w+k);
            __m256d y = _mm256_mul_pd(x, x);
            __m256d z = _mm256_mul_pd(x, rt);
            x = _mm256_mul_pd(y, z);
            _mm256_store_pd(w+k, x);
        }

        for (int i = 1, k = 0; i < N; i++)
            for (int j = 0; j < i; j++, k++) {
                __m256d t = _mm256_set1_pd(w[k]);
                t = _mm256_mul_pd(r[k], t);
                __m256d x = _mm256_mul_pd(t, rm[j]);
                __m256d y = _mm256_mul_pd(t, rm[i]);

                v[i] = _mm256_sub_pd(v[i], x);
                v[j] = _mm256_add_pd(v[j], y);
            }

        for (int i = 0; i < N; i++) {
            __m256d t = _mm256_mul_pd(v[i], rt);
            p[i] = _mm256_add_pd(p[i], t);
        }
    }
}

int main(int argc, char *argv[]) {
    int n = atoi(argv[1]);

    double m[N];
    __m256d p[N], v[N];

    // sun
    m[0] = SOLAR_MASS;
    p[0] = _mm256_set1_pd(0.0);
    v[0] = _mm256_set1_pd(0.0);

    // jupiter
    m[1] = 9.54791938424326609e-04 * SOLAR_MASS;
    p[1] = _mm256_setr_pd(0.0,
         4.84143144246472090e+00,
        -1.16032004402742839e+00,
        -1.03622044471123109e-01);
    v[1] = _mm256_setr_pd(0.0,
         1.66007664274403694e-03 * DAYS_PER_YEAR,
         7.69901118419740425e-03 * DAYS_PER_YEAR,
        -6.90460016972063023e-05 * DAYS_PER_YEAR);

    // saturn
    m[2] = 2.85885980666130812e-04 * SOLAR_MASS;
    p[2] = _mm256_setr_pd(0.0,
         8.34336671824457987e+00,
         4.12479856412430479e+00,
        -4.03523417114321381e-01);
    v[2] = _mm256_setr_pd(0.0,
        -2.76742510726862411e-03 * DAYS_PER_YEAR,
         4.99852801234917238e-03 * DAYS_PER_YEAR,
         2.30417297573763929e-05 * DAYS_PER_YEAR);

    // uranus
    m[3] = 4.36624404335156298e-05 * SOLAR_MASS;
    p[3] = _mm256_setr_pd(0.0,
         1.28943695621391310e+01,
        -1.51111514016986312e+01,
        -2.23307578892655734e-01);
    v[3] = _mm256_setr_pd(0.0,
         2.96460137564761618e-03 * DAYS_PER_YEAR,
         2.37847173959480950e-03 * DAYS_PER_YEAR,
        -2.96589568540237556e-05 * DAYS_PER_YEAR);

    // neptune
    m[4] = 5.15138902046611451e-05 * SOLAR_MASS;
    p[4] = _mm256_setr_pd(0.0,
         1.53796971148509165e+01,
        -2.59193146099879641e+01,
         1.79258772950371181e-01);
    v[4] = _mm256_setr_pd(0.0,
         2.68067772490389322e-03 * DAYS_PER_YEAR,
         1.62824170038242295e-03 * DAYS_PER_YEAR,
        -9.51592254519715870e-05 * DAYS_PER_YEAR);

    // offset momentum
    __m256d o = _mm256_set1_pd(0.0);
    for (int i = 0; i < N; i++) {
        __m256d t = _mm256_mul_pd(_mm256_set1_pd(m[i]), v[i]);
        o = _mm256_add_pd(o, t);
    }
    v[0] = _mm256_mul_pd(o, _mm256_set1_pd(-1.0 / SOLAR_MASS));

    printf("%.9f\n", energy(m, p, v));
    advance(n, 0.01, m, p, v);
    printf("%.9f\n", energy(m, p, v));

    return 0;
}
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
Intel(R) oneAPI DPC++/C++ Compiler
2024.1.0.20240308


 Thu, 06 Jun 2024 22:13:10 GMT

MAKE:
~/intel/oneapi/compiler/latest/bin/icx -pipe -Wall -O3 -fomit-frame-pointer -march=ivybridge  nbody.icx-9.c -o nbody.icx-9.icx_run 
rm nbody.icx-9.c

5.66s to complete and log all make actions

COMMAND LINE:
 ./nbody.icx-9.icx_run 50000000

PROGRAM OUTPUT:
-0.169075164
-0.169059907