n-body C# naot #4 program

source code

/**
  * The Computer Language Benchmarks Game
  * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
  *
  * Contributed by Derek Ziemba
  * 
  * This is nearly a direct port of Miles's "n-body C gcc #9 program"
  * The actual direct port was about 10% slower. 
  * I was able to speed it up primarily by just eliminating variables (C# doesn't handle lots of variables too well)
  */

using System;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

using static System.Runtime.CompilerServices.MethodImplOptions;

using V256d = System.Runtime.Intrinsics.Vector256<double>;
using V128d = System.Runtime.Intrinsics.Vector128<float>;

public static unsafe class NBody_CPP2 {
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_set1_pd(double x) => Vector256.Create(x);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_setr_pd(double x, double y, double z, double w) => Vector256.Create(x, y, z, w);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_load_pd(double* address) => Avx.LoadAlignedVector256(address);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static void _mm256_store_pd(double* address, V256d source) => Avx.StoreAligned(address, source);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V128d _mm_rsqrt_ps(V128d x) => Sse.ReciprocalSqrt(x);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_cvtps_pd(V128d x) => Avx.ConvertToVector256Double(x);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V128d _mm256_cvtpd_ps(V256d x) => Avx.ConvertToVector128Single(x);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_mul_pd(V256d x, V256d y) => Avx.Multiply(x, y);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_sub_pd(V256d x, V256d y) => Avx.Subtract(x, y);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_add_pd(V256d x, V256d y) => Avx.Add(x, y);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_hadd_pd(V256d x, V256d y) => Avx.HorizontalAdd(x, y);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_blend_pd(V256d x, V256d y, byte control) => Avx.Blend(x, y, control);
  [SkipLocalsInit][MethodImpl(AggressiveOptimization | AggressiveInlining)] private static V256d _mm256_permute2f128_pd(V256d x, V256d y, byte control) => Avx.Permute2x128(x, y, control);


  // compute rsqrt of distance between each pair of bodies
  [SkipLocalsInit]
  [MethodImpl(AggressiveOptimization | AggressiveInlining)]
  private static void kernel(V256d* r, double* w, V256d* p) {
    for(int i = 1, k = 0; i < 5; i++) {
      for(int j = 0; j < i; j++, k++) {
        r[k] = _mm256_sub_pd(p[i], p[j]);
      }
    }

    V256d c0375 = _mm256_set1_pd(0.375); 
    V256d c1250 = _mm256_set1_pd(1.25); 
    V256d c1875 = _mm256_set1_pd(1.875); 
    for(int k = 0; k < 10; k += 4) {
      V256d x0 = r[k], x1 = r[k+1], x2 = r[k+2], x3 = r[k+3];
      V256d t0 = _mm256_hadd_pd(_mm256_mul_pd(x0, x0), _mm256_mul_pd(x1, x1));
      V256d t1 = _mm256_hadd_pd(_mm256_mul_pd(x2, x2), _mm256_mul_pd(x3, x3));
      V256d s = _mm256_add_pd(_mm256_permute2f128_pd(t0, t1, 0x21), _mm256_blend_pd(t0, t1, 0b1100));
      V256d x = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(s)));
      V256d y = _mm256_mul_pd(s, _mm256_mul_pd(x, x));
      _mm256_store_pd(w + k, _mm256_mul_pd(x, _mm256_sub_pd(_mm256_mul_pd(_mm256_mul_pd(y, c0375), y), _mm256_sub_pd(_mm256_mul_pd(y, c1250), c1875))));
    }
  }



  static void advance(int n, double dt, V256d* m, V256d* p, V256d* v) {
    V256d* mem = stackalloc V256d[16];
    // Align the memory (C# doesn't have a built in way AFAIK) to prevent fault when calling Avx.LoadAlignedVector256 or Avx.StoreAligned
    V256d* r = (V256d*)((((UInt64)mem)+31UL)&~31UL);
    double* w = (double*)(r + 12);

    r[10] = _mm256_set1_pd(1.0);
    r[11] = _mm256_set1_pd(1.0);

    V256d rt = _mm256_set1_pd(dt);

    for(int s = 0; s < n; s++) {
      kernel(r, w, p);

      for(int k = 0; k < 10; k += 4) {
        V256d x = _mm256_load_pd(w + k);
        _mm256_store_pd(w + k, _mm256_mul_pd(_mm256_mul_pd(x, x), _mm256_mul_pd(x, rt)));
      }

      for(int i = 1, k = 0; i < 5; i++)
        for(int j = 0; j < i; j++, k++) {
          V256d t = _mm256_mul_pd(r[k], _mm256_set1_pd(w[k]));
          v[i] = _mm256_sub_pd(v[i], _mm256_mul_pd(t, m[j]));
          v[j] = _mm256_add_pd(v[j], _mm256_mul_pd(t, m[i]));
        }

      for(int i = 0; i < 5; i++) {
        p[i] = _mm256_add_pd(p[i], _mm256_mul_pd(v[i], rt));
      }
    }
  }


  static double energy(V256d* m, V256d* p, V256d* v) {
    double e = 0.0;
    V256d* mem = stackalloc V256d[16];
    // Align the memory (C# doesn't have a built in way AFAIK) to prevent fault when calling Avx.LoadAlignedVector256 or Avx.StoreAligned
    V256d* r = (V256d*)((((UInt64)mem)+31UL)&~31UL); 
    double* w = (double*)(r + 12);


    r[5] = _mm256_set1_pd(0.0);
    r[6] = _mm256_set1_pd(0.0);

    for(int k = 0; k < 5; k++)
      r[k] = Avx.Multiply(v[k], v[k]);

    for(int k = 0; k < 5; k += 4) {
      V256d t0 = _mm256_hadd_pd(r[k], r[k + 1]);
      V256d t1 = _mm256_hadd_pd(r[k + 2], r[k + 3]);
      _mm256_store_pd(w + k, _mm256_add_pd(_mm256_permute2f128_pd(t0, t1, 0x21), _mm256_blend_pd(t0, t1, 0b1100)));
    }

    for(int k = 0; k < 5; k++)
      e += 0.5 * m[k].GetElement(1) * w[k];

    r[10] = _mm256_set1_pd(0.0);
    r[11] = _mm256_set1_pd(0.0);

    kernel(r, w, p);

    for(int i = 1, k = 0; i < 5; i++)
      for(int j = 0; j < i; j++, k++)
        e -= m[i].GetElement(1) * m[j].GetElement(1) * w[k];

    return e;
  }



  public static void Main(string[] args) {
    int iterations = args.Length > 0 ? Int32.Parse(args[0]) : 10000;
    if (iterations <= 0) { return; }

    const double PI = 3.141592653589793;
    const double SOLAR_MASS = (4 * PI * PI);
    const double DAYS_PER_YEAR = 365.24;

    V256d* mem = stackalloc V256d[16];
    // Align the memory (C# doesn't have a built in way AFAIK) to prevent fault when calling Avx.LoadAlignedVector256 or Avx.StoreAligned
    V256d* p = (V256d*)((((UInt64)mem)+31UL)&~31UL); 

    // positions
    p[0] = V256d.Zero;
    p[1] = _mm256_setr_pd(0.0, 4.84143144246472090e+00, -1.16032004402742839e+00, -1.03622044471123109e-01);
    p[2] = _mm256_setr_pd(0.0, 8.34336671824457987e+00, 4.12479856412430479e+00, -4.03523417114321381e-01);
    p[3] = _mm256_setr_pd(0.0, 1.28943695621391310e+01, -1.51111514016986312e+01, -2.23307578892655734e-01);
    p[4] = _mm256_setr_pd(0.0, 1.53796971148509165e+01, -2.59193146099879641e+01, 1.79258772950371181e-01);

    // velocities
    V256d* v = p+5; 
    //v[0] = _mm256_set1_pd(-1.0);
    v[1] = _mm256_setr_pd(0.0, 1.66007664274403694e-03 * DAYS_PER_YEAR, 7.69901118419740425e-03 * DAYS_PER_YEAR, -6.90460016972063023e-05 * DAYS_PER_YEAR);
    v[2] = _mm256_setr_pd(0.0, -2.76742510726862411e-03 * DAYS_PER_YEAR, 4.99852801234917238e-03 * DAYS_PER_YEAR, 2.30417297573763929e-05 * DAYS_PER_YEAR);
    v[3] = _mm256_setr_pd(0.0,  2.96460137564761618e-03 * DAYS_PER_YEAR, 2.37847173959480950e-03 * DAYS_PER_YEAR, -2.96589568540237556e-05 * DAYS_PER_YEAR);
    v[4] = _mm256_setr_pd(0.0, 2.68067772490389322e-03 * DAYS_PER_YEAR,  1.62824170038242295e-03 * DAYS_PER_YEAR,  -9.51592254519715870e-05 * DAYS_PER_YEAR);

    // masses
    V256d* m = p+10; 
    m[0] = _mm256_set1_pd(SOLAR_MASS);
    m[1] = _mm256_set1_pd(9.54791938424326609e-04 * SOLAR_MASS);
    m[2] = _mm256_set1_pd(2.85885980666130812e-04 * SOLAR_MASS);
    m[3] = _mm256_set1_pd(4.36624404335156298e-05 * SOLAR_MASS);
    m[4] = _mm256_set1_pd(5.15138902046611451e-05 * SOLAR_MASS);

    // Offset Momentmum
    v[0] = Avx.Divide(Avx.Add(Avx.Add(Avx.Add(Avx.Multiply(v[1], m[1]),
                                              Avx.Multiply(v[2], m[2])),
                                      Avx.Multiply(v[3], m[3])),
                              Avx.Multiply(v[4], m[4])),
                      Avx.Multiply(_mm256_set1_pd(-1.0), m[0]));


    Console.WriteLine(energy(m, p, v).ToString("F9"));
    advance(iterations, 0.01, m, p, v);
    Console.WriteLine(energy(m, p, v).ToString("F9"));
  }


} //END Class

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
.NET SDK 9.0.100
Host Version: 9.0.0
Commit: 9d5a6a9aa4

<OutputType>Exe
<TargetFramework>net9.0
<ImplicitUsings>enable
<Nullable>enable
<AllowUnsafeBlocks>true
<ServerGarbageCollection>true
<ConcurrentGarbageCollection>true
<PublishAot>true
<OptimizationPreference>Speed
<IlcInstructionSet>native


 Thu, 06 Feb 2025 02:58:02 GMT

MAKE:
cp nbody.csharpaot-4.csharpaot Program.cs
cp Include/csharpaot/program.csproj .
/opt/src/dotnet-sdk-9.0.100/dotnet publish -r linux-x64 -c Release
  Determining projects to restore...
  Restored /home/dunham/all-benchmarksgame/benchmarksgame_i53330/nbody/tmp/program.csproj (in 940 ms).
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/nbody/tmp/Program.cs(34,165): warning CA1857: The argument should be a constant for optimal performance (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1857) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/nbody/tmp/program.csproj]
/home/dunham/all-benchmarksgame/benchmarksgame_i53330/nbody/tmp/Program.cs(35,179): warning CA1857: The argument should be a constant for optimal performance (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1857) [/home/dunham/all-benchmarksgame/benchmarksgame_i53330/nbody/tmp/program.csproj]
  program -> /home/dunham/all-benchmarksgame/benchmarksgame_i53330/nbody/tmp/bin/Release/net9.0/linux-x64/program.dll
  Generating native code
  program -> /home/dunham/all-benchmarksgame/benchmarksgame_i53330/nbody/tmp/bin/Release/net9.0/linux-x64/publish/

24.03s to complete and log all make actions

COMMAND LINE:
 ./bin/Release/net9.0/linux-x64/native/program 50000000

PROGRAM OUTPUT:
-0.169075164
-0.169059907