source code
// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// Contributed by Kevin Miller
//
// ver 2: added a couple of optimizations
// - Reduced number of times a vector of 8 was checked to see if
// they had all escaped, similar to Dominic Letz's C #5 entry.
// - Processed 64 pixels at a time if width was a multiple of 64,
// thereby reducing writes to the bitmap.
//
// compile with following gcc flags
// -pipe -Wall -O3 -ffast-math -fno-finite-math-only -march=native -mfpmath=sse -msse3 -fopenmp
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <emmintrin.h>
long numDigits(long n)
{
long len = 0;
while(n)
{
n=n/10;
len++;
}
return len;
}
inline long vec_nle(__m128d *v, double f)
{
return (v[0][0] <= f ||
v[0][1] <= f ||
v[1][0] <= f ||
v[1][1] <= f ||
v[2][0] <= f ||
v[2][1] <= f ||
v[3][0] <= f ||
v[3][1] <= f) ? 0 : -1;
}
inline void clrPixels_nle(__m128d *v, double f, unsigned long * pix8)
{
if(!(v[0][0] <= f)) *pix8 &= 0x7f;
if(!(v[0][1] <= f)) *pix8 &= 0xbf;
if(!(v[1][0] <= f)) *pix8 &= 0xdf;
if(!(v[1][1] <= f)) *pix8 &= 0xef;
if(!(v[2][0] <= f)) *pix8 &= 0xf7;
if(!(v[2][1] <= f)) *pix8 &= 0xfb;
if(!(v[3][0] <= f)) *pix8 &= 0xfd;
if(!(v[3][1] <= f)) *pix8 &= 0xfe;
}
inline void calcSum(__m128d *r, __m128d *i, __m128d *sum, __m128d *init_r, __m128d init_i)
{
for(long pair=0; pair<4; pair++)
{
__m128d r2 = r[pair] * r[pair];
__m128d i2 = i[pair] * i[pair];
__m128d ri = r[pair] * i[pair];
sum[pair] = r2 + i2;
r[pair]=r2 - i2 + init_r[pair];
i[pair]=ri + ri + init_i;
}
}
inline unsigned long mand8(__m128d *init_r, __m128d init_i)
{
__m128d r[4], i[4], sum[4];
for(long pair=0; pair<4; pair++)
{
r[pair]=init_r[pair];
i[pair]=init_i;
}
unsigned long pix8 = 0xff;
for (long j = 0; j < 6; j++)
{
for(long k=0; k<8; k++)
calcSum(r, i, sum, init_r, init_i);
if (vec_nle(sum, 4.0))
{
pix8 = 0x00;
break;
}
}
if (pix8)
{
calcSum(r, i, sum, init_r, init_i);
calcSum(r, i, sum, init_r, init_i);
clrPixels_nle(sum, 4.0, &pix8);
}
return pix8;
}
unsigned long mand64(__m128d *init_r, __m128d init_i)
{
unsigned long pix64 = 0;
for(long byte=0; byte<8; byte++)
{
unsigned long pix8 = mand8(init_r, init_i);
pix64 = (pix64 >> 8) | (pix8 << 56);
init_r += 4;
}
return pix64;
}
int main(int argc, char ** argv)
{
// get width/height from arguments
long wid_ht = 16000;
if (argc >= 2)
{
wid_ht = atoi(argv[1]);
}
wid_ht = (wid_ht+7) & ~7;
// allocate memory for header and pixels
long headerLength = numDigits(wid_ht)*2+5;
long pad = ((headerLength + 7) & ~7) - headerLength; // pad aligns pixels on 8
long dataLength = headerLength + wid_ht*(wid_ht>>3);
unsigned char * const buffer = malloc(pad + dataLength);
unsigned char * const header = buffer + pad;
unsigned char * const pixels = header + headerLength;
// generate the bitmap header
sprintf((char *)header, "P4\n%ld %ld\n", wid_ht, wid_ht);
// calculate initial values, store in r0, i0
__m128d r0[wid_ht/2];
double i0[wid_ht];
for(long xy=0; xy<wid_ht; xy+=2)
{
r0[xy>>1] = 2.0 / wid_ht * (__m128d){xy, xy+1} - 1.5;
i0[xy] = 2.0 / wid_ht * xy - 1.0;
i0[xy+1] = 2.0 / wid_ht * (xy+1) - 1.0;
}
// generate the bitmap
long use8 = wid_ht%64;
if (use8)
{
// process 8 pixels (one byte) at a time
#pragma omp parallel for schedule(guided)
for(long y=0; y<wid_ht; y++)
{
__m128d init_i = (__m128d){i0[y], i0[y]};
long rowstart = y*wid_ht/8;
for(long x=0; x<wid_ht; x+=8)
{
pixels[rowstart + x/8] = mand8(r0+x/2, init_i);
}
}
}
else
{
// process 64 pixels (8 bytes) at a time
#pragma omp parallel for schedule(guided)
for(long y=0; y<wid_ht; y++)
{
__m128d init_i = (__m128d){i0[y], i0[y]};
long rowstart = y*wid_ht/64;
for(long x=0; x<wid_ht; x+=64)
{
((unsigned long *)pixels)[rowstart + x/64] = mand64(r0+x/2, init_i);
}
}
}
// write the data
long ret = ret = write(STDOUT_FILENO, header, dataLength);
free(buffer);
return 0;
}
notes, command-line, and program output
NOTES:
64-bit Ubuntu quad core
gcc (Ubuntu 14.2.0-4ubuntu2) 14.2.0
Tue, 22 Oct 2024 19:08:56 GMT
MAKE:
/usr/bin/gcc -pipe -Wall -O3 -fomit-frame-pointer -march=ivybridge -mno-fma -fno-finite-math-only -fopenmp mandelbrot.gcc-6.c -o mandelbrot.gcc-6.gcc_run
rm mandelbrot.gcc-6.c
2.81s to complete and log all make actions
COMMAND LINE:
./mandelbrot.gcc-6.gcc_run 16000
(BINARY) PROGRAM OUTPUT NOT SHOWN