This is the C++ DLL:
#include "PPC.h"
#include <thread>
static const float G = 0.0000001F;
const int count = 4096;
__declspec(align(64)) float pointsx[count];
__declspec(align(64)) float pointsy[count];
void SetData(float* x, float* y){
memcpy(pointsx, x, count * sizeof(float));
memcpy(pointsy, y, count * sizeof(float));
}
void Compute(float* points, float* velx, float* vely, long pcount, float aspect, float zoom) {
#pragma omp parallel for
for (auto i = 0; i < count; ++i) {
auto forcex = 0.0F;
auto forcey = 0.0F;
for (auto j = 0; j < count; ++j) {
if(j == i)continue;
const auto distx = pointsx[i] - pointsx[j];
const auto disty = pointsy[i] - pointsy[j];
//if(px != px) continue; //most efficient way to avoid a NaN failure
const auto force = G / (distx * distx + disty * disty);
forcex += distx * force;
forcey += disty * force;
}
pointsx[i] += velx[i] -= forcex;
pointsy[i] += vely[i] -= forcey;
if (zoom != 1) {
points[i * 2] = pointsx[i] * zoom / aspect;
points[i * 2 + 1] = pointsy[i] * zoom;
} else {
points[i * 2] = pointsx[i] / aspect;
points[i * 2 + 1] = pointsy[i];
}
/*points[i * 2] = pointsx[i];
points[i * 2 + 1] = pointsy[i];*/
}
}
This is the relevant part of the C# OpenTK GameWindow:
private void PhysicsLoop(){
while(true){
if(stop){
for(var i = 0; i < pcount; ++i) { velx[i] = vely[i] = 0F; }
}
if(reset){
reset = false;
var r = new Random();
for(var i = 0; i < Startcount; ++i){
do{
pointsx[i] = (float)(r.NextDouble()*2.0F - 1.0F);
pointsy[i] = (float)(r.NextDouble()*2.0F - 1.0F);
} while(pointsx[i]*pointsx[i] + pointsy[i]*pointsy[i] > 1.0F);
velx[i] = vely[i] = 0.0F;
}
NativeMethods.SetData(pointsx, pointsy);
pcount = Startcount;
buffersize = (IntPtr)(pcount*8);
}
are.WaitOne();
NativeMethods.Compute(points0, velx, vely, pcount, aspect, zoom);
var pointstemp = points0;
points0 = points1;
points1 = pointstemp;
are1.Set();
}
}
protected override void OnRenderFrame(FrameEventArgs e){
GL.Clear(ClearBufferMask.ColorBufferBit);
GL.EnableVertexAttribArray(0);
GL.BindBuffer(BufferTarget.ArrayBuffer, vbo);
mre1.Wait();
are1.WaitOne();
GL.BufferData(BufferTarget.ArrayBuffer, buffersize, points1, BufferUsageHint.StaticDraw);
are.Set();
GL.VertexAttribPointer(0, 2, VertexAttribPointerType.Float, false, 0, 0);
GL.DrawArrays(PrimitiveType.Points, 0, pcount);
GL.DisableVertexAttribArray(0);
SwapBuffers();
}
These are the array declarations:
private const int Startcount = 4096;
private readonly float[] pointsx = new float[Startcount];
private readonly float[] pointsy = new float[Startcount];
private float[] points0 = new float[Startcount*2];
private float[] points1 = new float[Startcount*2];
private readonly float[] velx = new float[Startcount];
private readonly float[] vely = new float[Startcount];
Edit 0: It seems that adding 3 zeros to G increases the accuracy of the simulation but I'm at a loss as to why its different without interleaved coordinates. Edit 1: I somehow achieved an 8.3x performance increase with AVX over scalar with the new code above!