Alright, so I had some time to investigate the DirectX Math lib and run some tests. And I got some questions
So here are my results for my tests
//============ DEBUG MODE Times ==========
//Using normal math operations
Norm TIME: 4.275861ms
//Using DirectX Math where items were loaded from XMFloat3/XMMatrix4x4 and then stored to XMFloat3
DirectX Math XMFLOAT TIME: 4.965582ms
//Using DirectX Math where XMVector/XMMatrix were used directly
DirectX Math RAW SIMD TIME: 2.183706ms
//Using custom solution where __m128 was directly used
New RAW SIMD Solution TIME: 1.502607ms
//Original attempt, used loaded data from a Vector3/Matrix44 and stored the result back into a Vector3
Original SIMD Solution TIME: 5.034964ms
Code used in case anyone is interested
#include <iostream>
#include <pmmintrin.h>
#include <Windows.h>
#include <string>
#include <DirectXMath.h>
class Vector3
{
public:
Vector3()
{
x = 0.0f;
y = 0.0f;
z = 0.0f;
}
~Vector3()
{
}
float x;
float y;
float z;
};
class Matrix4
{
public:
Matrix4()
{
data[0] = 1.0f;
data[1] = 0.0f;
data[2] = 0.0f;
data[3] = 0.0f;
data[4] = 0.0f;
data[5] = 1.0f;
data[6] = 0.0f;
data[7] = 0.0f;
data[8] = 0.0f;
data[9] = 0.0f;
data[10] = 1.0f;
data[11] = 0.0f;
data[12] = 0.0f;
data[13] = 0.0f;
data[14] = 0.0f;
data[15] = 1.0f;
}
~Matrix4() {}
float data[16];
void set(float* b)
{
data[0] = b[0];
data[1] = b[1];
data[2] = b[2];
data[3] = b[3];
data[4] = b[4];
data[5] = b[5];
data[6] = b[6];
data[7] = b[7];
data[8] = b[8];
data[9] = b[9];
data[10] = b[10];
data[11] = b[11];
data[12] = b[12];
data[13] = b[13];
data[14] = b[14];
data[15] = b[15];
}
};
class SIMDVector3
{
public:
SIMDVector3()
{
data = _mm_setzero_ps();
}
SIMDVector3(__m128 data)
{
this->data = data;
}
SIMDVector3(float x, float y, float z)
{
data = _mm_set_ps(1.0f, z, y, x);
}
~SIMDVector3()
{
}
__m128 data;
};
class SIMDMatrix4
{
public:
SIMDMatrix4()
{
data[0] = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
data[1] = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
data[2] = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
data[3] = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f);
}
SIMDMatrix4(float* b)
{
data[0] = _mm_set_ps(b[3], b[2], b[1], b[0]);
data[1] = _mm_set_ps(b[7], b[6], b[5], b[4]);
data[2] = _mm_set_ps(b[11], b[10], b[9], b[8]);
data[3] = _mm_set_ps(b[15], b[14], b[13], b[12]);
}
~SIMDMatrix4()
{
}
__m128 data[4];
};
Vector3 normMul(const Matrix4 &m, const Vector3 &b)
{
Vector3 r;
r.x = m.data[0] * b.x + m.data[4] * b.y + m.data[8] * b.z + m.data[12] * 1.0f;
r.y = m.data[1] * b.x + m.data[5] * b.y + m.data[9] * b.z + m.data[13] * 1.0f;
r.z = m.data[2] * b.x + m.data[6] * b.y + m.data[10] * b.z + m.data[14] * 1.0f;
return r;
}
Vector3 origSIMDMul(const Matrix4 &m, const Vector3 &b)
{
//Setup
Vector3 r;
__m128 m1 = _mm_set_ps(m.data[12], m.data[8], m.data[4], m.data[0]);
__m128 m2 = _mm_set_ps(m.data[13], m.data[9], m.data[5], m.data[1]);
__m128 m3 = _mm_set_ps(m.data[14], m.data[10], m.data[6], m.data[2]);
__m128 vec = _mm_set_ps(1.0f, b.z, b.y, b.x);
//Multiple the vecs with the columns; Matrices are column order
m1 = _mm_mul_ps(m1, vec);
m2 = _mm_mul_ps(m2, vec);
m3 = _mm_mul_ps(m3, vec);
//Get result x
m1 = _mm_hadd_ps(m1, m1);
r.x = _mm_cvtss_f32(_mm_hadd_ps(m1, m1));
//Get result y
m2 = _mm_hadd_ps(m2, m2);
r.y = _mm_cvtss_f32(_mm_hadd_ps(m2, m2));
//Get result z
m3 = _mm_hadd_ps(m3, m3);
r.z = _mm_cvtss_f32(_mm_hadd_ps(m3, m3));
return r;
}
void simdMul(const SIMDMatrix4 &m, const SIMDVector3 &b, SIMDVector3 &r)
{
__m128 x = _mm_mul_ps(m.data[0], _mm_shuffle_ps(b.data, b.data, _MM_SHUFFLE(0, 0, 0, 0)));
__m128 y = _mm_mul_ps(m.data[1], _mm_shuffle_ps(b.data, b.data, _MM_SHUFFLE(1, 1, 1, 1)));
__m128 z = _mm_mul_ps(m.data[2], _mm_shuffle_ps(b.data, b.data, _MM_SHUFFLE(2, 2, 2, 2)));
r.data = _mm_add_ps(x, _mm_add_ps(y, _mm_add_ps(z, m.data[3])));
}
int main()
{
LARGE_INTEGER startTime;
LARGE_INTEGER endTime;
LARGE_INTEGER frq;
QueryPerformanceFrequency(&frq);
DirectX::XMFLOAT3 xmResult;
DirectX::XMFLOAT3 xmVec3(2.0f, 5.0f, 10.0f);
DirectX::XMFLOAT4X4 xmMat44(1.0f, 0.0f, 0.0f, 0.0f,
0.0f, 1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f, 0.0f,
0.0f, 0.0f, 0.0f, 1.0f);
DirectX::XMVECTOR rawVec;
DirectX::XMMATRIX rawMat;
QueryPerformanceCounter(&startTime);
for (int i = 0; i < 10000; ++i)
{
rawMat = DirectX::XMLoadFloat4x4(&xmMat44);
for (int j = 0; j < 4; ++j)
{
rawVec = DirectX::XMLoadFloat3(&xmVec3);
DirectX::XMStoreFloat3(&xmResult, DirectX::XMVector3Transform(rawVec, rawMat));
}
}
QueryPerformanceCounter(&endTime);
std::cout << "DirectX Math XMFLOAT TIME: " + std::to_string((double)((endTime.QuadPart - startTime.QuadPart) * 1000) / (double)frq.QuadPart) + "ms" << std::endl;
DirectX::XMVECTOR xmSimdResult;
DirectX::XMVECTOR xmSimdVec = DirectX::XMVectorSet(2.0f, 5.0f, 10.0f, 1.0f);
DirectX::XMMATRIX xmSimdMat = DirectX::XMMatrixIdentity();
QueryPerformanceCounter(&startTime);
for (int i = 0; i < 10000; ++i)
{
for (int j = 0; j < 4; ++j)
{
xmSimdResult = DirectX::XMVector3Transform(xmSimdVec, xmSimdMat);
}
}
QueryPerformanceCounter(&endTime);
std::cout << "DirectX Math RAW SIMD TIME: " + std::to_string((double)((endTime.QuadPart - startTime.QuadPart) * 1000) / (double)frq.QuadPart) + "ms" << std::endl;
SIMDVector3 smRes;
float data[16];
data[0] = 1.0f;
data[1] = 5.0f;
data[2] = 9.0f;
data[3] = 13.0f;
data[4] = 2.0f;
data[5] = 6.0f;
data[6] = 10.0f;
data[7] = 14.0f;
data[8] = 3.0f;
data[9] = 7.0f;
data[10] = 11.0f;
data[11] = 15.0f;
data[12] = 4.0f;
data[13] = 8.0f;
data[14] = 12.0f;
data[15] = 16.0f;
SIMDMatrix4 smMat(data);
SIMDVector3 smVec(2.0f, 5.0f, 10.0f);
QueryPerformanceCounter(&startTime);
for (int i = 0; i < 10000; ++i)
{
for (int j = 0; j < 4; ++j)
{
simdMul(smMat, smVec, smRes);
}
}
QueryPerformanceCounter(&endTime);
std::cout << "New RAW SIMD Solution TIME: " + std::to_string((double)((endTime.QuadPart - startTime.QuadPart) * 1000) / (double)frq.QuadPart) + "ms" << std::endl;
Vector3 vecRes;
Matrix4 mat1;
Vector3 v1;
v1.x = 2.0f;
v1.y = 5.0f;
v1.z = 10.0f;
QueryPerformanceCounter(&startTime);
for (int i = 0; i < 10000; ++i)
{
for (int j = 0; j < 4; ++j)
{
vecRes = origSIMDMul(mat1, v1);
}
}
QueryPerformanceCounter(&endTime);
std::cout << "Original SIMD Solution TIME: " + std::to_string((double)((endTime.QuadPart - startTime.QuadPart) * 1000) / (double)frq.QuadPart) + "ms" << std::endl;
Matrix4 mat2;
Vector3 v2;
v2.x = 2.0f;
v2.y = 5.0f;
v2.z = 10.0f;
QueryPerformanceCounter(&startTime);
for (int i = 0; i < 10000; ++i)
{
for (int j = 0; j < 4; ++j)
{
vecRes = normMul(mat2, v2);
}
}
QueryPerformanceCounter(&endTime);
std::cout << "Norm TIME: " + std::to_string((double)((endTime.QuadPart - startTime.QuadPart) * 1000) / (double)frq.QuadPart) + "ms" << std::endl;
std::cout << "Complete" << std::endl;
}
On 12/11/2017 at 6:25 AM, Hodgman said:
This function spends more time converting between non-simd arranged data to simd-arranged data, and back again, than it does actually doing any calculations.
Looking at the test times @Hodgman is 100% right. Loading data into the SIMD registries and then getting it back out completely out weighs the benefit of the SIMD fast calculations. This can also be seen in the DirectX Math test where I use the XMFloat3 / XMMatrix4x4 types as these need to be loaded/stored. I have a questions about this later down the line
On 12/11/2017 at 3:36 AM, Infinisearch said:
Using SIMD correctly is more than just using intrinsics
SIMD operations are insanely fast. When running in release mode the timing on the RAW SIMD tests can't even register (0ms). I can bump the loop up to simulate over 100 million vector transformations against a matrix and it still comes as 0ms on the timer. So you can really do some serious work if you directly use the SIMD __m128 type and do not load/unload things often
Now this brings me back to my questions about DirectX Math and how to use the lib. According to the MSDN DirectXMath guide they say the XMVECTOR and XMMATRIX types are the work horses for the DirectXMath Library. Which makes total sense, but then they go to say
Quote
Allocations from the heap, however, are more complicated. As such, you need to be careful whenever you use either XMVECTOR or XMMATRIX as a member of a class or structure to be allocated from the heap. On Windows x64, all heap allocations are 16-byte aligned, but for Windows x86, they are only 8-byte aligned. There are options for allocating structures from the heap with 16-byte alignment (see Properly Align Allocations). For C++ programs, you can use operator new/delete/new[]/delete[] overloads (either globally or class-specific) to enforce optimal alignment if desired.
Which I understand, but I guess I'm not really sure what is expected in the overloaded new/delete/new[]/delete[]. I just know that doing:
class Sprite
{
Sprite(){}
~Sprite(){}
XMVECTOR position;
XMVECTOR texCoords;
XMVECTOR color;
};
Sprite* mySprite = new Sprite;
Is going to mess up the alignment and make SIMD operations take a performance hit
Then they go on to say
Quote
However, often it is easier and more compact to avoid using XMVECTOR or XMMATRIX directly in a class or structure. Instead, make use of the XMFLOAT3, XMFLOAT4, XMFLOAT4X3, XMFLOAT4X4, and so on, as members of your structure. Further, you can use the Vector Loading and Vector Storage functions to move the data efficiently into XMVECTOR or XMMATRIX local variables, perform computations, and store the results. There are also streaming functions (XMVector3TransformStream, XMVector4TransformStream, and so on) that efficiently operate directly on arrays of these data types
And that's where I get thrown off
Am I normally supposed to be using the XMFLOAT[n] / XMMatrix[n]x[m] types?
Based on the above statement it sounds like I should, but that does not make sense to me if I want to take advantage of SIMD operations. As having to load/unload data causes a major performance hit making the timings often worse then using normal math operations
Also I noticed during my tests and this maybe my fault, but it seems like I have to transpose the matrix before multiplying it by the vector to get the correct vector result when using DirectXMath. Is this normal?
//Multiplying matrix by vec should get me the result vector of 46, 118, 190, 262
//But this only happens if I transpose the matrix first
//If I DO NOT transpose the matrix first I get the result vector of 130, 148, 166, 184 which is wrong?
DirectX::XMVECTOR vec = DirectX::XMVectorSet(2.0f, 5.0f, 10.0f, 1.0f);
DirectX::XMMATRIX mat = {
1.0f, 2.0f, 3.0f, 4.0f,
5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f,
13.0f, 14.0f, 15.0f, 16.0f,
};
mat = DirectX::XMMatrixTranspose(mat);
DirectX::XMVECTOR r = DirectX::XMVector3Transform(vec, mat);