My "first-order correction to LERP" was meant to modify the straight line in LERP to get closer to the arc along the unit sphere. I don't know if the approximation is good enough to be visually acceptable in a game.
Here's a much better approximation, which still only uses multiplication and addition, so it should be very fast.
float approx_a(float d, float t) {
float const x1 = 0.555713f;
float const x2 = 0.281959f;
float const x3 = -0.207537f;
float const x4 = 0.143091f;
float const x5 = -0.0925688f;
return (1.0f - t) * (1.0 + (1.0f - d * x1 - t * x2 - d*d*x3 - t*t*x4 - d*t*x5) * t * (1.0 - d));
}
void approximate_slerp(float const qa[4], float const qb[4], float t, float ret[4]) {
float d = qa[0] * qb[0] + qa[1] * qb[1] + qa[2] * qb[2] + qa[3] * qb[3];
float a = approx_a(std::abs(d), t);
float b = copysign(approx_a(std::abs(d), 1.0f - t), d);
ret[0] = qa[0] * a + qb[0] * b;
ret[1] = qa[1] * a + qb[1] * b;
ret[2] = qa[2] * a + qb[2] * b;
ret[3] = qa[3] * a + qb[3] * b;
// The quaternion computed above might be good enough. Its main flaw might be that it doesn't have length exactly 1.
// If this is important, you can use the lines below, which will make the length much closer to 1.
float l2_norm = ret[0]*ret[0] + ret[1]*ret[1] + ret[2]*ret[2] + ret[3]*ret[3];
float inv_length = 1.5 - 0.5 * l2_norm;
ret[0] *= inv_length;
ret[1] *= inv_length;
ret[2] *= inv_length;
ret[3] *= inv_length;
}
Rearranging things a bit to make the compiler's life easier, you can get this:
void approximate_slerp2(float const qa[4], float const qb[4], float t, float ret[4]) {
float const x1 = 0.555713f;
float const x2 = 0.281959f;
float const x3 = -0.207537f;
float const x4 = 0.143091f;
float const x5 = -0.0925688f;
float d = qa[0] * qb[0] + qa[1] * qb[1] + qa[2] * qb[2] + qa[3] * qb[3];
float ad = std::abs(d);
// T<n> are quantities that only depend on t, so they can be calculated once for many bones
float T1 = t * (1.0f - t);
float T2 = 1.0f - t * (x2 + t*x4);
float T3 = 1.0f - (1.0f-t) * (x2 +(1.0f-t)*x4);
// C<n> are quantities that are shared in the computation of a and b. I'm not sure if the compiler needs help here, but it shouldn't hurt to organize things like this.
float C1 = T1 * (1.0f - ad);
float C2 = ad * (x1 + ad * x3);
float C3 = ad * x5;
float a = 1.0f - t + C1 * (T2 - C2 - t * C3);
float b = copysign(t + C1 * (T3 - C2 - (1.0f-t) * C3), d);
ret[0] = qa[0] * a + qb[0] * b;
ret[1] = qa[1] * a + qb[1] * b;
ret[2] = qa[2] * a + qb[2] * b;
ret[3] = qa[3] * a + qb[3] * b;
float final_l2 = ret[0]*ret[0] + ret[1]*ret[1] + ret[2]*ret[2] + ret[3]*ret[3];
float inv_length = 1.5 - 0.5*final_l2;
ret[0] *= inv_length;
ret[1] *= inv_length;
ret[2] *= inv_length;
ret[3] *= inv_length;
}