I have written a compute shader that calculates a quaternion Julia set. The strange thing, is that it works best when I state: layout(local_size_x = 1, local_size_y = 1) in;
I tried bumping it up to 32x32, but I get a decrease in performance. This is on AMD and Intel. Is there any rhyme or reason behind this? Also, is there such a thing as inline functions in GLSL?
The shader as a whole looks like this:
#version 430 core
layout(local_size_x = 1, local_size_y = 1) in;
layout(binding = 0, r32f) writeonly uniform image2D output_image;
layout(binding = 1, rgba32f) readonly uniform image2D input_image;
uniform vec4 c;
uniform int max_iterations;
uniform float threshold;
// functions go here
vec4 iter_func(vec4 z)
{
vec4 A0 = vec4(0, 0, 0, 0);
vec4 A1 = vec4(0, 0, 0, 0);
vec4 A2 = vec4(0, 0, 0, 0);
vec4 S2_0 = vec4(0, 0, 0, 0);
vec4 S2_1 = vec4(0, 0, 0, 0);
vec4 S2_2 = vec4(0, 0, 0, 0);
A0 = qcopy(z);
A1 = qcopy(z);
S2_0 = qsin(A0);
S2_1 = qsin(A1);
S2_2 = qmul(c, S2_1);
S2_0 = qadd(S2_0, S2_2);
A2 = qcopy(S2_0);
z = qcopy(A2);
return z;
}
float iterate(vec4 z)
{
float threshold_sq = threshold*threshold;
float len_sq = dot(z, z);
for(int i = 0; i < max_iterations; i++)
{
z = iter_func(z);
if((len_sq = dot(z, z)) >= threshold_sq)
break;
}
return sqrt(len_sq);
}
void main()
{
const ivec2 pixel_coords = ivec2(gl_GlobalInvocationID.xy);
vec4 z = imageLoad(input_image, pixel_coords);
const float magnitude = iterate(z);
const vec4 output_pixel = vec4(magnitude, 0, 0, 0);
imageStore(output_image, pixel_coords, output_pixel);
}