I would like to use dynamic branching to skip unnecessary instructions. Please consider two functions:
float computeFirst(float s)
{
[branch] if(abs(s) > 1.0)
return -1.0;
// a bunch of instructions
return acos(s); // acos just for example
}
float computeSecond(float s)
{
[branch] if(abs(s) > 1.0)
{
return -1.0;
}
else
{
// a bunch of instructions
return acos(s); // acos just for example
}
}
Are these functions equivalent? Both have the dynamic branch but do they work the same way and the unnecessary instructions are actually skipped (when all the pixels in a warp follow the same branch)?
Using Shader Playground, I found that these two functions compile differently:
// computeFirst
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.y, l(-1.000000)
endif
add r0.z, -|v0.x|, l(1.000000)
sqrt r0.z, r0.z
mad r0.w, |v0.x|, l(-0.018729), l(0.074261)
mad r0.w, r0.w, |v0.x|, l(-0.212114)
mad r0.w, r0.w, |v0.x|, l(1.570729)
mul r1.x, r0.z, r0.w
mad r1.x, r1.x, l(-2.000000), l(3.141593)
lt r1.y, v0.x, -v0.x
and r1.x, r1.y, r1.x
mad r0.z, r0.w, r0.z, r1.x
movc o0.x, r0.x, r0.y, r0.z
mov o0.yzw, l(0,0,0,0)
ret
// Approximately 17 instruction slots used
// computeSecond
ps_5_0
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.x, l(-1.000000)
else
add r0.y, -|v0.x|, l(1.000000)
sqrt r0.y, r0.y
mad r0.z, |v0.x|, l(-0.018729), l(0.074261)
mad r0.z, r0.z, |v0.x|, l(-0.212114)
mad r0.z, r0.z, |v0.x|, l(1.570729)
mul r0.w, r0.y, r0.z
mad r0.w, r0.w, l(-2.000000), l(3.141593)
lt r1.x, v0.x, -v0.x
and r0.w, r0.w, r1.x
mad r0.x, r0.z, r0.y, r0.w
endif
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
ret
// Approximately 18 instruction slots used
In computeFirst, dynamic branch looks useless and never seems to allow unnecessary instructions to be skipped. Am I misunderstanding something and are these two compiled versions equivalent?