I would like to use dynamic branching to skip unnecessary instructions. Please consider two functions:
float computeFirst(float s)
[branch] if(abs(s) > 1.0)
return -1.0;
// a bunch of instructions
return acos(s); // acos just for example
float computeSecond(float s)
[branch] if(abs(s) > 1.0)
return -1.0;
// a bunch of instructions
return acos(s); // acos just for example
Are these functions equivalent? Both have the dynamic branch but do they work the same way and the unnecessary instructions are actually skipped (when all the pixels in a warp follow the same branch)?
Using Shader Playground, I found that these two functions compile differently:
// computeFirst
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.y, l(-1.000000)
add r0.z, -|v0.x|, l(1.000000)
sqrt r0.z, r0.z
mad r0.w, |v0.x|, l(-0.018729), l(0.074261)
mad r0.w, r0.w, |v0.x|, l(-0.212114)
mad r0.w, r0.w, |v0.x|, l(1.570729)
mul r1.x, r0.z, r0.w
mad r1.x, r1.x, l(-2.000000), l(3.141593)
lt r1.y, v0.x, -v0.x
and r1.x, r1.y, r1.x
mad r0.z, r0.w, r0.z, r1.x
movc o0.x, r0.x, r0.y, r0.z
mov o0.yzw, l(0,0,0,0)
// Approximately 17 instruction slots used
// computeSecond
dcl_globalFlags refactoringAllowed
dcl_input_ps linear v0.x
dcl_output o0.xyzw
dcl_temps 2
lt r0.x, l(1.000000), |v0.x|
if_nz r0.x
mov r0.x, l(-1.000000)
add r0.y, -|v0.x|, l(1.000000)
sqrt r0.y, r0.y
mad r0.z, |v0.x|, l(-0.018729), l(0.074261)
mad r0.z, r0.z, |v0.x|, l(-0.212114)
mad r0.z, r0.z, |v0.x|, l(1.570729)
mul r0.w, r0.y, r0.z
mad r0.w, r0.w, l(-2.000000), l(3.141593)
lt r1.x, v0.x, -v0.x
and r0.w, r0.w, r1.x
mad r0.x, r0.z, r0.y, r0.w
mov o0.x, r0.x
mov o0.yzw, l(0,0,0,0)
// Approximately 18 instruction slots used
In computeFirst, dynamic branch looks useless and never seems to allow unnecessary instructions to be skipped. Am I misunderstanding something and are these two compiled versions equivalent?