Hi all,
I am currently writing a Gaussian Blur pixshader.
The shader code is as following:
sampler2D Texture0;
float2 TexSize;
float ScanPass;
static const int MAX_RADIUS = 200;
static const int MAX_FLOAT4 = (MAX_RADIUS - 1) / 4 + 1;
float4 Template[MAX_FLOAT4];
float4 main(float2 texCoord : TEXCOORD0) : COLOR
{
float4 coord = texCoord.xyxy;
float2 scale = { 1 / TexSize.x, 1 / TexSize.y };
float4 color = 0;
bool run = true;
for (int i = 0; i < MAX_FLOAT4; ++i)
{
float4 t = Template[i];
if (i == 0)
t[0] /= 2;
if (run)
{
for (int j = 0; j < 4; ++j)
{
if (t[j] <= 0)
run = false;
color += tex2D(Texture0, coord.xy) * t[j];
color += tex2D(Texture0, coord.zw) * t[j];
if (ScanPass == 0)
{
coord.x -= scale.x;
coord.z += scale.x;
}
else
{
coord.y -= scale.y;
coord.w += scale.y;
}
coord = clamp(coord, float4(0, 0, 0, 0), float4(1, 1, 1, 1));
}
}
}
return color;
}
the array Template will contains the Gaussian Blur factors.
For example, if the standard deviation is 1.0, there are 3 + 1 + 3 pixels affected (3 * sigma + 1, radius = 3). I will put the fators calculated by x = 0,1,2,3 into the array. (To reduce the register number, x = -1,-2,-3 will not be passed.)
The question is, when I increase MAX_RADIUS, the shader compile will generate an error:
error X4505: maximum temp register index exceeded
I had taken a look at the asm code and found that the array indexing takes a lot of instructions. Those are:
add r6, r5.y, c52
add r7, r5.y, c59
add r8, r5.y, c60
add r9, r5.y, c53
add r10, r5.y, c54
add r11, r5.y, c55
add r12, r5.y, c56
add r13, r5.y, c57
add r14, r5.y, c58
add r15, r5.y, c61
add r16, r5.y, c62
add r17, r5.y, c63
add r5.zw, r5.y, c64.xyxy
abs r6, r6
abs r7, r7
abs r8, r8
abs r9, r9
abs r10, r10
abs r11, r11
abs r12, r12
abs r13, r13
abs r14, r14.yzwx
abs r15, r15
abs r16, r16
abs r17, r17
abs r18.xy, r5.zwzw
mov r19, -r15
mov r20, -r16
mov r21, -r17
mov r5.zw, -r6.xyxz
mov r22.xy, -r7.xzzw
mov r22.zw, -r8.xyxz
mov r23.xy, -r9.xzzw
mov r23.zw, -r10.xyxz
mov r24.xy, -r11.xzzw
mov r24.zw, -r12.xyxz
mov r25.xy, -r13.xzzw
mov r18.z, r14.w
mov r25.zw, -r18.xyzx
mov r26.xy, -r6.ywzw
mov r26.zw, -r7.xyyw
mov r27.xy, -r8.ywzw
mov r27.zw, -r9.xyyw
mov r28.xy, -r10.ywzw
mov r28.zw, -r11.xyyw
mov r29.xy, -r12.ywzw
mov r29.zw, -r13.xyyw
mov r14.w, r18.y
mov r30, -r14
mov r6.xz, -r6
add r5.zw, r5, r6.xyxz
cmp r5.zw, r5, c65.x, c65.y
mov r6.xz, -r7
add r6.xz, r6, r22.xyyw
cmp r6.xz, r6, c65.x, c65.y
mov r7.xz, -r8
add r7.xz, r7, r22.zyww
cmp r7.xz, r7, c65.x, c65.y
mov r8.xz, -r9
add r8.xz, r8, r23.xyyw
cmp r8.xz, r8, c65.x, c65.y
mov r9.xz, -r10
add r9.xz, r9, r23.zyww
cmp r9.xz, r9, c65.x, c65.y
mov r10.xz, -r11
add r10.xz, r10, r24.xyyw
cmp r10.xz, r10, c65.x, c65.y
mov r11.xz, -r12
add r11.xz, r11, r24.zyww
cmp r11.xz, r11, c65.x, c65.y
mov r12.xz, -r13
add r12.xz, r12, r25.xyyw
cmp r12.xz, r12, c65.x, c65.y
mov r13.xz, -r18.zyxw
add r13.xz, r13, r25.zyww
cmp r13.xz, r13, c65.x, c65.y
mov r6.yw, -r6
add r6.yw, r6, r26.xxzy
cmp r6.yw, r6, c65.x, c65.y
mov r7.yw, -r7
add r7.yw, r7, r26.xzzw
cmp r7.yw, r7, c65.x, c65.y
mov r8.yw, -r8
add r8.yw, r8, r27.xxzy
......
In fact, the above HLSL code is already be optimized by me. In previos versions, I stored the Template array as "float Template[MAX_RADIUS]", and put Template every where when it used. In that case, it failed even when MAX_RADIUS = 53 or 54.
Some artiles said that there are no "memory" but only registers in HLSL, so the array indexing will be so complex. Is that right?
Can i use registers directly in HLSL? Some thing like:
if (i < 4)
__asm mov r0, c0
else if (i < 8)
__asm mov r0, c1
...
I think that is much more short that what the shader compiler generated.
Above all, my question is,
How to pass such large arrays into HLSL (to supprt large radius Gaussian Blur) ?
or, how to optimize array indexing ?
Thank you.