Hello guys,
I am working on implementing deferred texturing tecnique.
I have screen-space material ID texture from render G-Buffer pass for which I would like to calculate screen space rectangles encompasing material IDs used for the same mesh type. By mesh type I refer to one pipeline state object permutation used for G-Buffer shading pass. Those screen space rectangles are later used to shade G-Buffer based on mesh type as described in article Deferred+ from GPU Zen.
My compute shader pass for calculating encompasing rectangles does not produce expected results.
I did some debugging with PIX and I can see that PIX for some reason does not show g_MaterialIDTexture and g_MeshTypePerMaterialIDBuffer in the list of binded resources.
When I step with debugger through the shader code, reading g_MaterialIDTexture and MeshTypePerMaterialIDBuffer is skipped. You can see the shader below.
groupshared uint2 g_ScreenMinPoints[NUM_MESH_TYPES];
groupshared uint2 g_ScreenMaxPoints[NUM_MESH_TYPES];
#define NUM_THREADS_PER_GROUP (NUM_THREADS_X * NUM_THREADS_Y)
cbuffer AppDataBuffer : register(b0)
{
AppData g_AppData;
}
RWStructuredBuffer<uint2> g_ShadingRectangleMinPointBuffer : register(u0);
RWStructuredBuffer<uint2> g_ShadingRectangleMaxPointBuffer : register(u1);
Texture2D<uint> g_MaterialIDTexture : register(t0);
Buffer<uint> g_MeshTypePerMaterialIDBuffer : register(t1);
[numthreads(NUM_THREADS_X, NUM_THREADS_Y, 1)]
void Main(uint3 globalThreadId : SV_DispatchThreadID, uint localThreadIndex : SV_GroupIndex)
{
for (uint index = localThreadIndex; index < NUM_MESH_TYPES; index += NUM_THREADS_PER_GROUP)
{
g_ScreenMinPoints[index] = uint2(0xffffffff, 0xffffffff);
g_ScreenMaxPoints[index] = uint2(0, 0);
}
GroupMemoryBarrierWithGroupSync();
if ((globalThreadId.x < g_AppData.screenSize.x) && (globalThreadId.y < g_AppData.screenSize.y))
{
uint materialID = g_MaterialIDTexture[globalThreadId.xy];
uint meshType = g_MeshTypePerMaterialIDBuffer[materialID];
InterlockedMin(g_ScreenMinPoints[meshType].x, globalThreadId.x);
InterlockedMin(g_ScreenMinPoints[meshType].y, globalThreadId.y);
InterlockedMax(g_ScreenMaxPoints[meshType].x, globalThreadId.x);
InterlockedMax(g_ScreenMaxPoints[meshType].y, globalThreadId.y);
}
GroupMemoryBarrierWithGroupSync();
for (uint index = localThreadIndex; index < NUM_MESH_TYPES; index += NUM_THREADS_PER_GROUP)
{
InterlockedMin(g_ShadingRectangleMinPointBuffer[index].x, g_ScreenMinPoints[index].x);
InterlockedMin(g_ShadingRectangleMinPointBuffer[index].y, g_ScreenMinPoints[index].y);
InterlockedMax(g_ShadingRectangleMaxPointBuffer[index].x, g_ScreenMaxPoints[index].x);
InterlockedMax(g_ShadingRectangleMaxPointBuffer[index].y, g_ScreenMaxPoints[index].y);
}
}
I checked DXBC output and it does not include them either.
//
// Generated by Microsoft (R) HLSL Shader Compiler 10.1
//
//
// Buffer Definitions:
//
// cbuffer AppDataBuffer
// {
//
// struct AppData
// {
//
// float4x4 viewMatrix; // Offset: 0
// float4x4 viewInvMatrix; // Offset: 64
// float4x4 projMatrix; // Offset: 128
// float4x4 projInvMatrix; // Offset: 192
// float4x4 viewProjMatrix; // Offset: 256
// float4x4 viewProjInvMatrix; // Offset: 320
// float4x4 prevViewProjMatrix; // Offset: 384
// float4x4 prevViewProjInvMatrix;// Offset: 448
// float4x4 notUsed1; // Offset: 512
// float4 cameraWorldSpacePos; // Offset: 576
// float4 cameraWorldFrustumPlanes[6];// Offset: 592
// float cameraNearPlane; // Offset: 688
// float cameraFarPlane; // Offset: 692
// float2 notUsed2; // Offset: 696
// uint2 screenSize; // Offset: 704
// float2 rcpScreenSize; // Offset: 712
// uint2 screenHalfSize; // Offset: 720
// float2 rcpScreenHalfSize; // Offset: 728
// uint2 screenQuarterSize; // Offset: 736
// float2 rcpScreenQuarterSize; // Offset: 744
// float4 sunWorldSpaceDir; // Offset: 752
// float4 sunLightColor; // Offset: 768
// float4 notUsed3[15]; // Offset: 784
//
// } g_AppData; // Offset: 0 Size: 1024
//
// }
//
// Resource bind info for g_ShadingRectangleMinPointBuffer
// {
//
// uint2 $Element; // Offset: 0 Size: 8
//
// }
//
// Resource bind info for g_ShadingRectangleMaxPointBuffer
// {
//
// uint2 $Element; // Offset: 0 Size: 8
//
// }
//
//
// Resource Bindings:
//
// Name Type Format Dim HLSL Bind Count
// ------------------------------ ---------- ------- ----------- -------------- ------
// g_ShadingRectangleMinPointBuffer UAV struct r/w u0 1
// g_ShadingRectangleMaxPointBuffer UAV struct r/w u1 1
// AppDataBuffer cbuffer NA NA cb0 1
//
//
//
// Input signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Input
//
// Output signature:
//
// Name Index Mask Register SysValue Format Used
// -------------------- ----- ------ -------- -------- ------- ------
// no Output
0x00000000: cs_5_0
0x00000008: dcl_globalFlags refactoringAllowed | skipOptimization
0x0000000C: dcl_constantbuffer CB0[45], immediateIndexed
0x0000001C: dcl_uav_structured u0, 8
0x0000002C: dcl_uav_structured u1, 8
0x0000003C: dcl_input vThreadIDInGroupFlattened
0x00000044: dcl_input vThreadID.xy
0x0000004C: dcl_temps 2
0x00000054: dcl_tgsm_structured g0, 8, 1
0x00000068: dcl_tgsm_structured g1, 8, 1
0x0000007C: dcl_thread_group 16, 16, 1
//
// Initial variable locations:
// vThreadID.x <- globalThreadId.x; vThreadID.y <- globalThreadId.y; vThreadID.z <- globalThreadId.z;
// vThreadIDInGroupFlattened.x <- localThreadIndex
//
#line 22 "D:\GitHub\RenderSDK\Samples\Bin\DynamicGI\Shaders\CalcShadingRectanglesCS.hlsl"
0 0x0000008C: mov r0.x, vThreadIDInGroupFlattened.x
1 0x0000009C: mov r0.y, r0.x
2 0x000000B0: loop
3 0x000000B4: mov r0.z, l(1)
4 0x000000C8: ult r0.z, r0.y, r0.z
5 0x000000E4: breakc_z r0.z
#line 24
6 0x000000F0: store_structured g0.x, l(0), l(0), l(-1)
7 0x00000114: store_structured g0.x, l(0), l(4), l(-1)
#line 25
8 0x00000138: mov r0.zw, l(0,0,0,0)
9 0x00000158: store_structured g1.x, l(0), l(0), r0.z
10 0x0000017C: store_structured g1.x, l(0), l(4), r0.w
#line 26
11 0x000001A0: mov r0.z, l(256)
12 0x000001B4: iadd r0.y, r0.z, r0.y
13 0x000001D0: endloop
#line 27
14 0x000001D4: sync_g_t
#line 29
15 0x000001D8: ult r0.x, vThreadID.x, cb0[44].x
16 0x000001F4: ult r0.y, vThreadID.y, cb0[44].y
17 0x00000210: and r0.x, r0.y, r0.x
18 0x0000022C: if_nz r0.x
#line 34
19 0x00000238: atomic_umin g0, l(0, 0, 0, 0), vThreadID.x
#line 35
20 0x0000025C: atomic_umin g0, l(0, 4, 0, 0), vThreadID.y
#line 37
21 0x00000280: atomic_umax g1, l(0, 0, 0, 0), vThreadID.x
#line 38
22 0x000002A4: atomic_umax g1, l(0, 4, 0, 0), vThreadID.y
#line 39
23 0x000002C8: endif
#line 40
24 0x000002CC: sync_g_t
#line 42
25 0x000002D0: mov r0.x, vThreadIDInGroupFlattened.x // r0.x <- index
26 0x000002E0: mov r1.x, r0.x // r1.x <- index
27 0x000002F4: loop
28 0x000002F8: mov r0.y, l(1)
29 0x0000030C: ult r0.y, r1.x, r0.y
30 0x00000328: breakc_z r0.y
#line 44
31 0x00000334: ld_structured r0.y, l(0), l(0), g0.xxxx
32 0x00000358: mov r1.y, l(0)
33 0x0000036C: atomic_umin u0, r1.xyxx, r0.y
#line 45
34 0x00000388: ld_structured r0.y, l(0), l(4), g0.xxxx
35 0x000003AC: mov r1.z, l(4)
36 0x000003C0: atomic_umin u0, r1.xzxx, r0.y
#line 47
37 0x000003DC: ld_structured r0.y, l(0), l(0), g1.xxxx
38 0x00000400: atomic_umax u1, r1.xyxx, r0.y
#line 48
39 0x0000041C: ld_structured r0.y, l(0), l(4), g1.xxxx
40 0x00000440: atomic_umax u1, r1.xzxx, r0.y
#line 49
41 0x0000045C: mov r0.y, l(256)
42 0x00000470: iadd r1.x, r0.y, r1.x
43 0x0000048C: endloop
#line 50
44 0x00000490: ret
// Approximately 45 instruction slots used
Looks like compiler optimizes them away but I do not understand why. Any ideas? :-)
Thanks,