Hi,
I have a stream compaction algorithm based on a few compute shaders using a prefix sum. First, a number of scan passes, then a number of add passes and then the compaction.
It seems that something goes wrong during the add passes, like the buffer barriers aren't working between them… Am I missing something? Do I need a different type of barrier?
Scan Shader:
#version 450
#extension GL_KHR_shader_subgroup_arithmetic : enable
layout(std430, set = 0, binding = 0) buffer Input
{
uvec4 dataInput[];
};
layout(std430, set = 0, binding = 1) buffer Output
{
uvec4 dataOutput[];
};
layout (local_size_x = 256) in;
const int sumSubGroupSize = 64;
shared uint sdata[sumSubGroupSize];
void main()
{
uint numInstances = dataInput[0].z;
uint sum = 0;
if (gl_GlobalInvocationID.x < numInstances)
{
sum = dataInput[gl_GlobalInvocationID.x].x;
}
sum = subgroupInclusiveAdd(sum);
if (gl_SubgroupInvocationID == gl_SubgroupSize - 1)
{
sdata[gl_SubgroupID] = sum;
}
memoryBarrierShared();
barrier();
if (gl_SubgroupID == 0)
{
uint warpSum = gl_SubgroupInvocationID < gl_NumSubgroups ? sdata[gl_SubgroupInvocationID] : 0;
warpSum = subgroupInclusiveAdd(warpSum);
sdata[gl_SubgroupInvocationID] = warpSum;
}
memoryBarrierShared();
barrier();
uint blockSum = 0;
if (gl_SubgroupID > 0)
{
blockSum = sdata[gl_SubgroupID - 1];
}
sum += blockSum;
if (gl_GlobalInvocationID.x < numInstances)
{
dataInput[gl_GlobalInvocationID.x].x = sum;
}
else
{
dataInput[gl_GlobalInvocationID.x].x = 0;
}
if (gl_LocalInvocationID.x == gl_WorkGroupSize.x - 1)
{
dataOutput[gl_WorkGroupID.x].x = sum;
}
if (gl_GlobalInvocationID.x == 0)
{
dataOutput[0].z = (numInstances + 255) / 256;
}
}
Add Shader:
#version 450
#extension GL_ARB_separate_shader_objects : enable
#extension GL_GOOGLE_include_directive : enable
//layout (local_size_x_id = 1) in;
layout (local_size_x = 256) in;
layout(std430, set = 0, binding = 0) buffer Input
{
uvec4 InputData[];
};
layout(std430, set = 0, binding = 1) buffer Output
{
uvec4 OutputData[];
};
shared uint sum;
void main()
{
uint numInstances = OutputData[0].z;
if (gl_WorkGroupID.x > 0 && gl_GlobalInvocationID.x < numInstances)
{
sum = 0;
if (gl_LocalInvocationID.x == 0)
{
sum = InputData[gl_WorkGroupID.x - 1].x;
}
memoryBarrierShared();
barrier();
OutputData[gl_GlobalInvocationID.x].x += sum;
}
}
C++ Side:
{
renderCmd->CmdBeginDebugMarker("Stream Compaction");
{
renderCmd->CmdBeginDebugMarker("Scan");
{
Renderer::BufferBarrier bbarriers[] = {
{ mInstanceVisibilityBuffer, Renderer::ResourceState::UNORDERED_ACCESS },
{ mInstanceVisibilityPartialSumsBuffer[0], Renderer::ResourceState::UNORDERED_ACCESS },
};
renderCmd->CmdResourceBarrier(2, bbarriers, 0, nullptr);
renderCmd->CmdBindPipeline(mVisibilityScanPipeline);
renderCmd->CmdBindDescriptorSet(0, mVisibilityScanDescriptorSetTexture);
//renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityScanDescriptorSetUniforms);
const uint32_t* threadGroupSizes = mVisibilityScanShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;
uint32_t totalInstance = mInstancedMeshData.mTotalNumInstances;
renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);
for (uint32_t pass = 1; pass < mInstanceVisibilityPartialSumsBuffer.size(); pass++)
{
Renderer::BufferBarrier bbarriers[] = {
{ mInstanceVisibilityPartialSumsBuffer[pass - 1], Renderer::ResourceState::UNORDERED_ACCESS },
{ mInstanceVisibilityPartialSumsBuffer[pass], Renderer::ResourceState::UNORDERED_ACCESS },
};
renderCmd->CmdResourceBarrier(2, bbarriers, 0, nullptr);
renderCmd->CmdBindPipeline(mVisibilityScanPipeline);
renderCmd->CmdBindDescriptorSet(pass, mVisibilityScanDescriptorSetTexture);
//renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityScanDescriptorSetUniforms);
const uint32_t* threadGroupSizes = mVisibilityScanShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;
totalInstance = (totalInstance + 255) / 256;
renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);
}
}
renderCmd->CmdEndDebugMarker();
renderCmd->CmdBeginDebugMarker("Add");
{
for (uint32_t pass = (uint32_t)mInstanceVisibilityPartialSumsBuffer.size() - 1; pass > 0 ; pass--)
{
Renderer::BufferBarrier bbarriers[] = {
{ mInstanceVisibilityPartialSumsBuffer[pass - 1], Renderer::ResourceState::UNORDERED_ACCESS },
{ mInstanceVisibilityPartialSumsBuffer[pass], Renderer::ResourceState::UNORDERED_ACCESS },
};
renderCmd->CmdResourceBarrier(2, bbarriers, 0, nullptr);
renderCmd->CmdBindPipeline(mVisibilityAddPipeline);
renderCmd->CmdBindDescriptorSet(pass, mVisibilityAddDescriptorSetTexture);
//renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityAddDescriptorSetUniforms);
const uint32_t* threadGroupSizes = mVisibilityAddShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;
uint32_t divide = (256 << (8 << (pass - 2)));
uint32_t totalInstance = (mInstancedMeshData.mTotalNumInstances + divide - 1) / divide;
renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);
}
Renderer::BufferBarrier bbarriers[] = {
{ mInstanceVisibilityBuffer, Renderer::ResourceState::UNORDERED_ACCESS },
{ mInstanceVisibilityPartialSumsBuffer[0], Renderer::ResourceState::UNORDERED_ACCESS },
};
renderCmd->CmdResourceBarrier(2, bbarriers, 0, nullptr);
renderCmd->CmdBindPipeline(mVisibilityAddPipeline);
renderCmd->CmdBindDescriptorSet(0, mVisibilityAddDescriptorSetTexture);
//renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityAddDescriptorSetUniforms);
const uint32_t* threadGroupSizes = mVisibilityAddShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;
uint32_t totalInstance = mInstancedMeshData.mTotalNumInstances;
renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);
}
renderCmd->CmdEndDebugMarker();
renderCmd->CmdBeginDebugMarker("Compact");
{
Renderer::BufferBarrier bbarriers[] = {
{ mInstanceVisibilityBuffer, Renderer::ResourceState::UNORDERED_ACCESS },
{ mDrawInstancesIndirectCommandBuffer, Renderer::ResourceState::UNORDERED_ACCESS },
{ mInstanceDataBuffer, Renderer::ResourceState::UNORDERED_ACCESS },
};
renderCmd->CmdResourceBarrier(3, bbarriers, 0, nullptr);
renderCmd->CmdBindPipeline(mVisibilityCompactPipeline);
renderCmd->CmdBindDescriptorSet(0, mVisibilityCompactDescriptorSetTexture);
renderCmd->CmdBindDescriptorSet(frameIndex, mVisibilityCompactDescriptorSetUniforms);
const uint32_t* threadGroupSizes = mVisibilityCompactShader->mReflection.mStageReflections[0].mNumThreadsPerGroup;
uint32_t totalInstance = mInstancedMeshData.mTotalNumInstances;
renderCmd->CmdDispatch((totalInstance + threadGroupSizes[0] - 1) / threadGroupSizes[0], 1, 1);
}
renderCmd->CmdEndDebugMarker();
}
renderCmd->CmdEndDebugMarker();
}
The number of threads is stored and written out in the [0].z component of some of the buffers for the next pass to use.