Hey guys,

In my application I use a Compute Shader to elaborate data in a fast way. I dispatch a Compute Shader for each instance of my model. So for example, I have 30 instancies, I dispatch a Compute Shader 30 times.

for(int i = 0; i < engineModLoader.instanceNumber; i++)
   engineRenderer.DispatchCompute(phoenixMesh.totalMeshlets.size(), selectedMeshlet,

I use the result of the compute shader to fill a Global Index Buffer useful for the drawing of instances. So, all Compute Shaders dispatched have to be termineted before the DrawFrame() call, which renders the instances. Unfortunatle the result returned by the Compute Shader is wrong. I don't know if it is a sync problem or if I miss something else. The DispatchCompute() is the following:

void Renderer::DispatchCompute(int numberOfElements, std::vector<Phoenix::DataToCompute>& selectedMeshlet, 
    const glm::vec3& instancePos)
        VkSubmitInfo computeSubmitInfo{};
        computeSubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
        engineTransform.ubo.instancePos = instancePos;
        vkResetFences(engineDevice.logicalDevice, 1, &computeInFlightFences[currentComputeFrame]);
        vkResetCommandBuffer(computeCommandBuffers[currentComputeFrame], 0);

        RecordComputeBuffer(numberOfElements, computeCommandBuffers[currentComputeFrame]);

        computeSubmitInfo.commandBufferCount = 1;
        computeSubmitInfo.pCommandBuffers = &computeCommandBuffers[currentComputeFrame];

        if (vkQueueSubmit(engineDevice.computeQueue, 1, &computeSubmitInfo, computeInFlightFences[currentComputeFrame]) 
        != VK_SUCCESS) 
            throw std::runtime_error("failed to submit compute command buffer!");
        vkWaitForFences(engineDevice.logicalDevice, 1, &computeInFlightFences[currentComputeFrame], VK_TRUE, UINT64_MAX);

        VkDeviceSize bufferSize = sizeof(Phoenix::DataToCompute) * numberOfElements;

        VkBuffer stagingBuffer;
        VkDeviceMemory stagingBufferMemory;
        CreateBuffer(bufferSize, VK_BUFFER_USAGE_TRANSFER_DST_BIT,
        stagingBuffer, stagingBufferMemory);
        CopyBuffer(SSBOBuffers[currentComputeFrame], stagingBuffer, bufferSize);

        void* bufferData = nullptr;
        vkMapMemory(engineDevice.logicalDevice, stagingBufferMemory, 0, bufferSize, 0, &bufferData);
        memcpy(, bufferData, bufferSize);
        vkUnmapMemory(engineDevice.logicalDevice, stagingBufferMemory);

        currentComputeFrame = (currentComputeFrame + 1) % MAX_FRAMES_IN_FLIGHT;

        vkDestroyBuffer(engineDevice.logicalDevice, stagingBuffer, nullptr);
        vkFreeMemory(engineDevice.logicalDevice, stagingBufferMemory, nullptr);
    void Renderer::RecordComputeBuffer(int numberOfElements, VkCommandBuffer commandBuffer)
        VkCommandBufferBeginInfo beginInfo{};

        if (vkBeginCommandBuffer(commandBuffer, &beginInfo) != VK_SUCCESS) 
            throw std::runtime_error("failed to begin recording command buffer!");

        VkDeviceSize ssboSize = sizeof(Phoenix::DataToCompute) * numberOfElements;

        vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, enginePipeline.computePipeline);
        vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, enginePipeline.computePipelineLayout, 0, 1, 
        &descriptorSets[currentComputeFrame], 0, 0);

        vkCmdDispatch(commandBuffer, numberOfElements / 32, 1, 1);

        if (vkEndCommandBuffer(commandBuffer) != VK_SUCCESS) 
            throw std::runtime_error("failed to record command buffer!");

As you can see I use vkWaitForFences to wait the termination of command. The Compute Shader is the following:

#version 450

struct DataToCompute
   int meshletID;
   float error;
   float parentError;
   vec3 boundCenter;
   vec3 parentBoundCenter;
   float errorThreshold;
   bool selected;
   int width;
   float hFov;
   int lod; 

layout (binding = 0) uniform ParameterUBO {
    mat4 model;
    mat4 view;
    mat4 proj;
    vec3 instancePos;
} ubo;

layout(std140, binding = 3) buffer MeshletDataSSBO {
   DataToCompute dataIn[ ];

layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;

const int MAX_LOD_NUMBER = 5;

float ComputeScreenSpaceError(vec3 centerBound, float groupError, int width, 
float hFov, vec3 instancePos, mat4 modelView)
   centerBound += instancePos;
   vec4 viewCenter = vec4(centerBound, 1.0f);
   //I transform the center in view-space
   viewCenter = modelView * viewCenter;
   centerBound.x = viewCenter.x;
   centerBound.y = viewCenter.y;
   centerBound.z = viewCenter.z;
   float radius = length(vec3(modelView * vec4(groupError, 0, 0, 0 )));

   const float cotHalfFov = 1.0f / tan(hFov / 2.0f);
   const float d2 = dot(centerBound, centerBound);
   const float r = radius;
   const float div = sqrt(d2 - r*r);
   float screenSpaceError = (width / 2.0f * cotHalfFov * r) / div;
   return screenSpaceError;

void main() 
   uint index = gl_GlobalInvocationID.x;

   if(index >= 325)
   mat4 modelView = ubo.view * ubo.model;
   float currentError = ComputeScreenSpaceError(dataIn[index].boundCenter, dataIn[index].error, dataIn[index].width,
   dataIn[index].hFov, ubo.instancePos, modelView);

   if(dataIn[index].lod >= MAX_LOD_NUMBER - 1)
      if(currentError <= dataIn[index].errorThreshold)
         dataIn[index].selected = true;
      float parentError = ComputeScreenSpaceError(dataIn[index].parentBoundCenter, dataIn[index].parentError,
      dataIn[index].width, dataIn[index].hFov, ubo.instancePos, modelView);
      if(currentError <= dataIn[index].errorThreshold && parentError > dataIn[index].errorThreshold)
         dataIn[index].selected = true;


Where I'm going wrong ?

giuseppe7 said:
I dispatch a Compute Shader for each instance of my model. So for example, I have 30 instancies, I dispatch a Compute Shader 30 times.

It's no mistake, but i think your approach is very inefficient.
It looks you launch only one workgroup, then wait until it is done, then launch the next?
You should launch them all at the same time with only one dispatch.
Or are there hierarchy dependencies? Even then, you would have large workloads after you have processed a few top levels.

Currently i assume the GPU is underutilized.

I have a a lot of such per hierarchy level workloads. For synchronization i only use memory barriers on the GPU, ensuring one level is processed before the next starts work. I make a command buffer with one indirect dispatch per level, then upload the command buffer just once and run it every frame. Goal is to calculate a e.g. lodcut of surfel hierachy per frame, which feels very similar to what Nanite does. There is no need for GPU / CPU synchronization. Parent nodes generate work for the child level using indirect dispatch. Processing a tree with approx. 20 levels and 200k surfels takes some nanoseconds iirc, certainly much less than a millisecond on PS4 class HW.
(I found it interesting that in my case the ‘lazy update’ optimization was no win, meaning to take the previous lodcut and calculating per node if lod should go up or down. This is a win only for binary trees, but with a larger branching factor like 8 (but 4 on average) in my case it was just as fast to traverse the whole tree.)

That said, i would not spend too much time on finding the bug. You may want to do the whole thing just differently at some point anyway.

However, i see one fishy thing:

giuseppe7 said:
uint index = gl_GlobalInvocationID.x; if(index >= 325) return;

giuseppe7 said:
layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;

Your workgroup is only 32 threads wide, so the local thread index won't go up to 225.
Maybe that's related to the bug.

