12 minutes ago, Ryan_001 said:
Its good to know that theory and practice align, at least for this : ) Nice work. I'm curious, what sort of barrier parameters are you using?
BufferMemoryBarriers, here's code.
I leave comments in to illustrate how poor and uncertain the specs leave us at trial and error - or would you get the idea you need to set VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT for an indirect compute dispatch? ![:) :)](https://uploads.gamedev.net/emoticons/medium.smile.webp)
(Of course i could remove this here as i'm only writing some prefix sum results and no dispatch count, but offset and size becomes interesting now...)
void MemoryBarriers (VkCommandBuffer commandBuffer, int *bufferList, const int numBarriers)
{
int const maxBarriers = 16;
assert (numBarriers <= maxBarriers);
VkBufferMemoryBarrier bufferMemoryBarriers[maxBarriers] = {};
//VkMemoryBarrier memoryBarriers[maxBarriers] = {};
for (int i=0; i<numBarriers; i++)
{
bufferMemoryBarriers.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
//bufferMemoryBarriers.srcAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
//bufferMemoryBarriers.dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT;
bufferMemoryBarriers.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT;
bufferMemoryBarriers.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
//bufferMemoryBarriers.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
//bufferMemoryBarriers.dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
bufferMemoryBarriers.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
bufferMemoryBarriers.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
bufferMemoryBarriers.buffer = buffers[bufferList].deviceBuffer;
bufferMemoryBarriers.offset = 0;
bufferMemoryBarriers.size = VK_WHOLE_SIZE;
//memoryBarriers.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
//memoryBarriers.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;// | VK_ACCESS_SHADER_WRITE_BIT;
//memoryBarriers.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;// | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
}
vkCmdPipelineBarrier(
commandBuffer,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,
0,//VkDependencyFlags
0, NULL,//numBarriers, memoryBarriers,//
numBarriers, bufferMemoryBarriers,
0, NULL);
}
void Record (VkCommandBuffer commandBuffer, const uint32_t taskFlags,
int profilerStartID, int profilerStopID, bool profilePerTask = true, bool use_barriers = true)
{
VkCommandBufferBeginInfo commandBufferBeginInfo = {};
commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
commandBufferBeginInfo.flags = 0;//VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo);
#ifdef USE_GPU_PROFILER
if (profilerStartID>=0) profiler.Start (profilerStartID, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
#endif
if (taskFlags & (1<<tTEST0))
{
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayouts[tTEST0], 0, 1, &descriptorSets[tTEST0], 0, nullptr);
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines[taskToPipeline[tTEST0]]);
#ifdef PROFILE_TASKS
if (profilePerTask) profiler.Start (TS_TEST0, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
#endif
int barrierBuffers[] = {bTEST0};
for (int i=0; i<TASK_COUNT_0; i++)
{
vkCmdDispatchIndirect(commandBuffer, buffers[bDISPATCH].deviceBuffer, sizeof(VkDispatchIndirectCommand) * (0 + i) );
if (use_barriers) MemoryBarriers (commandBuffer, barrierBuffers, 1);
}
#ifdef PROFILE_TASKS
if (profilePerTask) profiler.Stop (TS_TEST0, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
#endif
}
if (taskFlags & (1<<tTEST1))
{
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayouts[tTEST1], 0, 1, &descriptorSets[tTEST1], 0, nullptr);
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines[taskToPipeline[tTEST1]]);
#ifdef PROFILE_TASKS
if (profilePerTask) profiler.Start (TS_TEST1, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
#endif
int barrierBuffers[] = {bTEST1};
for (int i=0; i<TASK_COUNT_1; i++)
{
vkCmdDispatchIndirect(commandBuffer, buffers[bDISPATCH].deviceBuffer, sizeof(VkDispatchIndirectCommand) * (200 + i) );
if (use_barriers) MemoryBarriers (commandBuffer, barrierBuffers, 1);
}
#ifdef PROFILE_TASKS
if (profilePerTask) profiler.Stop (TS_TEST1, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
#endif
}
if (taskFlags & (1<<tTEST2))
{
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayouts[tTEST2], 0, 1, &descriptorSets[tTEST2], 0, nullptr);
vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines[taskToPipeline[tTEST2]]);
#ifdef PROFILE_TASKS
if (profilePerTask) profiler.Start (TS_TEST2, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
#endif
int barrierBuffers[] = {bTEST2};
for (int i=0; i<TASK_COUNT_2; i++)
{
vkCmdDispatchIndirect(commandBuffer, buffers[bDISPATCH].deviceBuffer, sizeof(VkDispatchIndirectCommand) * (400 + i) );
if (use_barriers) MemoryBarriers (commandBuffer, barrierBuffers, 1);
}
#ifdef PROFILE_TASKS
if (profilePerTask) profiler.Stop (TS_TEST2, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
#endif
}
#ifdef USE_GPU_PROFILER
if (profilerStopID>=0) profiler.Stop (profilerStopID, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
#endif
vkEndCommandBuffer(commandBuffer);
}