Advertisement

Writes from compute shader not visible

Started by April 25, 2018 03:06 PM
4 comments, last by turanszkij 6 years, 9 months ago

Hi, I am having problems with all of my compute shaders in Vulkan. They are not writing to resources, even though there are no problems in the debug layer, every descriptor seem correctly bound in the graphics debugger, and the shaders definitely take time to execute. I understand that this is probably a bug in my implementation which is a bit complex, trying to emulate a DX11 style rendering API, but maybe I'm missing something trivial in my logic here? Currently I am doing these:

  • Set descriptors, such as VK_DESCRIPTOR_TYPE_STORAGE_BUFFER for a read-write structured buffer (which is non formatted buffer)
  • Bind descriptor table / validate correctness by debug layer
  • Dispatch on graphics/compute queue, the same one that is feeding graphics rendering commands. 
  • Insert memory barrier with both stagemasks as VK_PIPELINE_STAGE_ALL_COMMANDS_BIT and srcAccessMask VK_ACCESS_SHADER_WRITE_BIT to dstAccessMask VK_ACCESS_SHADER_READ_BIT
  • Also insert buffer memory barrier just for the storage buffer I wanted to write

Both my application behaves like the buffers are empty, and Nsight debugger also shows empty buffers (ssems like everything initialized to 0). Also, I tried the most trivial shader, writing value of 1 to the first element of uint buffer. Am I missing something trivial here? What could be an other way to debug this further?

 

8 minutes ago, turanszkij said:

Insert memory barrier

Maybe you need to use buffer memory barriers instead - i was confused about this (confusion still visible in comments :) )

 


		if (taskFlags & (1<<tREBUILD_RESTORE))
		{
			vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayouts[tREBUILD_RESTORE], 0, 1, &descriptorSets[tREBUILD_RESTORE], 0, nullptr);
		
			vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines[taskToPipeline[tREBUILD_RESTORE]]);
	#ifdef PROFILE_TASKS
			profiler.Start (TS_BRESTORE, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
	#endif
			vkCmdDispatchIndirect(commandBuffer, buffers[bDISPATCH].deviceBuffer, sizeof(VkDispatchIndirectCommand) * OFFSET_DISPATCH_REBUILD_RESTORE);
	#ifdef PROFILE_TASKS
			profiler.Stop (TS_BRESTORE, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
	#endif
			int barrierBuffers[] = {bS_POS, bS_V4, bS_I4, bS_SI, bS_CONE};
			MemoryBarriers (commandBuffer, barrierBuffers, 5);
		}



	void MemoryBarriers (VkCommandBuffer commandBuffer, int *bufferList, const int numBarriers)
	{
		int const maxBarriers = 16;
		assert (numBarriers <= maxBarriers);

		VkBufferMemoryBarrier bufferMemoryBarriers[maxBarriers] = {};
		//VkMemoryBarrier memoryBarriers[maxBarriers] = {};

		for (int i=0; i<numBarriers; i++)
		{
			bufferMemoryBarriers[i].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
			//bufferMemoryBarriers[i].srcAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
			//bufferMemoryBarriers[i].dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT;
			bufferMemoryBarriers[i].srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT;
			bufferMemoryBarriers[i].dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
			//bufferMemoryBarriers[i].srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
			//bufferMemoryBarriers[i].dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
			bufferMemoryBarriers[i].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
			bufferMemoryBarriers[i].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
			bufferMemoryBarriers[i].buffer = buffers[bufferList[i]].deviceBuffer;
			bufferMemoryBarriers[i].offset = 0;
			bufferMemoryBarriers[i].size = VK_WHOLE_SIZE;

			//memoryBarriers[i].sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
			//memoryBarriers[i].srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;// | VK_ACCESS_SHADER_WRITE_BIT;
			//memoryBarriers[i].dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;// | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
		}

		vkCmdPipelineBarrier(
			commandBuffer,
			VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,//VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,//                        srcStageMask,
			VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,//VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,//                        dstStageMask,
			0,//VkDependencyFlags                           dependencyFlags,
			0, NULL,//numBarriers, memoryBarriers,//
			numBarriers, bufferMemoryBarriers,
			0, NULL);
	}
		

 

Advertisement
12 minutes ago, JoeJ said:

Maybe you need to use buffer memory barriers instead - i was confused about this (confusion still visible in comments )

 



		if (taskFlags & (1<<tREBUILD_RESTORE))
		{
			vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayouts[tREBUILD_RESTORE], 0, 1, &descriptorSets[tREBUILD_RESTORE], 0, nullptr);
		
			vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelines[taskToPipeline[tREBUILD_RESTORE]]);
	#ifdef PROFILE_TASKS
			profiler.Start (TS_BRESTORE, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
	#endif
			vkCmdDispatchIndirect(commandBuffer, buffers[bDISPATCH].deviceBuffer, sizeof(VkDispatchIndirectCommand) * OFFSET_DISPATCH_REBUILD_RESTORE);
	#ifdef PROFILE_TASKS
			profiler.Stop (TS_BRESTORE, commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
	#endif
			int barrierBuffers[] = {bS_POS, bS_V4, bS_I4, bS_SI, bS_CONE};
			MemoryBarriers (commandBuffer, barrierBuffers, 5);
		}



	void MemoryBarriers (VkCommandBuffer commandBuffer, int *bufferList, const int numBarriers)
	{
		int const maxBarriers = 16;
		assert (numBarriers <= maxBarriers);

		VkBufferMemoryBarrier bufferMemoryBarriers[maxBarriers] = {};
		//VkMemoryBarrier memoryBarriers[maxBarriers] = {};

		for (int i=0; i<numBarriers; i++)
		{
			bufferMemoryBarriers[i].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
			//bufferMemoryBarriers[i].srcAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
			//bufferMemoryBarriers[i].dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT;
			bufferMemoryBarriers[i].srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT;
			bufferMemoryBarriers[i].dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
			//bufferMemoryBarriers[i].srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
			//bufferMemoryBarriers[i].dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
			bufferMemoryBarriers[i].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
			bufferMemoryBarriers[i].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
			bufferMemoryBarriers[i].buffer = buffers[bufferList[i]].deviceBuffer;
			bufferMemoryBarriers[i].offset = 0;
			bufferMemoryBarriers[i].size = VK_WHOLE_SIZE;

			//memoryBarriers[i].sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
			//memoryBarriers[i].srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;// | VK_ACCESS_SHADER_WRITE_BIT;
			//memoryBarriers[i].dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;// | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
		}

		vkCmdPipelineBarrier(
			commandBuffer,
			VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,//VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,//                        srcStageMask,
			VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,//VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,//                        dstStageMask,
			0,//VkDependencyFlags                           dependencyFlags,
			0, NULL,//numBarriers, memoryBarriers,//
			numBarriers, bufferMemoryBarriers,
			0, NULL);
	}
		

 

Thanks, but as I said, I am issuing both memory barriers and buffer memory barriers. Also double checked and compared the code with mine and can't see any big differences with vulkan calls...

23 minutes ago, turanszkij said:

but as I said, I am issuing both memory barriers and buffer memory barriers.

missed this. I don't use plain memory barriers at all - maybe removing them helps. 

I guess you already use GLSL for initial tests, no HLSL?

You could look at this for a messy template of using simple graphics, imgui and compute: https://github.com/JoeJGit/Vulkan-Async-Compute-Test

I guess it throws tons of validation errors meanwhile and contains bugs, but maybe it helps to spot something you've forgotten.

For future reference: This was a weird one. The problem got magically fixed when I limited the constant buffer descriptors bound from 15 to 12, as my GPU (GTX 1050) max limit supported only 12. This was strange because:

  • The earlier validation layer I was using didn't complain at all.
  • I am not even using 12 constant buffers, just 2-3 usually. But the descriptor table was filled with "null" descriptors to conform to DX11 style API slot layout.
  • The later validation layer version started to complain, GPU crashes started to occur in random places.

So now my Vulkan layer uses 12 CBV slots, which is not a big deal... :)

Relevant topic: 

 

This topic is closed to new replies.

Advertisement