Advertisement

SSAA and ResolveSubresource

Started by October 29, 2017 11:36 AM
15 comments, last by matt77hias 7 years, 3 months ago

Game can be made of thousands of shaders, having even a hundred pre compiled downsample is not a big deal. You can always have the most typical cases optimised and rely on the generic one for exotic scenario.

Also, if you start to get a large difference between src and dst it is where you may want a better filter than a simple box average anyway.

1 hour ago, galop1n said:

That way, you can parallelize the tfetch accros more thread group and the instruction of the tone mapping.

Should one ideally use a group of (2,2,1) threads for MSAA/SSAA x4 or just use something larger (multiple of 32/64)?

🧙

Advertisement

So basically this becomes for MSAA something like this:


#else  // MSAA_X && MSAA_Y

struct Data {
	float4 ldr;
	float3 normal;
	float  depth;
};

groupshared Data data[GROUP_SIZE][GROUP_SIZE];

[numthreads(GROUP_SIZE, GROUP_SIZE, 1)]
void CS(uint3 thread_id : SV_DispatchThreadID, 
	uint3 group_thread_id : SV_GroupThreadID) {
	
	const uint2 location = g_viewport_top_left 
		                   + thread_id.xy / uint2(MSAA_X, MSAA_Y);
	
	const uint2 sample_index2 = group_thread_id.xy % uint2(MSAA_X, MSAA_Y);
	const uint  sample_index  = sample_index2.x + sample_index2.y;

	const float4 hdr 
		= g_input_image_texture.sample[sample_index][location];
	data[group_thread_id.x][group_thread_id.y].ldr
		= saturate(TONE_MAP_COMPONENT(hdr));
	data[group_thread_id.x][group_thread_id.y].normal
		= g_input_normal_texture.sample[sample_index][location];
	data[group_thread_id.x][group_thread_id.y].depth  
		= g_input_depth_texture.sample[sample_index][location];

	GroupMemoryBarrierWithGroupSync();
	
	if (0 != sample_index) {
		return;
	}

	// Resolve the (multi-sampled) radiance, normal and depth.
	float4 ldr_sum    = 0.0f;
	float3 normal_sum = 0.0f;
#ifdef DISSABLE_INVERTED_Z_BUFFER
	float depth       = 1.0f;
#else  // DISSABLE_INVERTED_Z_BUFFER
	float depth       = 0.0f;
#endif // DISSABLE_INVERTED_Z_BUFFER
	
	[unroll]
	for (uint i = 0, x = group_thread_id.x; x < MSAA_X; ++i, ++x) {
		[unroll]
		for (uint j = 0, y = group_thread_id.y; y < MSAA_Y; ++j, ++y) {
			ldr_sum    += data[x][y].ldr;
			normal_sum += data[x][y].normal;

#ifdef DISSABLE_INVERTED_Z_BUFFER
			depth = min(depth, data[x][y].depth);
#else  // DISSABLE_INVERTED_Z_BUFFER
			depth = max(depth, data[x][y].depth);
#endif // DISSABLE_INVERTED_Z_BUFFER
		}
	}

	static const float inv_nb_samples = 1.0f / (MSAA_X * MSAA_Y);

	// Store the resolved radiance.
	g_output_image_texture[location]  = INVERSE_TONE_MAP_COMPONENT(ldr_sum * inv_nb_samples);
	// Store the resolved normal.
	g_output_normal_texture[location] = normalize(normal_sum);
	// Store the resolved depth.
	g_output_depth_texture[location]  = depth;
}

#endif // MSAA_X && MSAA_Y

and for SSAA:


#else  // SSAA_X && SSAA_Y

struct Data {
    float4 ldr;
    float3 normal;
    float  depth;
};

groupshared Data data[GROUP_SIZE][GROUP_SIZE];

[numthreads(GROUP_SIZE, GROUP_SIZE, 1)]
void CS(uint3 thread_id : SV_DispatchThreadID, 
    uint3 group_thread_id : SV_GroupThreadID) {
    
    const uint2 input_location  = g_viewport_top_left * uint2(SSAA_X, SSAA_Y)
                                  + thread_id.xy;
    const uint2 output_location = g_viewport_top_left 
                                  + thread_id.xy / uint2(SSAA_X, SSAA_Y);

    const float4 hdr = g_input_image_texture[input_location];
    data[group_thread_id.x][group_thread_id.y].ldr 
        = saturate(TONE_MAP_COMPONENT(hdr));
    data[group_thread_id.x][group_thread_id.y].normal
        = g_input_normal_texture[input_location];
    data[group_thread_id.x][group_thread_id.y].depth  
        = g_input_depth_texture[input_location];

    GroupMemoryBarrierWithGroupSync();

    const uint2 sample_index2 = group_thread_id.xy % uint2(SSAA_X, SSAA_Y);
    const uint  sample_index  = sample_index2.x + sample_index2.y;

    if (0 != sample_index) {
        return;
    }

    // Resolve the (multi-sampled) radiance, normal and depth.
    float4 ldr_sum    = 0.0f;
    float3 normal_sum = 0.0f;
#ifdef DISSABLE_INVERTED_Z_BUFFER
    float depth       = 1.0f;
#else  // DISSABLE_INVERTED_Z_BUFFER
    float depth       = 0.0f;
#endif // DISSABLE_INVERTED_Z_BUFFER
    
    [unroll]
    for (uint i = 0, x = group_thread_id.x; x < SSAA_X; ++i, ++x) {
        [unroll]
        for (uint j = 0, y = group_thread_id.y; y < SSAA_Y; ++j, ++y) {
            ldr_sum    += data[x][y].ldr;
            normal_sum += data[x][y].normal;

#ifdef DISSABLE_INVERTED_Z_BUFFER
            depth = min(depth, data[x][y].depth);
#else  // DISSABLE_INVERTED_Z_BUFFER
            depth = max(depth, data[x][y].depth);
#endif // DISSABLE_INVERTED_Z_BUFFER
        }
    }

    static const float inv_nb_samples = 1.0f / (SSAA_X * SSAA_Y);

    // Store the resolved radiance.
    g_output_image_texture[location]  = INVERSE_TONE_MAP_COMPONENT(ldr_sum * inv_nb_samples);
    // Store the resolved normal.
    g_output_normal_texture[location] = normalize(normal_sum);
    // Store the resolved depth.
    g_output_depth_texture[location]  = depth;
}

#endif // SSAA_X && SSAA_Y

 

🧙

On 10/29/2017 at 6:31 PM, galop1n said:

a good example is tonemapping

should one also add the eye adaption? or is that only for the very final pass which writes to the back buffer?

🧙

I tried the dedicated and non-dedicated approaches and did a quick compare:


#if defined(SSAA) && defined(GROUP_SIZE)

struct Data {
    float4 ldr;
    float3 normal;
    float  depth;
};

groupshared Data data[SSAA * SSAA * GROUP_SIZE * GROUP_SIZE];

[numthreads((SSAA * SSAA), GROUP_SIZE, GROUP_SIZE)]
void CS(uint3 thread_id : SV_DispatchThreadID,
    uint3 group_thread_id : SV_GroupThreadID,
    uint  group_index : SV_GroupIndex) {

    static const float weight = 1.0f / (SSAA * SSAA);

    const uint2 output_location = g_viewport_top_left + thread_id.yz;
    const uint2 input_location  = output_location * SSAA 
        + uint2(group_thread_id.x % SSAA, group_thread_id.x / SSAA);
    
    // Accessing a texture out of bounds, results in zeros. All threads in a 
    // SSAA tile have or do not have data available. Thus the averaging will
    // always be correct.

    // Collect and store the data in the group shared memory.
    data[group_index].ldr    = ToneMap_Max3(
                               g_input_image_texture[input_location], 
                               weight);
    data[group_index].normal = g_input_normal_texture[input_location];
    data[group_index].depth  = g_input_depth_texture[input_location];

    // Sync all group shared memory accesses.
    GroupMemoryBarrierWithGroupSync();

    // Early termination.
    if (0 != group_thread_id.x) {
        return;
    }
    if (any(output_location >= g_display_resolution)) {
        return;
    }
    
    float4 ldr_sum    = 0.0f;
    float3 normal_sum = 0.0f;
#ifdef DISSABLE_INVERTED_Z_BUFFER
    float depth       = 1.0f;
#else  // DISSABLE_INVERTED_Z_BUFFER
    float depth       = 0.0f;
#endif // DISSABLE_INVERTED_Z_BUFFER
    
    // Resolve the (multi-sampled) radiance, normal and depth.
    [unroll]
    for (uint i = group_index; i < group_index + (SSAA * SSAA); ++i) {
        ldr_sum    += data[i].ldr;
        normal_sum += data[i].normal;

#ifdef DISSABLE_INVERTED_Z_BUFFER
        depth = min(depth, data[i].depth);
#else  // DISSABLE_INVERTED_Z_BUFFER
        depth = max(depth, data[i].depth);
#endif // DISSABLE_INVERTED_Z_BUFFER
    }

    // Store the resolved radiance.
    g_output_image_texture[output_location]  = InverseToneMap_Max3(ldr_sum);
    // Store the resolved normal.
    g_output_normal_texture[output_location] = normalize(normal_sum);
    // Store the resolved depth.
    g_output_depth_texture[output_location]  = depth;
}

#else  // SSAA && GROUP_SIZE

#ifndef GROUP_SIZE
#define GROUP_SIZE GROUP_SIZE_DEFAULT
#endif

[numthreads(GROUP_SIZE, GROUP_SIZE, 1)]
void CS(uint3 thread_id : SV_DispatchThreadID) {

    const uint2 output_location = g_viewport_top_left + thread_id.xy;
    if (any(output_location >= g_display_resolution)) {
        return;
    }

    uint2 input_dim;
    g_input_image_texture.GetDimensions(input_dim.x, input_dim.y);
    uint2 output_dim;
    g_output_image_texture.GetDimensions(output_dim.x, output_dim.y);

    const uint2 nb_samples     = input_dim / output_dim;
    const float weight         = 1.0f / (nb_samples.x * nb_samples.y);
    const uint2 input_location = output_location * nb_samples;
    
    float4 ldr_sum    = 0.0f;
    float3 normal_sum = 0.0f;
#ifdef DISSABLE_INVERTED_Z_BUFFER
    float depth       = 1.0f;
#else  // DISSABLE_INVERTED_Z_BUFFER
    float depth       = 0.0f;
#endif // DISSABLE_INVERTED_Z_BUFFER

    // Resolve the (super-sampled) radiance, normal and depth.
    for (uint i = 0; i < nb_samples.x; ++i) {
        for (uint j = 0; j < nb_samples.y; ++j) {

            const uint2 location = input_location + uint2(i,j);

            ldr_sum += ToneMap_Max3(g_input_image_texture[location],
                                    weight);

            normal_sum += g_input_normal_texture[location];

#ifdef DISSABLE_INVERTED_Z_BUFFER
            depth = min(depth, g_input_depth_texture[location]);
#else  // DISSABLE_INVERTED_Z_BUFFER
            depth = max(depth, g_input_depth_texture[location]);
#endif // DISSABLE_INVERTED_Z_BUFFER
        }
    }

    // Store the resolved radiance.
    g_output_image_texture[output_location]  = InverseToneMap_Max3(ldr_sum);
    // Store the resolved normal.
    g_output_normal_texture[output_location] = normalize(normal_sum);
    // Store the resolved depth.
    g_output_depth_texture[output_location]  = depth;
}

#endif // SSAA && GROUP_SIZE

SSAA 2x ~655 FPS vs ~577 FPS

SSAA 3x ~265 FPS vs ~200 FPS

SSAA 4x ~127 FPS vs ~75 FPS

But unlike you expect the fastest one is the non-dedicated one...

SSAA 2x [numthreads(4, 16, 16)] = 1024 (multiple of 64) [MAXIMUM]

SSAA 3x [numthreads(9, 8, 8)] = 576 (multiple of 64)

SSAA 4x [numthreads(16, 8, 8)] = 1024 (multiple of 64) [MAXIMUM]

SSAA All [numthreads(16, 16, 1)] = 256 (multiple of 64)

Since SSAA 2x already results in some difference, should I lower the nb threads/group or is this really due to the using the Z ?

🧙

On 11/4/2017 at 7:39 PM, matt77hias said:

SSAA 2x ~655 FPS vs ~577 FPS

SSAA 3x ~265 FPS vs ~200 FPS

SSAA 4x ~127 FPS vs ~75 FPS

But unlike you expect the fastest one is the non-dedicated one...

SSAA 2x [numthreads(4, 16, 16)] = 1024 (multiple of 64) [MAXIMUM]

SSAA 3x [numthreads(9, 8, 8)] = 576 (multiple of 64)

SSAA 4x [numthreads(16, 8, 8)] = 1024 (multiple of 64) [MAXIMUM]

SSAA All [numthreads(16, 16, 1)] = 256 (multiple of 64)

Since SSAA 2x already results in some difference, should I lower the nb threads/group or is this really due to the using the Z ?

Is it actually advisable to be close to the maximum of 1024? Or is this only feasible for the best GPUs?

What about a sync? Is a 256 threads sync less expensive than a 1024 threads sync?

🧙

This topic is closed to new replies.

Advertisement