I've written a small software rasterizer using OpenCL and would like to optimize and parallelize it more, currently I'm scanning the whole screen and see if the triangle overlaps with the pixels..
I would like to parallelize the loop and do it more efficiently. For example in my idea, is to process only the bounding box pixels.. ?

__kernel void sendImageToPBO(__global uchar4* dst_buffer, __global float* vbo, int vbosize,
__global int* ibo, int ibosize)
{
size_t blockIdx = get_group_id(0);
size_t blockIdy = get_group_id(1);
size_t blockDimX = get_local_size(0);
size_t blockDimY = get_local_size(1);
size_t threadIdX = get_local_id(0);
size_t threadIdY = get_local_id(1);
float3 c0 = { 1, 0, 0 };
float3 c1 = { 0, 1, 0 };
float3 c2 = { 0, 0, 1 };
int x = get_global_id(0);
int y= get_global_id(1);
int imageWidth = 800;
int imageHeight = 800;
if (x < vbosize && y < vbosize)
{
for (int i = 0; i < vbosize; i += 9)
{
float3 v1 = (float3)(vbo[i], vbo[i + 1], vbo[i + 2]);
float3 v0 = (float3)(vbo[i + 3], vbo[i + 4], vbo[i + 5]);
float3 v2 = (float3)(vbo[i + 6], vbo[i + 7], vbo[i + 8]);
float xmin = fmin(v0.x, fmin(v1.x, v2.x));
float ymin = fmin(v0.y, fmin(v1.y, v2.y));
float xmax = fmax(v0.x, fmin(v1.x, v2.x));
float ymax = fmax(v0.y, fmin(v1.y, v2.y));
// be careful xmin/xmax/ymin/ymax can be negative. Don't cast to unsigned int
unsigned int x0 = max(0, (int)(floor(xmin)));
unsigned int x1 = min((int)(imageWidth)-1, (int)(floor(xmax)));
unsigned int y0 = max(0, (int)(floor(ymin)));
unsigned int y1 = min((int)(imageHeight)-1, (int)(floor(ymax)));
float3 p = { x + 0.5f, y + 0.5f, 0 };
float w0 = edgeFunction(v1, v2, p);
float w1 = edgeFunction(v2, v0, p);
float w2 = edgeFunction(v0, v1, p);
if (w0 >= 0 && w1 >= 0 && w2 >= 0) {
float area = edgeFunction(v0, v1, v2);
float r = w0 * c0.x + w1 * c1.x + w2 * c2.x;
float g = w0 * c0.y + w1 * c1.y + w2 * c2.y;
float b = w0 * c0.z + w1 * c1.z + w2 * c2.z;
w0 /= area;
w1 /= area;
w2 /= area;
float z = 1 / (w0 * v0.z + w1 * v1.z + w2 * v2.z);
r *= z, g *= z, b *= z;
dst_buffer[y * get_global_size(0) + x] = (uchar4)(r * 255, g * 255, b * 255, 255);
}
}
}