|
/*
|
* function: kernel_wavelet_denoise
|
* wavelet filter for denoise usage
|
* in: input image data as read only
|
* threshold: noise threshold
|
* low:
|
*/
|
|
__constant float threshConst[5] = { 50.430166f, 20.376415f, 10.184031f, 6.640919f, 3.367972f };
|
|
__kernel void kernel_wavelet_denoise(__global uint *src, __global uint *approxOut, __global float *details, __global uint *dest,
|
int inputYOffset, int outputYOffset, uint inputUVOffset, uint outputUVOffset,
|
int layer, int decomLevels, float hardThresh, float softThresh)
|
{
|
int x = get_global_id(0);
|
int y = get_global_id(1);
|
size_t width = get_global_size(0);
|
size_t height = get_global_size(1);
|
|
int imageWidth = width * 16;
|
int imageHeight = height;
|
|
float stdev = 0.0f;
|
float thold = 0.0f;
|
float16 deviation = (float16)0.0f;
|
|
layer = (layer > 1) ? layer : 1;
|
layer = (layer < decomLevels) ? layer : decomLevels;
|
|
src += inputYOffset;
|
dest += outputYOffset;
|
|
#if WAVELET_DENOISE_UV
|
int xScaler = pown(2.0f, layer);
|
int yScaler = pown(2.0f, (layer - 1));
|
#else
|
int xScaler = pown(2.0f, (layer - 1));
|
int yScaler = xScaler;
|
#endif
|
|
xScaler = ((x == 0) || (x > imageWidth / 16 - xScaler)) ? 0 : xScaler;
|
yScaler = ((y < yScaler) || (y > imageHeight - yScaler)) ? 0 : yScaler;
|
|
uint4 approx;
|
float16 detail;
|
|
#if WAVELET_DENOISE_UV
|
int srcOffset = (layer % 2) ? (inputUVOffset * imageWidth / 4) : 0;
|
__global uchar *src_p = (__global uchar *)(src + srcOffset);
|
#else
|
__global uchar *src_p = (__global uchar *)(src);
|
#endif
|
|
int pixel_index = x * 16 + y * imageWidth;
|
int group_index = x * 4 + y * (imageWidth / 4);
|
|
#if WAVELET_DENOISE_UV
|
uint4 luma;
|
int luma_index0 = x * 4 + (2 * y) * (imageWidth / 4);
|
int luma_index1 = x * 4 + (2 * y + 1) * (imageWidth / 4);
|
#else
|
uint4 chroma;
|
int chroma_index = x * 4 + (y / 2) * (imageWidth / 4);
|
#endif
|
|
ushort16 a;
|
ushort16 b;
|
ushort16 c;
|
ushort16 d;
|
ushort16 e;
|
ushort16 f;
|
ushort16 g;
|
ushort16 h;
|
ushort16 i;
|
|
float div = 1.0f / 16.0f;
|
|
a = (ushort16)(convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 1]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 2]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 3]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 4]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 5]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 6]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 7]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 8]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 9]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 10]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 11]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 12]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 13]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 14]), convert_ushort(src_p[pixel_index - yScaler * imageWidth - xScaler + 15])
|
);
|
|
b = (ushort16)(convert_ushort(src_p[pixel_index - yScaler * imageWidth]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 1]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + 2]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 3]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + 4]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 5]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + 6]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 7]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + 8]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 9]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + 10]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 11]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + 12]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 13]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + 14]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + 15])
|
);
|
|
c = (ushort16)(convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 1]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 2]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 3]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 4]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 5]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 6]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 7]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 8]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 9]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 10]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 11]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 12]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 13]),
|
convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 14]), convert_ushort(src_p[pixel_index - yScaler * imageWidth + xScaler + 15])
|
);
|
|
d = (ushort16)(convert_ushort(src_p[pixel_index - xScaler]), convert_ushort(src_p[pixel_index - xScaler + 1]),
|
convert_ushort(src_p[pixel_index - xScaler + 2]), convert_ushort(src_p[pixel_index - xScaler + 3]),
|
convert_ushort(src_p[pixel_index - xScaler + 4]), convert_ushort(src_p[pixel_index - xScaler + 5]),
|
convert_ushort(src_p[pixel_index - xScaler + 6]), convert_ushort(src_p[pixel_index - xScaler + 7]),
|
convert_ushort(src_p[pixel_index - xScaler + 8]), convert_ushort(src_p[pixel_index - xScaler + 9]),
|
convert_ushort(src_p[pixel_index - xScaler + 10]), convert_ushort(src_p[pixel_index - xScaler + 11]),
|
convert_ushort(src_p[pixel_index - xScaler + 12]), convert_ushort(src_p[pixel_index - xScaler + 13]),
|
convert_ushort(src_p[pixel_index - xScaler + 14]), convert_ushort(src_p[pixel_index - xScaler + 15])
|
);
|
|
e = (ushort16)(convert_ushort(src_p[pixel_index]), convert_ushort(src_p[pixel_index + 1]),
|
convert_ushort(src_p[pixel_index + 2]), convert_ushort(src_p[pixel_index + 3]),
|
convert_ushort(src_p[pixel_index + 4]), convert_ushort(src_p[pixel_index + 5]),
|
convert_ushort(src_p[pixel_index + 6]), convert_ushort(src_p[pixel_index + 7]),
|
convert_ushort(src_p[pixel_index + 8]), convert_ushort(src_p[pixel_index + 9]),
|
convert_ushort(src_p[pixel_index + 10]), convert_ushort(src_p[pixel_index + 11]),
|
convert_ushort(src_p[pixel_index + 12]), convert_ushort(src_p[pixel_index + 13]),
|
convert_ushort(src_p[pixel_index + 14]), convert_ushort(src_p[pixel_index + 15])
|
);
|
|
f = (ushort16)(convert_ushort(src_p[pixel_index + xScaler]), convert_ushort(src_p[pixel_index + xScaler + 1]),
|
convert_ushort(src_p[pixel_index + xScaler + 2]), convert_ushort(src_p[pixel_index + xScaler + 3]),
|
convert_ushort(src_p[pixel_index + xScaler + 4]), convert_ushort(src_p[pixel_index + xScaler + 5]),
|
convert_ushort(src_p[pixel_index + xScaler + 6]), convert_ushort(src_p[pixel_index + xScaler + 7]),
|
convert_ushort(src_p[pixel_index + xScaler + 8]), convert_ushort(src_p[pixel_index + xScaler + 9]),
|
convert_ushort(src_p[pixel_index + xScaler + 10]), convert_ushort(src_p[pixel_index + xScaler + 11]),
|
convert_ushort(src_p[pixel_index + xScaler + 12]), convert_ushort(src_p[pixel_index + xScaler + 13]),
|
convert_ushort(src_p[pixel_index + xScaler + 14]), convert_ushort(src_p[pixel_index + xScaler + 15])
|
);
|
|
g = (ushort16)(convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 1]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 2]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 3]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 4]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 5]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 6]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 7]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 8]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 9]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 10]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 11]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 12]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 13]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 14]), convert_ushort(src_p[pixel_index + yScaler * imageWidth - xScaler + 15])
|
);
|
|
h = (ushort16)(convert_ushort(src_p[pixel_index + yScaler * imageWidth]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 1]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + 2]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 3]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + 4]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 5]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + 6]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 7]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + 8]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 9]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + 10]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 11]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + 12]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 13]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + 14]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + 15])
|
);
|
|
i = (ushort16)(convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 1]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 2]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 3]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 4]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 5]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 6]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 7]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 8]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 9]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 10]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 11]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 12]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 13]),
|
convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 14]), convert_ushort(src_p[pixel_index + yScaler * imageWidth + xScaler + 15])
|
);
|
|
/*
|
{ a, b, c } { 1, 2, 1 }
|
{ d, e, f } { 2, 4, 2 }
|
{ g, h, i } { 1, 2, 1 }
|
*/
|
ushort16 sum;
|
sum = (ushort16)1 * a + (ushort16)2 * b + (ushort16)1 * c +
|
(ushort16)2 * d + (ushort16)4 * e + (ushort16)2 * f +
|
(ushort16)1 * g + (ushort16)2 * h + (ushort16)1 * i;
|
|
approx = as_uint4(convert_uchar16(((convert_float16(sum) + 0.5f / div) * div)));
|
detail = convert_float16(convert_char16(e) - as_char16(approx));
|
|
thold = hardThresh * threshConst[layer - 1];
|
|
detail = (detail < -thold) ? detail + (thold - thold * softThresh) : detail;
|
detail = (detail > thold) ? detail - (thold - thold * softThresh) : detail;
|
detail = (detail > -thold && detail < thold) ? detail * softThresh : detail;
|
|
__global float16 *details_p = (__global float16 *)(&details[pixel_index]);
|
if (layer == 1) {
|
(*details_p) = detail;
|
|
#if WAVELET_DENOISE_UV
|
// copy Y
|
luma = vload4(0, src + luma_index0);
|
vstore4(luma, 0, dest + luma_index0);
|
luma = vload4(0, src + luma_index1);
|
vstore4(luma, 0, dest + luma_index1);
|
#else
|
// copy UV
|
if (y % 2 == 0) {
|
chroma = vload4(0, src + chroma_index + inputUVOffset * (imageWidth / 4));
|
vstore4(chroma, 0, dest + chroma_index + outputUVOffset * (imageWidth / 4));
|
}
|
#endif
|
} else {
|
(*details_p) += detail;
|
}
|
|
if (layer < decomLevels) {
|
#if WAVELET_DENOISE_UV
|
int approxOffset = (layer % 2) ? 0 : (inputUVOffset * imageWidth / 4);
|
(*(__global uint4*)(approxOut + group_index + approxOffset)) = approx;
|
#else
|
(*(__global uint4*)(approxOut + group_index)) = approx;
|
#endif
|
}
|
else
|
{
|
// Reconstruction
|
#if WAVELET_DENOISE_UV
|
__global uint4* dest_p = (__global uint4*)(&dest[group_index + outputUVOffset * imageWidth / 4]);
|
(*dest_p) = as_uint4(convert_uchar16(*details_p + convert_float16(as_uchar16(approx))));
|
#else
|
__global uint4* dest_p = (__global uint4*)(&dest[group_index]);
|
(*dest_p) = as_uint4(convert_uchar16(*details_p + convert_float16(as_uchar16(approx))));
|
#endif
|
}
|
}
|