/*
|
* function: kernel_3d_denoise
|
* 3D Noise Reduction
|
* gain: The parameter determines the filtering strength for the reference block
|
* threshold: Noise variances of observed image
|
* restoredPrev: The previous restored image, image2d_t as read only
|
* output: restored image, image2d_t as write only
|
* input: observed image, image2d_t as read only
|
* inputPrev1: reference image, image2d_t as read only
|
* inputPrev2: reference image, image2d_t as read only
|
*/
|
|
#ifndef REFERENCE_FRAME_COUNT
|
#define REFERENCE_FRAME_COUNT 2
|
#endif
|
|
#ifndef ENABLE_IIR_FILERING
|
#define ENABLE_IIR_FILERING 1
|
#endif
|
|
#define ENABLE_GRADIENT 1
|
|
#ifndef WORKGROUP_WIDTH
|
#define WORKGROUP_WIDTH 2
|
#endif
|
|
#ifndef WORKGROUP_HEIGHT
|
#define WORKGROUP_HEIGHT 32
|
#endif
|
|
#define REF_BLOCK_X_OFFSET 1
|
#define REF_BLOCK_Y_OFFSET 4
|
|
#define REF_BLOCK_WIDTH (WORKGROUP_WIDTH + 2 * REF_BLOCK_X_OFFSET)
|
#define REF_BLOCK_HEIGHT (WORKGROUP_HEIGHT + 2 * REF_BLOCK_Y_OFFSET)
|
|
inline int2 subgroup_pos(const int sg_id, const int sg_lid)
|
{
|
int2 pos;
|
pos.x = mad24(2, sg_id % 2, sg_lid % 2);
|
pos.y = mad24(4, sg_id / 2, sg_lid / 2);
|
|
return pos;
|
}
|
|
inline void average_slice(float8 ref,
|
float8 observe,
|
float8* restore,
|
float2* sum_weight,
|
float gain,
|
float threshold,
|
uint sg_id,
|
uint sg_lid)
|
{
|
float8 grad = 0.0f;
|
float8 gradient = 0.0f;
|
float8 dist = 0.0f;
|
float8 distance = 0.0f;
|
float weight = 0.0f;
|
|
#if ENABLE_GRADIENT
|
// calculate & cumulate gradient
|
if (sg_lid % 2 == 0) {
|
grad = intel_sub_group_shuffle(ref, 4);
|
} else {
|
grad = intel_sub_group_shuffle(ref, 5);
|
}
|
gradient = (float8)(grad.s1, grad.s1, grad.s1, grad.s1, grad.s5, grad.s5, grad.s5, grad.s5);
|
|
// normalize gradient "1/(4*255.0f) = 0.00098039f"
|
grad = fabs(gradient - ref) * 0.00098039f;
|
//grad = mad(-2, gradient, (ref + grad)) * 0.0004902f;
|
|
grad.s0 = (grad.s0 + grad.s1 + grad.s2 + grad.s3);
|
grad.s4 = (grad.s4 + grad.s5 + grad.s6 + grad.s7);
|
#endif
|
// calculate & normalize distance "1/255.0f = 0.00392157f"
|
dist = (observe - ref) * 0.00392157f;
|
dist = dist * dist;
|
|
float8 dist_shuffle[8];
|
dist_shuffle[0] = (intel_sub_group_shuffle(dist, 0));
|
dist_shuffle[1] = (intel_sub_group_shuffle(dist, 1));
|
dist_shuffle[2] = (intel_sub_group_shuffle(dist, 2));
|
dist_shuffle[3] = (intel_sub_group_shuffle(dist, 3));
|
dist_shuffle[4] = (intel_sub_group_shuffle(dist, 4));
|
dist_shuffle[5] = (intel_sub_group_shuffle(dist, 5));
|
dist_shuffle[6] = (intel_sub_group_shuffle(dist, 6));
|
dist_shuffle[7] = (intel_sub_group_shuffle(dist, 7));
|
|
if (sg_lid % 2 == 0) {
|
distance = dist_shuffle[0];
|
distance += dist_shuffle[2];
|
distance += dist_shuffle[4];
|
distance += dist_shuffle[6];
|
}
|
else {
|
distance = dist_shuffle[1];
|
distance += dist_shuffle[3];
|
distance += dist_shuffle[5];
|
distance += dist_shuffle[7];
|
}
|
|
// cumulate distance
|
dist.s0 = (distance.s0 + distance.s1 + distance.s2 + distance.s3);
|
dist.s4 = (distance.s4 + distance.s5 + distance.s6 + distance.s7);
|
gain = (grad.s0 < threshold) ? gain : 2.0f * gain;
|
weight = native_exp(-gain * dist.s0);
|
(*restore).lo = mad(weight, ref.lo, (*restore).lo);
|
(*sum_weight).lo = (*sum_weight).lo + weight;
|
|
gain = (grad.s4 < threshold) ? gain : 2.0f * gain;
|
weight = native_exp(-gain * dist.s4);
|
(*restore).hi = mad(weight, ref.hi, (*restore).hi);
|
(*sum_weight).hi = (*sum_weight).hi + weight;
|
}
|
|
inline void weighted_average (__read_only image2d_t input,
|
__local uchar8* ref_cache,
|
bool load_observe,
|
float8* observe,
|
float8* restore,
|
float2* sum_weight,
|
float gain,
|
float threshold,
|
uint sg_id,
|
uint sg_lid)
|
{
|
sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
|
int local_id_x = get_local_id(0);
|
int local_id_y = get_local_id(1);
|
const int group_id_x = get_group_id(0);
|
const int group_id_y = get_group_id(1);
|
|
int start_x = mad24(group_id_x, WORKGROUP_WIDTH, -REF_BLOCK_X_OFFSET);
|
int start_y = mad24(group_id_y, WORKGROUP_HEIGHT, -REF_BLOCK_Y_OFFSET);
|
|
int i = local_id_x + local_id_y * WORKGROUP_WIDTH;
|
for ( int j = i; j < (REF_BLOCK_HEIGHT * REF_BLOCK_WIDTH);
|
j += (WORKGROUP_HEIGHT * WORKGROUP_WIDTH) ) {
|
int corrd_x = start_x + (j % REF_BLOCK_WIDTH);
|
int corrd_y = start_y + (j / REF_BLOCK_WIDTH);
|
|
ref_cache[j] = as_uchar8( convert_ushort4(read_imageui(input,
|
sampler,
|
(int2)(corrd_x, corrd_y))));
|
}
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
#if WORKGROUP_WIDTH == 4
|
int2 pos = subgroup_pos(sg_id, sg_lid);
|
local_id_x = pos.x;
|
local_id_y = pos.y;
|
#endif
|
|
if (load_observe) {
|
(*observe) = convert_float8(
|
ref_cache[mad24(local_id_y + REF_BLOCK_Y_OFFSET,
|
REF_BLOCK_WIDTH,
|
local_id_x + REF_BLOCK_X_OFFSET)]);
|
(*restore) = (*observe);
|
(*sum_weight) = 1.0f;
|
}
|
|
float8 ref[2] = {0.0f, 0.0f};
|
__local uchar4* p_ref = (__local uchar4*)(ref_cache);
|
|
// top-left
|
ref[0] = convert_float8(*(__local uchar8*)(p_ref + mad24(local_id_y,
|
2 * REF_BLOCK_WIDTH,
|
mad24(2, local_id_x, 1))));
|
average_slice(ref[0], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid);
|
|
// top-right
|
ref[1] = convert_float8(*(__local uchar8*)(p_ref + mad24(local_id_y,
|
2 * REF_BLOCK_WIDTH,
|
mad24(2, local_id_x, 3))));
|
average_slice(ref[1], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid);
|
|
// top-mid
|
average_slice((float8)(ref[0].hi, ref[1].lo), *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid);
|
|
// mid-left
|
ref[0] = convert_float8(*(__local uchar8*)(p_ref + mad24((local_id_y + 4),
|
2 * REF_BLOCK_WIDTH,
|
mad24(2, local_id_x, 1))));
|
average_slice(ref[0], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid);
|
|
// mid-right
|
ref[1] = convert_float8(*(__local uchar8*)(p_ref + mad24((local_id_y + 4),
|
2 * REF_BLOCK_WIDTH,
|
mad24(2, local_id_x, 3))));
|
average_slice(ref[1], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid);
|
|
// mid-mid
|
if (!load_observe) {
|
average_slice((float8)(ref[0].hi, ref[1].lo), *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid);
|
}
|
|
// bottom-left
|
ref[0] = convert_float8(*(__local uchar8*)(p_ref + mad24((local_id_y + 8),
|
2 * REF_BLOCK_WIDTH,
|
mad24(2, local_id_x, 1))));
|
average_slice(ref[0], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid);
|
|
// bottom-right
|
ref[1] = convert_float8(*(__local uchar8*)(p_ref + mad24((local_id_y + 8),
|
2 * REF_BLOCK_WIDTH,
|
mad24(2, local_id_x, 3))));
|
average_slice(ref[1], *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid);
|
|
// bottom-mid
|
average_slice((float8)(ref[0].hi, ref[1].lo), *observe, restore, sum_weight, gain, threshold, sg_id, sg_lid);
|
}
|
|
__kernel void kernel_3d_denoise ( float gain,
|
float threshold,
|
__read_only image2d_t restoredPrev,
|
__write_only image2d_t output,
|
__read_only image2d_t input,
|
__read_only image2d_t inputPrev1,
|
__read_only image2d_t inputPrev2)
|
{
|
float8 restore = 0.0f;
|
float8 observe = 0.0f;
|
float2 sum_weight = 0.0f;
|
|
const int sg_id = get_sub_group_id();
|
const int sg_lid = (get_local_id(1) * WORKGROUP_WIDTH + get_local_id(0)) % 8;
|
|
__local uchar8 ref_cache[REF_BLOCK_HEIGHT * REF_BLOCK_WIDTH];
|
|
weighted_average (input, ref_cache, true, &observe, &restore, &sum_weight, gain, threshold, sg_id, sg_lid);
|
|
#if ENABLE_IIR_FILERING
|
weighted_average (restoredPrev, ref_cache, false, &observe, &restore, &sum_weight, gain, threshold, sg_id, sg_lid);
|
#else
|
#if REFERENCE_FRAME_COUNT > 1
|
weighted_average (inputPrev1, ref_cache, false, &observe, &restore, &sum_weight, gain, threshold, sg_id, sg_lid);
|
#endif
|
|
#if REFERENCE_FRAME_COUNT > 2
|
weighted_average (inputPrev2, ref_cache, false, &observe, &restore, &sum_weight, gain, threshold, sg_id, sg_lid);
|
#endif
|
#endif
|
|
restore.lo = restore.lo / sum_weight.lo;
|
restore.hi = restore.hi / sum_weight.hi;
|
|
int local_id_x = get_local_id(0);
|
int local_id_y = get_local_id(1);
|
const int group_id_x = get_group_id(0);
|
const int group_id_y = get_group_id(1);
|
|
#if WORKGROUP_WIDTH == 4
|
int2 pos = subgroup_pos(sg_id, sg_lid);
|
local_id_x = pos.x;
|
local_id_y = pos.y;
|
#endif
|
|
int coor_x = mad24(group_id_x, WORKGROUP_WIDTH, local_id_x);
|
int coor_y = mad24(group_id_y, WORKGROUP_HEIGHT, local_id_y);
|
|
write_imageui(output,
|
(int2)(coor_x, coor_y),
|
convert_uint4(as_ushort4(convert_uchar8(restore))));
|
}
|