/*
|
* function: kernel_min_filter
|
* input: image2d_t as read only
|
* output: image2d_t as write only
|
*
|
* data_type CL_UNSIGNED_INT16
|
* channel_order CL_RGBA
|
*/
|
|
//#define VERTICAL_MIN_KERNEL 1
|
#define PATCH_RADIUS 8
|
|
// offset X should be PATCH_RADIUS and aligned by 8
|
// offset Y should be PATCH_RADIUS aligned
|
|
#if VERTICAL_MIN_KERNEL // vertical
|
#define OFFSET_X 0
|
#define OFFSET_Y PATCH_RADIUS
|
#define GROUP_X 128
|
#define GROUP_Y 8
|
#define LINES_OF_WI 2
|
|
#else //horizontal
|
// offset X should be PATCH_RADIUS and aligned with 8
|
#define OFFSET_X 8
|
#define OFFSET_Y 0
|
#define GROUP_X 128
|
#define GROUP_Y 4
|
#define LINES_OF_WI 1
|
#endif
|
|
#define DOT_X_SIZE (GROUP_X + OFFSET_X * 2)
|
#define DOT_Y_SIZE (GROUP_Y + OFFSET_Y * 2)
|
|
//__constant const int slm_x_size = DOT_X_SIZE / 8;
|
//__constant const int slm_y_size = DOT_Y_SIZE;
|
#define slm_x_size (DOT_X_SIZE / 8)
|
#define slm_y_size DOT_Y_SIZE
|
__constant int uchar8_offset = OFFSET_X / 8;
|
|
void load_to_slm (__read_only image2d_t input, __local uchar8 *slm, int group_start_x, int group_start_y)
|
{
|
sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
int local_x = get_local_id (0);
|
int local_y = get_local_id (1);
|
int local_index = local_y * get_local_size (0) + local_x;
|
|
int group_offset_x = group_start_x - uchar8_offset;
|
int group_offset_y = group_start_y - OFFSET_Y;
|
|
for (; local_index < slm_x_size * slm_y_size; local_index += get_local_size(0) * get_local_size(1)) {
|
int slm_x = local_index % slm_x_size;
|
int slm_y = local_index / slm_x_size;
|
int pos_x = group_offset_x + slm_x;
|
int pos_y = group_offset_y + slm_y;
|
uchar8 data = as_uchar8(convert_ushort4(read_imageui(input, sampler, (int2)(pos_x, pos_y))));
|
slm[local_index] = data;
|
}
|
|
}
|
|
void finish_vertical_min (
|
__local uchar8 *data_center, __write_only image2d_t output,
|
int group_start_x, int group_start_y, int local_x, int local_y)
|
{
|
int pos_x, pos_y;
|
uchar8 min_val = data_center[0];
|
int v;
|
|
// process 2 line with each uchar8 pixels by each work-item
|
#pragma unroll
|
for (v = 1; v < OFFSET_Y; ++v) {
|
min_val = min (min_val, data_center[slm_x_size * v]);
|
min_val = min (min_val, data_center[-slm_x_size * v]);
|
}
|
min_val = min (min_val, data_center[slm_x_size * OFFSET_Y]);
|
|
uchar8 min_val_1 = min (min_val, data_center[-slm_x_size * OFFSET_Y]);
|
uchar8 min_val_2 = min (min_val, data_center[slm_x_size * (OFFSET_Y + 1)]);
|
|
pos_x = group_start_x + local_x;
|
pos_y = group_start_y + local_y;
|
|
write_imageui(output, (int2)(pos_x, pos_y), convert_uint4(as_ushort4(min_val_1)));
|
write_imageui(output, (int2)(pos_x, pos_y + 1), convert_uint4(as_ushort4(min_val_2)));
|
}
|
|
|
void finish_horizontal_min (
|
__local uchar8 *data_center, __write_only image2d_t output,
|
int group_start_x, int group_start_y, int local_x, int local_y)
|
{
|
|
uchar8 value = data_center[0];
|
uchar8 v_left = ((__local uchar8 *)data_center)[-1];
|
uchar8 v_right = ((__local uchar8 *)data_center)[1];
|
/*
|
* Order 1st uchar4
|
* 1st 4 values's common min, value.lo
|
* - - - 3 4 5 6 7 X X X X 4 5 6 7 0 - - - - - - -
|
* 2nd 4 values's common min, value.hi
|
* - - - - - - - 7 0 1 2 3 X X X X 0 1 2 3 4 - - -
|
* 1st and 2nd 4 value's shared common
|
* - - - - - - - 7 0 1 2 3 4 5 6 7 0 - - - - - - -
|
*/
|
|
uchar4 tmp4;
|
uchar2 tmp2;
|
uchar tmp1_left, tmp1_right;
|
|
uchar shared_common;
|
uchar first_common_min, second_common_min;
|
uchar8 out_data;
|
|
tmp4 = min (value.lo, value.hi);
|
tmp2 = min (tmp4.s01, tmp4.s23);
|
shared_common = min (tmp2.s0, tmp2.s1);
|
shared_common = min (shared_common, v_left.s7);
|
shared_common = min (shared_common, v_right.s0);
|
|
tmp2 = min (v_left.s34, v_left.s56);
|
first_common_min = min (tmp2.s0, tmp2.s1);
|
first_common_min = min (first_common_min, shared_common);
|
|
tmp2 = min (v_right.s12, v_right.s34);
|
second_common_min = min (tmp2.s0, tmp2.s1);
|
second_common_min = min (second_common_min, shared_common);
|
|
//final first 4 values
|
tmp1_left = min (v_left.s1, v_left.s2);
|
tmp1_right = min (v_right.s1, v_right.s2);
|
out_data.s0 = min (tmp1_left, v_left.s0);
|
out_data.s0 = min (out_data.s0, first_common_min);
|
|
out_data.s1 = min (tmp1_left, first_common_min);
|
out_data.s1 = min (out_data.s1, v_right.s1);
|
|
out_data.s2 = min (v_left.s2, first_common_min);
|
out_data.s2 = min (out_data.s2, tmp1_right);
|
|
out_data.s3 = min (first_common_min, tmp1_right);
|
out_data.s3 = min (out_data.s3, v_right.s3);
|
|
//second 4 values
|
tmp1_left = min (v_left.s5, v_left.s6);
|
tmp1_right = min (v_right.s5, v_right.s6);
|
out_data.s4 = min (tmp1_left, v_left.s4);
|
out_data.s4 = min (out_data.s4, second_common_min);
|
|
out_data.s5 = min (tmp1_left, second_common_min);
|
out_data.s5 = min (out_data.s5, v_right.s5);
|
|
out_data.s6 = min (v_left.s6, second_common_min);
|
out_data.s6 = min (out_data.s6, tmp1_right);
|
|
out_data.s7 = min (second_common_min, tmp1_right);
|
out_data.s7 = min (out_data.s7, v_right.s7);
|
|
int pos_x = group_start_x + local_x;
|
int pos_y = group_start_y + local_y;
|
|
write_imageui(output, (int2)(pos_x, pos_y), convert_uint4(as_ushort4(out_data)));
|
}
|
|
__kernel void kernel_min_filter (
|
__read_only image2d_t input,
|
__write_only image2d_t output)
|
{
|
int group_start_x = get_group_id (0) * (GROUP_X / 8);
|
int group_start_y = get_group_id (1) * GROUP_Y;
|
|
__local uchar8 slm_cache[slm_x_size * slm_y_size];
|
|
//load to slm
|
load_to_slm (input, slm_cache, group_start_x, group_start_y);
|
barrier (CLK_LOCAL_MEM_FENCE);
|
|
int local_x = get_local_id (0) ;
|
int local_y = get_local_id (1) * LINES_OF_WI;
|
int slm_x = local_x + uchar8_offset;
|
int slm_y = local_y + OFFSET_Y;
|
int slm_index = slm_x + slm_y * slm_x_size;
|
__local uchar8 *data_center = slm_cache + slm_index;
|
|
#if VERTICAL_MIN_KERNEL
|
finish_vertical_min (data_center, output, group_start_x, group_start_y, local_x, local_y);
|
#else
|
finish_horizontal_min (data_center, output, group_start_x, group_start_y, local_x, local_y);
|
#endif
|
}
|