diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index 739c403ea76..21219038553 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -269,285 +269,3 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); } - -// support batch > 1 -__kernel void conv2d_3x3_multi_batch(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - __read_only image2d_t bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int batch, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __read_only image2d_t prelu_alpha) { - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - if (item_ch_id >= item_ch || item_w_id >= item_w || item_h_id >= item_h) { - return; - } - - // out_width_id_per_blk - int out_batch_id = item_h_id / out_h; - int out_w_base_id = mul24(item_ch_id, out_w); - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = mad24((item_h_id % out_h), stride, (-pad)); - int in_w_id0 = mad24(item_w_id, stride, (-pad)); - int in_w_id1 = mad24(item_w, stride, in_w_id0); - int in_w_id2 = mad24(item_w, stride, in_w_id1); - int in_w_id3 = mad24(item_w, stride, in_w_id2); - int in_w_id4 = mad24(item_w, stride, in_w_id3); - -#ifdef BIASE_CH - - CL_DTYPE4 output[5]; - output[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; - -#elif defined(BIASE_ELE) - - CL_DTYPE4 output[5]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id4, item_h_id)); - } -#else - CL_DTYPE4 output[5] = {0.0f}; -#endif - - CL_DTYPE4 filter[4] = {0.0f}; - CL_DTYPE4 input[5] = {0.0f}; - - for (int ch = 0; ch < ((in_ch + 3) >> 2); ch++) { - int ch_surplus = ((ch + 1) << 2) - in_ch > 0 ? ((ch + 1) << 2) - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch << 2; - int filter_h_val = mul24(item_ch_id, 9); - - for (int h = 0; h < 3; h++) { - int in_h_val = select(mad24(out_batch_id, in_h, (in_h_id + h)), - -1, - (mad24(out_batch_id, in_h, (in_h_id + h)) < - mul24(out_batch_id, in_h) || - mad24(out_batch_id, in_h, (in_h_id + h)) >= - mul24((out_batch_id + 1), in_h))); - - for (int w = 0; w < 3; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, - -1, - (in_w_id0 + w < 0 | in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, - -1, - (in_w_id1 + w < 0 | in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, - -1, - (in_w_id2 + w < 0 | in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, - -1, - (in_w_id3 + w < 0 | in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, - -1, - (in_w_id4 + w < 0 | in_w_id4 + w >= in_w)); - - filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val, filter_h_val)); - filter[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + 1, filter_h_val)); - filter[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + 2, filter_h_val)); - filter[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + 3, filter_h_val++)); - - input[0] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); - input[1] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val1, in_h_val)); - input[2] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val2, in_h_val)); - input[3] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val3, in_h_val)); - input[4] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter[0], output[0]); - output[1] = mad(input[1].x, filter[0], output[1]); - output[2] = mad(input[2].x, filter[0], output[2]); - output[3] = mad(input[3].x, filter[0], output[3]); - output[4] = mad(input[4].x, filter[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter[1], output[0]); - output[1] = mad(input[1].y, filter[1], output[1]); - output[2] = mad(input[2].y, filter[1], output[2]); - output[3] = mad(input[3].y, filter[1], output[3]); - output[4] = mad(input[4].y, filter[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter[2], output[0]); - output[1] = mad(input[1].z, filter[2], output[1]); - output[2] = mad(input[2].z, filter[2], output[2]); - output[3] = mad(input[3].z, filter[2], output[3]); - output[4] = mad(input[4].z, filter[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter[3], output[0]); - output[1] = mad(input[1].w, filter[3], output[1]); - output[2] = mad(input[2].w, filter[3], output[2]); - output[3] = mad(input[3].w, filter[3], output[3]); - output[4] = mad(input[4].w, filter[3], output[4]); - } - } - } - } - - CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#elif defined(PRELU_ELE) //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_h_id % out_h)); - if (out_w_id1 < out_w) { - alpha[1] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_h_id % out_h)); - } - if (out_w_id2 < out_w) { - alpha[2] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id2, item_h_id % out_h)); - } - if (out_w_id3 < out_w) { - alpha[3] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id3, item_h_id % out_h)); - } - if (out_w_id4 < out_w) { - alpha[4] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id4, item_h_id % out_h)); - } -//} -#elif defined(PRELU_ALL) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha[0].y = alpha[0].x; - alpha[0].z = alpha[0].x; - alpha[0].w = alpha[0].x; - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#endif - output[0] = activation_type4(output[0], alpha[0]); - output[1] = activation_type4(output[1], alpha[1]); - output[2] = activation_type4(output[2], alpha[2]); - output[3] = activation_type4(output[3], alpha[3]); - output[4] = activation_type4(output[4], alpha[4]); - -#ifdef SCALE_ACTIVATION - output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); - output[1] = fuse_scale(output[1], 1.f, 0.f, 0.f); - output[2] = fuse_scale(output[2], 1.f, 0.f, 0.f); - output[3] = fuse_scale(output[3], 1.f, 0.f, 0.f); - output[4] = fuse_scale(output[4], 1.f, 0.f, 0.f); -#endif - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id0, item_h_id), - output[0]); - if (out_w_id1 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id1, item_h_id), - output[1]); - } - if (out_w_id2 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id2, item_h_id), - output[2]); - } - if (out_w_id3 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id3, item_h_id), - output[3]); - } - if (out_w_id4 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id4, item_h_id), - output[4]); - } -} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl deleted file mode 100644 index 938fbcf676e..00000000000 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -__kernel void conv2d_5x5(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - __read_only image2d_t bias, -#ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __read_only image2d_t prelu_alpha) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - - const int batch_index = out_nh / output_height; - const int out_nh_in_one_batch = out_nh % output_height; - - const int filter_n0 = 4 * out_c + 0; - const int filter_n1 = 4 * out_c + 1; - const int filter_n2 = 4 * out_c + 2; - const int filter_n3 = 4 * out_c + 3; - - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; - - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh_in_one_batch; - - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - -#ifdef BIASE_CH - CL_DTYPE4 output = - READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(out_c, 0)); -#elif defined(BIASE_ELE) - CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, output_pos); -#else - CL_DTYPE4 output = 0.0f; -#endif - - CL_DTYPE4 input; - CL_DTYPE4 filter[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, - in_pos_in_one_block.y + batch_index * input_height); - for (int j = 0; j < 5; j++) { - for (int k = 0; k < 5; k++) { - input = SELECT( - READ_IMG_TYPE(CL_DTYPE_CHAR, - input_image, - SAMPLER, - (int2)(pos_in.x + (j - 2) * dilation, - pos_in.y + (k - 2) * dilation)), - (CL_DTYPE4)(0.0f), - in_pos_in_one_block.x + (j - 2) * dilation < 0 || - in_pos_in_one_block.y + (k - 2) * dilation < 0 || - in_pos_in_one_block.x + (j - 2) * dilation >= input_width || - in_pos_in_one_block.y + (k - 2) * dilation >= input_height); - int filter_h = k; - int filter_w = j; - int filter_c = i; - - filter_pos0.x = filter_c * 5 + filter_w; - filter_pos0.y = filter_n0 * 5 + filter_h; - - filter_pos1.x = filter_c * 5 + filter_w; - filter_pos1.y = filter_n1 * 5 + filter_h; - - filter_pos2.x = filter_c * 5 + filter_w; - filter_pos2.y = filter_n2 * 5 + filter_h; - - filter_pos3.x = filter_c * 5 + filter_w; - filter_pos3.y = filter_n3 * 5 + filter_h; - - filter[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, SAMPLER, filter_pos0); - filter[1] = - READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, SAMPLER, filter_pos1); - filter[2] = - READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, SAMPLER, filter_pos2); - filter[3] = - READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, SAMPLER, filter_pos3); - - output.x += dot(input, filter[0]); - output.y += dot(input, filter[1]); - output.z += dot(input, filter[2]); - output.w += dot(input, filter[3]); - // - // if (output_pos.x == 0 && output_pos.y == 5) { - // printf("i,j,k ={ %d, %d , %d }\n", i,j,k); - // printf("in={ %f , %f , %f , %f } \n", - // convert_float(input.x), - // convert_float(input.y), - // convert_float(input.z), - // convert_float(input.w)); - // printf("filter0={ %f , %f , %f , %f } \n", - // convert_float(filter[0].x), - // convert_float(filter[0].y), - // convert_float(filter[0].z), - // convert_float(filter[0].w)); - // printf("filter1={ %f , %f , %f , %f } \n", - // convert_float(filter[1].x), - // convert_float(filter[1].y), - // convert_float(filter[1].z), - // convert_float(filter[1].w)); - // printf("filter2={ %f , %f , %f , %f } \n", - // convert_float(filter[2].x), - // convert_float(filter[2].y), - // convert_float(filter[2].z), - // convert_float(filter[2].w)); - // printf("filter3={ %f , %f , %f , %f } \n", - // convert_float(filter[3].x), - // convert_float(filter[3].y), - // convert_float(filter[3].z), - // convert_float(filter[3].w)); - // printf("output={ %f , %f , %f , %f } \n", - // convert_float(output.x), - // convert_float(output.y), - // convert_float(output.z), - // convert_float(output.w)); - // } - } - } - } - -#ifdef BATCH_NORM - output = output * READ_IMG_TYPE( - CL_DTYPE_CHAR, new_scale, SAMPLER, (int2)(out_c, 0)) + - READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); -#endif - - CL_DTYPE4 alpha0; -#ifdef PRELU_CH //{ - alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); -//} -#elif defined(PRELU_ELE) //{ - alpha0 = READ_IMG_TYPE( - CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_c * global_size_dim1 + out_w, out_nh % output_height)); -//} -#elif defined(PRELU_ALL) //{ - alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha0.y = alpha0.x; - alpha0.z = alpha0.x; - alpha0.w = alpha0.x; -//} -#endif - output = activation_type4(output, alpha0); - -#ifdef SCALE_ACTIVATION - output = fuse_scale(output, 1.f, 0.f, 0.f); -#endif - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl deleted file mode 100644 index b785ef48af4..00000000000 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl +++ /dev/null @@ -1,635 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -// opt version of conv5x5 -__kernel void conv2d_5x5_opt(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - __read_only image2d_t bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int batch, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __read_only image2d_t prelu_alpha) { - // filter - const int filter_w = 5; - const int filter_h = 5; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - if (item_ch_id >= item_ch || item_w_id >= item_w || item_h_id >= item_h) { - return; - } - - // out_width_id_per_blk and out_batch_id - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; - -#ifdef BIASE_CH - - CL_DTYPE4 output[5]; - output[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; - -#elif defined(BIASE_ELE) - - CL_DTYPE4 output[5]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id4, item_h_id)); - } -#else - CL_DTYPE4 output[5] = {0.0f}; -#endif - - CL_DTYPE4 filter[4] = {0.0f}; - CL_DTYPE4 filter_trans[4] = {0.0f}; - CL_DTYPE4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * filter_h; - int filter_h_val1 = filter_h_val0 + filter_h; - int filter_h_val2 = filter_h_val1 + filter_h; - int filter_h_val3 = filter_h_val2 + filter_h; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch * filter_w; - - for (int h = 0; h < filter_h; h++) { - int in_h_val = - select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h)); - - for (int w = 0; w < filter_w; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, - -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, - -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, - -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, - -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, - -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + w, - filter_h_val0 + h)); // in_ch:0-3,out_ch:0 - filter[1] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + w, - filter_h_val1 + h)); // in_ch:0-3,out_ch:1 - filter[2] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + w, - filter_h_val2 + h)); // in_ch:0-3,out_ch:2 - filter[3] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + w, - filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (CL_DTYPE4)(filter[0].x, - filter[1].x, - filter[2].x, - filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (CL_DTYPE4)(filter[0].y, - filter[1].y, - filter[2].y, - filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (CL_DTYPE4)(filter[0].z, - filter[1].z, - filter[2].z, - filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (CL_DTYPE4)(filter[0].w, - filter[1].w, - filter[2].w, - filter[3].w); // in_ch:3,out_ch:0-3 - - input[0] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); - input[1] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val1, in_h_val)); - input[2] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val2, in_h_val)); - input[3] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val3, in_h_val)); - input[4] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); - } - } - } - } - - CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#elif defined(PRELU_ELE) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id4, item_h_id)); - } -//} -#elif defined(PRELU_ALL) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha[0].y = alpha[0].x; - alpha[0].z = alpha[0].x; - alpha[0].w = alpha[0].x; - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#endif - output[0] = activation_type4(output[0], alpha[0]); - output[1] = activation_type4(output[1], alpha[1]); - output[2] = activation_type4(output[2], alpha[2]); - output[3] = activation_type4(output[3], alpha[3]); - output[4] = activation_type4(output[4], alpha[4]); - -#ifdef SCALE_ACTIVATION - output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); - output[1] = fuse_scale(output[1], 1.f, 0.f, 0.f); - output[2] = fuse_scale(output[2], 1.f, 0.f, 0.f); - output[3] = fuse_scale(output[3], 1.f, 0.f, 0.f); - output[4] = fuse_scale(output[4], 1.f, 0.f, 0.f); -#endif - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id0, item_h_id), - output[0]); - if (out_w_id1 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id1, item_h_id), - output[1]); - } - if (out_w_id2 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id2, item_h_id), - output[2]); - } - if (out_w_id3 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id3, item_h_id), - output[3]); - } - if (out_w_id4 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id4, item_h_id), - output[4]); - } -} -// support batch > 1 -__kernel void conv2d_5x5_multi_batch(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - __read_only image2d_t bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int batch, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __read_only image2d_t prelu_alpha) { - // filter - const int filter_w = 5; - const int filter_h = 5; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - if (item_ch_id >= item_ch || item_w_id >= item_w || item_h_id >= item_h) { - return; - } - - // out_width_id_per_blk and out_batch_id - int out_batch_id = item_h_id / out_h; - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; - -#ifdef BIASE_CH - - CL_DTYPE4 output[5]; - output[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; - -#elif defined(BIASE_ELE) - - CL_DTYPE4 output[5]; - output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, - bias, - SAMPLER, - (int2)(out_w_base_id + out_w_id4, item_h_id)); - } -#else - CL_DTYPE4 output[5] = {0.0f}; -#endif - - CL_DTYPE4 filter[4] = {0.0f}; - CL_DTYPE4 filter_trans[4] = {0.0f}; - CL_DTYPE4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * filter_h; - int filter_h_val1 = filter_h_val0 + filter_h; - int filter_h_val2 = filter_h_val1 + filter_h; - int filter_h_val3 = filter_h_val2 + filter_h; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch * filter_w; - - for (int h = 0; h < filter_h; h++) { - int in_h_val = select( - out_batch_id * in_h + in_h_id + h, - -1, - (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || - out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); - - for (int w = 0; w < filter_w; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, - -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, - -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, - -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, - -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, - -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + w, - filter_h_val0 + h)); // in_ch:0-3,out_ch:0 - filter[1] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + w, - filter_h_val1 + h)); // in_ch:0-3,out_ch:1 - filter[2] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + w, - filter_h_val2 + h)); // in_ch:0-3,out_ch:2 - filter[3] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + w, - filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (CL_DTYPE4)(filter[0].x, - filter[1].x, - filter[2].x, - filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (CL_DTYPE4)(filter[0].y, - filter[1].y, - filter[2].y, - filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (CL_DTYPE4)(filter[0].z, - filter[1].z, - filter[2].z, - filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (CL_DTYPE4)(filter[0].w, - filter[1].w, - filter[2].w, - filter[3].w); // in_ch:3,out_ch:0-3 - - input[0] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); - input[1] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val1, in_h_val)); - input[2] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val2, in_h_val)); - input[3] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val3, in_h_val)); - input[4] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); - } - } - } - } - - CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#elif defined(PRELU_ELE) //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_h_id % out_h)); - if (out_w_id1 < out_w) { - alpha[1] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_h_id % out_h)); - } - if (out_w_id2 < out_w) { - alpha[2] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id2, item_h_id % out_h)); - } - if (out_w_id3 < out_w) { - alpha[3] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id3, item_h_id % out_h)); - } - if (out_w_id4 < out_w) { - alpha[4] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id4, item_h_id % out_h)); - } -//} -#elif defined(PRELU_ALL) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha[0].y = alpha[0].x; - alpha[0].z = alpha[0].x; - alpha[0].w = alpha[0].x; - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#endif - output[0] = activation_type4(output[0], alpha[0]); - output[1] = activation_type4(output[1], alpha[1]); - output[2] = activation_type4(output[2], alpha[2]); - output[3] = activation_type4(output[3], alpha[3]); - output[4] = activation_type4(output[4], alpha[4]); - -#ifdef SCALE_ACTIVATION - output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); - output[1] = fuse_scale(output[1], 1.f, 0.f, 0.f); - output[2] = fuse_scale(output[2], 1.f, 0.f, 0.f); - output[3] = fuse_scale(output[3], 1.f, 0.f, 0.f); - output[4] = fuse_scale(output[4], 1.f, 0.f, 0.f); -#endif - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id0, item_h_id), - output[0]); - if (out_w_id1 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id1, item_h_id), - output[1]); - } - if (out_w_id2 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id2, item_h_id), - output[2]); - } - if (out_w_id3 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id3, item_h_id), - output[3]); - } - if (out_w_id4 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id4, item_h_id), - output[4]); - } -} diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl deleted file mode 100644 index 3a0837c5ec8..00000000000 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl +++ /dev/null @@ -1,662 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -// opt version of con7x7 -__kernel void conv2d_7x7_opt(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - __read_only image2d_t bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int batch, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __read_only image2d_t prelu_alpha) { - // filter - const int filter_w = 7; - const int filter_h = 7; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - if (item_ch_id >= item_ch || item_w_id >= item_w || item_h_id >= item_h) { - return; - } - - // out_width_id_per_blk and out_batch_id - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; - -#ifdef BIASE_CH - CL_DTYPE4 output[5]; - output[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; -#else - CL_DTYPE4 output[5] = {0.0f}; -#endif - - CL_DTYPE4 filter[4] = {0.0f}; - CL_DTYPE4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * filter_h; - int filter_h_val1 = filter_h_val0 + filter_h; - int filter_h_val2 = filter_h_val1 + filter_h; - int filter_h_val3 = filter_h_val2 + filter_h; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch << 2; - int filter_h_val = mul24(item_ch_id, 49); - for (int h = 0; h < filter_h; h++) { - int in_h_val = - select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h)); - - for (int w = 0; w < filter_w; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, - -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, - -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, - -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, - -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, - -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val, filter_h_val)); - filter[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + 1, filter_h_val)); - filter[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + 2, filter_h_val)); - filter[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + 3, filter_h_val++)); - - input[0] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); - input[1] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val1, in_h_val)); - input[2] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val2, in_h_val)); - input[3] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val3, in_h_val)); - input[4] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter[0], output[0]); - output[1] = mad(input[1].x, filter[0], output[1]); - output[2] = mad(input[2].x, filter[0], output[2]); - output[3] = mad(input[3].x, filter[0], output[3]); - output[4] = mad(input[4].x, filter[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter[1], output[0]); - output[1] = mad(input[1].y, filter[1], output[1]); - output[2] = mad(input[2].y, filter[1], output[2]); - output[3] = mad(input[3].y, filter[1], output[3]); - output[4] = mad(input[4].y, filter[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter[2], output[0]); - output[1] = mad(input[1].z, filter[2], output[1]); - output[2] = mad(input[2].z, filter[2], output[2]); - output[3] = mad(input[3].z, filter[2], output[3]); - output[4] = mad(input[4].z, filter[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter[3], output[0]); - output[1] = mad(input[1].w, filter[3], output[1]); - output[2] = mad(input[2].w, filter[3], output[2]); - output[3] = mad(input[3].w, filter[3], output[3]); - output[4] = mad(input[4].w, filter[3], output[4]); - } - } - } - } - - CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#elif defined(PRELU_ELE) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id4, item_h_id)); - } -//} -#elif defined(PRELU_ALL) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha[0].y = alpha[0].x; - alpha[0].z = alpha[0].x; - alpha[0].w = alpha[0].x; - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#endif - output[0] = activation_type4(output[0], alpha[0]); - output[1] = activation_type4(output[1], alpha[1]); - output[2] = activation_type4(output[2], alpha[2]); - output[3] = activation_type4(output[3], alpha[3]); - output[4] = activation_type4(output[4], alpha[4]); - -#ifdef SCALE_ACTIVATION - output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); - output[1] = fuse_scale(output[1], 1.f, 0.f, 0.f); - output[2] = fuse_scale(output[2], 1.f, 0.f, 0.f); - output[3] = fuse_scale(output[3], 1.f, 0.f, 0.f); - output[4] = fuse_scale(output[4], 1.f, 0.f, 0.f); -#endif - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id0, item_h_id), - output[0]); - if (out_w_id1 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id1, item_h_id), - output[1]); - } - if (out_w_id2 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id2, item_h_id), - output[2]); - } - if (out_w_id3 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id3, item_h_id), - output[3]); - } - if (out_w_id4 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id4, item_h_id), - output[4]); - } -} -// support batch > 1 -__kernel void conv2d_7x7_multi_batch(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - __read_only image2d_t bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int batch, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __read_only image2d_t prelu_alpha) { - // filter - const int filter_w = 7; - const int filter_h = 7; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - if (item_ch_id >= item_ch || item_w_id >= item_w || item_h_id >= item_h) { - return; - } - - // out_width_id_per_blk and out_batch_id - int out_batch_id = item_h_id / out_h; - - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; - -#ifdef BIASE_CH - - CL_DTYPE4 output[5]; - output[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; -#else - CL_DTYPE4 output[5] = {0.0f}; -#endif - - CL_DTYPE4 filter[4] = {0.0f}; - CL_DTYPE4 filter_trans[4] = {0.0f}; - CL_DTYPE4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * filter_h; - int filter_h_val1 = filter_h_val0 + filter_h; - int filter_h_val2 = filter_h_val1 + filter_h; - int filter_h_val3 = filter_h_val2 + filter_h; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch << 2; - int filter_h_val = mul24(item_ch_id, 49); - - for (int h = 0; h < filter_h; h++) { - int in_h_val = select( - out_batch_id * in_h + in_h_id + h, - -1, - (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h || - out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h)); - - for (int w = 0; w < filter_w; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, - -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, - -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, - -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, - -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, - -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val, filter_h_val)); - filter[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + 1, filter_h_val)); - filter[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + 2, filter_h_val)); - filter[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - filter_image, - SAMPLER, - (int2)(filter_w_val + 3, filter_h_val++)); - - input[0] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); - input[1] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val1, in_h_val)); - input[2] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val2, in_h_val)); - input[3] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val3, in_h_val)); - input[4] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter[0], output[0]); - output[1] = mad(input[1].x, filter[0], output[1]); - output[2] = mad(input[2].x, filter[0], output[2]); - output[3] = mad(input[3].x, filter[0], output[3]); - output[4] = mad(input[4].x, filter[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter[1], output[0]); - output[1] = mad(input[1].y, filter[1], output[1]); - output[2] = mad(input[2].y, filter[1], output[2]); - output[3] = mad(input[3].y, filter[1], output[3]); - output[4] = mad(input[4].y, filter[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter[2], output[0]); - output[1] = mad(input[1].z, filter[2], output[1]); - output[2] = mad(input[2].z, filter[2], output[2]); - output[3] = mad(input[3].z, filter[2], output[3]); - output[4] = mad(input[4].z, filter[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter[3], output[0]); - output[1] = mad(input[1].w, filter[3], output[1]); - output[2] = mad(input[2].w, filter[3], output[2]); - output[3] = mad(input[3].w, filter[3], output[3]); - output[4] = mad(input[4].w, filter[3], output[4]); - } - } - } - } - - CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#elif defined(PRELU_ELE) //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_h_id % out_h)); - if (out_w_id1 < out_w) { - alpha[1] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_h_id % out_h)); - } - if (out_w_id2 < out_w) { - alpha[2] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id2, item_h_id % out_h)); - } - if (out_w_id3 < out_w) { - alpha[3] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id3, item_h_id % out_h)); - } - if (out_w_id4 < out_w) { - alpha[4] = - READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id4, item_h_id % out_h)); - } -//} -#elif defined(PRELU_ALL) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha[0].y = alpha[0].x; - alpha[0].z = alpha[0].x; - alpha[0].w = alpha[0].x; - alpha[1] = alpha[0]; - alpha[2] = alpha[0]; - alpha[3] = alpha[0]; - alpha[4] = alpha[0]; -//} -#endif - output[0] = activation_type4(output[0], alpha[0]); - output[1] = activation_type4(output[1], alpha[1]); - output[2] = activation_type4(output[2], alpha[2]); - output[3] = activation_type4(output[3], alpha[3]); - output[4] = activation_type4(output[4], alpha[4]); - -#ifdef SCALE_ACTIVATION - output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); - output[1] = fuse_scale(output[1], 1.f, 0.f, 0.f); - output[2] = fuse_scale(output[2], 1.f, 0.f, 0.f); - output[3] = fuse_scale(output[3], 1.f, 0.f, 0.f); - output[4] = fuse_scale(output[4], 1.f, 0.f, 0.f); -#endif - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id0, item_h_id), - output[0]); - if (out_w_id1 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id1, item_h_id), - output[1]); - } - if (out_w_id2 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id2, item_h_id), - output[2]); - } - if (out_w_id3 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id3, item_h_id), - output[3]); - } - if (out_w_id4 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id4, item_h_id), - output[4]); - } -} - -__kernel void conv2d_7x7_opt_mali(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __global CL_DTYPE4 *filter_buf, - __global CL_DTYPE4 *bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int batch, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __read_only image2d_t prelu_alpha) { - // filter - const int filter_w = 7; - const int filter_h = 7; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = 2 * get_global_id(1); - const int item_h_id = get_global_id(2); - if (get_global_id(0) >= item_ch || get_global_id(1) >= item_w || - get_global_id(2) >= item_h) { - return; - } - - // out_width_id_per_blk and out_batch_id - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + 1; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = (item_w_id + 1) * stride - pad; - ; - -#ifdef BIASE_CH - CL_DTYPE4 output[2]; - output[0] = (bias + item_ch_id)[0]; - output[1] = output[0]; -#else - CL_DTYPE4 output[2] = {0.0f}; -#endif - - CL_DTYPE4 filter[2] = {0.0f}; - CL_DTYPE4 input[2] = {0.0f}; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch << 2; - int filter_h_val = mul24(item_ch_id, 49); - __global CL_DTYPE4 *filter_ptr = - filter_buf + ((in_ch + 3) >> 2) * 4 * filter_h_val + filter_w_val; - - for (int h = 0; h < filter_h; h++) { - int in_h_val = - select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h)); - - for (int w = 0; w < filter_w; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, - -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, - -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - - input[0] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); - input[1] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val1, in_h_val)); - - output[0] = mad(input[0].x, filter_ptr[0], output[0]); - output[1] = mad(input[1].x, filter_ptr[0], output[1]); - - output[0] = mad(input[0].y, filter_ptr[1], output[0]); - output[1] = mad(input[1].y, filter_ptr[1], output[1]); - - output[0] = mad(input[0].z, filter_ptr[2], output[0]); - output[1] = mad(input[1].z, filter_ptr[2], output[1]); - - output[0] = mad(input[0].w, filter_ptr[3], output[0]); - output[1] = mad(input[1].w, filter_ptr[3], output[1]); - - filter_ptr += ((in_ch + 3) >> 2) * 4; - } - } - } - - CL_DTYPE4 alpha[2]; -#ifdef PRELU_CH //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); - alpha[1] = alpha[0]; -//} -#elif defined(PRELU_ELE) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_h_id)); - } -//} -#elif defined(PRELU_ALL) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha[0].y = alpha[0].x; - alpha[0].z = alpha[0].x; - alpha[0].w = alpha[0].x; - alpha[1] = alpha[0]; -//} -#endif - output[0] = activation_type4(output[0], alpha[0]); - output[1] = activation_type4(output[1], alpha[1]); - -#ifdef SCALE_ACTIVATION - output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); - output[1] = fuse_scale(output[1], 1.f, 0.f, 0.f); - -#endif - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id0, item_h_id), - output[0]); - if (out_w_id1 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id1, item_h_id), - output[1]); - } -} \ No newline at end of file diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_nxn_kernel.cl similarity index 68% rename from lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl rename to lite/backends/opencl/cl_kernel/image/conv2d_nxn_kernel.cl index 6f1d5c11cc4..20abd151351 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_nxn_kernel.cl @@ -1,11 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include -__kernel void conv2d_7x7(__private const int global_size_dim0, +__kernel void conv2d_nxn(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, __read_only image2d_t input_image, __read_only image2d_t filter_image, __read_only image2d_t bias, + __private const int filter_w, + __private const int filter_h, #ifdef BATCH_NORM __read_only image2d_t new_scale, __read_only image2d_t new_biase, @@ -56,6 +72,8 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + const int filter_w_half = filter_w >> 1; + #ifdef BIASE_CH CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(out_c, 0)); @@ -74,34 +92,36 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, for (int i = 0; i < input_c; ++i) { int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y + batch_index * input_height); - for (int j = 0; j < 7; j++) { - for (int k = 0; k < 7; k++) { + for (int j = 0; j < filter_h; j++) { + for (int k = 0; k < filter_w; k++) { input = SELECT( READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, - (int2)(pos_in.x + (j - 3) * dilation, - pos_in.y + (k - 3) * dilation)), + (int2)(pos_in.x + (j - filter_w_half) * dilation, + pos_in.y + (k - filter_w_half) * dilation)), (CL_DTYPE4)(0.0f), - in_pos_in_one_block.x + (j - 3) * dilation < 0 || - in_pos_in_one_block.y + (k - 3) * dilation < 0 || - in_pos_in_one_block.x + (j - 3) * dilation >= input_width || - in_pos_in_one_block.y + (k - 3) * dilation >= input_height); - int filter_h = k; - int filter_w = j; - int filter_c = i; - - filter_pos0.x = filter_c * 7 + filter_w; - filter_pos0.y = filter_n0 * 7 + filter_h; - - filter_pos1.x = filter_c * 7 + filter_w; - filter_pos1.y = filter_n1 * 7 + filter_h; - - filter_pos2.x = filter_c * 7 + filter_w; - filter_pos2.y = filter_n2 * 7 + filter_h; - - filter_pos3.x = filter_c * 7 + filter_w; - filter_pos3.y = filter_n3 * 7 + filter_h; + in_pos_in_one_block.x + (j - filter_w_half) * dilation < 0 || + in_pos_in_one_block.y + (k - filter_w_half) * dilation < 0 || + in_pos_in_one_block.x + (j - filter_w_half) * dilation >= + input_width || + in_pos_in_one_block.y + (k - filter_w_half) * dilation >= + input_height); + int filter_h_id = k; + int filter_w_id = j; + int filter_c_id = i; + + filter_pos0.x = filter_c_id * filter_w + filter_w_id; + filter_pos0.y = filter_n0 * filter_h + filter_h_id; + + filter_pos1.x = filter_c_id * filter_w + filter_w_id; + filter_pos1.y = filter_n1 * filter_h + filter_h_id; + + filter_pos2.x = filter_c_id * filter_w + filter_w_id; + filter_pos2.y = filter_n2 * filter_h + filter_h_id; + + filter_pos3.x = filter_c_id * filter_w + filter_w_id; + filter_pos3.y = filter_n3 * filter_h + filter_h_id; filter[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, SAMPLER, filter_pos0); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_default_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_opt_adreno_kernel.cl similarity index 52% rename from lite/backends/opencl/cl_kernel/image/conv2d_3x3_default_kernel.cl rename to lite/backends/opencl/cl_kernel/image/conv2d_opt_adreno_kernel.cl index 88e3ff34fd8..b3713888ac4 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_default_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_opt_adreno_kernel.cl @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,23 +13,26 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -__kernel void conv2d_3x3_opt(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - __read_only image2d_t bias, - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int batch, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __read_only image2d_t prelu_alpha) { + +__kernel void conv2d_nxn_opt_adreno(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, + __read_only image2d_t bias, + __private const int filter_w, + __private const int filter_h, + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h, + __read_only image2d_t prelu_alpha) { // item_id const int item_ch_id = get_global_id(0); const int item_w_id = get_global_id(1); @@ -56,6 +59,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, int in_w_id3 = mad24(item_w, stride, in_w_id2); int in_w_id4 = mad24(item_w, stride, in_w_id3); + int filter_hw = filter_w * filter_h; + #ifdef BIASE_CH CL_DTYPE4 output[5]; output[0] = @@ -64,6 +69,36 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, output[2] = output[0]; output[3] = output[0]; output[4] = output[0]; +#elif defined(BIASE_ELE) + CL_DTYPE4 output[5]; + output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_nh_id)); + if (out_w_id1 < out_w) { + output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_nh_id)); + } + if (out_w_id2 < out_w) { + output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_nh_id)); + } + if (out_w_id3 < out_w) { + output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_nh_id)); + } + if (out_w_id4 < out_w) { + output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + bias, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_nh_id)); + } #else CL_DTYPE4 output[5] = {0.0f}; #endif @@ -77,13 +112,13 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, const int in_w_base_id = mul24(ch, in_w); int filter_w_val = ch << 2; - int filter_h_val = mul24(item_ch_id, 9); + int filter_h_val = mul24(item_ch_id, filter_hw); - for (int h = 0; h < 3; h++) { + for (int h = 0; h < filter_h; h++) { int in_h_val = select( n * in_h + in_h_id + h, -1, (in_h_id + h < 0 | in_h_id + h >= in_h)); - for (int w = 0; w < 3; w++) { + for (int w = 0; w < filter_w; w++) { int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, (in_w_id0 + w < 0 | in_w_id0 + w >= in_w)); @@ -159,45 +194,48 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, } } CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH //{ +#ifdef PRELU_CH alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; -//} -#elif defined(PRELU_ELE) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_nh_id)); +#elif defined(PRELU_ELE) + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_nh_id % out_h)); if (out_w_id1 < out_w) { - alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_nh_id)); + alpha[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_nh_id % out_h)); } if (out_w_id2 < out_w) { - alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id2, item_nh_id)); + alpha[2] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_nh_id % out_h)); } if (out_w_id3 < out_w) { - alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id3, item_nh_id)); + alpha[3] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_nh_id % out_h)); } if (out_w_id4 < out_w) { - alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id4, item_nh_id)); + alpha[4] = + READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_nh_id % out_h)); } -//} -#elif defined(PRELU_ALL) //{ +#elif defined(PRELU_ALL) alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; @@ -206,7 +244,6 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; -//} #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); @@ -251,138 +288,3 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, output[4]); } } - -__kernel void conv2d_3x3_opt_mali(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __global CL_DTYPE4 *filter_buf, - __global CL_DTYPE4 *bias_buf, - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int batch, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h, - __read_only image2d_t prelu_alpha) { - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = 2 * get_global_id(1); - const int item_nh_id = get_global_id(2); - if (get_global_id(0) >= item_ch || get_global_id(1) >= item_w || - get_global_id(2) >= item_h) { - return; - } - - // out_width_id_per_blk - int out_w_base_id = mul24(item_ch_id, out_w); - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + 1; - - // in_width_id_per_blk and in_height_id_per_batch - int n = item_nh_id / out_h; - int in_h_id = mad24((item_nh_id % out_h), stride, (-pad)); - int in_w_id0 = mad24(item_w_id, stride, (-pad)); - int in_w_id1 = mad24(item_w_id + 1, stride, (-pad)); - -#ifdef BIASE_CH - CL_DTYPE4 output[2]; - output[0] = (bias_buf + item_ch_id)[0]; - output[1] = output[0]; -#else - CL_DTYPE4 output[2] = {0.0f}; -#endif - - CL_DTYPE4 filter[2] = {0.0f}; - CL_DTYPE4 input[2] = {0.0f}; - - for (int ch = 0; ch < ((in_ch + 3) >> 2); ch++) { - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch << 2; - int filter_h_val = mul24(item_ch_id, 9); - __global CL_DTYPE4 *filter_ptr = - filter_buf + ((in_ch + 3) >> 2) * 4 * filter_h_val + filter_w_val; - - for (int h = 0; h < 3; h++) { - int in_h_val = select( - n * in_h + in_h_id + h, -1, (in_h_id + h < 0 | in_h_id + h >= in_h)); - - for (int w = 0; w < 3; w++) { - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, - -1, - (in_w_id0 + w < 0 | in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, - -1, - (in_w_id1 + w < 0 | in_w_id1 + w >= in_w)); - - input[0] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); - input[1] = READ_IMG_TYPE( - CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val1, in_h_val)); - - output[0] = mad(input[0].x, filter_ptr[0], output[0]); - output[1] = mad(input[1].x, filter_ptr[0], output[1]); - - output[0] = mad(input[0].y, filter_ptr[1], output[0]); - output[1] = mad(input[1].y, filter_ptr[1], output[1]); - - output[0] = mad(input[0].z, filter_ptr[2], output[0]); - output[1] = mad(input[1].z, filter_ptr[2], output[1]); - - output[0] = mad(input[0].w, filter_ptr[3], output[0]); - output[1] = mad(input[1].w, filter_ptr[3], output[1]); - - filter_ptr += ((in_ch + 3) >> 2) * 4; - } - } - } - CL_DTYPE4 alpha[2]; -#ifdef PRELU_CH //{ - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); - alpha[1] = alpha[0]; -//} -#elif defined(PRELU_ELE) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id0, item_nh_id)); - if (out_w_id1 < out_w) { - alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, - prelu_alpha, - SAMPLER, - (int2)(out_w_base_id + out_w_id1, item_nh_id)); - } -//} -#elif defined(PRELU_ALL) //{ - alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha[0].y = alpha[0].x; - alpha[0].z = alpha[0].x; - alpha[0].w = alpha[0].x; - alpha[1] = alpha[0]; -//} -#endif - output[0] = activation_type4(output[0], alpha[0]); - output[1] = activation_type4(output[1], alpha[1]); - -#ifdef SCALE_ACTIVATION - output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); - output[1] = fuse_scale(output[1], 1.f, 0.f, 0.f); -#endif - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id0, item_nh_id), - output[0]); - if (out_w_id1 < out_w) { - WRITE_IMG_TYPE(CL_DTYPE_CHAR, - output_image, - (int2)(out_w_base_id + out_w_id1, item_nh_id), - output[1]); - } -} diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_opt_mali_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_opt_mali_kernel.cl new file mode 100644 index 00000000000..9a2ff548e72 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/conv2d_opt_mali_kernel.cl @@ -0,0 +1,149 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void conv2d_nxn_opt_mali(__private const int item_ch, + __private const int item_w, + __private const int item_h, + __read_only image2d_t input_image, + __global CL_DTYPE4 *filter_buf, + __global CL_DTYPE4 *bias_buf, + __private const int filter_w, + __private const int filter_h, + __write_only image2d_t output_image, + __private const int stride, + __private const int pad, + __private const int dilation, + __private const int batch, + __private const int in_ch, + __private const int in_w, + __private const int in_h, + __private const int out_w, + __private const int out_h, + __read_only image2d_t prelu_alpha) { + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1) << 1; + const int item_nh_id = get_global_id(2); + if (get_global_id(0) >= item_ch || get_global_id(1) >= item_w || + get_global_id(2) >= item_h) { + return; + } + + // out_width_id_per_blk + int out_w_base_id = mul24(item_ch_id, out_w); + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + 1; + int filter_w_h = filter_w * filter_h; + + // in_width_id_per_blk and in_height_id_per_batch + int n = item_nh_id / out_h; + int in_h_id = mad24((item_nh_id % out_h), stride, (-pad)); + int in_w_id0 = mad24(item_w_id, stride, (-pad)); + int in_w_id1 = mad24(item_w_id + 1, stride, (-pad)); + +#ifdef BIASE_CH + CL_DTYPE4 output[2]; + output[0] = (bias_buf + item_ch_id)[0]; + output[1] = output[0]; +#else + CL_DTYPE4 output[2] = {0.0f}; +#endif + + CL_DTYPE4 input[2] = {0.0f}; + + for (int ch = 0; ch < ((in_ch + 3) >> 2); ch++) { + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch << 2; + int filter_h_val = mul24(item_ch_id, filter_w_h); + __global CL_DTYPE4 *filter_ptr = + filter_buf + ((in_ch + 3) >> 2) * 4 * filter_h_val + filter_w_val; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = select( + n * in_h + in_h_id + h, -1, (in_h_id + h < 0 | in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, + -1, + (in_w_id0 + w < 0 | in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, + -1, + (in_w_id1 + w < 0 | in_w_id1 + w >= in_w)); + + input[0] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val0, in_h_val)); + input[1] = READ_IMG_TYPE( + CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(in_w_val1, in_h_val)); + + output[0] = mad(input[0].x, filter_ptr[0], output[0]); + output[1] = mad(input[1].x, filter_ptr[0], output[1]); + + output[0] = mad(input[0].y, filter_ptr[1], output[0]); + output[1] = mad(input[1].y, filter_ptr[1], output[1]); + + output[0] = mad(input[0].z, filter_ptr[2], output[0]); + output[1] = mad(input[1].z, filter_ptr[2], output[1]); + + output[0] = mad(input[0].w, filter_ptr[3], output[0]); + output[1] = mad(input[1].w, filter_ptr[3], output[1]); + + filter_ptr += ((in_ch + 3) >> 2) * 4; + } + } + } + CL_DTYPE4 alpha[2]; +#ifdef PRELU_CH + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; +#elif defined(PRELU_ELE) + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_nh_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_nh_id)); + } +#elif defined(PRELU_ALL) + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; + alpha[0].z = alpha[0].x; + alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + +#ifdef SCALE_ACTIVATION + output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); + output[1] = fuse_scale(output[1], 1.f, 0.f, 0.f); +#endif + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id0, item_nh_id), + output[0]); + if (out_w_id1 < out_w) { + WRITE_IMG_TYPE(CL_DTYPE_CHAR, + output_image, + (int2)(out_w_base_id + out_w_id1, item_nh_id), + output[1]); + } +} diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index 25fc6d05348..cd4cb6cb6b7 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -409,18 +409,14 @@ void ConvImageCompute::PrepareForRun() { } } else if (groups_ == 1) { if (is_mali_ && input_tensor_n_ == 1) { - kernel_func_names_.push_back("conv2d_3x3_opt_mali"); + kernel_func_names_.push_back("conv2d_nxn_opt_mali"); + kernel_func_paths_.push_back("image/conv2d_opt_mali_kernel.cl"); bias_buffer_flag = true; } else { - kernel_func_names_.push_back( - input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt"); + kernel_func_names_.push_back("conv2d_nxn_opt_adreno"); + kernel_func_paths_.push_back("image/conv2d_opt_adreno_kernel.cl"); } - if (kernel_func_names_.back() != "conv2d_3x3_multi_batch") { - kernel_func_paths_.push_back("image/conv2d_3x3_default_kernel.cl"); - } else { - kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); - } - impl_ = &ConvImageCompute::Conv2d3x3opt; + impl_ = &ConvImageCompute::Conv2dnxnopt; CLImageConverterNBlock converter; const DDim& filter_image_dims = @@ -466,14 +462,13 @@ void ConvImageCompute::PrepareForRun() { filter_image_h_, filter_image_data); } - } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5 && pad_equal && + } else if (filter_tensor_h_ == filter_tensor_w_ && pad_equal && stride_equal && dilation_equal && dilation_h_ == 1 && groups_ == 1) { -#define CONV_5x5_OPT -#ifndef CONV_5x5_OPT - // conv2d_5x5 - kernel_func_names_.push_back("conv2d_5x5"); - kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl"); +#define CONV_OPT +#ifndef CONV_OPT + kernel_func_names_.push_back("conv2d_nxn"); + kernel_func_paths_.push_back("image/conv2d_nxn_kernel.cl"); CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); @@ -486,60 +481,16 @@ void ConvImageCompute::PrepareForRun() { MUTABLE_DATA_GPU( filter_gpu_image_, filter_image_w_, filter_image_h_, filter_image_data); - impl_ = &ConvImageCompute::Conv2d5x5; + impl_ = &ConvImageCompute::Conv2dnxn; #else - // conv2d_5x5_opt - - kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_5x5_multi_batch" - : "conv2d_5x5_opt"); - kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl"); - - CLImageConverterFolder converter; - const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - filter_image_h_ = filter_image_dims[1]; - filter_image_w_ = filter_image_dims[0]; - tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); - - auto* filter_image_data = MUTABLE_DATA_CPU(tensor_hold_filter_image_); - - converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); - MUTABLE_DATA_GPU( - filter_gpu_image_, filter_image_w_, filter_image_h_, filter_image_data); - - impl_ = &ConvImageCompute::Conv2d5x5opt; -#endif -#undef CONV_5x5_OPT - } else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7 && pad_equal && - stride_equal && dilation_equal && dilation_h_ == 1 && - groups_ == 1) { -#define CONV_7x7_OPT -#ifndef CONV_7x7_OPT - // conv2d_7x7 - kernel_func_names_.push_back("conv2d_7x7"); - kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl"); - - CLImageConverterFolder converter; - const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - filter_image_h_ = filter_image_dims[1]; - filter_image_w_ = filter_image_dims[0]; - tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); - - auto* filter_image_data = MUTABLE_DATA_CPU(tensor_hold_filter_image_); - converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); - MUTABLE_DATA_GPU( - filter_gpu_image_, filter_image_w_, filter_image_h_, filter_image_data); - impl_ = &ConvImageCompute::Conv2d7x7; - -#else - // conv2d_7x7 if (is_mali_ && input_tensor_n_ == 1) { - kernel_func_names_.push_back("conv2d_7x7_opt_mali"); + kernel_func_names_.push_back("conv2d_nxn_opt_mali"); + kernel_func_paths_.push_back("image/conv2d_opt_mali_kernel.cl"); bias_buffer_flag = true; } else { - kernel_func_names_.push_back( - input_tensor_n_ > 1 ? "conv2d_7x7_multi_batch" : "conv2d_7x7_opt"); + kernel_func_names_.push_back("conv2d_nxn_opt_adreno"); + kernel_func_paths_.push_back("image/conv2d_opt_adreno_kernel.cl"); } - kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl"); CLImageConverterNBlock converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); @@ -564,9 +515,9 @@ void ConvImageCompute::PrepareForRun() { filter_image_data); } - impl_ = &ConvImageCompute::Conv2d7x7opt; + impl_ = &ConvImageCompute::Conv2dnxnopt; #endif -#undef CONV_7x7_OPT +#undef CONV_OPT } else if (groups_ == 1) { // conv2d_common kernel_func_names_.push_back("conv2d_common"); @@ -1607,22 +1558,7 @@ void ConvImageCompute::SetGlobalWorkSize() { static_cast(output_channel_blocks * round_up_ouptut_width), static_cast(batch_round_h), 1}; - } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" || - kernel_func_names_[0] == "conv2d_3x3_opt") { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } else if (kernel_func_names_[0] == "conv2d_3x3_opt_mali") { + } else if (kernel_func_names_[0] == "conv2d_nxn_opt_mali") { int w_blk_size = 2; int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; @@ -1636,41 +1572,13 @@ void ConvImageCompute::SetGlobalWorkSize() { global_work_size_ = cl::NDRange{static_cast(c_blk_), static_cast(w_blk_), static_cast(nh_blk_)}; - } else if (kernel_func_names_[0] == "conv2d_5x5_multi_batch" || - kernel_func_names_[0] == "conv2d_5x5_opt") { + } else if (kernel_func_names_[0] == "conv2d_nxn_opt_adreno") { int w_blk_size = 5; int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; int h_blk_size = 1; int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } else if (kernel_func_names_[0] == "conv2d_7x7_multi_batch" || - kernel_func_names_[0] == "conv2d_7x7_opt") { - int w_blk_size = 5; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - - c_blk_ = default_c_blk_; - w_blk_ = w_blk; - nh_blk_ = h_blk; - global_work_size_ = cl::NDRange{static_cast(c_blk_), - static_cast(w_blk_), - static_cast(nh_blk_)}; - } else if (kernel_func_names_[0] == "conv2d_7x7_opt_mali") { - int w_blk_size = 2; - int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size; - - int h_blk_size = 1; - int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size; - c_blk_ = default_c_blk_; w_blk_ = w_blk; nh_blk_ = h_blk; @@ -1933,252 +1841,95 @@ void ConvImageCompute::Conv2d1x1opt() { } } -void ConvImageCompute::Conv2d3x3() { - use_lws_ = false; - status_ = kernel_.setArg(0, c_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(1, w_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(2, nh_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(3, *input_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(4, *filter_image_p_); +void ConvImageCompute::Conv2dnxnopt() { + int arg_idx = 0; + status_ = kernel_.setArg(arg_idx++, c_blk_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(5, *bias_image_p_); + status_ = kernel_.setArg(arg_idx++, w_blk_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(6, *output_image_p_); + status_ = kernel_.setArg(arg_idx++, nh_blk_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(7, stride_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(8, offset_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(9, input_c_block_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(10, dilation_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(11, input_tensor_w_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(12, input_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(13, output_tensor_w_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(14, output_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(15, output_tensor_c_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(16, filter_tensor_c_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(17, filter_tensor_w_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(18, filter_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(19, groups_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(20, input_tensor_c_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(21, *alpha_image_p_); - CL_CHECK_FATAL(status_); -} - -void ConvImageCompute::Conv2d3x3opt() { - status_ = kernel_.setArg(0, default_c_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(1, w_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(2, nh_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(3, *input_image_p_); + status_ = kernel_.setArg(arg_idx++, *input_image_p_); CL_CHECK_FATAL(status_); if (is_mali_ && input_tensor_n_ == 1) { auto* filter_buffer_p_ = GET_BUFFER_GPU(w_gpu_t_); auto* bias_buffer_p_ = GET_BUFFER_GPU(bias_gpu_t_); - status_ = kernel_.setArg(4, *filter_buffer_p_); + status_ = kernel_.setArg(arg_idx++, *filter_buffer_p_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(5, *bias_buffer_p_); + status_ = kernel_.setArg(arg_idx++, *bias_buffer_p_); CL_CHECK_FATAL(status_); } else { - status_ = kernel_.setArg(4, *filter_image_p_); + status_ = kernel_.setArg(arg_idx++, *filter_image_p_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(5, *bias_image_p_); + status_ = kernel_.setArg(arg_idx++, *bias_image_p_); CL_CHECK_FATAL(status_); } - status_ = kernel_.setArg(6, *output_image_p_); + status_ = kernel_.setArg(arg_idx++, filter_tensor_w_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(7, stride_h_); + status_ = kernel_.setArg(arg_idx++, filter_tensor_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(8, pad_left_); + status_ = kernel_.setArg(arg_idx++, *output_image_p_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(9, dilation_h_); + status_ = kernel_.setArg(arg_idx++, stride_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(10, input_tensor_n_); + status_ = kernel_.setArg(arg_idx++, pad_left_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(11, input_tensor_c_); + status_ = kernel_.setArg(arg_idx++, dilation_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(12, input_tensor_w_); + status_ = kernel_.setArg(arg_idx++, input_tensor_n_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(13, input_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(14, output_tensor_w_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(15, output_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(16, *alpha_image_p_); - CL_CHECK_FATAL(status_); -} - -void ConvImageCompute::Conv2d5x5() { - use_lws_ = false; - status_ = kernel_.setArg(0, c_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(1, w_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(2, nh_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(3, *input_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(4, *filter_image_p_); + status_ = kernel_.setArg(arg_idx++, input_tensor_c_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(5, *bias_image_p_); + status_ = kernel_.setArg(arg_idx++, input_tensor_w_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(6, *output_image_p_); + status_ = kernel_.setArg(arg_idx++, input_tensor_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(7, stride_h_); + status_ = kernel_.setArg(arg_idx++, output_tensor_w_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(8, offset_); + status_ = kernel_.setArg(arg_idx++, output_tensor_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(9, input_c_block_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(10, dilation_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(11, input_tensor_w_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(12, input_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(13, output_tensor_w_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(14, output_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(15, *alpha_image_p_); + status_ = kernel_.setArg(arg_idx++, *alpha_image_p_); CL_CHECK_FATAL(status_); } -void ConvImageCompute::Conv2d5x5opt() { - status_ = kernel_.setArg(0, c_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(1, w_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(2, nh_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(3, *input_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(4, *filter_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(5, *bias_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(6, *output_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(7, stride_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(8, pad_left_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(9, dilation_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(10, input_tensor_n_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(11, input_tensor_c_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(12, input_tensor_w_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(13, input_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(14, output_tensor_w_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(15, output_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(16, *alpha_image_p_); - CL_CHECK_FATAL(status_); -} - -void ConvImageCompute::Conv2d7x7() { +void ConvImageCompute::Conv2dnxn() { use_lws_ = false; - status_ = kernel_.setArg(0, c_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(1, w_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(2, nh_blk_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(3, *input_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(4, *filter_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(5, *bias_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(6, *output_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(7, stride_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(8, offset_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(9, input_c_block_); + int arg_idx = 0; + status_ = kernel_.setArg(arg_idx++, c_blk_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(9, dilation_h_); + status_ = kernel_.setArg(arg_idx++, w_blk_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(10, input_tensor_w_); + status_ = kernel_.setArg(arg_idx++, nh_blk_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(11, input_tensor_h_); + status_ = kernel_.setArg(arg_idx++, *input_image_p_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(12, output_tensor_w_); + status_ = kernel_.setArg(arg_idx++, *filter_image_p_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(13, output_tensor_h_); + status_ = kernel_.setArg(arg_idx++, *bias_image_p_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(14, *alpha_image_p_); + status_ = kernel_.setArg(arg_idx++, filter_tensor_w_); CL_CHECK_FATAL(status_); -} - -void ConvImageCompute::Conv2d7x7opt() { - status_ = kernel_.setArg(0, c_blk_); + status_ = kernel_.setArg(arg_idx++, filter_tensor_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(1, w_blk_); + status_ = kernel_.setArg(arg_idx++, *output_image_p_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(2, nh_blk_); + status_ = kernel_.setArg(arg_idx++, stride_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(3, *input_image_p_); + status_ = kernel_.setArg(arg_idx++, offset_); CL_CHECK_FATAL(status_); - if (is_mali_ && input_tensor_n_ == 1) { - auto* filter_buffer_p_ = GET_BUFFER_GPU(w_gpu_t_); - auto* bias_buffer_p_ = GET_BUFFER_GPU(bias_gpu_t_); - status_ = kernel_.setArg(4, *filter_buffer_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(5, *bias_buffer_p_); - CL_CHECK_FATAL(status_); - } else { - status_ = kernel_.setArg(4, *filter_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(5, *bias_image_p_); - CL_CHECK_FATAL(status_); - } - status_ = kernel_.setArg(6, *output_image_p_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(7, stride_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(8, pad_left_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(9, dilation_h_); + status_ = kernel_.setArg(arg_idx++, input_c_block_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(10, input_tensor_n_); + status_ = kernel_.setArg(arg_idx++, dilation_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(11, input_tensor_c_); + status_ = kernel_.setArg(arg_idx++, input_tensor_w_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(12, input_tensor_w_); + status_ = kernel_.setArg(arg_idx++, input_tensor_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(13, input_tensor_h_); + status_ = kernel_.setArg(arg_idx++, output_tensor_w_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(14, output_tensor_w_); + status_ = kernel_.setArg(arg_idx++, output_tensor_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(15, output_tensor_h_); - CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(16, *alpha_image_p_); + status_ = kernel_.setArg(arg_idx++, *alpha_image_p_); CL_CHECK_FATAL(status_); } diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 887e459682c..9f674f53d15 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -63,11 +63,8 @@ class ConvImageCompute : public KernelLite