Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry-pick][OpenCL] Add shuffle channel kernel #4782

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions lite/backends/opencl/cl_kernel/image/shuffle_channel_kernel.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <cl_common.h>

__kernel void shuffle_channel(__read_only image2d_t input,
__write_only image2d_t output,
__private const int group,
__private const int group_size,
__private const int channels,
__private const int out_W) {
const int w_idx = get_global_id(0);
const int c4_idx = get_global_id(1);
const int nh_idx = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = c4_idx * out_W + w_idx;
output_pos.y = nh_idx;
CL_DTYPE4 output_data;
for(int i = 0; i < 4; i++){
int outc_idx = (c4_idx << 2) + i;
if(outc_idx >= channels){
break;
}
int inc_idx = outc_idx % group * group_size + outc_idx / group;
int inc4_idx = inc_idx >> 2;
int2 input_pos;
input_pos.x = inc4_idx * out_W + w_idx;
input_pos.y = nh_idx;
CL_DTYPE4 input_data;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
CL_DTYPE value;
int sub_idx = inc_idx % 4;
if (sub_idx == 0) {
value = input_data.x;
} else if (sub_idx == 1) {
value = input_data.y;
} else if (sub_idx == 2) {
value = input_data.z;
} else if (sub_idx == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
}

56 changes: 55 additions & 1 deletion lite/core/arena/framework.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,32 @@ void TestCase::PrepareInputsForInstruction() {
auto* inst_tensor = inst_scope_->FindMutableTensor(var);
CHECK(!base_tensor->dims().empty())
<< "The dims of input tensor is empty yet";
#ifdef LITE_WITH_OPENCL
input_cpu_tensor.Resize(base_tensor->dims());
base_tensor->raw_data();
base_tensor->memory_size();
input_cpu_tensor.raw_data();
float* input_cpu_data = input_cpu_tensor.mutable_data<float>();
memcpy(input_cpu_data,
base_tensor->raw_data(),
base_tensor->numel() * sizeof(float));
const DDim& input_image_dims =
converter.InitImageDimInfoWith(base_tensor->dims());
input_image_cpu_tensor.Resize(
{1, input_image_dims[0], input_image_dims[1], 4});
uint16_t* input_image_cpu_data =
input_image_cpu_tensor.mutable_data<uint16_t>();
converter.NCHWToImage(
input_cpu_data, input_image_cpu_data, base_tensor->dims());
inst_tensor->mutable_data<half_t, cl::Image2D>(
input_image_dims[0], input_image_dims[1], input_image_cpu_data);
#else
TargetCopy(type->target(),
inst_tensor->mutable_data(type->target(),
base_tensor->memory_size()),
base_tensor->raw_data(),
base_tensor->memory_size());
#endif
} else if (type->IsTensorList() &&
!TargetCompatibleTo(*Type::GetTensorListTy(TARGET(kHost)),
*type)) {
Expand Down Expand Up @@ -150,15 +171,41 @@ bool TestCase::CheckTensorPrecision(const Tensor* inst_tensor,
inst_data = static_cast<const T*>(inst_tensor->raw_data());
break;
#ifdef LITE_WITH_XPU
case TARGET(kXPU):
case TARGET(kXPU): {
CopySync<TARGET(kXPU)>(inst_host_tensor.mutable_data<T>(),
inst_tensor->raw_data(),
sizeof(T) * inst_tensor->dims().production(),
IoDirection::DtoH);
inst_data = inst_host_tensor.data<T>();
break;
}
#endif
#ifdef LITE_WITH_OPENCL
case TARGET(kOpenCL): {
CLRuntime::Global()->command_queue().finish();
const DDim& out_image_shape =
converter.InitImageDimInfoWith(inst_tensor->dims());
auto out_image_width = out_image_shape[0];
auto out_image_height = out_image_shape[1];
half_t* out_image_data = new half_t[out_image_shape.production() * 4];
auto* out_image = inst_tensor->data<half_t, cl::Image2D>();
TargetWrapperCL::ImgcpySync(out_image_data,
out_image,
out_image_width,
out_image_height,
0,
0,
IoDirection::DtoH);

float* out_data = new float[out_image_shape.production() * 4];
converter.ImageToNCHW(out_image_data,
inst_host_tensor.mutable_data<float>(),
out_image_shape,
inst_tensor->dims());
inst_data = inst_host_tensor.data<T>();
break;
}
#endif
default:
// Before compare, need to copy data from `target` device to host.
LOG(FATAL) << "Not supported";
Expand Down Expand Up @@ -186,17 +233,24 @@ bool TestCase::CheckPrecision(const Tensor* inst_tensor,
if (precision_type == PRECISION(kAny)) {
precision_type_t = base_tensor->precision();
}
#ifdef LITE_WITH_OPENCL
precision_type_t = base_tensor->precision();
#endif
CHECK(precision_type_t == base_tensor->precision())
<< "arg precision type and base tensor precision type are not matched! "
"arg precision type is: "
<< PrecisionToStr(precision_type) << ", base tensor precision type is: "
<< PrecisionToStr(base_tensor->precision());
#ifdef LITE_WITH_OPENCL

#else
CHECK(inst_tensor->precision() == base_tensor->precision())
<< "real tensor precision type and base tensor precision type are not "
"matched! real tensor precision type is: "
<< PrecisionToStr(inst_tensor->precision())
<< ", base tensor precision type is: "
<< PrecisionToStr(base_tensor->precision());
#endif
switch (precision_type_t) {
case PRECISION(kFloat):
return CheckTensorPrecision<float>(inst_tensor, base_tensor, abs_error);
Expand Down
6 changes: 6 additions & 0 deletions lite/core/arena/framework.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,12 @@ class TestCase {
public:
const Instruction& instruction() { return *instruction_; }

#ifdef LITE_WITH_OPENCL
CLImageConverterDefault converter;
lite::Tensor input_image_cpu_tensor;
lite::Tensor input_cpu_tensor;
#endif

private:
std::unique_ptr<KernelContext> ctx_;
void CreateInstruction();
Expand Down
1 change: 1 addition & 0 deletions lite/kernels/host/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ add_kernel(pixel_shuffle_compute_host Host extra SRCS pixel_shuffle_compute.cc D
add_kernel(one_hot_compute_host Host extra SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
add_kernel(uniform_random_compute_host Host extra SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps})
add_kernel(sequence_unpad_compute_host Host extra SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps} sequence_padding)
add_kernel(shuffle_channel_compute_host Host extra SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} sequence_padding)

if(LITE_BUILD_EXTRA AND LITE_WITH_x86)
lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host)
Expand Down
77 changes: 77 additions & 0 deletions lite/kernels/host/shuffle_channel_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/host/shuffle_channel_compute.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace host {
void shuffle_kernel(
float* output, const float* input, int group_row, int group_col, int len) {
for (int i = 0; i < group_row; ++i) {
for (int j = 0; j < group_col; ++j) {
const float* p_i = input + (i * group_col + j) * len;
float* p_o = output + (j * group_row + i) * len;
memcpy(p_o, p_i, len * sizeof(float));
}
}
}

void shuffle_channel(const float* inputs,
float* outputs,
int group,
int num,
int channel,
int height,
int width) {
int fea_size = channel * height * width;
int spatial_size = height * width;
int group_row = group;
int group_col = channel / group;
for (int i = 0; i < num; ++i) {
shuffle_kernel(outputs + i * fea_size,
inputs + i * fea_size,
group_row,
group_col,
spatial_size);
}
}
void ShuffleChannelCompute::Run() {
auto& param = Param<operators::ShuffleChannelParam>();
const float* x_data = param.X->data<float>();
float* output_data = param.Out->mutable_data<float>();
DDim x_dims = param.X->dims();
int group = param.group;
int num = param.X->dims()[0];
int channel = param.X->dims()[1];
int height = param.X->dims()[2];
int width = param.X->dims()[3];
shuffle_channel(x_data, output_data, group, num, channel, height, width);
}

} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle

REGISTER_LITE_KERNEL(shuffle_channel,
kHost,
kFloat,
kNCHW,
paddle::lite::kernels::host::ShuffleChannelCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
37 changes: 37 additions & 0 deletions lite/kernels/host/shuffle_channel_compute.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <random>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace host {

class ShuffleChannelCompute
: public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
using param_t = operators::ShuffleChannelParam;

void Run() override;

virtual ~ShuffleChannelCompute() = default;
};

} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
2 changes: 1 addition & 1 deletion lite/kernels/opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ add_kernel(pad2d_opencl_image OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl
add_kernel(box_coder_opencl_image OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pixel_shuffle_opencl_image OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(expand_opencl_image OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})

add_kernel(shuffle_channel_opencl_image OPENCL basic SRCS shuffle_channel_image_compute.cc DEPS ${cl_kernel_deps})
# extra
# wait to add ...

Expand Down
Loading