PaddlePaddle · zhaoyang-star · Nov 20, 2020 · Nov 11, 2020 · Nov 19, 2020 · Nov 20, 2020
diff --git a/lite/backends/opencl/cl_kernel/image/shuffle_channel_kernel.cl b/lite/backends/opencl/cl_kernel/image/shuffle_channel_kernel.cl
@@ -0,0 +1,68 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void shuffle_channel(__read_only  image2d_t input,
+                             __write_only image2d_t output,
+                             __private const int group,
+                             __private const int group_size,
+                             __private const int channels,
+                             __private const int out_W) {
+    const int w_idx      = get_global_id(0);
+    const int c4_idx     = get_global_id(1);
+    const int nh_idx     = get_global_id(2);
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP |
+                              CLK_FILTER_NEAREST;
+    int2 output_pos;
+    output_pos.x = c4_idx * out_W + w_idx;
+    output_pos.y = nh_idx;
+    CL_DTYPE4 output_data;
+    for(int i = 0; i < 4; i++){
+        int outc_idx = (c4_idx << 2) + i;
+        if(outc_idx >= channels){
+            break;
+        }
+        int inc_idx  = outc_idx % group * group_size + outc_idx / group;
+        int inc4_idx = inc_idx >> 2;
+        int2 input_pos;
+        input_pos.x = inc4_idx * out_W + w_idx;
+        input_pos.y = nh_idx;
+        CL_DTYPE4 input_data;
+        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+        CL_DTYPE value;
+        int sub_idx = inc_idx % 4;
+        if (sub_idx == 0) {
+          value = input_data.x;
+        } else if (sub_idx == 1) {
+          value = input_data.y;
+        } else if (sub_idx == 2) {
+          value = input_data.z;
+        } else if (sub_idx == 3) {
+          value = input_data.w;
+        }
+        if (i == 0) {
+          output_data.x = value;
+        } else if (i == 1) {
+          output_data.y = value;
+        } else if (i == 2) {
+          output_data.z = value;
+        } else if (i == 3) {
+          output_data.w = value;
+        }
+    }
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
+}
+
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
@@ -100,11 +100,32 @@ void TestCase::PrepareInputsForInstruction() {
         auto* inst_tensor = inst_scope_->FindMutableTensor(var);
         CHECK(!base_tensor->dims().empty())
             << "The dims of input tensor is empty yet";
+#ifdef LITE_WITH_OPENCL
+        input_cpu_tensor.Resize(base_tensor->dims());
+        base_tensor->raw_data();
+        base_tensor->memory_size();
+        input_cpu_tensor.raw_data();
+        float* input_cpu_data = input_cpu_tensor.mutable_data<float>();
+        memcpy(input_cpu_data,
+               base_tensor->raw_data(),
+               base_tensor->numel() * sizeof(float));
+        const DDim& input_image_dims =
+            converter.InitImageDimInfoWith(base_tensor->dims());
+        input_image_cpu_tensor.Resize(
+            {1, input_image_dims[0], input_image_dims[1], 4});
+        uint16_t* input_image_cpu_data =
+            input_image_cpu_tensor.mutable_data<uint16_t>();
+        converter.NCHWToImage(
+            input_cpu_data, input_image_cpu_data, base_tensor->dims());
+        inst_tensor->mutable_data<half_t, cl::Image2D>(
+            input_image_dims[0], input_image_dims[1], input_image_cpu_data);
+#else
         TargetCopy(type->target(),
                    inst_tensor->mutable_data(type->target(),
                                              base_tensor->memory_size()),
                    base_tensor->raw_data(),
                    base_tensor->memory_size());
+#endif
       } else if (type->IsTensorList() &&
                  !TargetCompatibleTo(*Type::GetTensorListTy(TARGET(kHost)),
                                      *type)) {
@@ -150,15 +171,41 @@ bool TestCase::CheckTensorPrecision(const Tensor* inst_tensor,
       inst_data = static_cast<const T*>(inst_tensor->raw_data());
       break;
 #ifdef LITE_WITH_XPU
-    case TARGET(kXPU):
+    case TARGET(kXPU): {
       CopySync<TARGET(kXPU)>(inst_host_tensor.mutable_data<T>(),
                              inst_tensor->raw_data(),
                              sizeof(T) * inst_tensor->dims().production(),
                              IoDirection::DtoH);
       inst_data = inst_host_tensor.data<T>();
       break;
+    }
 #endif
+#ifdef LITE_WITH_OPENCL
+    case TARGET(kOpenCL): {
+      CLRuntime::Global()->command_queue().finish();
+      const DDim& out_image_shape =
+          converter.InitImageDimInfoWith(inst_tensor->dims());
+      auto out_image_width = out_image_shape[0];
+      auto out_image_height = out_image_shape[1];
+      half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+      auto* out_image = inst_tensor->data<half_t, cl::Image2D>();
+      TargetWrapperCL::ImgcpySync(out_image_data,
+                                  out_image,
+                                  out_image_width,
+                                  out_image_height,
+                                  0,
+                                  0,
+                                  IoDirection::DtoH);
 
+      float* out_data = new float[out_image_shape.production() * 4];
+      converter.ImageToNCHW(out_image_data,
+                            inst_host_tensor.mutable_data<float>(),
+                            out_image_shape,
+                            inst_tensor->dims());
+      inst_data = inst_host_tensor.data<T>();
+      break;
+    }
+#endif
     default:
       // Before compare, need to copy data from `target` device to host.
       LOG(FATAL) << "Not supported";
@@ -186,17 +233,24 @@ bool TestCase::CheckPrecision(const Tensor* inst_tensor,
   if (precision_type == PRECISION(kAny)) {
     precision_type_t = base_tensor->precision();
   }
+#ifdef LITE_WITH_OPENCL
+  precision_type_t = base_tensor->precision();
+#endif
   CHECK(precision_type_t == base_tensor->precision())
       << "arg precision type and base tensor precision type are not matched! "
          "arg precision type is: "
       << PrecisionToStr(precision_type) << ", base tensor precision type is: "
       << PrecisionToStr(base_tensor->precision());
+#ifdef LITE_WITH_OPENCL
+
+#else
   CHECK(inst_tensor->precision() == base_tensor->precision())
       << "real tensor precision type and base tensor precision type are not "
          "matched! real tensor precision type is: "
       << PrecisionToStr(inst_tensor->precision())
       << ", base tensor precision type is: "
       << PrecisionToStr(base_tensor->precision());
+#endif
   switch (precision_type_t) {
     case PRECISION(kFloat):
       return CheckTensorPrecision<float>(inst_tensor, base_tensor, abs_error);

diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h
@@ -159,6 +159,12 @@ class TestCase {
  public:
   const Instruction& instruction() { return *instruction_; }
 
+#ifdef LITE_WITH_OPENCL
+  CLImageConverterDefault converter;
+  lite::Tensor input_image_cpu_tensor;
+  lite::Tensor input_cpu_tensor;
+#endif
+
  private:
   std::unique_ptr<KernelContext> ctx_;
   void CreateInstruction();

diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
@@ -29,6 +29,7 @@ add_kernel(pixel_shuffle_compute_host Host extra SRCS pixel_shuffle_compute.cc D
 add_kernel(one_hot_compute_host Host extra SRCS one_hot_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(uniform_random_compute_host Host extra SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_unpad_compute_host Host extra SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps} sequence_padding)
+add_kernel(shuffle_channel_compute_host Host extra SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} sequence_padding)
 
 if(LITE_BUILD_EXTRA AND LITE_WITH_x86)
   lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host)

diff --git a/lite/kernels/host/shuffle_channel_compute.cc b/lite/kernels/host/shuffle_channel_compute.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/shuffle_channel_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+void shuffle_kernel(
+    float* output, const float* input, int group_row, int group_col, int len) {
+  for (int i = 0; i < group_row; ++i) {
+    for (int j = 0; j < group_col; ++j) {
+      const float* p_i = input + (i * group_col + j) * len;
+      float* p_o = output + (j * group_row + i) * len;
+      memcpy(p_o, p_i, len * sizeof(float));
+    }
+  }
+}
+
+void shuffle_channel(const float* inputs,
+                     float* outputs,
+                     int group,
+                     int num,
+                     int channel,
+                     int height,
+                     int width) {
+  int fea_size = channel * height * width;
+  int spatial_size = height * width;
+  int group_row = group;
+  int group_col = channel / group;
+  for (int i = 0; i < num; ++i) {
+    shuffle_kernel(outputs + i * fea_size,
+                   inputs + i * fea_size,
+                   group_row,
+                   group_col,
+                   spatial_size);
+  }
+}
+void ShuffleChannelCompute::Run() {
+  auto& param = Param<operators::ShuffleChannelParam>();
+  const float* x_data = param.X->data<float>();
+  float* output_data = param.Out->mutable_data<float>();
+  DDim x_dims = param.X->dims();
+  int group = param.group;
+  int num = param.X->dims()[0];
+  int channel = param.X->dims()[1];
+  int height = param.X->dims()[2];
+  int width = param.X->dims()[3];
+  shuffle_channel(x_data, output_data, group, num, channel, height, width);
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(shuffle_channel,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::ShuffleChannelCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/host/shuffle_channel_compute.h b/lite/kernels/host/shuffle_channel_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <random>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class ShuffleChannelCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
+ public:
+  using param_t = operators::ShuffleChannelParam;
+
+  void Run() override;
+
+  virtual ~ShuffleChannelCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
@@ -37,7 +37,7 @@ add_kernel(pad2d_opencl_image OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl
 add_kernel(box_coder_opencl_image OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(pixel_shuffle_opencl_image OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(expand_opencl_image OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
-
+add_kernel(shuffle_channel_opencl_image OPENCL basic SRCS shuffle_channel_image_compute.cc DEPS ${cl_kernel_deps})
 # extra
 # wait to add ...