[cuDNN] Add cuDNN grouped convolutions support

Signed-off-by: Wei Pan <weip@nvidia.com>
apache · Apr 15, 2020 · abb7564 · abb7564
1 parent e8138f7
commit abb7564
Show file tree

Hide file tree

Showing 8 changed files with 162 additions and 81 deletions.
diff --git a/python/tvm/contrib/cudnn.py b/python/tvm/contrib/cudnn.py
@@ -182,7 +182,8 @@ def conv_output_shape(tensor_format,
                       x_shape,
                       w_shape,
                       data_dtype,
-                      conv_dtype):
+                      conv_dtype,
+                      groups=1):
     """Get output shape of 2D or 3D convolution
 
     Paramters
@@ -205,6 +206,8 @@ def conv_output_shape(tensor_format,
         data type
     conv_dtype: str
         convolution type
+    groups: int
+        number of groups
 
     Returns
     -------
@@ -228,7 +231,8 @@ def conv_output_shape(tensor_format,
          _get_np_int32_array_handle(wshape),
          _get_np_int32_array_handle(oshape),
          data_dtype,
-         conv_dtype)
+         conv_dtype,
+         groups)
     return list(oshape)
 
 
@@ -240,7 +244,8 @@ def conv_find_algo(tensor_format,
                    w_shape,
                    y_shape,
                    data_dtype,
-                   conv_dtype):
+                   conv_dtype,
+                   groups=1):
     """Choose the best algo for the given input.
 
     Paramters
@@ -265,6 +270,8 @@ def conv_find_algo(tensor_format,
         data type
     conv_dtype: str
         convolution type
+    groups: int
+        number of groups
 
     Returns
     -------
@@ -287,7 +294,8 @@ def conv_find_algo(tensor_format,
                 _get_np_int32_array_handle(wshape),
                 _get_np_int32_array_handle(yshape),
                 data_dtype,
-                conv_dtype)
+                conv_dtype,
+                groups)
 
 
 def conv_forward(x,
@@ -298,7 +306,8 @@ def conv_forward(x,
                  conv_mode,
                  tensor_format,
                  algo,
-                 conv_dtype):
+                 conv_dtype,
+                 groups=1):
     """Create an extern op that compute 2D or 3D convolution with CuDNN
 
     Parameters
@@ -325,6 +334,8 @@ def conv_forward(x,
         if algo == -1, the best algo will be chosen by CUDNN
     conv_dtype: str
         convolution type
+    groups: int
+        the number of groups
 
     Returns
     -------
@@ -335,8 +346,7 @@ def conv_forward(x,
     assert dims in (4, 5)
 
     conv_dtype = x.dtype if conv_dtype is None else conv_dtype
-    pad, stride, dilation, _, _ = \
-        _prepare_global_func_params(dims - 2, pad, stride, dilation)
+    pad, stride, dilation, _, _ = _prepare_global_func_params(dims - 2, pad, stride, dilation)
 
     oshape = conv_output_shape(tensor_format,
                                pad,
@@ -345,7 +355,8 @@ def conv_forward(x,
                                list(x.shape),
                                list(w.shape),
                                x.dtype,
-                               conv_dtype)
+                               conv_dtype,
+                               groups)
     if algo == -1:
         # For now if we try to call `cudnnFindConvolutionForwardAlgorithm` when
         # using INT8 data type, CuDNN will crash down.
@@ -361,7 +372,8 @@ def conv_forward(x,
                                   list(w.shape),
                                   oshape,
                                   x.dtype,
-                                  conv_dtype)
+                                  conv_dtype,
+                                  groups)
 
     if dims == 4:
         return te.extern(
@@ -380,7 +392,8 @@ def conv_forward(x,
                 ins[0],
                 ins[1],
                 outs[0],
-                conv_dtype), name="y")
+                conv_dtype,
+                groups), name="y")
 
     return te.extern(
         oshape, [x, w],
@@ -401,7 +414,8 @@ def conv_forward(x,
             ins[0],
             ins[1],
             outs[0],
-            conv_dtype), name="y")
+            conv_dtype,
+            groups), name="y")
 
 def softmax(x, axis=-1):
     """Compute softmax using CuDNN

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
@@ -91,6 +91,9 @@ def schedule_lrn_cuda(attrs, outs, target):
 @conv2d_strategy.register(["cuda", "gpu"])
 def conv2d_strategy_cuda(attrs, inputs, out_type, target):
     """conv2d cuda strategy"""
+    if attrs.data_layout == "NCHW":
+        raise ValueError("HERE")
+
     strategy = _op.OpStrategy()
     data, kernel = inputs
     stride_h, stride_w = attrs.get_int_tuple("strides")
@@ -196,6 +199,15 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 name="group_conv2d_NCHWc_int8.cuda")
         else:
             raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
+        # add cudnn implementation
+        if target.target_name == "cuda" and "cudnn" in target.libs:
+            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \
+                    padding[1] == padding[3]:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn, True, has_groups=True),
+                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
+                    name="conv2d_cudnn.cuda",
+                    plevel=15)
     return strategy
 
 @conv2d_winograd_without_weight_transfrom_strategy.register(["cuda", "gpu"])

diff --git a/src/runtime/contrib/cudnn/conv_forward.cc b/src/runtime/contrib/cudnn/conv_forward.cc
@@ -35,6 +35,7 @@ void ConvolutionForward(
   int format,
   int algo,
   int dims,
+  int groups,
   const int pad[],
   const int stride[],
   const int dilation[],
@@ -62,8 +63,10 @@ void ConvolutionForward(
 
   // Note: For 2D tenor, using ND setters causes CUDNN_STATUS_NOT_SUPPORTED error
   // in following cudnnGetConvolutionForwardWorkspaceSize() when data type is fp16, int
+
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(entry_ptr->conv_entry.conv_desc, groups));
   if (dims == 2) {
-  // Set Desc
+    // Set Desc
     CUDNN_CALL(cudnnSetConvolution2dDescriptor(entry_ptr->conv_entry.conv_desc,
                                                pad[0],
                                                pad[1],
@@ -183,6 +186,7 @@ void ConvolutionForward(
 void OutputShape(
   int format,
   int dims,
+  int groups,
   const int pad[],
   const int stride[],
   const int dilation[],
@@ -202,6 +206,7 @@ void OutputShape(
   int full_dims = dims + 2;
 
   // conv desc
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(entry_ptr->conv_entry.conv_desc, groups));
   CUDNN_CALL(cudnnSetConvolutionNdDescriptor(entry_ptr->conv_entry.conv_desc,
                                              dims,
                                              pad,
@@ -240,6 +245,7 @@ void OutputShape(
     // Set Input
     std::vector<int> tensor_stride(full_dims);
     GetCudnnStride(full_dims, x_dim, tensor_stride.data());
+
     CUDNN_CALL(cudnnSetTensorNdDescriptor(entry_ptr->conv_entry.input_desc,
                                           data_type,
                                           full_dims,
@@ -264,6 +270,7 @@ void OutputShape(
 void FindAlgo(
   int format,
   int dims,
+  int groups,
   const int pad[],
   const int stride[],
   const int dilation[],
@@ -284,6 +291,7 @@ void FindAlgo(
   int full_dims = dims + 2;
 
   // conv desc
+  CUDNN_CALL(cudnnSetConvolutionGroupCount(entry_ptr->conv_entry.conv_desc, groups));
   CUDNN_CALL(cudnnSetConvolutionNdDescriptor(entry_ptr->conv_entry.conv_desc,
                                              dims,
                                              pad,
@@ -360,16 +368,18 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
   int algo = args[2];
   int pad_v[2], stride_v[2], dilation_v[2];
   for (int i = 0; i < 2; i++) {
-      pad_v[i] = args[3 + i];
-      stride_v[i] = args[5 + i];
-      dilation_v[i] = args[7 + i];
+    pad_v[i] = args[3 + i];
+    stride_v[i] = args[5 + i];
+    dilation_v[i] = args[7 + i];
   }
   DLTensor* x = args[9];
   DLTensor* w = args[10];
   DLTensor* y = args[11];
   std::string conv_dtype = args[12];
+  int groups = args[13];
 
-  ConvolutionForward(mode, format, algo, 2, pad_v, stride_v, dilation_v, x, w, y, conv_dtype);
+  ConvolutionForward(mode, format, algo, 2, groups, pad_v, stride_v,
+                     dilation_v, x, w, y, conv_dtype);
 });
 
 
@@ -380,17 +390,18 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv3d.forward")
   int algo = args[2];
   int pad_v[3], stride_v[3], dilation_v[3];
   for (int i = 0; i < 3; i++) {
-      pad_v[i] = args[3 + i];
-      stride_v[i] = args[6 + i];
-      dilation_v[i] = args[9 + i];
+    pad_v[i] = args[3 + i];
+    stride_v[i] = args[6 + i];
+    dilation_v[i] = args[9 + i];
   }
   DLTensor *x = args[12];
   DLTensor *w = args[13];
   DLTensor *y = args[14];
   std::string conv_dtype = args[15];
+  int groups = args[16];
 
-  ConvolutionForward(mode, format, algo, 3, pad_v, stride_v, dilation_v, x, w, y,
-                     conv_dtype);
+  ConvolutionForward(mode, format, algo, 3, groups, pad_v, stride_v,
+                     dilation_v, x, w, y, conv_dtype);
 });
 
 
@@ -406,8 +417,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv.output_shape")
   void* out_shape = args[7];
   std::string data_dtype = args[8];
   std::string conv_dtype = args[9];
+  int groups = args[10];
 
-  OutputShape(format, dims, pad, stride, dilation, x_dim,
+  OutputShape(format, dims, groups, pad, stride, dilation, x_dim,
               w_dim, out_shape, data_dtype, conv_dtype);
 });
 
@@ -424,8 +436,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv.find_algo")
   int* y_dim = static_cast<int*>(static_cast<void*>(args[7]));
   std::string data_dtype = args[8];
   std::string conv_dtype = args[9];
+  int groups = args[10];
 
-  FindAlgo(format, dims, pad, stride, dilation, x_dim,
+  FindAlgo(format, dims, groups, pad, stride, dilation, x_dim,
            w_dim, y_dim, data_dtype, conv_dtype, ret);
 });
 

diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -78,7 +78,6 @@ struct ConvEntry {
   runtime::DeviceAPI *cuda_api;
   void *workspace{nullptr};
   size_t workspace_size{0};
-  int group_count {0};
   ConvEntry();
   ~ConvEntry();
   void UpdateWorkspace(const size_t wsize);