Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Runtime][Contrib] Support cudnn softmax #5214

Merged
merged 6 commits into from
Apr 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions python/tvm/contrib/cudnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,3 +402,27 @@ def conv_forward(x,
ins[1],
outs[0],
conv_dtype), name="y")

def softmax(x, axis=-1):
"""Compute softmax using CuDNN

Parameters
----------
x : tvm.te.Tensor
The input tensor

axis : int
The axis to compute the softmax

Returns
-------
ret : tvm.te.Tensor
The result tensor
"""
return te.extern(
x.shape, [x],
lambda ins, outs: tvm.tir.call_packed(
"tvm.contrib.cudnn.softmax.forward",
ins[0],
outs[0],
axis), name="y")
4 changes: 2 additions & 2 deletions python/tvm/relay/op/nn/_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@


# softmax
reg.register_schedule("nn.softmax", strategy.schedule_softmax)
reg.register_strategy("nn.softmax", strategy.softmax_strategy)
reg.register_pattern("nn.softmax", OpPattern.OPAQUE)


# log_softmax
reg.register_schedule("nn.log_softmax", strategy.schedule_softmax)
reg.register_schedule("nn.log_softmax", strategy.schedule_log_softmax)
reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)


Expand Down
5 changes: 5 additions & 0 deletions python/tvm/relay/op/op_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ class DenseAttrs(Attrs):
"""Attributes for nn.dense"""


@tvm._ffi.register_object("relay.attrs.SoftmaxAttrs")
class SoftmaxAttrs(Attrs):
"""Attributes for nn.softmax"""


@tvm._ffi.register_object("relay.attrs.FIFOBufferAttrs")
class FIFOBufferAttrs(Attrs):
"""Attributes for nn.fifo_buffer"""
Expand Down
22 changes: 19 additions & 3 deletions python/tvm/relay/op/strategy/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,25 @@ def schedule_adaptive_pool_cuda(attrs, outs, target):
with target:
return topi.cuda.schedule_adaptive_pool(outs)

@schedule_softmax.register(["cuda", "gpu"])
def schedule_softmax_cuda(attrs, outs, target):
"""schedule softmax for cuda"""
@softmax_strategy.register(["cuda", "gpu"])
def softmax_strategy_cuda(attrs, inputs, out_type, target):
"""softmax cuda strategy"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_softmax(topi.nn.softmax),
wrap_topi_schedule(topi.cuda.schedule_softmax),
name="softmax.cuda")
if target.target_name == "cuda" and "cudnn" in target.libs:
strategy.add_implementation(
wrap_compute_softmax(topi.cuda.softmax_cudnn),
wrap_topi_schedule(topi.cuda.schedule_softmax_cudnn),
name="softmax.cudnn",
plevel=15)
return strategy

@schedule_log_softmax.register(["cuda", "gpu"])
def schedule_log_softmax_cuda(attrs, outs, target):
"""scheudle log_softmax for cuda"""
with target:
return topi.cuda.schedule_softmax(outs)

Expand Down
22 changes: 20 additions & 2 deletions python/tvm/relay/op/strategy/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,27 @@ def schedule_adaptive_pool(attrs, outs, target):
return topi.generic.schedule_adaptive_pool(outs)

# softmax
def wrap_compute_softmax(topi_compute):
"""Wrap softmax topi compute"""
def _compute_softmax(attrs, inputs, out_type):
axis = attrs.get_int("axis")
return [topi_compute(inputs[0], axis)]
return _compute_softmax

@override_native_generic_func("softmax_strategy")
def softmax_strategy(attrs, inputs, out_type, target):
"""softmax generic strategy"""
strategy = _op.OpStrategy()
strategy.add_implemenation(
wrap_compute_softmax(topi.nn.softmax),
wrap_topi_schedule(topi.generic.schedule_softmax),
name="softmax.generic")
return strategy

# log_softmax
@generic_func
def schedule_softmax(attrs, outs, target):
"""Schedule softmax"""
def schedule_log_softmax(attrs, outs, target):
"""Schedule log_softmax op"""
with target:
return topi.generic.schedule_softmax(outs)

Expand Down
16 changes: 13 additions & 3 deletions python/tvm/relay/op/strategy/hls.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,19 @@ def schedule_adaptive_pool_hls(attrs, outs, target):
with target:
return topi.hls.schedule_adaptive_pool(outs)

@schedule_softmax.register("hls")
def schedule_softmax_hls(attrs, outs, target):
"""schedule softmax for hls"""
@softmax_strategy.register("hls")
def softmax_strategy_hls(attrs, inputs, out_type, target):
"""softmax hls strategy"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_softmax(topi.nn.softmax),
wrap_topi_schedule(topi.hls.schedule_softmax),
name="softmax.hls")
return strategy

@schedule_log_softmax.register("hls")
def schedule_log_softmax_hls(attrs, inputs, out_type, target):
"""schedule log_softmax for hls"""
with target:
return topi.hls.schedule_softmax(outs)

Expand Down
16 changes: 13 additions & 3 deletions python/tvm/relay/op/strategy/opengl.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,19 @@ def schedule_adaptive_pool_opengl(attrs, outs, target):
with target:
return topi.opengl.schedule_adaptive_pool(outs)

@schedule_softmax.register("opengl")
def schedule_softmax_opengl(attrs, outs, target):
"""schedule softmax for opengl"""
@softmax_strategy.register("opengl")
def softmax_strategy_opengl(attrs, inputs, out_type, target):
"""softmax opengl strategy"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_softmax(topi.nn.softmax),
wrap_topi_schedule(topi.opengl.schedule_softmax),
name="softmax.opengl")
return strategy

@schedule_log_softmax.register("opengl")
def schedule_log_softmax_opengl(attrs, outs, target):
"""schedule log_softmax for opengl"""
with target:
return topi.opengl.schedule_softmax(outs)

Expand Down
16 changes: 13 additions & 3 deletions python/tvm/relay/op/strategy/x86.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,19 @@ def schedule_adaptive_pool_cpu(attrs, outs, target):
with target:
return topi.x86.schedule_adaptive_pool(outs)

@schedule_softmax.register("cpu")
def schedule_softmax_cpu(attrs, outs, target):
"""schedule softmax for x86"""
@softmax_strategy.register("cpu")
def softmax_strategy_cpu(attrs, inputs, out_type, target):
"""softmax x86 strategy"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_softmax(topi.nn.softmax),
wrap_topi_schedule(topi.x86.schedule_softmax),
name="softmax.x86")
return strategy

@schedule_log_softmax.register("cpu")
def schedule_log_softmax_cpu(attrs, outs, target):
"""schedule log_softmax op for x86"""
with target:
return topi.x86.schedule_softmax(outs)

Expand Down
9 changes: 1 addition & 8 deletions src/relay/op/nn/nn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -347,14 +347,7 @@ RELAY_REGISTER_OP("nn.softmax")
.set_num_inputs(1)
.add_argument("data", "Tensor", "The input tensor.")
.set_support_level(1)
.add_type_rel("Identity", IdentityRel)
.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
const Array<te::Tensor>& inputs,
const Type& out_type) {
const auto* param = attrs.as<SoftmaxAttrs>();
CHECK(param != nullptr);
return Array<te::Tensor>{ topi::nn::softmax(inputs[0], param->axis) };
});
.add_type_rel("Identity", IdentityRel);


// relay.nn.log_softmax
Expand Down
10 changes: 10 additions & 0 deletions src/runtime/contrib/cudnn/cudnn_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,5 +140,15 @@ void ConvEntry::CleanWorkspace() {
workspace_size = 0;
}

// SoftmaxEntry

SoftmaxEntry::SoftmaxEntry() {
CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc));
}

SoftmaxEntry::~SoftmaxEntry() {
CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc));
}

} // namespace contrib
} // namespace tvm
8 changes: 8 additions & 0 deletions src/runtime/contrib/cudnn/cudnn_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,20 @@ struct ConvEntry {
void CleanWorkspace();
}; // ConvThreadEntry

struct SoftmaxEntry {
cudnnSoftmaxMode_t mode;
cudnnDataType_t data_type;
cudnnTensorDescriptor_t shape_desc;
SoftmaxEntry();
~SoftmaxEntry();
}; // SoftmaxEntry

struct CuDNNThreadEntry {
CuDNNThreadEntry();
~CuDNNThreadEntry();
cudnnHandle_t handle{nullptr};
ConvEntry conv_entry;
SoftmaxEntry softmax_entry;
runtime::DeviceAPI *cuda_api{nullptr};
static CuDNNThreadEntry* ThreadLocal();
}; // CuDNNThreadEntry
Expand Down
94 changes: 94 additions & 0 deletions src/runtime/contrib/cudnn/softmax.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* \file src/runtime/contrib/cudnn/softmax.cc
* \brief Use external cudnn softmax function
*/
#include <tvm/runtime/registry.h>
#include <tvm/runtime/device_api.h>
#include "cudnn_utils.h"

namespace tvm {
namespace contrib {

using namespace runtime;

TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.softmax.forward")
.set_body([](TVMArgs args, TVMRetValue *ret) {
DLTensor* x = args[0];
DLTensor* y = args[1];
int axis = args[2];
int ndim = x->ndim;
int64_t* shape = x->shape;
if (axis < 0) axis += ndim;
CHECK(axis >= 0 && axis < ndim);

CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
entry_ptr->softmax_entry.data_type = CuDNNDataType::DLTypeToCuDNNType(x->dtype);

// Set mode and shape descriptor
if (axis == ndim - 1) {
int64_t N = 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused why we need int64_t here but later cast to int

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's because DLTensor defines the shape in int64_t type. There'll be a cast anyway.

for (int i = 0; i < ndim - 1; ++i) {
N *= shape[i];
}
entry_ptr->softmax_entry.mode = CUDNN_SOFTMAX_MODE_INSTANCE;
CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->softmax_entry.shape_desc,
CUDNN_TENSOR_NCHW,
entry_ptr->softmax_entry.data_type,
static_cast<int>(N),
static_cast<int>(shape[ndim - 1]),
1,
1));
} else {
int64_t pre_axis_dim = 1;
int64_t post_axis_dim = 1;
for (int i = 0; i < ndim; ++i) {
if (i < axis) {
pre_axis_dim *= shape[i];
} else if (i > axis) {
post_axis_dim *= shape[i];
}
}
entry_ptr->softmax_entry.mode = CUDNN_SOFTMAX_MODE_CHANNEL;
CUDNN_CALL(cudnnSetTensor4dDescriptor(entry_ptr->softmax_entry.shape_desc,
CUDNN_TENSOR_NCHW,
entry_ptr->softmax_entry.data_type,
static_cast<int>(pre_axis_dim),
static_cast<int>(shape[axis]),
static_cast<int>(post_axis_dim),
1));
}

auto alpha = CuDNNDataType::GetConst<1>(entry_ptr->softmax_entry.data_type);
auto beta = CuDNNDataType::GetConst<0>(entry_ptr->softmax_entry.data_type);
CUDNN_CALL(cudnnSoftmaxForward(entry_ptr->handle,
CUDNN_SOFTMAX_ACCURATE,
entry_ptr->softmax_entry.mode,
alpha,
entry_ptr->softmax_entry.shape_desc,
x->data,
beta,
entry_ptr->softmax_entry.shape_desc,
y->data));
});

} // namespace contrib
} // namespace tvm
46 changes: 46 additions & 0 deletions tests/python/contrib/test_cudnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,52 @@ def verify():
def test_conv3d():
verify_conv3d("float32", "float32", tensor_format=0)


def verify_softmax(shape, axis, dtype="float32"):
A = te.placeholder(shape, dtype=dtype, name='A')
B = cudnn.softmax(A, axis)
s = te.create_schedule([B.op])

ctx = tvm.gpu(0)
a_np = np.random.uniform(size=shape).astype(dtype)
b_np = topi.testing.softmax_python(a_np)
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(b_np, ctx)
f = tvm.build(s, [A, B], "cuda", target_host="llvm", name="softmax")
f(a, b)
tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)

def verify_softmax_4d(shape, dtype="float32"):
A = te.placeholder(shape, dtype=dtype, name='A')
B = cudnn.softmax(A, axis=1)
s = te.create_schedule([B.op])

ctx = tvm.gpu(0)
n, c, h, w = shape
a_np = np.random.uniform(size=shape).astype(dtype)
b_np = topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h*w, c))
b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2)
a = tvm.nd.array(a_np, ctx)
b = tvm.nd.array(b_np, ctx)
f = tvm.build(s, [A, B], "cuda", target_host="llvm", name="softmax")
f(a, b)
tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3)

def test_softmax():
if not tvm.runtime.enabled("cuda"):
print("skip because cuda is not enabled...")
return
if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
print("skip because cudnn is not enabled...")
return

verify_softmax((32, 10), -1)
verify_softmax((3, 4), -1)
verify_softmax((1, 5), -1, "float64")
verify_softmax_4d((1, 16, 256, 256))
verify_softmax_4d((1, 16, 256, 256), "float64")

if __name__ == "__main__":
test_conv2d()
test_conv3d()
test_softmax()
2 changes: 1 addition & 1 deletion topi/python/topi/cuda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from .deformable_conv2d import *
from .conv3d import *
from .reduction import schedule_reduce
from .softmax import schedule_softmax
from .softmax import *
from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
from .dense import *
from .pooling import *
Expand Down
Loading