diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py index 3240ebcd515c..25367bb7b04c 100644 --- a/python/tvm/topi/cuda/scan.py +++ b/python/tvm/topi/cuda/scan.py @@ -104,7 +104,7 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add, i # The following algorithm performs parallel exclusive scan # Up Sweep of exclusive scan lim = tvm.tir.generic.cast( - tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(scan_axis_size, "float64"))), "int64" + tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(scan_axis_size, "float32"))), "int64" ) with ib.for_range(0, lim, dtype="int64") as l2_width: width = 2 << l2_width diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py index 5ebd3060a6bb..5e6108737cd6 100644 --- a/python/tvm/topi/cuda/sort.py +++ b/python/tvm/topi/cuda/sort.py @@ -239,7 +239,7 @@ def compare(a, b): # Sort the lower levels of the merge using odd-even sort, it's fast for small inputs lower_lim = tvm.tir.generic.cast( - tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(block_size, "float64"))), "int64" + tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(block_size, "float32"))), "int64" ) _odd_even_sort( @@ -255,7 +255,7 @@ def compare(a, b): ) upper_lim = tvm.tir.generic.cast( - tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float64"))), "int64" + tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float32"))), "int64" ) def get_merge_begin(source, base_idx, aCount, bCount, aStart, bStart, diag, step_count):