Skip to content

Commit

Permalink
Add more documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
aktech authored and mergify[bot] committed May 10, 2021
1 parent d7f4b1f commit 4297ba5
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 2 deletions.
2 changes: 1 addition & 1 deletion sgkit/distance/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def pairwise_distance(
Omit to let dask heuristically decide a good default. A default can
also be set globally with the split_every key in dask.config.
device
The architecture to run the calculation on, either of cpu or gpu
The architecture to run the calculation on, either of "cpu" or "gpu"
Returns
-------
Expand Down
15 changes: 15 additions & 0 deletions sgkit/distance/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,18 @@ def call_metric_kernel(
out = np.zeros((f.shape[0], g.shape[0], N_MAP_PARAM[metric]), dtype=f.dtype)
d_out = cuda.to_device(out)

# https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications__technical-specifications-per-compute-capability
# These apply to compute capability 2.0 and higher and all GPUs NVIDIA has
# shipped in the past 10+ years have compute capability > 3.0.
# One way to get the compute capability programmatically is via:
# from numba import cuda
# cuda.get_current_device().compute_capability

# In future when we have an average GPU with ability to have
# more number of threads per block, we can increase this to that value
# or parameterise this from the pairwise function or get the maximum
# possible value for a given compute capability.

threads_per_block = (32, 32)
blocks_per_grid = (
math.ceil(out.shape[0] / threads_per_block[0]),
Expand All @@ -211,6 +223,9 @@ def _correlation(
) -> None: # pragma: no cover.
# Note: assigning variable and only saving the final value in the
# array made this significantly faster.

# aggressively making all variables explicitly typed
# makes it more performant by a factor of ~2-3x
v0 = types.float32(0)
v1 = types.float32(0)
v2 = types.float32(0)
Expand Down
2 changes: 1 addition & 1 deletion sgkit/tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

def detect_cuda_driver() -> bool:
try:
return bool(len(cuda.list_devices()))
return len(cuda.list_devices()) > 0
except cuda.CudaSupportError:
return False

Expand Down

0 comments on commit 4297ba5

Please sign in to comment.