Skip to content

Commit

Permalink
added integration test to help identify which functions affect GPU an…
Browse files Browse the repository at this point in the history
…d fixed API interface
  • Loading branch information
gerwin3 committed Aug 10, 2023
1 parent 19a8d67 commit ad56eaa
Show file tree
Hide file tree
Showing 12 changed files with 253 additions and 49 deletions.
4 changes: 2 additions & 2 deletions crates/async-cuda-npp/src/copy_constant_border.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ mod tests {
&input,
&mut output,
&ConstantBorder::black(10, 20),
&Stream::null(),
&Stream::null().await,
)
.await
.unwrap();
Expand All @@ -113,7 +113,7 @@ mod tests {
&input,
&mut output,
&ConstantBorder::black(10, 20),
&Stream::null(),
&Stream::null().await,
)
.await
.unwrap();
Expand Down
8 changes: 4 additions & 4 deletions crates/async-cuda-npp/src/ffi/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ impl Context {
let mut raw = std::ptr::null_mut();
let raw_ptr = std::ptr::addr_of_mut!(raw);
// SAFETY:
// * No need to execute this on the runtime since this call just initializes the
// `NppStreamContext` and is stateless for all intents and purposes.
// * Must call this function on runtime since `nppGetStreamContext` needs the correct thread
// locals to determine current device and other context settings.
// * We can store a reference to the stream in `NppStreamContext` as long as we make sure
// `NppStreamContext` cannot outlive the stream, which we can guarantee because we take
// ownership of the stream.
Expand Down Expand Up @@ -76,8 +76,8 @@ impl Context {
let raw_ptr = std::ptr::addr_of_mut!(raw);
let stream_ptr = stream.inner().as_internal().as_ptr();
// SAFETY:
// * No need to execute this on the runtime since this call just initializes the
// `NppStreamContext` and is stateless for all intents and purposes.
// * Must call this function on runtime since `nppGetStreamContext` needs the correct
// thread locals to determine current device and other context settings.
// * We can store a reference to the stream in `NppStreamContext` as long as we make
// sure `NppStreamContext` cannot outlive the stream, which we can guarantee because
// we take ownership of the stream.
Expand Down
10 changes: 5 additions & 5 deletions crates/async-cuda-npp/src/remap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ mod tests {
let map_x = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
let map_y = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
remap(&input, &mut output, &map_x, &map_y, &Stream::null())
remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
.await
.unwrap();
}
Expand All @@ -140,7 +140,7 @@ mod tests {
let map_x = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
let map_y = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
let mut output = DeviceBuffer2D::<u8>::new(100, 100, 2).await;
remap(&input, &mut output, &map_x, &map_y, &Stream::null())
remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
.await
.unwrap();
}
Expand All @@ -152,7 +152,7 @@ mod tests {
let map_x = DeviceBuffer2D::<f32>::new(100, 100, 2).await;
let map_y = DeviceBuffer2D::<f32>::new(100, 100, 3).await;
let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
remap(&input, &mut output, &map_x, &map_y, &Stream::null())
remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
.await
.unwrap();
}
Expand All @@ -164,7 +164,7 @@ mod tests {
let map_x = DeviceBuffer2D::<f32>::new(120, 100, 1).await;
let map_y = DeviceBuffer2D::<f32>::new(120, 100, 1).await;
let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
remap(&input, &mut output, &map_x, &map_y, &Stream::null())
remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
.await
.unwrap();
}
Expand All @@ -176,7 +176,7 @@ mod tests {
let map_x = DeviceBuffer2D::<f32>::new(100, 120, 1).await;
let map_y = DeviceBuffer2D::<f32>::new(100, 120, 1).await;
let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
remap(&input, &mut output, &map_x, &map_y, &Stream::null())
remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
.await
.unwrap();
}
Expand Down
4 changes: 2 additions & 2 deletions crates/async-cuda-npp/src/resize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ mod tests {
Region::Full,
&mut output,
Region::Full,
&Stream::null(),
&Stream::null().await,
)
.await
.unwrap();
Expand All @@ -172,7 +172,7 @@ mod tests {
Region::Full,
&mut output,
Region::Full,
&Stream::null(),
&Stream::null().await,
)
.await
.unwrap();
Expand Down
2 changes: 1 addition & 1 deletion crates/async-cuda-npp/src/resize_batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ mod tests {
&mut inputs_and_outputs_ref,
Region::Full,
Region::Full,
&Stream::null(),
&Stream::null().await,
)
.await
.unwrap();
Expand Down
7 changes: 4 additions & 3 deletions crates/async-cuda-npp/src/stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@ impl Stream {
///
/// This type is a wrapper around the actual CUDA stream type: [`async_cuda_core::Stream`].
#[inline]
pub fn null() -> Self {
pub async fn null() -> Self {
let context = Future::new(Context::from_null_stream).await;
Self {
context: Arc::new(Context::from_null_stream()),
context: Arc::new(context),
}
}

Expand Down Expand Up @@ -84,7 +85,7 @@ mod tests {

#[tokio::test]
async fn test_null() {
let stream = Stream::null();
let stream = Stream::null().await;
assert!(!stream.to_context().as_ptr().is_null());
// SAFETY: This works because we know that the first field of the underlying
// `NppStreamContext` struct used internally is `hStream`, which should refer to the wrapped
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use async_cuda_core::ffi::device::Device;
use async_cuda_core::stream::Stream;

use async_cuda_npp::ffi::context::Context;

/// This integration test helps determine which ffi functions affect the GPU state, or local thread
/// state.
///
/// This information is important to determine which function need to be executed on the runtime
/// thread, and which functions can be executed directly by the caller (and don't need to be async).
///
/// We only test functions where it is not immediately apparent whether or not the function has
/// side-effects. All wrappers for NPP operations aren't tested since it is evident that they affect
/// the GPU state.
///
/// # Find GPU side-effects
///
/// Run this integration test under the Nsight profile with the following command:
///
/// ```bash
/// nsys profile --output /tmp/side_effects_trace --force-overwrite true cargo test --release --test async_cuda_npp_functions_side_effects_test
/// ```
///
/// Use the `nsys-ui` utility to inspect the report produced in `/tmp/side_effects_trace.qdstrm` and
/// determine for each function call if one or more CUDA API functions were invoked, and if the GPU
/// was affected in any way. Function calls are separated by device synchronization markers in the
/// trace.
///
/// # Find thread-local side-effects
///
/// These need to inferred from documentation or usage (or an educated guess).
///
/// # Results
///
/// | Function | Side-effect: GPU | Side-effect: thread-local |
/// | ----------------------------- | ---------------- | ------------------------- |
/// | `Context::from_null_stream` | ❌ | ✅ |
/// | `Context::from_stream` | ❌ | ✅ |
#[tokio::test]
async fn test_side_effects() {
// First block contains stuff we are not interested in measuring...
let stream = Stream::new().await.unwrap();

// A sequence of CUDA calls that is easy to find in the trace.
Device::synchronize().unwrap();
let _mem_info_1 = Device::memory_info().unwrap();
let _mem_info_2 = Device::memory_info().unwrap();
let _mem_info_3 = Device::memory_info().unwrap();
let _mem_info_4 = Device::memory_info().unwrap();
Device::synchronize().unwrap();

let _context_null = Context::from_null_stream();
Device::synchronize().unwrap();

let _context_new = Context::from_stream(stream);
Device::synchronize().unwrap();
}
12 changes: 6 additions & 6 deletions crates/async-tensorrt/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ impl Builder {
///
/// [TensorRT documentation](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_builder.html#a68a8b59fbf86e42762b7087e6ffe6fb4)
#[inline(always)]
pub async fn add_optimization_profile(&mut self) -> Result<()> {
Future::new(|| self.inner.add_optimization_profile()).await
pub fn add_optimization_profile(&mut self) -> Result<()> {
self.inner.add_optimization_profile()
}

/// Create a new optimization profile.
Expand All @@ -55,8 +55,8 @@ impl Builder {
///
/// [TensorRT documentation](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_builder.html#a68a8b59fbf86e42762b7087e6ffe6fb4)
#[inline(always)]
pub async fn with_optimization_profile(mut self) -> Result<Self> {
self.add_optimization_profile().await?;
pub fn with_optimization_profile(mut self) -> Result<Self> {
self.add_optimization_profile()?;
Ok(self)
}

Expand All @@ -80,11 +80,11 @@ impl Builder {
///
/// * `flags` - Flags for specifying network properties.
#[inline(always)]
pub async fn network_definition(
pub fn network_definition(
&mut self,
flags: NetworkDefinitionCreationFlags,
) -> NetworkDefinition {
Future::new(|| self.inner.network_definition(flags)).await
self.inner.network_definition(flags)
}

/// Builds and serializes a network for the provided [`crate::ffi::network::NetworkDefinition`]
Expand Down
37 changes: 19 additions & 18 deletions crates/async-tensorrt/src/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,17 @@ impl Engine {
/// # Return value
///
/// A [`HostBuffer`] that contains the serialized engine.
pub async fn serialize(&self) -> Result<HostBuffer> {
Future::new(move || self.inner.serialize()).await
#[inline(always)]
pub fn serialize(&self) -> Result<HostBuffer> {
self.inner.serialize()
}

/// Get the number of IO tensors.
///
/// [TensorRT documentation](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_cuda_engine.html#af2018924cbea2fa84808040e60c58405)
#[inline(always)]
pub async fn num_io_tensors(&self) -> usize {
Future::new(|| self.inner.num_io_tensors()).await
pub fn num_io_tensors(&self) -> usize {
self.inner.num_io_tensors()
}

/// Retrieve the name of an IO tensor.
Expand All @@ -49,8 +50,8 @@ impl Engine {
///
/// * `io_tensor_index` - IO tensor index.
#[inline(always)]
pub async fn io_tensor_name(&self, io_tensor_index: usize) -> String {
Future::new(|| self.inner.io_tensor_name(io_tensor_index)).await
pub fn io_tensor_name(&self, io_tensor_index: usize) -> String {
self.inner.io_tensor_name(io_tensor_index)
}

/// Get the shape of a tensor.
Expand All @@ -61,8 +62,8 @@ impl Engine {
///
/// * `tensor_name` - Tensor name.
#[inline(always)]
pub async fn tensor_shape(&self, tensor_name: &str) -> Vec<usize> {
Future::new(|| self.inner.tensor_shape(tensor_name)).await
pub fn tensor_shape(&self, tensor_name: &str) -> Vec<usize> {
self.inner.tensor_shape(tensor_name)
}

/// Get the IO mode of a tensor.
Expand All @@ -73,8 +74,8 @@ impl Engine {
///
/// * `tensor_name` - Tensor name.
#[inline(always)]
pub async fn tensor_io_mode(&self, tensor_name: &str) -> TensorIoMode {
Future::new(|| self.inner.tensor_io_mode(tensor_name)).await
pub fn tensor_io_mode(&self, tensor_name: &str) -> TensorIoMode {
self.inner.tensor_io_mode(tensor_name)
}
}

Expand Down Expand Up @@ -192,7 +193,7 @@ mod tests {
#[tokio::test]
async fn test_engine_serialize() {
let engine = simple_engine!();
let serialized_engine = engine.serialize().await.unwrap();
let serialized_engine = engine.serialize().unwrap();
let serialized_engine_bytes = serialized_engine.as_bytes();
assert!(serialized_engine_bytes.len() > 1800);
assert!(serialized_engine_bytes.len() < 2500);
Expand All @@ -205,13 +206,13 @@ mod tests {
#[tokio::test]
async fn test_engine_tensor_info() {
let engine = simple_engine!();
assert_eq!(engine.num_io_tensors().await, 2);
assert_eq!(engine.io_tensor_name(0).await, "X");
assert_eq!(engine.io_tensor_name(1).await, "Y");
assert_eq!(engine.tensor_io_mode("X").await, TensorIoMode::Input);
assert_eq!(engine.tensor_io_mode("Y").await, TensorIoMode::Output);
assert_eq!(engine.tensor_shape("X").await, &[1, 2]);
assert_eq!(engine.tensor_shape("Y").await, &[2, 3]);
assert_eq!(engine.num_io_tensors(), 2);
assert_eq!(engine.io_tensor_name(0), "X");
assert_eq!(engine.io_tensor_name(1), "Y");
assert_eq!(engine.tensor_io_mode("X"), TensorIoMode::Input);
assert_eq!(engine.tensor_io_mode("Y"), TensorIoMode::Output);
assert_eq!(engine.tensor_shape("X"), &[1, 2]);
assert_eq!(engine.tensor_shape("Y"), &[2, 3]);
}

#[tokio::test]
Expand Down
6 changes: 2 additions & 4 deletions crates/async-tensorrt/src/ffi/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ impl Parser {
/// # Arguments
///
/// * `path` - Path to file to parse.
pub fn parse_from_file(&mut self, path: &impl AsRef<std::path::Path>) -> Result<()> {
fn parse_from_file(&mut self, path: &impl AsRef<std::path::Path>) -> Result<()> {
let internal = self.as_mut_ptr();
let path_ffi = std::ffi::CString::new(path.as_ref().as_os_str().to_str().unwrap()).unwrap();
let path_ptr = path_ffi.as_ptr();
Expand Down Expand Up @@ -135,9 +135,7 @@ mod tests {
async fn test_parser_parses_onnx_file() {
let simple_onnx_file = simple_onnx_file!();
let mut builder = Builder::new().await;
let network = builder
.network_definition(NetworkDefinitionCreationFlags::ExplicitBatchSize)
.await;
let network = builder.network_definition(NetworkDefinitionCreationFlags::ExplicitBatchSize);
assert!(
Parser::parse_network_definition_from_file(network, &simple_onnx_file.path()).is_ok()
);
Expand Down
6 changes: 2 additions & 4 deletions crates/async-tensorrt/src/tests/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@ macro_rules! simple_network {
let mut builder = $crate::Builder::new()
.await
.with_optimization_profile()
.await
.unwrap();
let network = builder
.network_definition($crate::NetworkDefinitionCreationFlags::ExplicitBatchSize)
.await;
let network =
builder.network_definition($crate::NetworkDefinitionCreationFlags::ExplicitBatchSize);
let network =
$crate::Parser::parse_network_definition_from_file(network, &simple_onnx_file.path())
.unwrap();
Expand Down
Loading

0 comments on commit ad56eaa

Please sign in to comment.