added integration test to help identify which functions affect GPU an…

…d fixed API interface
oddity-ai · Aug 10, 2023 · ad56eaa · ad56eaa
1 parent 19a8d67
commit ad56eaa
Show file tree

Hide file tree

Showing 12 changed files with 253 additions and 49 deletions.
diff --git a/crates/async-cuda-npp/src/copy_constant_border.rs b/crates/async-cuda-npp/src/copy_constant_border.rs
@@ -98,7 +98,7 @@ mod tests {
             &input,
             &mut output,
             &ConstantBorder::black(10, 20),
-            &Stream::null(),
+            &Stream::null().await,
         )
         .await
         .unwrap();
@@ -113,7 +113,7 @@ mod tests {
             &input,
             &mut output,
             &ConstantBorder::black(10, 20),
-            &Stream::null(),
+            &Stream::null().await,
         )
         .await
         .unwrap();

diff --git a/crates/async-cuda-npp/src/ffi/context.rs b/crates/async-cuda-npp/src/ffi/context.rs
@@ -35,8 +35,8 @@ impl Context {
         let mut raw = std::ptr::null_mut();
         let raw_ptr = std::ptr::addr_of_mut!(raw);
         // SAFETY:
-        // * No need to execute this on the runtime since this call just initializes the
-        //   `NppStreamContext` and is stateless for all intents and purposes.
+        // * Must call this function on runtime since `nppGetStreamContext` needs the correct thread
+        //   locals to determine current device and other context settings.
         // * We can store a reference to the stream in `NppStreamContext` as long as we make sure
         //   `NppStreamContext` cannot outlive the stream, which we can guarantee because we take
         //   ownership of the stream.
@@ -76,8 +76,8 @@ impl Context {
             let raw_ptr = std::ptr::addr_of_mut!(raw);
             let stream_ptr = stream.inner().as_internal().as_ptr();
             // SAFETY:
-            // * No need to execute this on the runtime since this call just initializes the
-            //   `NppStreamContext` and is stateless for all intents and purposes.
+            // * Must call this function on runtime since `nppGetStreamContext` needs the correct
+            //   thread locals to determine current device and other context settings.
             // * We can store a reference to the stream in `NppStreamContext` as long as we make
             //   sure `NppStreamContext` cannot outlive the stream, which we can guarantee because
             //   we take ownership of the stream.

diff --git a/crates/async-cuda-npp/src/remap.rs b/crates/async-cuda-npp/src/remap.rs
@@ -128,7 +128,7 @@ mod tests {
         let map_x = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
         let map_y = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
-        remap(&input, &mut output, &map_x, &map_y, &Stream::null())
+        remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
             .await
             .unwrap();
     }
@@ -140,7 +140,7 @@ mod tests {
         let map_x = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
         let map_y = DeviceBuffer2D::<f32>::new(100, 100, 1).await;
         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 2).await;
-        remap(&input, &mut output, &map_x, &map_y, &Stream::null())
+        remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
             .await
             .unwrap();
     }
@@ -152,7 +152,7 @@ mod tests {
         let map_x = DeviceBuffer2D::<f32>::new(100, 100, 2).await;
         let map_y = DeviceBuffer2D::<f32>::new(100, 100, 3).await;
         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
-        remap(&input, &mut output, &map_x, &map_y, &Stream::null())
+        remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
             .await
             .unwrap();
     }
@@ -164,7 +164,7 @@ mod tests {
         let map_x = DeviceBuffer2D::<f32>::new(120, 100, 1).await;
         let map_y = DeviceBuffer2D::<f32>::new(120, 100, 1).await;
         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
-        remap(&input, &mut output, &map_x, &map_y, &Stream::null())
+        remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
             .await
             .unwrap();
     }
@@ -176,7 +176,7 @@ mod tests {
         let map_x = DeviceBuffer2D::<f32>::new(100, 120, 1).await;
         let map_y = DeviceBuffer2D::<f32>::new(100, 120, 1).await;
         let mut output = DeviceBuffer2D::<u8>::new(100, 100, 3).await;
-        remap(&input, &mut output, &map_x, &map_y, &Stream::null())
+        remap(&input, &mut output, &map_x, &map_y, &Stream::null().await)
             .await
             .unwrap();
     }

diff --git a/crates/async-cuda-npp/src/resize.rs b/crates/async-cuda-npp/src/resize.rs
@@ -156,7 +156,7 @@ mod tests {
             Region::Full,
             &mut output,
             Region::Full,
-            &Stream::null(),
+            &Stream::null().await,
         )
         .await
         .unwrap();
@@ -172,7 +172,7 @@ mod tests {
             Region::Full,
             &mut output,
             Region::Full,
-            &Stream::null(),
+            &Stream::null().await,
         )
         .await
         .unwrap();

diff --git a/crates/async-cuda-npp/src/resize_batch.rs b/crates/async-cuda-npp/src/resize_batch.rs
@@ -268,7 +268,7 @@ mod tests {
             &mut inputs_and_outputs_ref,
             Region::Full,
             Region::Full,
-            &Stream::null(),
+            &Stream::null().await,
         )
         .await
         .unwrap();

diff --git a/crates/async-cuda-npp/src/stream.rs b/crates/async-cuda-npp/src/stream.rs
@@ -27,9 +27,10 @@ impl Stream {
     ///
     /// This type is a wrapper around the actual CUDA stream type: [`async_cuda_core::Stream`].
     #[inline]
-    pub fn null() -> Self {
+    pub async fn null() -> Self {
+        let context = Future::new(Context::from_null_stream).await;
         Self {
-            context: Arc::new(Context::from_null_stream()),
+            context: Arc::new(context),
         }
     }
 
@@ -84,7 +85,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_null() {
-        let stream = Stream::null();
+        let stream = Stream::null().await;
         assert!(!stream.to_context().as_ptr().is_null());
         // SAFETY: This works because we know that the first field of the underlying
         // `NppStreamContext` struct used internally is `hStream`, which should refer to the wrapped

diff --git a/crates/async-cuda-npp/tests/async_cuda_npp_functions_side_effects_test.rs b/crates/async-cuda-npp/tests/async_cuda_npp_functions_side_effects_test.rs
@@ -0,0 +1,57 @@
+use async_cuda_core::ffi::device::Device;
+use async_cuda_core::stream::Stream;
+
+use async_cuda_npp::ffi::context::Context;
+
+/// This integration test helps determine which ffi functions affect the GPU state, or local thread
+/// state.
+///
+/// This information is important to determine which function need to be executed on the runtime
+/// thread, and which functions can be executed directly by the caller (and don't need to be async).
+///
+/// We only test functions where it is not immediately apparent whether or not the function has
+/// side-effects. All wrappers for NPP operations aren't tested since it is evident that they affect
+/// the GPU state.
+///
+/// # Find GPU side-effects
+///
+/// Run this integration test under the Nsight profile with the following command:
+///
+/// ```bash
+/// nsys profile --output /tmp/side_effects_trace --force-overwrite true cargo test --release --test async_cuda_npp_functions_side_effects_test
+/// ```
+///
+/// Use the `nsys-ui` utility to inspect the report produced in `/tmp/side_effects_trace.qdstrm` and
+/// determine for each function call if one or more CUDA API functions were invoked, and if the GPU
+/// was affected in any way. Function calls are separated by device synchronization markers in the
+/// trace.
+///
+/// # Find thread-local side-effects
+///
+/// These need to inferred from documentation or usage (or an educated guess).
+///
+/// # Results
+///
+/// | Function                      | Side-effect: GPU | Side-effect: thread-local |
+/// | ----------------------------- | ---------------- | ------------------------- |
+/// | `Context::from_null_stream`   | ❌               | ✅                        |
+/// | `Context::from_stream`        | ❌               | ✅                        |
+#[tokio::test]
+async fn test_side_effects() {
+    // First block contains stuff we are not interested in measuring...
+    let stream = Stream::new().await.unwrap();
+
+    // A sequence of CUDA calls that is easy to find in the trace.
+    Device::synchronize().unwrap();
+    let _mem_info_1 = Device::memory_info().unwrap();
+    let _mem_info_2 = Device::memory_info().unwrap();
+    let _mem_info_3 = Device::memory_info().unwrap();
+    let _mem_info_4 = Device::memory_info().unwrap();
+    Device::synchronize().unwrap();
+
+    let _context_null = Context::from_null_stream();
+    Device::synchronize().unwrap();
+
+    let _context_new = Context::from_stream(stream);
+    Device::synchronize().unwrap();
+}
diff --git a/crates/async-tensorrt/src/builder.rs b/crates/async-tensorrt/src/builder.rs
@@ -37,8 +37,8 @@ impl Builder {
     ///
     /// [TensorRT documentation](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_builder.html#a68a8b59fbf86e42762b7087e6ffe6fb4)
     #[inline(always)]
-    pub async fn add_optimization_profile(&mut self) -> Result<()> {
-        Future::new(|| self.inner.add_optimization_profile()).await
+    pub fn add_optimization_profile(&mut self) -> Result<()> {
+        self.inner.add_optimization_profile()
     }
 
     /// Create a new optimization profile.
@@ -55,8 +55,8 @@ impl Builder {
     ///
     /// [TensorRT documentation](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_builder.html#a68a8b59fbf86e42762b7087e6ffe6fb4)
     #[inline(always)]
-    pub async fn with_optimization_profile(mut self) -> Result<Self> {
-        self.add_optimization_profile().await?;
+    pub fn with_optimization_profile(mut self) -> Result<Self> {
+        self.add_optimization_profile()?;
         Ok(self)
     }
 
@@ -80,11 +80,11 @@ impl Builder {
     ///
     /// * `flags` - Flags for specifying network properties.
     #[inline(always)]
-    pub async fn network_definition(
+    pub fn network_definition(
         &mut self,
         flags: NetworkDefinitionCreationFlags,
     ) -> NetworkDefinition {
-        Future::new(|| self.inner.network_definition(flags)).await
+        self.inner.network_definition(flags)
     }
 
     /// Builds and serializes a network for the provided [`crate::ffi::network::NetworkDefinition`]

diff --git a/crates/async-tensorrt/src/engine.rs b/crates/async-tensorrt/src/engine.rs
@@ -29,16 +29,17 @@ impl Engine {
     /// # Return value
     ///
     /// A [`HostBuffer`] that contains the serialized engine.
-    pub async fn serialize(&self) -> Result<HostBuffer> {
-        Future::new(move || self.inner.serialize()).await
+    #[inline(always)]
+    pub fn serialize(&self) -> Result<HostBuffer> {
+        self.inner.serialize()
     }
 
     /// Get the number of IO tensors.
     ///
     /// [TensorRT documentation](https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_cuda_engine.html#af2018924cbea2fa84808040e60c58405)
     #[inline(always)]
-    pub async fn num_io_tensors(&self) -> usize {
-        Future::new(|| self.inner.num_io_tensors()).await
+    pub fn num_io_tensors(&self) -> usize {
+        self.inner.num_io_tensors()
     }
 
     /// Retrieve the name of an IO tensor.
@@ -49,8 +50,8 @@ impl Engine {
     ///
     /// * `io_tensor_index` - IO tensor index.
     #[inline(always)]
-    pub async fn io_tensor_name(&self, io_tensor_index: usize) -> String {
-        Future::new(|| self.inner.io_tensor_name(io_tensor_index)).await
+    pub fn io_tensor_name(&self, io_tensor_index: usize) -> String {
+        self.inner.io_tensor_name(io_tensor_index)
     }
 
     /// Get the shape of a tensor.
@@ -61,8 +62,8 @@ impl Engine {
     ///
     /// * `tensor_name` - Tensor name.
     #[inline(always)]
-    pub async fn tensor_shape(&self, tensor_name: &str) -> Vec<usize> {
-        Future::new(|| self.inner.tensor_shape(tensor_name)).await
+    pub fn tensor_shape(&self, tensor_name: &str) -> Vec<usize> {
+        self.inner.tensor_shape(tensor_name)
     }
 
     /// Get the IO mode of a tensor.
@@ -73,8 +74,8 @@ impl Engine {
     ///
     /// * `tensor_name` - Tensor name.
     #[inline(always)]
-    pub async fn tensor_io_mode(&self, tensor_name: &str) -> TensorIoMode {
-        Future::new(|| self.inner.tensor_io_mode(tensor_name)).await
+    pub fn tensor_io_mode(&self, tensor_name: &str) -> TensorIoMode {
+        self.inner.tensor_io_mode(tensor_name)
     }
 }
 
@@ -192,7 +193,7 @@ mod tests {
     #[tokio::test]
     async fn test_engine_serialize() {
         let engine = simple_engine!();
-        let serialized_engine = engine.serialize().await.unwrap();
+        let serialized_engine = engine.serialize().unwrap();
         let serialized_engine_bytes = serialized_engine.as_bytes();
         assert!(serialized_engine_bytes.len() > 1800);
         assert!(serialized_engine_bytes.len() < 2500);
@@ -205,13 +206,13 @@ mod tests {
     #[tokio::test]
     async fn test_engine_tensor_info() {
         let engine = simple_engine!();
-        assert_eq!(engine.num_io_tensors().await, 2);
-        assert_eq!(engine.io_tensor_name(0).await, "X");
-        assert_eq!(engine.io_tensor_name(1).await, "Y");
-        assert_eq!(engine.tensor_io_mode("X").await, TensorIoMode::Input);
-        assert_eq!(engine.tensor_io_mode("Y").await, TensorIoMode::Output);
-        assert_eq!(engine.tensor_shape("X").await, &[1, 2]);
-        assert_eq!(engine.tensor_shape("Y").await, &[2, 3]);
+        assert_eq!(engine.num_io_tensors(), 2);
+        assert_eq!(engine.io_tensor_name(0), "X");
+        assert_eq!(engine.io_tensor_name(1), "Y");
+        assert_eq!(engine.tensor_io_mode("X"), TensorIoMode::Input);
+        assert_eq!(engine.tensor_io_mode("Y"), TensorIoMode::Output);
+        assert_eq!(engine.tensor_shape("X"), &[1, 2]);
+        assert_eq!(engine.tensor_shape("Y"), &[2, 3]);
     }
 
     #[tokio::test]

diff --git a/crates/async-tensorrt/src/ffi/parser.rs b/crates/async-tensorrt/src/ffi/parser.rs
@@ -54,7 +54,7 @@ impl Parser {
     /// # Arguments
     ///
     /// * `path` - Path to file to parse.
-    pub fn parse_from_file(&mut self, path: &impl AsRef<std::path::Path>) -> Result<()> {
+    fn parse_from_file(&mut self, path: &impl AsRef<std::path::Path>) -> Result<()> {
         let internal = self.as_mut_ptr();
         let path_ffi = std::ffi::CString::new(path.as_ref().as_os_str().to_str().unwrap()).unwrap();
         let path_ptr = path_ffi.as_ptr();
@@ -135,9 +135,7 @@ mod tests {
     async fn test_parser_parses_onnx_file() {
         let simple_onnx_file = simple_onnx_file!();
         let mut builder = Builder::new().await;
-        let network = builder
-            .network_definition(NetworkDefinitionCreationFlags::ExplicitBatchSize)
-            .await;
+        let network = builder.network_definition(NetworkDefinitionCreationFlags::ExplicitBatchSize);
         assert!(
             Parser::parse_network_definition_from_file(network, &simple_onnx_file.path()).is_ok()
         );

diff --git a/crates/async-tensorrt/src/tests/utils.rs b/crates/async-tensorrt/src/tests/utils.rs
@@ -4,11 +4,9 @@ macro_rules! simple_network {
         let mut builder = $crate::Builder::new()
             .await
             .with_optimization_profile()
-            .await
             .unwrap();
-        let network = builder
-            .network_definition($crate::NetworkDefinitionCreationFlags::ExplicitBatchSize)
-            .await;
+        let network =
+            builder.network_definition($crate::NetworkDefinitionCreationFlags::ExplicitBatchSize);
         let network =
             $crate::Parser::parse_network_definition_from_file(network, &simple_onnx_file.path())
                 .unwrap();