diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf09bf8..d9fb4b4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,20 +1,26 @@
 cmake_minimum_required(VERSION 3.17)
 
-project(cudnn_frontend VERSION 1.1.2)
+project(cudnn_frontend VERSION 1.2.0)
 
+option(CUDNN_FRONTEND_SKIP_NLOHMANN_JSON "Defines whether FE should not include nlohmann/json.hpp." OFF)
 option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON)
-option(CUDNN_FRONTEND_BUILD_UNIT_TESTS "Defines if unittests are built or not." OFF)
+option(CUDNN_FRONTEND_BUILD_UNIT_TESTS "Defines if unittests are built or not." ON)
 
 if(MSVC OR MSYS OR MINGW)
     option(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS "Defines if python bindings are built or not." OFF)
     add_compile_options(/W4 /WX)
 else()
-    option(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS "Defines if python bindings are built or not." ON)
+    option(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS "Defines if python bindings are built or not." OFF)
     add_compile_options(-Wall -Wextra -Wpedantic -Werror -Wno-error=attributes -Wno-attributes -Wno-error=unused-function -Wno-unused-function)
 endif()
 
 add_library(cudnn_frontend INTERFACE)
 
+target_compile_definitions(
+    cudnn_frontend INTERFACE
+    $<$<BOOL:${CUDNN_FRONTEND_SKIP_NLOHMANN_JSON}>:CUDNN_FRONTEND_SKIP_NLOHMANN_JSON>
+)
+
 target_include_directories(
     cudnn_frontend INTERFACE
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
@@ -50,5 +56,16 @@ if (CUDNN_FRONTEND_BUILD_UNIT_TESTS)
 endif()
 
 if (CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS)
-    add_subdirectory(python_bindings)
+    add_subdirectory(python)
 endif()
+
+# Introduce variables:
+# * CMAKE_INSTALL_LIBDIR
+# * CMAKE_INSTALL_BINDIR
+# * CMAKE_INSTALL_INCLUDEDIR
+include(GNUInstallDirs)
+
+install(
+    DIRECTORY ${PROJECT_SOURCE_DIR}/include/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..5ccb14a
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,55 @@
+# Contributing to cudnn-frontend
+
+If you are interested in contributing to cudnn-frontend, your contributions will fall
+into three categories:
+1. You want to report a bug, feature request, or documentation issue
+    - File an [issue](https://github.com/NVIDIA/cudnn-frontend/issues)
+    describing what you encountered or what you want to see changed.
+    - The cudnn team will evaluate the issues and triage them, scheduling
+    them for a release. If you believe the issue needs priority attention
+    comment on the issue to notify the team.
+2. You want to propose a new Feature and implement it
+    - Post about your intended feature, and we shall discuss the design and
+    implementation.
+    - Once we agree that the plan looks good, go ahead and implement it, using
+    the [code contributions](#code-contributions) guide below.
+3. You want to implement a feature or bug-fix for an outstanding issue
+    - Follow the [code contributions](#code-contributions) guide below.
+    - If you need more context on a particular issue, please ask and we shall
+    provide.
+
+## Code contributions
+
+### Your first issue
+
+1. Read the project's [README.md](https://github.com/NVIDIA/cudnn-frontend/blob/main/README.md)
+   to learn how to setup the development environment.
+2. Comment on the issue saying you are going to work on it and what changes you are going to make.
+3. Code! Make sure to update unit tests!
+4. When done, [create your pull request](https://github.com/NVIDIA/cudnn-frontend/compare).
+5. Wait for other developers to review your code and update code as needed.
+6. Once reviewed and approved, a cudnn-frontend developer will merge your pull request.
+7. At this time, we are accepting only small fixes, changes. Once merged to main this will be an untagged version. A release tag will be assigned along with future frontend release by cudnn team.
+
+Remember, if you are unsure about anything, don't hesitate to comment on issues and ask for clarifications!
+
+## Code Formatting
+
+Consistent code formatting is important in the cudnn-frontend project to ensure
+readability, maintainability, and thus simplifies collaboration.
+
+### Branches and Versions
+
+The cudnn-frontend repository has one main branch. Please submit a PR to this branch. We will update the doc as the policy changes.
+
+### Branch naming
+
+Branches used to create PRs should have a name of the form `<name>-issue-<issue_number>`
+which conforms to the following conventions:
+
+- Name:
+    - A name to convey what is being worked on
+    - Please use dashes or underscores between words as opposed to spaces.
+
+## Attribution
+Portions of contribution guide adopted from [https://github.com/rapidsai/cuml/blob/branch-24.04/CONTRIBUTING.md](https://github.com/rapidsai/cuml/blob/branch-24.04/CONTRIBUTING.md)
diff --git a/README.FE.1.0.md b/README.FE.1.0.md
index 196c5b1..90058ab 100644
--- a/README.FE.1.0.md
+++ b/README.FE.1.0.md
@@ -152,17 +152,8 @@ cudnn_frontend::graph::Graph& cudnn_frontend::graph::Plans::deselect_workspace_g
 ### Autotune
 
 Autotuning provides a way to execute different execution plans for a given graph and measure their relative performance under run time conditions.
-This generally helps validate and improve upon the results provided by the heuristics.
+This generally helps validate and improve upon the results provided by the heuristics. Please refer to [samples](samples/cpp/autotuning.cpp)
 
-The current API to perform the autotuning on the filtered plans:
-```
-cudnn_frontend::error_t
-cudnn_frontend::graph::Graph::autotune(cudnnHandle_t handle,
-            std::unordered_map<std::shared_ptr<Tensor_attributes>, void *> variants,
-            void *workspace,
-            void *user_impl = nullptr);
-
-```
 ### Execute
 Executing graph requires device pointers to all input output tensors and a user alloaction device workspace pointer.
 
@@ -220,4 +211,4 @@ Python samples are jupyter notebooks with step by step guide on using FE v1 API.
 
 ## Operations
 
-Please look at docs/operations for APIs of different operation types.
\ No newline at end of file
+Please look at [docs/operations](docs/operations/) for APIs of different operation types.
\ No newline at end of file
diff --git a/README.md b/README.md
index 573e1d3..eddcd03 100644
--- a/README.md
+++ b/README.md
@@ -38,6 +38,8 @@ To run the python samples, additionally, you will need the following python pack
 
 
 ### Python API
+
+#### Source installation:
 Install FE python API by running:
 ```
 pip install git+https://github.com/NVIDIA/cudnn-frontend.git
@@ -48,7 +50,15 @@ Above command picks cuda and cudnn from default system paths.
 To provide a custom CUDA installation path, use environment variable: `CUDAToolkit_ROOT`.  
 To provide a custom CUDNN installation path, use environment variable: `CUDNN_PATH`.
 
+#### pip wheel installation
+
+Download the pip wheel corresponding to your python installation.
+
+```
+pip install nvidia_cudnn_frontend-1.2.0-*.whl
+```
 
+#### Checking the installation
 To test whether installation is successful, run:
 ```
 pytest tests/python_fe
@@ -107,11 +117,11 @@ Calling `cudnn_frontend::getStream() = stream_name` can be used to assign the ou
 For further debugging, please turn on the cudnn backend logs described here https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#api-logging
 
 ## Documentation
-- See README.FE.1.0.md for v1.0 API documentation.
-- See README.FE.0.x.md for v0.x API documentation.
+- See [README.FE.1.0.md](README.FE.1.0.md) for v1.0 API documentation.
+- See [README.FE.0.x.md](README.FE.0.x.md) for v0.x API documentation.
 
 ## Contributing:
-No external contribution to this repository is accepted. Please create an issue in github for bugs or feature requests.
+Please refer to our [contribution guide](CONTRIBUTING.md)
 
 ## Feedback
 Support, resources, and information about cuDNN can be found online at https://developer.nvidia.com/cudnn. 
diff --git a/docs/operations/Attention.md b/docs/operations/Attention.md
index 78f9f61..e7e2a0a 100644
--- a/docs/operations/Attention.md
+++ b/docs/operations/Attention.md
@@ -1,7 +1,11 @@
 ## Table of Contents
 1. [Scaled Dot Product Attention](#scaled-dot-product-attention)
 2. [Scaled Dot Product Attention Backward](#scaled-dot-product-attention-backward)
-3. [Miscellaneous](#miscellaneous)
+3. Appendices
+    - [Tensor Layouts](#appendix-a)
+    - [Workspace limits and Performance](#appendix-b)
+    - [RNG dump](#appendix-c)
+4. [Miscellaneous](#miscellaneous)
 
 ### Scaled Dot Product Attention
 
@@ -250,13 +254,65 @@ Returns:
     dV (cudnn_tensor): The value gradient tensor of scaled dot-product attention.
 ```
 
-### Miscellaneous
-- FE provides shadow enums which help avoid users to workaround having different enums for different cudnn versions.
-- The cudnn backend enums are changed as follows:
-    - `cudnnBackend<enum_name>` -> `cudnn_frontend::<enum_name>`
-    - `cudnn<enum_name>` -> `cudnn_frontend::<enum_name>`
-- To dump the dropout mask generated by the Philox RNG dropout implementation for debugging purposes, users can use the `rng_dump` option. This option requires users to pass a tensor of dimensions $(B, H_{q}, S_{q}, S_{kv})$ 
-- Scaled Dot Product Attention Backward improves performance by using an optional dP workspace tensor. This tensor's memory consumption increases quadratically with the sequence length. The following describes the behavior of the `CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT` environment variable, which allows the user to change the GPU memory limit for this workspace tensor:
+
+#### Appendix A 
+Tensor Layouts:
+Q, K, V, O and corresponding gradients layout support. cuDNN API expresses the layout of tensors based on strides.
+
+For example, let Q have dimensions = [5, 7, 4, 3], and strides = [84, 12, 3, 1]
+An element at index [i, j, k, l] can be accessed at the position of Q_ptr + i * 84 + j * 12 + k * 3 + l * 1
+
+Notice how the strides are multiplied to the indices to get the position of all elements.
+Below we will go through the standard usage of the attention tensors and how they can be expressed in cuDNN.
+
+  1. Q, K, V are different matrices with strided layout
+  This is the basic case where the user can specify dims and strides for each of Q, K and V and it works as the example given above.
+  The only limitation is that stride corresponding to the hidden dimension per head (d, last dim in Q) needs to be 1.
+
+  2. Q, K, V are interleaved 
+  This is a special case of (1) and can be described in a strided layout as well. 
+  For example, Q, K and V can be a single matrix of dims (batch (b), number_of_heads (h), sequence_length (s), 3, hidden_dim_per_head(d))
+  Strides of Q can be defined as [h * s * 3 * d, s * 3 * d, 3 * d, 1]
+  Notice how the 3 is multiplied to the strides corresponding to b, h and s because of the interleaving.
+
+  3. There are some special cases when all tokens are not valid and Q, K, V can be in special layouts
+    Let Q tensor have two sequences (i.e batch = 2, number_of_heads = 1) with max_seq_len = 8 and actual_seq_len = [2, 3]
+    Conider two tokens "aa" & "bbb".
+      - Fully padded layout
+
+        aa000000
+        bbb00000
+        Dims = [b=2, h=1, s=8, d=64]
+        Strides = [512, 512, 64, 1]
+        
+        CUDNN gets indication of the actual sequence lengths using the seq_len_q and the seq_len_kv and cuts the computation at these values. Please enable use_padding_mask also for this case. CUDNN reads the data based on the strides.
+
+      - Fully packed layout
+        aabbb000
+        00000000
+        Dims = [b=2, h=1, s=8, d=64]
+        Strides = [512, 512, 64, 1]
+
+        The strides remain the same but they are incorrect as the second batch begins at 64*2. Therefore, we have an API called "ragged_offset" which is a b+1 size tensor telling where each batch begins. The b+1 element is where the last batch ends.
+        Users can set <tensor>.set_ragged_offset(<ragged_offset_tensor>)
+        For this example ragged_offset = [0, 128, 320]
+        Actual sequence length still have to be provided with padding mask.
+
+      - Valid tokens in a batch are packed together
+        aa00bbb0
+        00000000
+
+        User just needs to update the ragged offset to = [0, 256, 448]
+
+      - Valid tokens are not packed together
+        a0abbb00
+        bb000000
+        
+        Ragged offset is insufficient to represent this. This case is NOT supported.
+
+#### Appendix B
+Workspace limit:
+Scaled Dot Product Attention Backward improves performance by using an optional dP workspace tensor. This tensor's memory consumption increases quadratically with the sequence length. The following describes the behavior of the `CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT` environment variable, which allows the user to change the GPU memory limit for this workspace tensor:
   - `CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT = unset`  
     The optimization will utilize workspace memory until reaching the default limit of 256MB.
   - `CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT = -1`  
@@ -265,3 +321,12 @@ Returns:
     Workspace optimization is always disabled, avoiding the additional memory usage.
   - `CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT = n`  
     Allows workspace optimization up to a user-defined limit of n bytes, accommodating systems with varying GPU memory capacities.
+
+#### Appendix C
+To dump the dropout mask generated by the Philox RNG dropout implementation for debugging purposes, users can use the `rng_dump` option. This option requires users to pass a tensor of dimensions $(B, H_{q}, S_{q}, S_{kv})$ 
+
+### Miscellaneous
+- FE provides shadow enums which help avoid users to workaround having different enums for different cudnn versions.
+- The cudnn backend enums are changed as follows:
+    - `cudnnBackend<enum_name>` -> `cudnn_frontend::<enum_name>`
+    - `cudnn<enum_name>` -> `cudnn_frontend::<enum_name>`
\ No newline at end of file
diff --git a/include/cudnn_backend_base.h b/include/cudnn_backend_base.h
index 3de1577..43e203b 100644
--- a/include/cudnn_backend_base.h
+++ b/include/cudnn_backend_base.h
@@ -45,12 +45,14 @@ class OpaqueBackendPointer {
      * OpaqueBackendPointer constructor.
      * Calls the cudnnBackendCreateDescriptor. Allocates memory according to the type.
      */
-    OpaqueBackendPointer(cudnnBackendDescriptorType_t type) { status = cudnnBackendCreateDescriptor(type, &m_desc); }
+    OpaqueBackendPointer(cudnnBackendDescriptorType_t type) {
+        status = cudnn_frontend::create_descriptor(type, &m_desc);
+    }
     /**
      * OpaqueBackendPointer destructor.
      * Calls the cudnnBackendDestroyDescriptor. Frees memory allocated in the constructor.
      */
-    ~OpaqueBackendPointer() { cudnnBackendDestroyDescriptor(m_desc); };
+    ~OpaqueBackendPointer() { cudnn_frontend::destroy_descriptor(m_desc); };
     /**
      * Accessor.
      * Returns the const reference to raw underlying descriptor.
@@ -160,4 +162,5 @@ class BackendDescriptor {
     mutable cudnnStatus_t status = CUDNN_STATUS_SUCCESS;  //!< Error code if any being set
     mutable std::string err_msg;                          //!< Error message if any being set
 };
+
 }  // namespace cudnn_frontend
diff --git a/include/cudnn_frontend.h b/include/cudnn_frontend.h
index a76c101..81e684c 100644
--- a/include/cudnn_frontend.h
+++ b/include/cudnn_frontend.h
@@ -124,8 +124,8 @@
 #include "cudnn_frontend/utils/serialize.h"
 
 #define CUDNN_FRONTEND_MAJOR_VERSION 1
-#define CUDNN_FRONTEND_MINOR_VERSION 1
-#define CUDNN_FRONTEND_PATCH_VERSION 2
+#define CUDNN_FRONTEND_MINOR_VERSION 2
+#define CUDNN_FRONTEND_PATCH_VERSION 0
 #define CUDNN_FRONTEND_VERSION \
     ((CUDNN_FRONTEND_MAJOR_VERSION * 10000) + (CUDNN_FRONTEND_MINOR_VERSION * 100) + CUDNN_FRONTEND_PATCH_VERSION)
 
diff --git a/include/cudnn_frontend/cudnn_interface.h b/include/cudnn_frontend/cudnn_interface.h
index 6fe8bcf..4a203aa 100644
--- a/include/cudnn_frontend/cudnn_interface.h
+++ b/include/cudnn_frontend/cudnn_interface.h
@@ -42,30 +42,25 @@ class ICudnn {
     // TODO: Always returns OK. Can the status and error message be accessed from tensor descriptor?
     error_t
     create_cudnn_tensor(std::shared_ptr<graph::Tensor_attributes> const& props,
-                        uid_t& uid,
-                        std::unordered_map<uid_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                        std::unordered_set<uid_t> const& invalid_uids) const {
-        // Check whether tensor already created
-        // Make sure no other tensor somehow already has claimed uid.
+                        std::unordered_map<uid_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const {
+        // TODO: uid check has to be moved to validate stage.
+        RETURN_CUDNN_FRONTEND_ERROR_IF(props->has_uid() == false,
+                                       error_code_t::ATTRIBUTE_NOT_SET,
+                                       "Tensor named '" + props->get_name() + "' has no uid assigned.");
 
-        auto tensor_uid = props->has_uid() ? props->get_uid() : uid;
+        // Check whether tensor already created
+        auto tensor_uid = props->get_uid();
         if (tensors.find(tensor_uid) != tensors.end()) {
-            getLogger() << "[cudnn_frontend] INFO: Shared Tensor" << uid << " already created." << std::endl;
+            getLogger() << "[cudnn_frontend] INFO: Backend Tensor named '" << props->get_name() << "' with UID "
+                        << tensor_uid << " already created." << std::endl;
             return {error_code_t::OK, ""};
         }
 
-        if (props->has_uid() == false) {
-            props->set_uid(uid);
-            do {
-                uid++;
-            } while (invalid_uids.find(uid) != invalid_uids.end());
-        }
-
         auto&& tensor_builder = cudnn_frontend::TensorBuilder();
 
         tensor_builder.setDim(props->get_dim().size(), props->get_dim().data())
             .setStrides(props->get_stride().size(), props->get_stride().data())
-            .setId(props->get_uid())
+            .setId(tensor_uid)
             .setAlignment(16)
             .setDataType(props->get_data_type())
             .setVirtual(props->get_is_virtual())
@@ -73,7 +68,7 @@ class ICudnn {
             .setReorderType(props->get_reordering_type());
 
         if (auto ragged_offset_props = props->get_ragged_offset()) {
-            CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(ragged_offset_props, uid, tensors, invalid_uids));
+            CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(ragged_offset_props, tensors));
             tensor_builder.setRaggedOffset(tensors.at(ragged_offset_props->get_uid()));
         }
 
@@ -83,13 +78,13 @@ class ICudnn {
         auto tensor = tensor_builder.build();
         RETURN_CUDNN_FRONTEND_ERROR_IF(
             tensor.get_status() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, tensor.get_error());
-        tensors.emplace(props->get_uid(), std::make_shared<Tensor>(std::move(tensor)));
+        tensors.emplace(tensor_uid, std::make_shared<Tensor>(std::move(tensor)));
 #else
         // build() can throw
         // wrap in try catch
         try {
             auto tensor = tensor_builder.build();
-            tensors.emplace(props->get_uid(), std::make_shared<Tensor>(std::move(tensor)));
+            tensors.emplace(tensor_uid, std::make_shared<Tensor>(std::move(tensor)));
         } catch (cudnn_frontend::cudnnException& e) {
             RETURN_CUDNN_FRONTEND_ERROR_IF(
                 e.getCudnnStatus() != CUDNN_STATUS_SUCCESS, error_code_t::CUDNN_BACKEND_API_FAILED, e.what());
diff --git a/include/cudnn_frontend/graph_helpers.h b/include/cudnn_frontend/graph_helpers.h
index b23e5f5..34a437f 100644
--- a/include/cudnn_frontend/graph_helpers.h
+++ b/include/cudnn_frontend/graph_helpers.h
@@ -98,7 +98,7 @@ typedef struct [[nodiscard]] error_object {
     do {                                                                                                          \
         if (auto cudnn_retval = x; cudnn_retval != CUDNN_STATUS_SUCCESS) {                                        \
             std::stringstream error_msg;                                                                          \
-            error_msg << #x << " failed with " << cudnnGetErrorString(cudnn_retval);                              \
+            error_msg << #x << " failed with " << cudnn_frontend::get_error_string(cudnn_retval);                 \
             getLogger() << "[cudnn_frontend] ERROR: " << error_msg.str() << " at " << __FILE__ << ":" << __LINE__ \
                         << std::endl;                                                                             \
             return {error_code_t::CUDNN_BACKEND_API_FAILED, error_msg.str()};                                     \
@@ -110,7 +110,7 @@ typedef struct [[nodiscard]] error_object {
     do {                                                                                                          \
         if (auto cuda_retval = x; cuda_retval != cudaSuccess) {                                                   \
             std::stringstream error_msg;                                                                          \
-            error_msg << #x << " failed with " << cudaGetErrorString(cuda_retval);                                \
+            error_msg << #x << " failed with " << cuda_get_error_string(cuda_retval);                             \
             getLogger() << "[cudnn_frontend] ERROR: " << error_msg.str() << " at " << __FILE__ << ":" << __LINE__ \
                         << std::endl;                                                                             \
             return {error_code_t::CUDA_API_FAILED, error_msg.str()};                                              \
diff --git a/include/cudnn_frontend/graph_interface.h b/include/cudnn_frontend/graph_interface.h
index 13f40d1..ee2a152 100644
--- a/include/cudnn_frontend/graph_interface.h
+++ b/include/cudnn_frontend/graph_interface.h
@@ -54,7 +54,7 @@ class Graph : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         return {error_code_t::OK, ""};
     }
 
@@ -63,6 +63,31 @@ class Graph : public INode {
         return {error_code_t::OK, ""};
     }
 
+    virtual error_t
+    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t> &pass_by_values) const override final {
+        for (auto [uid, value] : deserialized_pass_by_value) {
+            pass_by_values.emplace(uid, value);
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    virtual error_t
+    collect_pre_assigned_uids_([[maybe_unused]] std::unordered_set<int64_t> &pre_assigned_uids) const override final {
+        return {error_code_t::OK, ""};
+    }
+
+    virtual error_t
+    create_cudnn_tensors_([[maybe_unused]] std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>
+                              &tensors) const override final {
+        return {error_code_t::OK, ""};
+    }
+
+    virtual error_t
+    set_uids_([[maybe_unused]] int64_t &potential_uid,
+              [[maybe_unused]] std::unordered_set<int64_t> const &pre_assigned_uids) const override final {
+        return {error_code_t::OK, ""};
+    }
+
    public:
     Graph() : INode(detail::Context{}) {}
 
diff --git a/include/cudnn_frontend/graph_properties.h b/include/cudnn_frontend/graph_properties.h
index cf32343..843cf97 100644
--- a/include/cudnn_frontend/graph_properties.h
+++ b/include/cudnn_frontend/graph_properties.h
@@ -17,17 +17,32 @@ namespace graph {
 // simple structure to hold all properties of a tensor.
 // Each property has a getter setter.
 class Tensor_attributes {
+   public:
+    using uid_t = int64_t;
+
+    // There are two usecases of pass by value tensors:
+    // 1. Fused scalar constants
+    // 2. Scalar passed during execution
+    // In approach 1, users provide a value to embed into the graph.
+    // In approach 2, users set is_pass_by_value boolean and then pass a pointer to scalar value with execute() API.
+    // A closed set of types that are allowed to be passed by value.
+    using pass_by_values_t = std::variant<int32_t, half, float, nv_bfloat16>;
+
+   private:
     template <typename>
     friend class Attributes;
 
     std::string name;
-    DataType_t data_type               = DataType_t::NOT_SET;
-    std::vector<int64_t> dim           = {};
-    std::vector<int64_t> stride        = {};
-    bool is_virtual                    = false;
-    bool is_pass_by_value              = false;
+    DataType_t data_type        = DataType_t::NOT_SET;
+    std::vector<int64_t> dim    = {};
+    std::vector<int64_t> stride = {};
+    bool is_virtual             = false;
+
+    std::optional<pass_by_values_t> pass_by_value = std::nullopt;
+    bool is_pass_by_value                         = false;
+
     TensorReordering_t reordering_type = TensorReordering_t::NONE;
-    int64_t uid                        = 0;
+    uid_t uid                          = 0;
     bool uid_assigned                  = false;
 
     std::shared_ptr<Tensor_attributes> ragged_offset;
@@ -46,6 +61,11 @@ class Tensor_attributes {
             error_code_t::ATTRIBUTE_NOT_SET,
             "Tensor '" + name + "' can't be both virutal and pass_by_value at the same time.");
 
+        RETURN_CUDNN_FRONTEND_ERROR_IF(
+            pass_by_value.has_value() & (!is_pass_by_value),
+            error_code_t::ATTRIBUTE_NOT_SET,
+            "Tensor '" + name + "' can't be a fused scalar and not a pass_by_value tensor at the same time.");
+
         return {error_code_t::OK, ""};
     }
 
@@ -68,6 +88,7 @@ class Tensor_attributes {
                                    dim,
                                    stride,
                                    is_virtual,
+                                   pass_by_value,
                                    is_pass_by_value,
                                    reordering_type,
                                    uid,
@@ -75,6 +96,34 @@ class Tensor_attributes {
 
     Tensor_attributes() = default;
 
+    Tensor_attributes(float const& scalar) {
+        pass_by_value    = scalar;
+        is_pass_by_value = true;
+        dim = stride = {1};
+        data_type    = DataType_t::FLOAT;
+    }
+
+    Tensor_attributes(half const& scalar) {
+        pass_by_value    = scalar;
+        is_pass_by_value = true;
+        dim = stride = {1};
+        data_type    = DataType_t::HALF;
+    }
+
+    Tensor_attributes(nv_bfloat16 const& scalar) {
+        pass_by_value    = scalar;
+        is_pass_by_value = true;
+        dim = stride = {1};
+        data_type    = DataType_t::BFLOAT16;
+    }
+
+    Tensor_attributes(int32_t const& scalar) {
+        pass_by_value    = scalar;
+        is_pass_by_value = true;
+        dim = stride = {1};
+        data_type    = DataType_t::INT32;
+    }
+
     std::string
     get_name() const {
         return name;
@@ -140,6 +189,11 @@ class Tensor_attributes {
         return set_is_virtual(!value);
     }
 
+    std::optional<pass_by_values_t>
+    get_pass_by_value() const {
+        return pass_by_value;
+    }
+
     bool
     get_is_pass_by_value() const {
         return is_pass_by_value;
@@ -162,12 +216,12 @@ class Tensor_attributes {
         return *this;
     }
 
-    int64_t
+    uid_t
     get_uid() const {
         return uid;
     }
 
-    int64_t
+    uid_t
     has_uid() const {
         return uid_assigned;
     }
@@ -180,7 +234,7 @@ class Tensor_attributes {
     }
 
     auto
-    set_uid(int64_t value) -> Tensor_attributes& {
+    set_uid(uid_t value) -> Tensor_attributes& {
         uid          = value;
         uid_assigned = true;
         return *this;
@@ -247,6 +301,21 @@ class Attributes {
         return non_virtual_uids;
     }
 
+   public:
+    error_t
+    fill_pass_by_value(std::unordered_map<Tensor_attributes::uid_t, Tensor_attributes::pass_by_values_t>&
+                           tensor_to_pass_by_value) const {
+        auto derived = static_cast<DerivedT const*>(this);
+        for (auto& [name, tensor] : derived->inputs) {
+            (void)name;
+            if (tensor && tensor->get_pass_by_value().has_value()) {
+                tensor_to_pass_by_value.emplace(tensor->get_uid(), tensor->get_pass_by_value().value());
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
     void
     fill_from_context(detail::Context const& context) {
         auto derived = static_cast<DerivedT const*>(this);
@@ -275,9 +344,27 @@ class Attributes {
         if (compute_data_type == DataType_t::NOT_SET) {
             set_compute_data_type(context.get_compute_data_type());
         }
+
+        // Handle shape and stride inferencing for fused scalars.
+        // Pick number of dimensions from anyone of non-fused-scalar input tensors
+        // In case, all tensors are fused scalars, just keep them 1D.
+        int64_t number_of_dims = 1;
+        for (auto [name, tensor] : derived->inputs) {
+            (void)name;
+            if (tensor && (tensor->get_pass_by_value().has_value() == false)) {
+                number_of_dims = tensor->get_dim().size();
+                break;
+            }
+        }
+        for (auto [name, tensor] : derived->inputs) {
+            (void)name;
+            if (tensor && tensor->get_pass_by_value().has_value()) {
+                tensor->set_dim(std::vector<int64_t>(number_of_dims, 1));
+                tensor->set_stride(std::vector<int64_t>(number_of_dims, 1));
+            }
+        }
     }
 
-   public:
     std::string name;
     DataType_t compute_data_type = DataType_t::NOT_SET;
 
@@ -366,6 +453,49 @@ class Attributes {
 
         return {error_code_t::OK, ""};
     }
+
+    error_t
+    set_uids(int64_t& potential_uid, std::unordered_set<int64_t> const& pre_assigned_uids) const {
+        auto derived = static_cast<DerivedT const*>(this);
+
+        auto get_next_potential_uid = [&]() -> void {
+            do {
+                ++potential_uid;
+            } while (pre_assigned_uids.find(potential_uid) != pre_assigned_uids.end());
+        };
+
+        std::function<void(std::shared_ptr<Tensor_attributes>)> assign_uid_to_tensor =
+            [&](std::shared_ptr<Tensor_attributes> tensor) {
+                if (!tensor) return;
+                if (tensor->has_uid() == false) {
+                    get_next_potential_uid();
+                    tensor->set_uid(potential_uid);
+                }
+                if (auto ragged_offset = tensor->get_ragged_offset()) {
+                    assign_uid_to_tensor(ragged_offset);
+                }
+            };
+
+        for (auto [name, tensor] : derived->inputs) {
+            (void)name;
+            assign_uid_to_tensor(tensor);
+        }
+
+        for (auto [name, tensor] : derived->outputs) {
+            (void)name;
+            assign_uid_to_tensor(tensor);
+        }
+
+        // Handle special case of BN where peer_stats is also an input
+        if constexpr (std::is_same_v<DerivedT, Batchnorm_attributes> ||
+                      std::is_same_v<DerivedT, Batchnorm_backward_attributes>) {
+            for (auto& tensor : derived->peer_stats) {
+                assign_uid_to_tensor(tensor);
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
 };
 
 class BN_finalize_attributes : public Attributes<BN_finalize_attributes> {
@@ -718,7 +848,7 @@ class Layernorm_backward_attributes : public Attributes<Layernorm_backward_attri
     friend class Graph;
 
    public:
-    enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE };
+    enum class input_names { DY, X, SCALE, MEAN, INV_VARIANCE, EPSILON };
     std::map<input_names, std::shared_ptr<Tensor_attributes>> inputs;
     enum class output_names { DX, DSCALE, DBIAS };
     std::map<output_names, std::shared_ptr<Tensor_attributes>> outputs;
diff --git a/include/cudnn_frontend/node/batchnorm.h b/include/cudnn_frontend/node/batchnorm.h
index c5c3a50..bc9bc05 100644
--- a/include/cudnn_frontend/node/batchnorm.h
+++ b/include/cudnn_frontend/node/batchnorm.h
@@ -9,12 +9,12 @@
 namespace cudnn_frontend {
 
 namespace graph {
-class BatchNormNode : public INode {
+class BatchNormNode : public NodeCRTP<BatchNormNode> {
    public:
     Batchnorm_attributes attributes;
 
     BatchNormNode(Batchnorm_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -22,12 +22,7 @@ class BatchNormNode : public INode {
     }
 
     error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for batchnorm node " << attributes.name << "..."
                     << std::endl;
 
@@ -111,35 +106,6 @@ class BatchNormNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building BatchNormNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        // Special case in BN where peer stats is also an input but is not present in inputs map
-        for (auto const& tensor : attributes.peer_stats) {
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/batchnorm_inference.h b/include/cudnn_frontend/node/batchnorm_inference.h
index 3ab531d..9892aca 100644
--- a/include/cudnn_frontend/node/batchnorm_inference.h
+++ b/include/cudnn_frontend/node/batchnorm_inference.h
@@ -9,12 +9,12 @@
 namespace cudnn_frontend {
 
 namespace graph {
-class BatchnormInferenceNode : public INode {
+class BatchnormInferenceNode : public NodeCRTP<BatchnormInferenceNode> {
    public:
     Batchnorm_inference_attributes attributes;
 
     BatchnormInferenceNode(Batchnorm_inference_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -22,7 +22,7 @@ class BatchnormInferenceNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for batchnorm inference node " << attributes.name
                     << "..." << std::endl;
 
@@ -74,33 +74,6 @@ class BatchnormInferenceNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building BatchnormInferenceNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/bn_finalize.h b/include/cudnn_frontend/node/bn_finalize.h
index 1226d08..1078b35 100644
--- a/include/cudnn_frontend/node/bn_finalize.h
+++ b/include/cudnn_frontend/node/bn_finalize.h
@@ -10,23 +10,18 @@ namespace cudnn_frontend {
 
 namespace graph {
 
-class BatchNormFinalizeNode : public INode {
+class BatchNormFinalizeNode : public NodeCRTP<BatchNormFinalizeNode> {
+   public:
     BN_finalize_attributes attributes;
 
-   public:
     BatchNormFinalizeNode(BN_finalize_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
         return Type::BN_FINALIZE;
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     pre_validate_node() const override final {
         CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs());
@@ -35,7 +30,7 @@ class BatchNormFinalizeNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for batchnorm finalize node  " << attributes.name
                     << "..." << std::endl;
 
@@ -78,29 +73,6 @@ class BatchNormFinalizeNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building BatchNormFinalizeNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/conv_dgrad.h b/include/cudnn_frontend/node/conv_dgrad.h
index a597fa9..c8d2d41 100644
--- a/include/cudnn_frontend/node/conv_dgrad.h
+++ b/include/cudnn_frontend/node/conv_dgrad.h
@@ -9,12 +9,12 @@
 
 namespace cudnn_frontend::graph {
 
-class DgradNode : public INode {
+class DgradNode : public NodeCRTP<DgradNode> {
+   public:
     Conv_dgrad_attributes attributes;
 
-   public:
     DgradNode(Conv_dgrad_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -45,7 +45,7 @@ class DgradNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for dgrad node " << attributes.name << "..."
                     << std::endl;
 
@@ -81,33 +81,6 @@ class DgradNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building DgradNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/conv_fprop.h b/include/cudnn_frontend/node/conv_fprop.h
index 35dcc23..80ea05b 100644
--- a/include/cudnn_frontend/node/conv_fprop.h
+++ b/include/cudnn_frontend/node/conv_fprop.h
@@ -9,23 +9,18 @@
 
 namespace cudnn_frontend::graph {
 
-class ConvolutionNode : public INode {
+class ConvolutionNode : public NodeCRTP<ConvolutionNode> {
    public:
     Conv_fprop_attributes attributes;
 
     ConvolutionNode(Conv_fprop_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
         return Type::CONVOLUTION;
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -50,7 +45,7 @@ class ConvolutionNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for conv node " << attributes.name << "..."
                     << std::endl;
 
@@ -103,29 +98,6 @@ class ConvolutionNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building ConvolutionNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/conv_wgrad.h b/include/cudnn_frontend/node/conv_wgrad.h
index 575be4b..c1433c1 100644
--- a/include/cudnn_frontend/node/conv_wgrad.h
+++ b/include/cudnn_frontend/node/conv_wgrad.h
@@ -9,12 +9,12 @@
 
 namespace cudnn_frontend::graph {
 
-class WgradNode : public INode {
+class WgradNode : public NodeCRTP<WgradNode> {
+   public:
     Conv_wgrad_attributes attributes;
 
-   public:
     WgradNode(Conv_wgrad_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -45,7 +45,7 @@ class WgradNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for conv node " << attributes.name << "."
                     << std::endl;
 
@@ -81,33 +81,6 @@ class WgradNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building WgradNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/dbn.h b/include/cudnn_frontend/node/dbn.h
index b2df03d..828f039 100644
--- a/include/cudnn_frontend/node/dbn.h
+++ b/include/cudnn_frontend/node/dbn.h
@@ -10,23 +10,18 @@ namespace cudnn_frontend {
 
 namespace graph {
 
-class DBNNode : public INode {
+class DBNNode : public NodeCRTP<DBNNode> {
    public:
     Batchnorm_backward_attributes attributes;
 
     DBNNode(Batchnorm_backward_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
         return Type::DBN;
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -37,7 +32,7 @@ class DBNNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for DBN node " << attributes.name << "..."
                     << std::endl;
 
@@ -91,36 +86,6 @@ class DBNNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building DBNNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        // Special case in BN where peer stats is also an input but is not present in inputs map
-        for (auto const& tensor : attributes.peer_stats) {
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/dbn_weight.h b/include/cudnn_frontend/node/dbn_weight.h
index dda9dae..7d54dcd 100644
--- a/include/cudnn_frontend/node/dbn_weight.h
+++ b/include/cudnn_frontend/node/dbn_weight.h
@@ -10,12 +10,12 @@ namespace cudnn_frontend {
 
 namespace graph {
 
-class DBNWeightNode : public INode {
+class DBNWeightNode : public NodeCRTP<DBNWeightNode> {
+   public:
     DBN_weight_attributes attributes;
 
-   public:
     DBNWeightNode(DBN_weight_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -30,7 +30,7 @@ class DBNWeightNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for batchnorm finalize node " << attributes.name
                     << "..." << std::endl;
 
@@ -79,11 +79,6 @@ class DBNWeightNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     post_validate_node() const override final {
         // Validate outputs
@@ -93,29 +88,6 @@ class DBNWeightNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building DBNWeightNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/dln.h b/include/cudnn_frontend/node/dln.h
index 9d4ebbb..a49734b 100644
--- a/include/cudnn_frontend/node/dln.h
+++ b/include/cudnn_frontend/node/dln.h
@@ -10,15 +10,12 @@ namespace cudnn_frontend {
 
 namespace graph {
 
-class DLNNode : public INode {
-    // Keep epsilon for pre-8906
-    std::shared_ptr<Tensor_attributes> epsilon;
-
+class DLNNode : public NodeCRTP<DLNNode> {
    public:
     Layernorm_backward_attributes attributes;
 
     DLNNode(Layernorm_backward_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -36,10 +33,16 @@ class DLNNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for DLN node " << attributes.name << "..."
                     << std::endl;
 
+        // WAR as epsilon was required in previous versions
+        if (cudnn_frontend::get_backend_version() < 8906) {
+            attributes.inputs[Layernorm_backward_attributes::input_names::EPSILON] =
+                std::make_shared<Tensor_attributes>(0.0f);
+        }
+
         attributes.fill_from_context(context);
 
         // TODO: Only inferencing from X works today.
@@ -96,22 +99,9 @@ class DLNNode : public INode {
         infer_scale_bias_tensors(attributes.outputs[Layernorm_backward_attributes::output_names::DSCALE]);
         infer_scale_bias_tensors(attributes.outputs[Layernorm_backward_attributes::output_names::DBIAS]);
 
-        if (cudnnGetVersion() < 8906) {
-            epsilon = std::make_shared<Tensor_attributes>();
-            epsilon->set_is_pass_by_value(true)
-                .set_dim({1, 1, 1, 1})
-                .set_stride({1, 1, 1, 1})
-                .set_data_type(DataType_t::FLOAT);
-        }
-
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     post_validate_node() const override final {
         // Validate outputs
@@ -121,33 +111,6 @@ class DLNNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building DLNNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        if (epsilon) {
-            CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(epsilon, uid, tensors, invalid_uids));
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
@@ -184,9 +147,9 @@ class DLNNode : public INode {
         CUDNN_FE_VALIDATE_AND_ASSIGN_OUTPUT_TENSOR(DX, Layernorm_backward_attributes::output_names::DX);
         DLN_op_builder.setdxDesc(*(tensors.at(DX->second->get_uid())));
 
-        if (epsilon) {
-            DLN_op_builder.setEpsilonTensor(*(tensors.at(epsilon->get_uid())));
-            uids_involved_in_operations.insert(epsilon->get_uid());
+        if (cudnn_frontend::get_backend_version() < 8906) {
+            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(EPSILON, Layernorm_backward_attributes::input_names::EPSILON);
+            DLN_op_builder.setEpsilonTensor(*(tensors.at(EPSILON->second->get_uid())));
         }
 
 #ifdef NV_CUDNN_DISABLE_EXCEPTION
@@ -219,15 +182,6 @@ class DLNNode : public INode {
         j = attributes;
         j.update(R"( {"tag": "LAYER_NORM_BPROP"})"_json);
     }
-
-    error_t
-    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& tensor_to_pass_by_value) const override final {
-        if (epsilon) {
-            // can pass in any dummy value
-            tensor_to_pass_by_value.emplace(epsilon->get_uid(), 0.0f);
-        }
-        return {error_code_t::OK, ""};
-    }
 };
 
 }  // namespace graph
diff --git a/include/cudnn_frontend/node/genstats.h b/include/cudnn_frontend/node/genstats.h
index 2703dec..4159f2e 100644
--- a/include/cudnn_frontend/node/genstats.h
+++ b/include/cudnn_frontend/node/genstats.h
@@ -9,23 +9,18 @@ namespace cudnn_frontend {
 
 namespace graph {
 
-class GenstatsNode : public INode {
+class GenstatsNode : public NodeCRTP<GenstatsNode> {
+   public:
     Genstats_attributes attributes;
 
-   public:
     GenstatsNode(Genstats_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
         return Type::GENSTATS;
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     pre_validate_node() const override final {
         CHECK_CUDNN_FRONTEND_ERROR(attributes.validate_inputs());
@@ -34,7 +29,7 @@ class GenstatsNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         attributes.fill_from_context(context);
 
         // Only inferrencing from X works today.
@@ -84,28 +79,6 @@ class GenstatsNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building GenstatsNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/instancenorm.h b/include/cudnn_frontend/node/instancenorm.h
index c8f1b07..87f487a 100644
--- a/include/cudnn_frontend/node/instancenorm.h
+++ b/include/cudnn_frontend/node/instancenorm.h
@@ -9,12 +9,12 @@
 namespace cudnn_frontend {
 
 namespace graph {
-class InstanceNormNode : public INode {
+class InstanceNormNode : public NodeCRTP<InstanceNormNode> {
    public:
     Instancenorm_attributes attributes;
 
     InstanceNormNode(Instancenorm_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -22,7 +22,7 @@ class InstanceNormNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for instancenorm node " << attributes.name
                     << "..." << std::endl;
 
@@ -102,27 +102,6 @@ class InstanceNormNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building InstanceNormNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
@@ -188,19 +167,14 @@ class InstanceNormNode : public INode {
         j = attributes;
         j.update(R"( {"tag": "INSTANCE_NORM"})"_json);
     }
-
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
 };
 
-class DINNode : public INode {
+class DINNode : public NodeCRTP<DINNode> {
    public:
     Instancenorm_backward_attributes attributes;
 
     DINNode(Instancenorm_backward_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -218,7 +192,7 @@ class DINNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for DIN node " << attributes.name << "..."
                     << std::endl;
 
@@ -296,33 +270,6 @@ class DINNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building DINode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/layernorm.h b/include/cudnn_frontend/node/layernorm.h
index 27e1ac7..136df63 100644
--- a/include/cudnn_frontend/node/layernorm.h
+++ b/include/cudnn_frontend/node/layernorm.h
@@ -9,12 +9,12 @@
 namespace cudnn_frontend {
 
 namespace graph {
-class LayerNormNode : public INode {
+class LayerNormNode : public NodeCRTP<LayerNormNode> {
    public:
     Layernorm_attributes attributes;
 
     LayerNormNode(Layernorm_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -22,12 +22,7 @@ class LayerNormNode : public INode {
     }
 
     error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for layernorm node " << attributes.name << "..."
                     << std::endl;
 
@@ -151,27 +146,6 @@ class LayerNormNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building LayerNormNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/matmul.h b/include/cudnn_frontend/node/matmul.h
index aa9fa95..8702975 100644
--- a/include/cudnn_frontend/node/matmul.h
+++ b/include/cudnn_frontend/node/matmul.h
@@ -9,23 +9,18 @@
 
 namespace cudnn_frontend::graph {
 
-class MatmulNode : public INode {
+class MatmulNode : public NodeCRTP<MatmulNode> {
+   public:
     Matmul_attributes attributes;
 
-   public:
     MatmulNode(Matmul_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
         return Type::MATMUL;
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -41,7 +36,7 @@ class MatmulNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for matmul node " << attributes.name << "..."
                     << std::endl;
 
@@ -90,29 +85,6 @@ class MatmulNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building MatmulNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/pointwise.h b/include/cudnn_frontend/node/pointwise.h
index 861d98f..a28829d 100644
--- a/include/cudnn_frontend/node/pointwise.h
+++ b/include/cudnn_frontend/node/pointwise.h
@@ -9,23 +9,18 @@
 
 namespace cudnn_frontend::graph {
 
-class PointwiseNode : public INode {
+class PointwiseNode : public NodeCRTP<PointwiseNode> {
    public:
     Pointwise_attributes attributes;
 
     PointwiseNode(Pointwise_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
         return Type::POINTWISE;
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -53,7 +48,7 @@ class PointwiseNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for pointwise node " << attributes.name << "..."
                     << std::endl;
 
@@ -85,29 +80,6 @@ class PointwiseNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building PointwiseNode " << attributes.name << " tensors X:" << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/reduction.h b/include/cudnn_frontend/node/reduction.h
index 45bd3b1..b2cc947 100644
--- a/include/cudnn_frontend/node/reduction.h
+++ b/include/cudnn_frontend/node/reduction.h
@@ -8,12 +8,12 @@
 
 namespace cudnn_frontend::graph {
 
-class ReductionNode : public INode {
+class ReductionNode : public NodeCRTP<ReductionNode> {
+   public:
     Reduction_attributes attributes;
 
-   public:
     ReductionNode(Reduction_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -34,7 +34,7 @@ class ReductionNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for reduction node " << attributes.name << "..."
                     << std::endl;
 
@@ -69,33 +69,6 @@ class ReductionNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building ReductionNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/reshape.h b/include/cudnn_frontend/node/reshape.h
index f9d9686..563b930 100644
--- a/include/cudnn_frontend/node/reshape.h
+++ b/include/cudnn_frontend/node/reshape.h
@@ -7,23 +7,18 @@
 
 namespace cudnn_frontend::graph {
 
-class ReshapeNode : public INode {
+class ReshapeNode : public NodeCRTP<ReshapeNode> {
+   public:
     Reshape_attributes attributes;
 
-   public:
     ReshapeNode(Reshape_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
         return Type::RESHAPE;
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -43,7 +38,7 @@ class ReshapeNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for reshape node " << attributes.name << "..."
                     << std::endl;
 
@@ -86,28 +81,6 @@ class ReshapeNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building Reshape tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/rmsnorm.h b/include/cudnn_frontend/node/rmsnorm.h
index 23cc23a..e2072f1 100644
--- a/include/cudnn_frontend/node/rmsnorm.h
+++ b/include/cudnn_frontend/node/rmsnorm.h
@@ -9,12 +9,12 @@
 namespace cudnn_frontend {
 
 namespace graph {
-class RMSNormNode : public INode {
+class RMSNormNode : public NodeCRTP<RMSNormNode> {
    public:
     Rmsnorm_attributes attributes;
 
     RMSNormNode(Rmsnorm_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -22,7 +22,7 @@ class RMSNormNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for rmsnorm node " << attributes.name << "..."
                     << std::endl;
 
@@ -87,28 +87,6 @@ class RMSNormNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building RMSNormNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
@@ -173,19 +151,14 @@ class RMSNormNode : public INode {
         j = attributes;
         j.update(R"( {"tag": "RMS_NORM"})"_json);
     }
-
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
 };
 
-class DRMSNormNode : public INode {
+class DRMSNormNode : public NodeCRTP<DRMSNormNode> {
    public:
     Rmsnorm_backward_attributes attributes;
 
     DRMSNormNode(Rmsnorm_backward_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -207,12 +180,7 @@ class DRMSNormNode : public INode {
     }
 
     error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferencing properties for DRMSNorm node " << attributes.name << "..."
                     << std::endl;
 
@@ -286,29 +254,6 @@ class DRMSNormNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building DRMSNormNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     error_t
     create_cudnn_operations(
         std::unordered_set<uid_t>& uids_involved_in_operations,
diff --git a/include/cudnn_frontend/node/rng.h b/include/cudnn_frontend/node/rng.h
index 9293925..da94285 100644
--- a/include/cudnn_frontend/node/rng.h
+++ b/include/cudnn_frontend/node/rng.h
@@ -8,12 +8,12 @@
 
 namespace cudnn_frontend::graph {
 
-class RngNode : public INode {
+class RngNode : public NodeCRTP<RngNode> {
+   public:
     Rng_attributes attributes;
 
-   public:
     RngNode(Rng_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -36,34 +36,7 @@ class RngNode : public INode {
     }
 
     error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const override final {
-        getLogger() << "[cudnn_frontend] INFO: "
-                    << "Building RngNode tensors " << attributes.name << "..." << std::endl;
-
-        for (auto const& [name, tensor] : attributes.inputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        for (auto const& [name, tensor] : attributes.outputs) {
-            (void)name;
-            if (tensor) {
-                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, uid, tensors, invalid_uids));
-            }
-        }
-        return {error_code_t::OK, ""};
-    }
-
-    error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for rng node " << attributes.name << "..."
                     << std::endl;
 
diff --git a/include/cudnn_frontend/node/scaled_dot_product_attention.h b/include/cudnn_frontend/node/scaled_dot_product_attention.h
index 46f7341..c49c034 100644
--- a/include/cudnn_frontend/node/scaled_dot_product_attention.h
+++ b/include/cudnn_frontend/node/scaled_dot_product_attention.h
@@ -13,14 +13,14 @@
 
 namespace cudnn_frontend::graph {
 
-class ScaledDotProductAttentionNode : public INode {
+class ScaledDotProductAttentionNode : public NodeCRTP<ScaledDotProductAttentionNode> {
+   public:
     std::shared_ptr<Tensor_attributes> negative_inf;
 
-   public:
     Scaled_dot_product_attention_attributes options;
 
     ScaledDotProductAttentionNode(Scaled_dot_product_attention_attributes&& options_, detail::Context const& context)
-        : INode(context), options(std::move(options_)) {}
+        : NodeCRTP(context), options(std::move(options_)) {}
 
     Type
     getType() override final {
@@ -54,7 +54,7 @@ class ScaledDotProductAttentionNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for Scaled_dot_product_attention node "
                     << options.name << "..." << std::endl;
 
diff --git a/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h b/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
index 240eda3..a8ef75e 100644
--- a/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
+++ b/include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
@@ -13,35 +13,24 @@
 
 namespace cudnn_frontend::graph {
 
-class SDPANode : public INode {
+class SDPANode : public NodeCRTP<SDPANode> {
     using input_names  = SDPA_attributes::input_names;
     using output_names = SDPA_attributes::output_names;
 
     std::shared_ptr<Tensor_attributes> rng_output;
-    std::shared_ptr<Tensor_attributes> dropout_scale;
-    std::shared_ptr<Tensor_attributes> negative_inf_causal;
-    // scalar seq_kv only needs to be passed in case there in no padding mask and seq_kv is not multiple of 64.
-    // Also future versions of cudnn will not need it, hence tensor is pre-fixed with WAR.
-    std::shared_ptr<Tensor_attributes> WAR_scalar_max_seq_kv;
-    std::shared_ptr<Tensor_attributes> negative_inf_padding;
     std::shared_ptr<Tensor_attributes> alibi_slopes;
 
    public:
     SDPA_attributes attributes;
 
     SDPANode(SDPA_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
         return Type::COMPOSITE;
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     error_t
     pre_validate_node() const override final {
         getLogger() << "[cudnn_frontend] INFO: "
@@ -119,8 +108,8 @@ class SDPANode : public INode {
 
         auto const& v_dim = attributes.inputs.at(input_names::V)->get_dim();
         auto s_kv         = v_dim[2];
-        if ((s_kv % 64 != 0) && (!(attributes.padding_mask)) && (cudnnGetVersion() < 90000)) {
-            RETURN_CUDNN_FRONTEND_ERROR_IF((cudnnGetVersion() <= 8905),
+        if ((s_kv % 64 != 0) && (!(attributes.padding_mask)) && (cudnn_frontend::get_backend_version() < 90000)) {
+            RETURN_CUDNN_FRONTEND_ERROR_IF((cudnn_frontend::get_backend_version() <= 8905),
                                            error_code_t::GRAPH_NOT_SUPPORTED,
                                            "s_kv not a multiple of 64 required cudnn version atleast 8.9.5");
             auto const& dropout_mask = attributes.inputs.find(input_names::Dropout_mask);
@@ -133,7 +122,7 @@ class SDPANode : public INode {
                 "s_kv not a multiple of 64 with dropout enabled is not supported with cudnn version below 9.0.0");
         }
 
-        if (((s_kv % 64 != 0) || (d_qk % 64 != 0)) && (cudnnGetVersion() <= 8905)) {
+        if (((s_kv % 64 != 0) || (d_qk % 64 != 0)) && (cudnn_frontend::get_backend_version() <= 8905)) {
             RETURN_CUDNN_FRONTEND_ERROR_IF(
                 true,
                 error_code_t::GRAPH_NOT_SUPPORTED,
@@ -171,7 +160,7 @@ class SDPANode : public INode {
                                        "Intermediate tensor data type needs to be set as internal tensors require it.");
 
         if (((s_q % 64 != 0) || (s_kv % 64 != 0)) && (attributes.padding_mask || has_dropout_mask) &&
-            (cudnnGetVersion() < 90000)) {
+            (cudnn_frontend::get_backend_version() < 90000)) {
             RETURN_CUDNN_FRONTEND_ERROR_IF(true,
                                            error_code_t::GRAPH_NOT_SUPPORTED,
                                            "s_q/s_kv not a multiple of 64 with padding/dropout mask is not supported "
@@ -183,7 +172,7 @@ class SDPANode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for Scaled_dot_product_flash_attention node  "
                     << attributes.name << "..." << std::endl;
 
@@ -239,12 +228,8 @@ class SDPANode : public INode {
 
         // Optional scale
         if (attributes.attn_scale_value.has_value()) {
-            attributes.inputs[input_names::Attn_scale] = std::make_shared<Tensor_attributes>();
-            attributes.inputs[input_names::Attn_scale]
-                ->set_dim({1, 1, 1, 1})
-                .set_stride({1, 1, 1, 1})
-                .set_data_type(DataType_t::FLOAT)
-                .set_is_pass_by_value(true);
+            attributes.inputs[input_names::Attn_scale] =
+                std::make_shared<Tensor_attributes>(attributes.attn_scale_value.value());
         }
         if (attributes.inputs[input_names::Attn_scale]) {
             Pointwise_attributes scale_attributes;
@@ -345,12 +330,7 @@ class SDPANode : public INode {
             logical_and_output->set_data_type(DataType_t::BOOLEAN);
 
             // Lower attributes to binary select attributes
-            negative_inf_padding = std::make_shared<Tensor_attributes>();
-            negative_inf_padding->set_dim({1, 1, 1, 1})
-                .set_stride({1, 1, 1, 1})
-                .set_is_pass_by_value(true)
-                // Hard code data type float as FE itself will place FLOAT_MIN in variant pack later
-                .set_data_type(DataType_t::FLOAT);
+            auto negative_inf_padding = std::make_shared<Tensor_attributes>(std::numeric_limits<float>::lowest());
 
             auto binary_select_attributes =
                 Pointwise_attributes().set_name("binary_select").set_mode(PointwiseMode_t::BINARY_SELECT);
@@ -360,17 +340,13 @@ class SDPANode : public INode {
         }
 
         // 2. (bug in cudnn backend) no padding with max_seq_len%64!=0
-        if ((s_kv % 64 != 0) && (!(attributes.padding_mask)) && (cudnnGetVersion() < 90000)) {
+        if ((s_kv % 64 != 0) && (!(attributes.padding_mask)) && (cudnn_frontend::get_backend_version() < 90000)) {
             auto col_index_attributes =
                 Pointwise_attributes().set_name("gen_col_index").set_mode(PointwiseMode_t::GEN_INDEX).set_axis(3);
             auto col_index_output = pointwise(last_output, col_index_attributes);
-
-            WAR_scalar_max_seq_kv = std::make_shared<Tensor_attributes>();
-            WAR_scalar_max_seq_kv->set_dim({1, 1, 1, 1})
-                .set_stride({1, 1, 1, 1})
-                .set_is_pass_by_value(true)
-                // Hard code data type int32 as FE itself will place FLOAT_MIN in variant pack later
-                .set_data_type(DataType_t::INT32);
+            // scalar seq_kv only needs to be passed in case there in no padding mask and seq_kv is not multiple of 64.
+            // Also future versions of cudnn will not need it, hence tensor is pre-fixed with WAR.
+            auto WAR_scalar_max_seq_kv = std::make_shared<Tensor_attributes>(static_cast<int32_t>(s_kv));
 
             auto col_less_seq_kv_attributes =
                 Pointwise_attributes().set_name("col_less_seq_kv").set_mode(PointwiseMode_t::CMP_LT);
@@ -378,13 +354,7 @@ class SDPANode : public INode {
                 pointwise(col_index_output, WAR_scalar_max_seq_kv, col_less_seq_kv_attributes);
 
             // Lower attributes to binary select attributes
-            negative_inf_padding = std::make_shared<Tensor_attributes>();
-            negative_inf_padding->set_dim({1, 1, 1, 1})
-                .set_stride({1, 1, 1, 1})
-                .set_is_pass_by_value(true)
-                // Hard code data type float as FE itself will place FLOAT_MIN in variant pack later
-                .set_data_type(DataType_t::FLOAT);
-
+            auto negative_inf_padding = std::make_shared<Tensor_attributes>(std::numeric_limits<float>::lowest());
             auto binary_select_attributes =
                 Pointwise_attributes().set_name("binary_select").set_mode(PointwiseMode_t::BINARY_SELECT);
             auto padding_mask_output =
@@ -410,12 +380,7 @@ class SDPANode : public INode {
             row_greater_than_col_output->set_data_type(DataType_t::BOOLEAN);
 
             // Lower attributes to binary select attributes
-            negative_inf_causal = std::make_shared<Tensor_attributes>();
-            negative_inf_causal->set_dim({1, 1, 1, 1})
-                .set_stride({1, 1, 1, 1})
-                .set_is_pass_by_value(true)
-                // Hard code data type float as FE itself will place FLOAT_MIN in variant pack later
-                .set_data_type(DataType_t::FLOAT);
+            auto negative_inf_causal = std::make_shared<Tensor_attributes>(std::numeric_limits<float>::lowest());
 
             auto binary_select_attributes =
                 Pointwise_attributes().set_name("binary_select").set_mode(PointwiseMode_t::BINARY_SELECT);
@@ -446,7 +411,7 @@ class SDPANode : public INode {
         if (attributes.dropout_probability.has_value()) {
             dropout_present = true;
             // Special case: Skip dropout when 0.0 probability. Only do for 8.9.3 and up as rng isn't optional earlier.
-            if (cudnnGetVersion() > 8902 && attributes.dropout_probability.value() == 0.0) {
+            if (cudnn_frontend::get_backend_version() > 8902 && attributes.dropout_probability.value() == 0.0) {
                 dropout_present = false;
             }
         } else if (attributes.inputs[input_names::Dropout_mask]) {
@@ -480,16 +445,15 @@ class SDPANode : public INode {
             auto const& dropout_mask_output = pointwise(last_output, rng_output, mask_attributes);
             last_output                     = dropout_mask_output;
 
-            dropout_scale = std::make_shared<Tensor_attributes>();
-            dropout_scale->set_dim({1, 1, 1, 1})
-                .set_stride({1, 1, 1, 1})
-                .set_is_pass_by_value(true)
-// Hard code data type input type as FE itself will place value in variant pack later
-#if CUDNN_VERSION < 8903
-                .set_data_type(attributes.inputs[input_names::Q]->get_data_type());
-#else
-                .set_data_type(DataType_t::FLOAT);
-#endif
+            std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> dropout_scale = nullptr;
+
+            if (get_backend_version() < 8903) {
+                half dropout_scale_value = __float2half(1.0f / (1.0f - attributes.dropout_probability.value()));
+                dropout_scale            = std::make_shared<Tensor_attributes>(dropout_scale_value);
+            } else {
+                float dropout_scale_value = (1.0f / (1.0f - attributes.dropout_probability.value()));
+                dropout_scale             = std::make_shared<Tensor_attributes>(dropout_scale_value);
+            }
 
             auto dropout_scale_attributes =
                 Pointwise_attributes().set_name("dropout_scale").set_mode(PointwiseMode_t::MUL);
@@ -557,41 +521,6 @@ class SDPANode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    virtual error_t
-    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& tensor_to_pass_by_value) const override final {
-        if (attributes.dropout_probability.has_value() && attributes.dropout_probability.value() != 0.0) {
-#if CUDNN_VERSION < 8903
-            half dropout_scale_value = __float2half(1.0f / (1.0f - attributes.dropout_probability.value()));
-#else
-            float dropout_scale_value = (1.0f / (1.0f - attributes.dropout_probability.value()));
-#endif
-            tensor_to_pass_by_value.emplace(dropout_scale->get_uid(), dropout_scale_value);
-        }
-
-        if (negative_inf_padding) {
-            float negative_inf_value = std::numeric_limits<float>::lowest();
-            tensor_to_pass_by_value.emplace(negative_inf_padding->get_uid(), negative_inf_value);
-        }
-
-        if (WAR_scalar_max_seq_kv) {
-            auto const& v_dim = attributes.inputs.at(input_names::V)->get_dim();
-            int32_t s_kv      = static_cast<int32_t>(v_dim[2]);
-            tensor_to_pass_by_value.emplace(WAR_scalar_max_seq_kv->get_uid(), s_kv);
-        }
-
-        if (negative_inf_causal) {
-            float negative_inf_value = std::numeric_limits<float>::lowest();
-            tensor_to_pass_by_value.emplace(negative_inf_causal->get_uid(), negative_inf_value);
-        }
-
-        if (attributes.attn_scale_value.has_value()) {
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Attn_scale, input_names::Attn_scale);
-            tensor_to_pass_by_value.emplace(Attn_scale->second->get_uid(), attributes.attn_scale_value.value());
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     virtual void
     serialize(json& j) const override final {
         j = attributes;
@@ -599,16 +528,11 @@ class SDPANode : public INode {
     }
 };
 
-class SDPABackwardNode : public INode {
+class SDPABackwardNode : public NodeCRTP<SDPABackwardNode> {
     using input_names  = SDPA_backward_attributes::input_names;
     using output_names = SDPA_backward_attributes::output_names;
 
    private:
-    // non-virtual node cpu tensors
-    std::shared_ptr<Tensor_attributes> one_tensor;
-    std::shared_ptr<Tensor_attributes> negative_inf_padding;
-    std::shared_ptr<Tensor_attributes> negative_inf_causal;
-
     // non-virtual node gpu tensors
     std::shared_ptr<Tensor_attributes> dQ_accum;
     int64_t dQ_accum_size = 0;
@@ -621,7 +545,7 @@ class SDPABackwardNode : public INode {
     SDPA_backward_attributes attributes;
 
     SDPABackwardNode(SDPA_backward_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -682,7 +606,7 @@ class SDPABackwardNode : public INode {
         int64_t d_v  = attributes.inputs.at(input_names::V)->get_dim()[3];
 
         RETURN_CUDNN_FRONTEND_ERROR_IF(
-            (s_q < 64) && cudnnGetVersion() < 90000,
+            (s_q < 64) && cudnn_frontend::get_backend_version() < 90000,
             error_code_t::GRAPH_NOT_SUPPORTED,
             "Sequence length must be greater than or equal to 64 for cudnn version prior to v9.0.0");
 
@@ -743,7 +667,7 @@ class SDPABackwardNode : public INode {
                                        "Intermediate tensor data type needs to be set as internal tensors require it.");
 
         if (((s_q % 64 != 0) || (s_kv % 64 != 0)) && (attributes.padding_mask || has_dropout_mask) &&
-            (cudnnGetVersion() < 90000)) {
+            (cudnn_frontend::get_backend_version() < 90000)) {
             RETURN_CUDNN_FRONTEND_ERROR_IF(true,
                                            error_code_t::GRAPH_NOT_SUPPORTED,
                                            "s_q/s_kv not a multiple of 64 with padding/dropout mask is not supported "
@@ -764,12 +688,7 @@ class SDPABackwardNode : public INode {
     }
 
     error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
-    error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for SDPABackwardNode " << attributes.name
                     << "..." << std::endl;
 
@@ -820,16 +739,11 @@ class SDPABackwardNode : public INode {
         // --------------Initialize and create tensors before creating nodes--------------------
         // one_tensor is needed for non-dropout graphs
         // one_tensor is passed by the node
-        one_tensor = std::make_shared<Tensor_attributes>();
-        one_tensor->set_is_virtual(false).set_is_pass_by_value(true);
-        one_tensor->set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1});
-        one_tensor->set_data_type(DataType_t::FLOAT);
+        auto one_tensor = std::make_shared<Tensor_attributes>(1.0f);
 
         if (attributes.attn_scale_value.has_value()) {
-            attributes.inputs[input_names::Attn_scale] = std::make_shared<Tensor_attributes>();
-            attributes.inputs[input_names::Attn_scale]->set_is_virtual(false).set_is_pass_by_value(true);
-            attributes.inputs[input_names::Attn_scale]->set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1});
-            attributes.inputs[input_names::Attn_scale]->set_data_type(DataType_t::FLOAT);
+            attributes.inputs[input_names::Attn_scale] =
+                std::make_shared<Tensor_attributes>(attributes.attn_scale_value.value());
         }
 
         // alibi_slopes is passed by the node
@@ -841,35 +755,17 @@ class SDPABackwardNode : public INode {
             alibi_slopes_size = h_q * sizeof(float);
         }
 
-        // negative_inf_padding is passed by the node
-        if (attributes.padding_mask) {
-            negative_inf_padding = std::make_shared<Tensor_attributes>();
-            negative_inf_padding->set_is_virtual(false).set_is_pass_by_value(true);
-            negative_inf_padding->set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1});
-            negative_inf_padding->set_data_type(DataType_t::FLOAT);
-        }
-
-        // negative_inf_causal is passed by the node
-        if (attributes.causal_mask) {
-            negative_inf_causal = std::make_shared<Tensor_attributes>();
-            negative_inf_causal->set_is_virtual(false).set_is_pass_by_value(true);
-            negative_inf_causal->set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1});
-            negative_inf_causal->set_data_type(DataType_t::FLOAT);
-        }
-
         // if dropout_prob is used, then the node passes scale and scale inverse
         // if dropout_mask is used, then the user passes scale and scale_inverse
         bool is_dropout_prob = (attributes.dropout_probability.has_value());
         bool is_dropout_mask = (attributes.inputs[input_names::Dropout_mask] != nullptr);
         if (is_dropout_prob) {
-            attributes.inputs[input_names::Dropout_scale] = std::make_shared<Tensor_attributes>();
-            attributes.inputs[input_names::Dropout_scale]->set_is_virtual(false).set_is_pass_by_value(true);
-            attributes.inputs[input_names::Dropout_scale]->set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1});
-            attributes.inputs[input_names::Dropout_scale]->set_data_type(DataType_t::FLOAT);
-            attributes.inputs[input_names::Dropout_scale_inv] = std::make_shared<Tensor_attributes>();
-            attributes.inputs[input_names::Dropout_scale_inv]->set_is_virtual(false).set_is_pass_by_value(true);
-            attributes.inputs[input_names::Dropout_scale_inv]->set_dim({1, 1, 1, 1}).set_stride({1, 1, 1, 1});
-            attributes.inputs[input_names::Dropout_scale_inv]->set_data_type(DataType_t::FLOAT);
+            float dropout_scale_value     = 1.0f / (1.0f - attributes.dropout_probability.value());
+            float dropout_scale_inv_value = (1.0f - attributes.dropout_probability.value());
+
+            attributes.inputs[input_names::Dropout_scale] = std::make_shared<Tensor_attributes>(dropout_scale_value);
+            attributes.inputs[input_names::Dropout_scale_inv] =
+                std::make_shared<Tensor_attributes>(dropout_scale_inv_value);
         }
 
         // ---------------------input tensor workarounds---------------------------
@@ -886,8 +782,9 @@ class SDPABackwardNode : public INode {
         bool use_workspace_opt = false;
 
         struct cudaDeviceProp prop;
-        CHECK_CUDA_ERROR(cudaGetDeviceProperties(&prop, 0));
-        if ((cudnnGetVersion() >= 8905 && prop.major >= 9) || (cudnnGetVersion() >= 9000)) {
+        CHECK_CUDA_ERROR(cuda_get_device_properties(&prop, 0));
+        if ((cudnn_frontend::get_backend_version() >= 8905 && prop.major >= 9) ||
+            (cudnn_frontend::get_backend_version() >= 9000)) {
             // default upper limit for workspace 256MB
             int64_t max_dp_workspace_bytes = 256 * 1024 * 1024;
 
@@ -1082,6 +979,7 @@ class SDPABackwardNode : public INode {
                                                      .set_mode(PointwiseMode_t::LOGICAL_AND)
                                                      .set_compute_data_type(DataType_t::BOOLEAN));
             padding_mask_output->set_data_type(DataType_t::BOOLEAN);
+            auto negative_inf_padding = std::make_shared<Tensor_attributes>(std::numeric_limits<float>::lowest());
 
             last_output =
                 pointwise(last_output,
@@ -1115,6 +1013,7 @@ class SDPABackwardNode : public INode {
                                                     .set_mode(PointwiseMode_t::CMP_GE)
                                                     .set_compute_data_type(DataType_t::BOOLEAN));
             causal_mask_output->set_data_type(DataType_t::BOOLEAN);
+            auto negative_inf_causal = std::make_shared<Tensor_attributes>(std::numeric_limits<float>::lowest());
 
             last_output =
                 pointwise(last_output,
@@ -1129,7 +1028,7 @@ class SDPABackwardNode : public INode {
                                 Pointwise_attributes().set_name("sub_s_m").set_mode(PointwiseMode_t::SUB));
 
         // WAR for bug 4475073 by explicitly putting the padding value again after the stats have been loaded
-        if (attributes.padding_mask && cudnnGetVersion() >= 90000) {
+        if (attributes.padding_mask && cudnn_frontend::get_backend_version() >= 90000) {
             auto row_idx_output = pointwise(last_output,
                                             Pointwise_attributes()
                                                 .set_name("gen_row_idx_2nd_padding")
@@ -1169,6 +1068,7 @@ class SDPABackwardNode : public INode {
                                                      .set_mode(PointwiseMode_t::LOGICAL_AND)
                                                      .set_compute_data_type(DataType_t::BOOLEAN));
             padding_mask_output->set_data_type(DataType_t::BOOLEAN);
+            auto negative_inf_padding = std::make_shared<Tensor_attributes>(std::numeric_limits<float>::lowest());
 
             last_output = pointwise(
                 last_output,
@@ -1341,7 +1241,7 @@ class SDPABackwardNode : public INode {
 
         // non-virtual softmax_sum is required for below cuDNN 8.9.5
         // non-virtual softmax_sum is passed by the node
-        if (cudnnGetVersion() < 8905) {
+        if (cudnn_frontend::get_backend_version() < 8905) {
             softmax_sum->set_is_virtual(false);
             softmax_sum->set_dim({b, h_q, s_q, 1});
             softmax_sum->set_data_type(DataType_t::FLOAT);
@@ -1387,43 +1287,6 @@ class SDPABackwardNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& tensor_to_pass_by_value) const override final {
-        using input_names = SDPA_backward_attributes::input_names;
-
-        if (one_tensor) {
-            tensor_to_pass_by_value.emplace(one_tensor->get_uid(), 1.0f);
-        }
-
-        if (attributes.attn_scale_value.has_value()) {
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Attn_scale, input_names::Attn_scale);
-            tensor_to_pass_by_value.emplace(Attn_scale->second->get_uid(), attributes.attn_scale_value.value());
-        }
-
-        if (attributes.padding_mask) {
-            float negative_inf_value = std::numeric_limits<float>::lowest();
-            tensor_to_pass_by_value.emplace(negative_inf_padding->get_uid(), negative_inf_value);
-        }
-
-        if (attributes.causal_mask) {
-            float negative_inf_value = std::numeric_limits<float>::lowest();
-            tensor_to_pass_by_value.emplace(negative_inf_causal->get_uid(), negative_inf_value);
-        }
-
-        if (attributes.dropout_probability.has_value()) {
-            float dropout_scale_value     = 1.0f / (1.0f - attributes.dropout_probability.value());
-            float dropout_scale_inv_value = (1.0f - attributes.dropout_probability.value());
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Dropout_scale, input_names::Dropout_scale);
-            tensor_to_pass_by_value.emplace(Dropout_scale->second->get_uid(), dropout_scale_value);
-
-            CUDNN_FE_VALIDATE_AND_ASSIGN_INPUT_TENSOR(Dropout_scale_inv, input_names::Dropout_scale_inv);
-            tensor_to_pass_by_value.emplace(Dropout_scale_inv->second->get_uid(), dropout_scale_inv_value);
-        }
-
-        return {error_code_t::OK, ""};
-    }
-
     virtual void
     serialize(json& j) const override final {
         j = attributes;
diff --git a/include/cudnn_frontend/node/softmax.h b/include/cudnn_frontend/node/softmax.h
index dbd4963..32857c5 100644
--- a/include/cudnn_frontend/node/softmax.h
+++ b/include/cudnn_frontend/node/softmax.h
@@ -11,12 +11,12 @@
 
 namespace cudnn_frontend::graph {
 
-class SoftmaxNode : public INode {
+class SoftmaxNode : public NodeCRTP<SoftmaxNode> {
    public:
     Softmax_attributes attributes;
 
     SoftmaxNode(Softmax_attributes&& attributes_, detail::Context const& context)
-        : INode(context), attributes(std::move(attributes_)) {}
+        : NodeCRTP(context), attributes(std::move(attributes_)) {}
 
     Type
     getType() override final {
@@ -41,7 +41,7 @@ class SoftmaxNode : public INode {
     }
 
     error_t
-    expand_and_infer_properties() override final {
+    expand_and_infer_properties_node() override final {
         getLogger() << "[cudnn_frontend] INFO: Inferrencing properties for Softmax node " << attributes.name << "."
                     << std::endl;
 
@@ -123,11 +123,6 @@ class SoftmaxNode : public INode {
         return {error_code_t::OK, ""};
     }
 
-    error_t
-    collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
-        return attributes.get_prefilled_uids(pre_assigned_uids);
-    }
-
     virtual void
     serialize(json& j) const override final {
         j = attributes;
diff --git a/include/cudnn_frontend/node_interface.h b/include/cudnn_frontend/node_interface.h
index 5ec2dfb..68b8980 100644
--- a/include/cudnn_frontend/node_interface.h
+++ b/include/cudnn_frontend/node_interface.h
@@ -7,6 +7,7 @@
 #include <limits>
 
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 
 #include "../cudnn_frontend_Tensor.h"
 #include "../cudnn_frontend_Operation.h"
@@ -22,6 +23,8 @@ namespace cudnn_frontend {
 
 namespace graph {
 
+class BatchNormNode;
+class DBNNode;
 class MatmulNode;
 class PointwiseNode;
 class ReductionNode;
@@ -33,13 +36,11 @@ class SoftmaxNode;
 class INode : public ICudnn {
    public:
     // A closed set of types that are allowed to be passed by value today
-    using pass_by_values_t = std::variant<int32_t, half, float, void*>;
+    using pass_by_values_t = Tensor_attributes::pass_by_values_t;
 
     detail::Context context;
 
    private:
-    std::unordered_map<uid_t, pass_by_values_t> deserialized_pass_by_value;
-    std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> deserialized_workspace_modifications;
     int64_t fe_workspace_size = 0;
 
     std::shared_ptr<Tensor_attributes>
@@ -53,7 +54,7 @@ class INode : public ICudnn {
     pre_validate_node() const = 0;
 
     virtual error_t
-    expand_and_infer_properties() = 0;
+    expand_and_infer_properties_node() = 0;
 
     virtual error_t
     post_validate_node() const = 0;
@@ -98,12 +99,17 @@ class INode : public ICudnn {
     }
 
     virtual error_t
-    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& pass_by_values) const {
-        for (auto [uid, value] : deserialized_pass_by_value) {
-            pass_by_values.emplace(uid, value);
-        }
-        return {error_code_t::OK, ""};
-    }
+    pass_by_value_tensors_(std::unordered_map<uid_t, pass_by_values_t>& pass_by_values) const = 0;
+
+    virtual error_t
+    collect_pre_assigned_uids_(std::unordered_set<int64_t>& pre_assigned_uids) const = 0;
+
+    virtual error_t
+    set_uids_(int64_t& potential_uid, std::unordered_set<int64_t> const& pre_assigned_uids) const = 0;
+
+    virtual error_t
+    create_cudnn_tensors_(
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& uid_to_backend_tensors) const = 0;
 
     error_t
     run_auxiliary_kernels(
@@ -111,21 +117,21 @@ class INode : public ICudnn {
         void* fe_workspace,
         std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>>& workspace_modifications) const {
         cudaStream_t stream;
-        CHECK_CUDNN_ERROR(cudnnGetStream(handle, &stream));
+        CHECK_CUDNN_ERROR(cudnn_frontend::get_stream(handle, &stream));
         char* workspace = static_cast<char*>(fe_workspace);
 
         for (auto [uid, data] : workspace_modifications) {
             (void)uid;
             if (std::get<0>(data) == 0) {
                 auto& vec_data = std::get<2>(data);
-                CHECK_CUDA_ERROR(cudaMemcpyAsync(workspace + std::get<1>(data),
-                                                 vec_data.data(),
-                                                 vec_data.size() * sizeof(float),
-                                                 cudaMemcpyHostToDevice,
-                                                 stream));
+                CHECK_CUDA_ERROR(cuda_mem_cpy_async(workspace + std::get<1>(data),
+                                                    vec_data.data(),
+                                                    vec_data.size() * sizeof(float),
+                                                    cudaMemcpyHostToDevice,
+                                                    stream));
             } else if (std::get<0>(data) == 1) {
                 int64_t memset_size = (int64_t)std::get<2>(data)[0];
-                CHECK_CUDA_ERROR(cudaMemsetAsync(workspace + std::get<1>(data), 0, memset_size, stream));
+                CHECK_CUDA_ERROR(cuda_mem_set_async(workspace + std::get<1>(data), 0, memset_size, stream));
             }
         }
         return {error_code_t::OK, ""};
@@ -182,12 +188,12 @@ class INode : public ICudnn {
         for (auto& [uid, value] : tensor_to_pass_by_value) {
             if (half* half_value_ptr = std::get_if<half>(&value)) {
                 tensor_to_pointer_map.emplace(uid, half_value_ptr);
+            } else if (nv_bfloat16* nv_bfloat16_value_ptr = std::get_if<nv_bfloat16>(&value)) {
+                tensor_to_pointer_map.emplace(uid, nv_bfloat16_value_ptr);
             } else if (int32_t* int32_t_value_ptr = std::get_if<int32_t>(&value)) {
                 tensor_to_pointer_map.emplace(uid, int32_t_value_ptr);
             } else if (float* float_value_ptr = std::get_if<float>(&value)) {
                 tensor_to_pointer_map.emplace(uid, float_value_ptr);
-            } else if (void** void_value_ptr = std::get_if<void*>(&value)) {
-                tensor_to_pointer_map.emplace(uid, *void_value_ptr);
             } else {
                 RETURN_CUDNN_FRONTEND_ERROR_IF(
                     true, error_code_t::INVALID_VARIANT_PACK, "Unexpected type for pass by value tensor.");
@@ -197,6 +203,9 @@ class INode : public ICudnn {
     }
 
    protected:
+    std::unordered_map<uid_t, pass_by_values_t> deserialized_pass_by_value;
+    std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> deserialized_workspace_modifications;
+
     // Type of each node. Nodes can either be a composite (value COMPOSITE) or
     // one of the other primitive types. Primitives types are nothing but
     // cudnn operations.
@@ -302,13 +311,35 @@ class INode : public ICudnn {
         sub_nodes.emplace_back(std::make_unique<RngNode>(std::move(attributes), context));
     }
 
+    error_t
+    pre_validate_and_expand_node() {
+        // pre validate to catch errors early
+        // Otherwise code reability decreases in expand_and_infer
+        CHECK_CUDNN_FRONTEND_ERROR(pre_validate_node());
+        CHECK_CUDNN_FRONTEND_ERROR(expand_and_infer_properties_node());
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->pre_validate_and_expand_node());
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    post_validate() const {
+        // Validate self
+        CHECK_CUDNN_FRONTEND_ERROR(post_validate_node());
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->post_validate());
+        }
+        return {error_code_t::OK, ""};
+    }
+
     // Creates cudnn tensors for each node (and its sub nodes)
-    virtual error_t
-    create_cudnn_tensors(int64_t& uid,
-                         std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& uid_to_backend_tensors,
-                         std::unordered_set<int64_t> const& invalid_uids) const {
+    error_t
+    create_cudnn_tensors(
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& uid_to_backend_tensors) const {
+        CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensors_(uid_to_backend_tensors));
         for (auto const& sub_node : sub_nodes) {
-            CHECK_CUDNN_FRONTEND_ERROR(sub_node->create_cudnn_tensors(uid, uid_to_backend_tensors, invalid_uids));
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->create_cudnn_tensors(uid_to_backend_tensors));
         }
         return {error_code_t::OK, ""};
     }
@@ -327,10 +358,22 @@ class INode : public ICudnn {
         return {error_code_t::OK, ""};
     }
 
-    virtual error_t
+    error_t
     collect_pre_assigned_uids(std::unordered_set<int64_t>& pre_assigned_uids) const {
+        // Collect uids from current node
+        CHECK_CUDNN_FRONTEND_ERROR(collect_pre_assigned_uids_(pre_assigned_uids));
+        for (auto const& sub_node : sub_nodes) {
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->collect_pre_assigned_uids(pre_assigned_uids));
+        }
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    set_uids(int64_t& potential_uid, std::unordered_set<int64_t> const& pre_assigned_uids) const {
+        // Collect uids from current node
+        CHECK_CUDNN_FRONTEND_ERROR(set_uids_(potential_uid, pre_assigned_uids));
         for (auto const& sub_node : sub_nodes) {
-            auto x = sub_node->collect_pre_assigned_uids(pre_assigned_uids);
+            CHECK_CUDNN_FRONTEND_ERROR(sub_node->set_uids(potential_uid, pre_assigned_uids));
         }
         return {error_code_t::OK, ""};
     }
@@ -364,38 +407,28 @@ class INode : public ICudnn {
                                            Rng_attributes);
     error_t
     validate() {
-        // validate self
-        CHECK_CUDNN_FRONTEND_ERROR(pre_validate_node());
-
         // infer_properties self
-        CHECK_CUDNN_FRONTEND_ERROR(expand_and_infer_properties());
+        CHECK_CUDNN_FRONTEND_ERROR(pre_validate_and_expand_node());
 
-        // validate sub nodes
-        for (auto const& sub_node : sub_nodes) {
-            CHECK_CUDNN_FRONTEND_ERROR(sub_node->validate());
-        }
+        // assign uids as part of validation
+        // This helps catch whether user has assigned duplicated uids to tensors
+        // Each time a backend tensor is created, uid will be incremented by 1, ensuring uniqueness.
+        std::unordered_set<int64_t> pre_assigned_uids;
+        CHECK_CUDNN_FRONTEND_ERROR(collect_pre_assigned_uids(pre_assigned_uids));
 
-        // validate self
-        CHECK_CUDNN_FRONTEND_ERROR(post_validate_node());
+        Tensor_attributes::uid_t start_uid = 1;
+        CHECK_CUDNN_FRONTEND_ERROR(set_uids(start_uid, pre_assigned_uids));
+
+        // validate the full tree again
+        CHECK_CUDNN_FRONTEND_ERROR(post_validate());
 
         return {error_code_t::OK, ""};
     }
 
     error_t
     build_operation_graph(cudnnHandle_t handle) {
-        // Starting uid for operation graph.
-        // Each time a backend tensor is created, uid will be incremented by 1, ensuring uniqueness.
-        // TODO: Maybe just use uid_to_tensors size as uid each time?
-        int64_t uid = 1;
-
-        std::unordered_set<int64_t> pre_assigned_uids;
-        CHECK_CUDNN_FRONTEND_ERROR(collect_pre_assigned_uids(pre_assigned_uids));
-        while (pre_assigned_uids.find(uid) != pre_assigned_uids.end()) {
-            uid++;
-        }
-
         // Lower each sub node to cudnn backend.
-        CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensors(uid, uid_to_tensors, pre_assigned_uids));
+        CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensors(uid_to_tensors));
 
         // INode needs to keep track of all uids that an operation graph uses.
         // This is because cudnn backend will not accept extra tensors in variant pack.
@@ -591,30 +624,7 @@ class INode : public ICudnn {
             index++;
         }
 
-        std::unordered_map<uid_t, int32_t> integer_pass_by_values;
-        std::unordered_map<uid_t, float> half_pass_by_values;
-        std::unordered_map<uid_t, float> float_pass_by_values;
-
-        auto pass_by_value_tensors = j["pass_by_values"];
-        for (auto i = 0u; i < pass_by_value_tensors.size(); i++) {
-            if (i == 0) {
-                integer_pass_by_values = pass_by_value_tensors[i].get<std::unordered_map<uid_t, int32_t>>();
-            } else if (i == 1) {
-                half_pass_by_values = pass_by_value_tensors[i].get<std::unordered_map<uid_t, float>>();
-            } else if (i == 2) {
-                float_pass_by_values = pass_by_value_tensors[i].get<std::unordered_map<uid_t, float>>();
-            }
-        }
-
-        for (auto const& [uid, value] : integer_pass_by_values) {
-            deserialized_pass_by_value.emplace(uid, value);
-        }
-        for (auto const& [uid, value] : half_pass_by_values) {
-            deserialized_pass_by_value.emplace(uid, __float2half(value));
-        }
-        for (auto const& [uid, value] : float_pass_by_values) {
-            deserialized_pass_by_value.emplace(uid, value);
-        }
+        deserialized_pass_by_value = j["pass_by_values"];
 
         deserialized_workspace_modifications = j["workspace_modifications"];
 
@@ -642,30 +652,11 @@ class INode : public ICudnn {
 
         std::unordered_map<uid_t, pass_by_values_t> tensor_to_pass_by_value;
         CHECK_CUDNN_FRONTEND_ERROR(gather_pass_by_value_tensors_(tensor_to_pass_by_value));
-
-        j["pass_by_values"];
-        std::unordered_map<uid_t, int32_t> integer_pass_by_values;
-        std::unordered_map<uid_t, float> half_pass_by_values;
-        std::unordered_map<uid_t, float> float_pass_by_values;
-        // std::unordered_map<uid_t, void *>  void_ptr_pass_by_values;
-        for (auto const& [uid, pass_by_value] : tensor_to_pass_by_value) {
-            if (pass_by_value.index() == 0) {
-                integer_pass_by_values.emplace(uid, std::get<0>(pass_by_value));
-            } else if (pass_by_value.index() == 1) {
-                half_pass_by_values.emplace(uid, __half2float(std::get<1>(pass_by_value)));
-            } else if (pass_by_value.index() == 2) {
-                float_pass_by_values.emplace(uid, std::get<2>(pass_by_value));
-            }
-        }
-        // json j = half_pass_by_values;
-        j["pass_by_values"].push_back(integer_pass_by_values);
-        j["pass_by_values"].push_back(half_pass_by_values);
-        j["pass_by_values"].push_back(float_pass_by_values);
+        j["pass_by_values"] = tensor_to_pass_by_value;
 
         std::unordered_map<uid_t, std::tuple<int64_t, int64_t, std::vector<float>>> workspace_modifications;
         int64_t workspace_offset = 0;
         CHECK_CUDNN_FRONTEND_ERROR(gather_workspace_modifications(workspace_modifications, workspace_offset));
-
         j["workspace_modifications"] = workspace_modifications;
 
         j["fe_workspace_size"] = get_fe_workspace_size();
@@ -695,6 +686,74 @@ to_json(json& j, const INode& p) {
     p.serialize(j);
 }
 
+template <typename DerivedT>
+class NodeCRTP : public INode {
+    DerivedT&
+    self() {
+        return *static_cast<DerivedT*>(this);
+    }
+    DerivedT const&
+    self() const {
+        return *static_cast<DerivedT const*>(this);
+    }
+
+    error_t
+    pass_by_value_tensors_(
+        std::unordered_map<Tensor_attributes::uid_t, pass_by_values_t>& tensor_to_pass_by_value) const override final {
+        CHECK_CUDNN_FRONTEND_ERROR(self().attributes.fill_pass_by_value(tensor_to_pass_by_value));
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    collect_pre_assigned_uids_(std::unordered_set<int64_t>& pre_assigned_uids) const override final {
+        CHECK_CUDNN_FRONTEND_ERROR(self().attributes.get_prefilled_uids(pre_assigned_uids));
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    set_uids_(int64_t& potential_uid, std::unordered_set<int64_t> const& pre_assigned_uids) const override final {
+        CHECK_CUDNN_FRONTEND_ERROR(self().attributes.set_uids(potential_uid, pre_assigned_uids));
+
+        return {error_code_t::OK, ""};
+    }
+
+    error_t
+    create_cudnn_tensors_(
+        std::unordered_map<int64_t, std::shared_ptr<cudnn_frontend::Tensor>>& tensors) const override final {
+        getLogger() << "[cudnn_frontend] INFO: Creating cudnn tensors for node named '" << self().attributes.name
+                    << "':" << std::endl;
+        for (auto const& [name, tensor] : self().attributes.inputs) {
+            (void)name;
+            if (tensor) {
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, tensors));
+            }
+        }
+        for (auto const& [name, tensor] : self().attributes.outputs) {
+            (void)name;
+            if (tensor) {
+                CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, tensors));
+            }
+        }
+
+        // Handle special case of BN where peer_stats is also an input
+        if constexpr (std::is_same_v<DerivedT, DBNNode> || std::is_same_v<DerivedT, BatchNormNode>) {
+            // Special case in BN where peer stats is also an input but is not present in inputs map
+            for (auto const& tensor : self().attributes.peer_stats) {
+                if (tensor) {
+                    CHECK_CUDNN_FRONTEND_ERROR(create_cudnn_tensor(tensor, tensors));
+                }
+            }
+        }
+
+        return {error_code_t::OK, ""};
+    }
+
+   protected:
+    using INode::INode;
+};
+
 #define CUDNN_FE_VALIDATE_TENSOR_(port, map_)                                                      \
     {                                                                                              \
         auto t           = map_.find(port);                                                        \
diff --git a/include/cudnn_frontend/plans.h b/include/cudnn_frontend/plans.h
index e9d66b9..cf8da82 100644
--- a/include/cudnn_frontend/plans.h
+++ b/include/cudnn_frontend/plans.h
@@ -53,7 +53,7 @@ execute(cudnnHandle_t handle,
     }
 #endif
 
-    auto status = cudnnBackendExecute(handle, plan->get_raw_desc(), raw_variant_pack);
+    auto status = cudnn_frontend::execute(handle, plan->get_raw_desc(), raw_variant_pack);
     if (status != CUDNN_STATUS_SUCCESS) {
         std::string message = "[cudnn_frontend] ERROR: Graph execution failed.";
         return {error_code_t::GRAPH_EXECUTION_FAILED, message};
@@ -121,12 +121,12 @@ query_heuristics(std::vector<std::shared_ptr<OperationGraph_v8>> const& operatio
             int64_t elem_count                        = 0;
             ManagedOpaqueDescriptor extractedEngine   = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
             cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
-            auto status = cudnnBackendGetAttribute(engine_config->get_backend_descriptor(),
-                                                   CUDNN_ATTR_ENGINECFG_ENGINE,
-                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                                   1,
-                                                   &elem_count,
-                                                   &extractedEngine_);
+            auto status = cudnn_frontend::get_attribute(engine_config->get_backend_descriptor(),
+                                                        CUDNN_ATTR_ENGINECFG_ENGINE,
+                                                        CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                        1,
+                                                        &elem_count,
+                                                        &extractedEngine_);
             if (status == CUDNN_STATUS_SUCCESS) {
                 good_configs.push_back(engine_config);
             }
@@ -264,53 +264,53 @@ class Execution_plan_list {
 
             ManagedOpaqueDescriptor extractedEngine   = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
             cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
-            auto status = cudnnBackendGetAttribute(engine_config->get_backend_descriptor(),
-                                                   CUDNN_ATTR_ENGINECFG_ENGINE,
-                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                                   1,
-                                                   &elem_count,
-                                                   &extractedEngine_);
+            auto status = cudnn_frontend::get_attribute(engine_config->get_backend_descriptor(),
+                                                        CUDNN_ATTR_ENGINECFG_ENGINE,
+                                                        CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                        1,
+                                                        &elem_count,
+                                                        &extractedEngine_);
             RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
                                            error_code_t::HEURISTIC_QUERY_FAILED,
                                            "Heuristic query Engine failed.");
 
-            status = cudnnBackendGetAttribute(extractedEngine_,
-                                              CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
-                                              CUDNN_TYPE_NUMERICAL_NOTE,
-                                              CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
-                                              &elem_count,
-                                              nullptr);
+            status = cudnn_frontend::get_attribute(extractedEngine_,
+                                                   CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                                                   CUDNN_TYPE_NUMERICAL_NOTE,
+                                                   CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                                                   &elem_count,
+                                                   nullptr);
             RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
                                            error_code_t::HEURISTIC_QUERY_FAILED,
                                            "Heuristic query Numerical Note failed");
 
             numerics.resize(static_cast<size_t>(elem_count));
-            status = cudnnBackendGetAttribute(extractedEngine_,
-                                              CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
-                                              CUDNN_TYPE_NUMERICAL_NOTE,
-                                              CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
-                                              &elem_count,
-                                              numerics.data());
+            status = cudnn_frontend::get_attribute(extractedEngine_,
+                                                   CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                                                   CUDNN_TYPE_NUMERICAL_NOTE,
+                                                   CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                                                   &elem_count,
+                                                   numerics.data());
             RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
                                            error_code_t::HEURISTIC_QUERY_FAILED,
                                            "Heuristic query Numerical Note failed");
-            status = cudnnBackendGetAttribute(extractedEngine_,
-                                              CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
-                                              CUDNN_TYPE_BEHAVIOR_NOTE,
-                                              CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
-                                              &elem_count,
-                                              nullptr);
+            status = cudnn_frontend::get_attribute(extractedEngine_,
+                                                   CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                                                   CUDNN_TYPE_BEHAVIOR_NOTE,
+                                                   CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                                                   &elem_count,
+                                                   nullptr);
             RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
                                            error_code_t::HEURISTIC_QUERY_FAILED,
                                            "Heuristic query Behavior Note failed");
 
             behavior.resize(static_cast<size_t>(elem_count));
-            status = cudnnBackendGetAttribute(extractedEngine_,
-                                              CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
-                                              CUDNN_TYPE_BEHAVIOR_NOTE,
-                                              CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
-                                              &elem_count,
-                                              behavior.data());
+            status = cudnn_frontend::get_attribute(extractedEngine_,
+                                                   CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                                                   CUDNN_TYPE_BEHAVIOR_NOTE,
+                                                   CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                                                   &elem_count,
+                                                   behavior.data());
             RETURN_CUDNN_FRONTEND_ERROR_IF((status != CUDNN_STATUS_SUCCESS),
                                            error_code_t::HEURISTIC_QUERY_FAILED,
                                            "Heuristic query Behavior Note failed");
@@ -396,11 +396,14 @@ class Execution_plan_list {
                 // Filter out execution plans with workspace greater than whats available from user
                 if (execution_plans[i]->getWorkspaceSize() > max_workspace_allowed) {
                     filtered_indices[i] = true;
+                    execution_plans[i]  = nullptr;
                     getLogger() << "[cudnn_frontend] INFO: Deselecting execution plan at position " << i << std::endl;
                     continue;
                 }
 
                 candidate = static_cast<int64_t>(i);
+                getLogger() << "[cudnn_frontend] INFO: Candidate set as " << i << std::endl;
+
                 return {error_code_t::OK, ""};
             }
         }
@@ -429,6 +432,7 @@ class Execution_plan_list {
                                        "Chosen plan index has been deselected.");
 
         if (execution_plans[index] != nullptr && execution_plans[index]->getWorkspaceSize() <= max_workspace_allowed) {
+            candidate = index;
             return {error_code_t::OK, ""};
         };
 
@@ -477,12 +481,16 @@ class Execution_plan_list {
 
             if (fe_status.is_good()) {
                 if (execution_plans[i]->getWorkspaceSize() > max_workspace_allowed) {
+                    getLogger() << "[cudnn_frontend] INFO: skipping plan since workspace violation. Requires "
+                                << execution_plans[i]->getWorkspaceSize() << std::endl;
                     filtered_indices[i] = true;
+                    execution_plans[i]  = nullptr;
                     continue;
                 }
                 // Only set the candidate the first time, as the order of iteration is from highest to lowest priority
                 if (candidate == -1) {
                     candidate = static_cast<int64_t>(i);
+                    getLogger() << "[cudnn_frontend] INFO: Candidate set as " << i << std::endl;
                 }
 
                 // Return from this function as first successfully built plan is found.
@@ -530,12 +538,12 @@ class Execution_plan_list {
         const float threshhold         = 0.95f;
         uint64_t successful_plan_count = 0;
         cudaEvent_t start, stop;
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-        cudaDeviceSynchronize();
+        cuda_event_create(&start);
+        cuda_event_create(&stop);
+        cuda_device_synchronize();
 
         cudaStream_t stream = nullptr;
-        cudnnGetStream(handle, &stream);
+        cudnn_frontend::get_stream(handle, &stream);
 
         for (auto plan : execution_plans) {
             float time_ms       = 0.0f;
@@ -545,16 +553,16 @@ class Execution_plan_list {
             // Warm-up run
             CHECK_CUDNN_FRONTEND_ERROR(detail::execute(handle, plan.get(), ptrs, uids, workspace_ptr));
             successful_plan_count++;
-            cudaDeviceSynchronize();
+            cuda_device_synchronize();
 
             for (int i = 0; i < maxIterCount; i++) {
-                cudaEventRecord(start, stream);
+                cuda_event_record(start, stream);
 
                 auto status = detail::execute(handle, plan.get(), ptrs, uids, workspace_ptr);
 
-                cudaEventRecord(stop, stream);
-                cudaEventSynchronize(stop);
-                cudaEventElapsedTime(&time_ms, start, stop);
+                cuda_event_record(stop, stream);
+                cuda_event_synchronize(stop);
+                cuda_event_elapsed_time(&time_ms, start, stop);
 
                 final_time_ms = std::min(min_time_ms, time_ms);
                 if (time_ms / min_time_ms < threshhold) {
@@ -575,8 +583,8 @@ class Execution_plan_list {
             execution_plans.push_back(sorted_plan);
         }
 
-        cudaEventDestroy(start);
-        cudaEventDestroy(stop);
+        cuda_event_destroy(start);
+        cuda_event_destroy(stop);
 
         getLogger() << "Autotuned " << successful_plan_count << " plans." << std::endl;
         return {error_code_t::OK, ""};
diff --git a/include/thirdparty/nlohmann/LICENSE.MIT b/include/cudnn_frontend/thirdparty/nlohmann/LICENSE.MIT
similarity index 100%
rename from include/thirdparty/nlohmann/LICENSE.MIT
rename to include/cudnn_frontend/thirdparty/nlohmann/LICENSE.MIT
diff --git a/include/thirdparty/nlohmann/json.hpp b/include/cudnn_frontend/thirdparty/nlohmann/json.hpp
similarity index 100%
rename from include/thirdparty/nlohmann/json.hpp
rename to include/cudnn_frontend/thirdparty/nlohmann/json.hpp
diff --git a/include/cudnn_frontend_ConvDesc.h b/include/cudnn_frontend_ConvDesc.h
index 4887dd3..75f1534 100644
--- a/include/cudnn_frontend_ConvDesc.h
+++ b/include/cudnn_frontend_ConvDesc.h
@@ -274,11 +274,11 @@ class ConvDescBuilder_v8 {
                 "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_CONVOLUTION_COMP_TYPE Failed");
             return std::move(m_convDesc);
         }
-        status = cudnnBackendSetAttribute(m_convDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_CONVOLUTION_COMP_TYPE,
-                                          CUDNN_TYPE_DATA_TYPE,
-                                          1,
-                                          &cudnn_data_type);
+        status = cudnn_frontend::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_CONVOLUTION_COMP_TYPE,
+                                               CUDNN_TYPE_DATA_TYPE,
+                                               1,
+                                               &cudnn_data_type);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_convDesc,
@@ -287,11 +287,11 @@ class ConvDescBuilder_v8 {
             return std::move(m_convDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_convDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_CONVOLUTION_CONV_MODE,
-                                          CUDNN_TYPE_CONVOLUTION_MODE,
-                                          1,
-                                          &m_convDesc.mode);
+        status = cudnn_frontend::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_CONVOLUTION_CONV_MODE,
+                                               CUDNN_TYPE_CONVOLUTION_MODE,
+                                               1,
+                                               &m_convDesc.mode);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_convDesc,
@@ -300,11 +300,11 @@ class ConvDescBuilder_v8 {
             return std::move(m_convDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_convDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS,
-                                          CUDNN_TYPE_INT64,
-                                          1,
-                                          &m_convDesc.nDims);
+        status = cudnn_frontend::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS,
+                                               CUDNN_TYPE_INT64,
+                                               1,
+                                               &m_convDesc.nDims);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_convDesc,
@@ -313,11 +313,11 @@ class ConvDescBuilder_v8 {
             return std::move(m_convDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_convDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS,
-                                          CUDNN_TYPE_INT64,
-                                          m_convDesc.nDims,
-                                          m_convDesc.padLower);
+        status = cudnn_frontend::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS,
+                                               CUDNN_TYPE_INT64,
+                                               m_convDesc.nDims,
+                                               m_convDesc.padLower);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_convDesc,
@@ -326,11 +326,11 @@ class ConvDescBuilder_v8 {
             return std::move(m_convDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_convDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_CONVOLUTION_POST_PADDINGS,
-                                          CUDNN_TYPE_INT64,
-                                          m_convDesc.nDims,
-                                          m_convDesc.padUpper);
+        status = cudnn_frontend::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_CONVOLUTION_POST_PADDINGS,
+                                               CUDNN_TYPE_INT64,
+                                               m_convDesc.nDims,
+                                               m_convDesc.padUpper);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_convDesc,
@@ -339,11 +339,11 @@ class ConvDescBuilder_v8 {
             return std::move(m_convDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_convDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_CONVOLUTION_DILATIONS,
-                                          CUDNN_TYPE_INT64,
-                                          m_convDesc.nDims,
-                                          m_convDesc.dilation);
+        status = cudnn_frontend::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_CONVOLUTION_DILATIONS,
+                                               CUDNN_TYPE_INT64,
+                                               m_convDesc.nDims,
+                                               m_convDesc.dilation);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_convDesc,
@@ -352,11 +352,11 @@ class ConvDescBuilder_v8 {
             return std::move(m_convDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_convDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES,
-                                          CUDNN_TYPE_INT64,
-                                          m_convDesc.nDims,
-                                          m_convDesc.stride);
+        status = cudnn_frontend::set_attribute(m_convDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES,
+                                               CUDNN_TYPE_INT64,
+                                               m_convDesc.nDims,
+                                               m_convDesc.stride);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_convDesc,
@@ -366,7 +366,7 @@ class ConvDescBuilder_v8 {
         }
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_convDesc.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_convDesc.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_convDesc, status, "CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR: cudnnFinalize Failed");
diff --git a/include/cudnn_frontend_Engine.h b/include/cudnn_frontend_Engine.h
index 49c7da1..2501115 100644
--- a/include/cudnn_frontend_Engine.h
+++ b/include/cudnn_frontend_Engine.h
@@ -122,15 +122,15 @@ class Engine_v8 : public BackendDescriptor {
             auto bKnob = bKnobs[i]->get_backend_descriptor();
             cudnnBackendKnobType_t type;
             int64_t maxValue, minValue, stride, elemCount;
-            status =
-                cudnnBackendGetAttribute(bKnob, CUDNN_ATTR_KNOB_INFO_TYPE, CUDNN_TYPE_KNOB_TYPE, 1, &elemCount, &type);
+            status = cudnn_frontend::get_attribute(
+                bKnob, CUDNN_ATTR_KNOB_INFO_TYPE, CUDNN_TYPE_KNOB_TYPE, 1, &elemCount, &type);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(this,
                                               status,
                                               "CUDNN_BACKEND_ENGINE_DESCRIPTOR: CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR "
                                               "GetAttribute CUDNN_ATTR_KNOB_INFO_TYPE failed");
             }
-            status = cudnnBackendGetAttribute(
+            status = cudnn_frontend::get_attribute(
                 bKnob, CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE, CUDNN_TYPE_INT64, 1, &elemCount, &maxValue);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(this,
@@ -138,7 +138,7 @@ class Engine_v8 : public BackendDescriptor {
                                               "CUDNN_BACKEND_ENGINE_DESCRIPTOR: CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR "
                                               "GetAttribute CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE Failed");
             }
-            status = cudnnBackendGetAttribute(
+            status = cudnn_frontend::get_attribute(
                 bKnob, CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE, CUDNN_TYPE_INT64, 1, &elemCount, &minValue);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(this,
@@ -146,8 +146,8 @@ class Engine_v8 : public BackendDescriptor {
                                               "CUDNN_BACKEND_ENGINE_DESCRIPTOR: CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR "
                                               "GetAttribute CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE Failed");
             }
-            status =
-                cudnnBackendGetAttribute(bKnob, CUDNN_ATTR_KNOB_INFO_STRIDE, CUDNN_TYPE_INT64, 1, &elemCount, &stride);
+            status = cudnn_frontend::get_attribute(
+                bKnob, CUDNN_ATTR_KNOB_INFO_STRIDE, CUDNN_TYPE_INT64, 1, &elemCount, &stride);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(this,
                                               status,
@@ -262,11 +262,11 @@ class EngineBuilder_v8 {
             return std::move(m_engine);
         }
 
-        status = cudnnBackendSetAttribute(m_engine.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINE_OPERATION_GRAPH,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_engine.opGraph->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_engine.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINE_OPERATION_GRAPH,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_engine.opGraph->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_engine,
@@ -275,11 +275,11 @@ class EngineBuilder_v8 {
             return std::move(m_engine);
         }
 
-        status = cudnnBackendSetAttribute(m_engine.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINE_GLOBAL_INDEX,
-                                          CUDNN_TYPE_INT64,
-                                          1,
-                                          &m_engine.idx);
+        status = cudnn_frontend::set_attribute(m_engine.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINE_GLOBAL_INDEX,
+                                               CUDNN_TYPE_INT64,
+                                               1,
+                                               &m_engine.idx);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_engine,
@@ -289,7 +289,7 @@ class EngineBuilder_v8 {
         }
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_engine.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_engine.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_engine, status, "CUDNN_BACKEND_ENGINE_DESCRIPTOR: cudnnFinalize Failed");
             return std::move(m_engine);
@@ -311,12 +311,12 @@ class EngineBuilder_v8 {
         for (std::uint32_t i = 0; i < m_engine.bKnobs.size(); i++) {
             bKnobs_[i] = m_engine.bKnobs[i]->get_backend_descriptor();
         }
-        status = cudnnBackendGetAttribute(m_engine.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINE_KNOB_INFO,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          CUDNN_KNOB_TYPE_COUNTS,
-                                          &m_engine.numKnobs,
-                                          bKnobs_.data());
+        status = cudnn_frontend::get_attribute(m_engine.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINE_KNOB_INFO,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               CUDNN_KNOB_TYPE_COUNTS,
+                                               &m_engine.numKnobs,
+                                               bKnobs_.data());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_engine,
diff --git a/include/cudnn_frontend_EngineConfig.h b/include/cudnn_frontend_EngineConfig.h
index 42efb8a..b8c3616 100644
--- a/include/cudnn_frontend_EngineConfig.h
+++ b/include/cudnn_frontend_EngineConfig.h
@@ -118,11 +118,11 @@ class EngineConfigBuilder_v8 {
             cudnnStatus_t status;
             cudnnBackendKnobType_t type = knobs[i].getKnobType();
             int64_t value               = knobs[i].getChoice();
-            status = cudnnBackendSetAttribute(m_engine_config.bChoices[i]->get_backend_descriptor(),
-                                              CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE,
-                                              CUDNN_TYPE_KNOB_TYPE,
-                                              1,
-                                              &type);
+            status = cudnn_frontend::set_attribute(m_engine_config.bChoices[i]->get_backend_descriptor(),
+                                                   CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE,
+                                                   CUDNN_TYPE_KNOB_TYPE,
+                                                   1,
+                                                   &type);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(&m_engine_config,
                                               status,
@@ -130,11 +130,11 @@ class EngineConfigBuilder_v8 {
                                               "CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR SetAttribute "
                                               "CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE Failed");
             }
-            status = cudnnBackendSetAttribute(m_engine_config.bChoices[i]->get_backend_descriptor(),
-                                              CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE,
-                                              CUDNN_TYPE_INT64,
-                                              1,
-                                              &value);
+            status = cudnn_frontend::set_attribute(m_engine_config.bChoices[i]->get_backend_descriptor(),
+                                                   CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE,
+                                                   CUDNN_TYPE_INT64,
+                                                   1,
+                                                   &value);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(&m_engine_config,
                                               status,
@@ -142,7 +142,7 @@ class EngineConfigBuilder_v8 {
                                               "CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR SetAttribute "
                                               "CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE Failed");
             }
-            status = cudnnBackendFinalize(m_engine_config.bChoices[i]->get_backend_descriptor());
+            status = cudnn_frontend::finalize(m_engine_config.bChoices[i]->get_backend_descriptor());
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_engine_config,
@@ -180,11 +180,11 @@ class EngineConfigBuilder_v8 {
             return std::move(m_engine_config);
         }
 
-        status = cudnnBackendSetAttribute(m_engine_config.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINECFG_ENGINE,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_engine_config.engine->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_engine_config.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINECFG_ENGINE,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_engine_config.engine->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_engine_config,
@@ -198,11 +198,11 @@ class EngineConfigBuilder_v8 {
             for (auto i = 0; i < m_engine_config.numKnobs; i++) {
                 bChoices_[i] = m_engine_config.bChoices[i]->get_backend_descriptor();
             }
-            status = cudnnBackendSetAttribute(m_engine_config.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              m_engine_config.numKnobs,
-                                              bChoices_.data());
+            status = cudnn_frontend::set_attribute(m_engine_config.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   m_engine_config.numKnobs,
+                                                   bChoices_.data());
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_engine_config,
@@ -213,7 +213,7 @@ class EngineConfigBuilder_v8 {
         }
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_engine_config.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_engine_config.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_engine_config, status, "CUDNN_BACKEND_ENGINECFG_DESCRIPTOR: cudnnFinalize Failed");
diff --git a/include/cudnn_frontend_EngineFallbackList.h b/include/cudnn_frontend_EngineFallbackList.h
index 21b277a..519cf65 100644
--- a/include/cudnn_frontend_EngineFallbackList.h
+++ b/include/cudnn_frontend_EngineFallbackList.h
@@ -29,9 +29,9 @@ namespace cudnn_frontend {
 
 [[maybe_unused]] auto static get_fallback_engine_list(DescriptorType_t mode, const std::string &opGraphTag)
     -> std::vector<int> {
-    auto major_version = cudnnGetVersion() / 1000;
+    auto major_version = cudnn_frontend::get_backend_version() / 1000;
 
-    auto minor_version = (cudnnGetVersion() / 100) % 10;
+    auto minor_version = (cudnn_frontend::get_backend_version() / 100) % 10;
     if (major_version >= 8) {
         if (minor_version <= 2) {
             /// Here we are using the term "bias" in the operationGraph as a proxy for
@@ -157,33 +157,12 @@ class EngineFallbackListBuilder_v8 {
                                           "CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH field for heuristic");
             return std::move(m_fallback_list);
         };
-#if (CUDNN_VERSION >= 8400)
         auto fallback_heuristics = EngineHeuristicsBuilder_v8()
                                        .setHeurMode(CUDNN_HEUR_MODE_FALLBACK)
                                        .setOperationGraph(m_fallback_list.opGraph, m_fallback_list.opGraphTag)
                                        .build();
         auto count                       = fallback_heuristics.getEngineConfigCount();
         m_fallback_list.m_engine_configs = fallback_heuristics.getEngineConfig(count);
-#else
-        auto fallback_engine_list = get_fallback_engine_list(m_fallback_list.mode, m_fallback_list.opGraphTag);
-        for (std::uint32_t i = 0; i < fallback_engine_list.size(); i++) {
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-            try {
-#endif
-                auto engine = cudnn_frontend::EngineBuilder_v8()
-                                  .setGlobalEngineIdx(fallback_engine_list[i])
-                                  .setOperationGraph(m_fallback_list.opGraph)
-                                  .build();
-                auto engine_config = cudnn_frontend::EngineConfigBuilder_v8().setEngine(engine).build();
-                m_fallback_list.m_engine_configs.emplace_back(engine_config.get_desc());
-#ifndef NV_CUDNN_DISABLE_EXCEPTION
-            } catch (cudnn_frontend::cudnnException &e) {
-                CUDNN_FRONTEND_UNUSED(e);
-                continue;
-            }
-#endif
-        }
-#endif
         getLogger() << "[cudnn_frontend] " << m_fallback_list << std::endl;
         return std::move(m_fallback_list);
     }
diff --git a/include/cudnn_frontend_Errata.h b/include/cudnn_frontend_Errata.h
index 3472eb7..131832b 100644
--- a/include/cudnn_frontend_Errata.h
+++ b/include/cudnn_frontend_Errata.h
@@ -71,7 +71,7 @@ check_shape(cudnnBackendDescriptor_t &op,
     cudnnBackendDescriptor_t tensor_ = tensor->get_backend_descriptor();
     int64_t count                    = 0;
     cudnnStatus_t status =
-        cudnnBackendGetAttribute(op, tensor_attr, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &count, &tensor_);
+        cudnn_frontend::get_attribute(op, tensor_attr, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &count, &tensor_);
     if (status != CUDNN_STATUS_SUCCESS) {
 #ifndef NV_CUDNN_DISABLE_EXCEPTION
         throw cudnnException(std::string("Error getting attribute. cudnn_status: " + to_string(status)).c_str(),
@@ -81,7 +81,7 @@ check_shape(cudnnBackendDescriptor_t &op,
 
     // Get tensor dims
     std::array<int64_t, 5> tensor_dims;
-    status = cudnnBackendGetAttribute(
+    status = cudnn_frontend::get_attribute(
         tensor_, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 5, &count, tensor_dims.data());
     if (status != CUDNN_STATUS_SUCCESS) {
 #ifndef NV_CUDNN_DISABLE_EXCEPTION
diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h
index 8af879f..7ed985b 100644
--- a/include/cudnn_frontend_ExecutionPlan.h
+++ b/include/cudnn_frontend_ExecutionPlan.h
@@ -76,13 +76,11 @@ class ExecutionPlan_v8 : public BackendDescriptor {
         for (auto note : numeric_notes_vec) {
             ss << cudnn_frontend::to_string(note) << ",";
         }
-#if (CUDNN_VERSION >= 8200)
         ss << "] behavior_notes:"
            << "[";
         for (auto note : behavior_notes_vec) {
             ss << cudnn_frontend::to_string(note) << ",";
         }
-#endif
         ss << "] workSpaceSize: " << workSpaceSize;
         return ss.str();
     }
@@ -112,7 +110,6 @@ class ExecutionPlan_v8 : public BackendDescriptor {
         return numeric_notes;
     }
 
-#if (CUDNN_VERSION >= 8200)
     std::array<cudnnBackendBehaviorNote_t, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT> const &
     getBehaviorNotes() const {
         return behavior_notes;
@@ -121,20 +118,18 @@ class ExecutionPlan_v8 : public BackendDescriptor {
     getAllBehaviorNotes() const {
         return behavior_notes_vec;
     }
-#endif
 
     std::string
     getJsonRepresentation() const {
-#if (CUDNN_VERSION >= 8400)
         auto status = CUDNN_STATUS_SUCCESS;
         int64_t serializationSize;
         std::vector<char> serialization_buf;
-        status = cudnnBackendGetAttribute(pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION,
-                                          CUDNN_TYPE_CHAR,
-                                          0,
-                                          &serializationSize,
-                                          nullptr);
+        status = cudnn_frontend::get_attribute(pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION,
+                                               CUDNN_TYPE_CHAR,
+                                               0,
+                                               &serializationSize,
+                                               nullptr);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(this,
                                           status,
@@ -142,12 +137,12 @@ class ExecutionPlan_v8 : public BackendDescriptor {
                                           "CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION Failed");
         }
         serialization_buf.resize(static_cast<size_t>(serializationSize));
-        status = cudnnBackendGetAttribute(pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION,
-                                          CUDNN_TYPE_CHAR,
-                                          serializationSize,
-                                          &serializationSize,
-                                          serialization_buf.data());
+        status = cudnn_frontend::get_attribute(pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION,
+                                               CUDNN_TYPE_CHAR,
+                                               serializationSize,
+                                               &serializationSize,
+                                               serialization_buf.data());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(this,
                                           status,
@@ -156,15 +151,6 @@ class ExecutionPlan_v8 : public BackendDescriptor {
         }
         std::string json_string(serialization_buf.begin(), serialization_buf.end());
         return json_string;
-#else
-        auto status = CUDNN_STATUS_NOT_SUPPORTED;
-        set_error_and_throw_exception(this,
-                                      status,
-                                      "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
-                                      "CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION Failed");
-        std::string json_string{""};
-        return json_string;
-#endif
     }
 
     ExecutionPlan_v8(ExecutionPlan_v8 const &) = default;
@@ -177,19 +163,19 @@ class ExecutionPlan_v8 : public BackendDescriptor {
         auto status                               = CUDNN_STATUS_SUCCESS;
         int64_t elem_count                        = 0;
         cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
-        status                                    = cudnnBackendGetAttribute(extractedEngine_,
-                                          CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
-                                          CUDNN_TYPE_NUMERICAL_NOTE,
-                                          CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
-                                          &elem_count,
-                                          nullptr);
+        status                                    = cudnn_frontend::get_attribute(extractedEngine_,
+                                               CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                                               CUDNN_TYPE_NUMERICAL_NOTE,
+                                               CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                                               &elem_count,
+                                               nullptr);
         numeric_notes_vec.resize(static_cast<size_t>(elem_count));
-        status = cudnnBackendGetAttribute(extractedEngine_,
-                                          CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
-                                          CUDNN_TYPE_NUMERICAL_NOTE,
-                                          CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
-                                          &elem_count,
-                                          numeric_notes_vec.data());
+        status = cudnn_frontend::get_attribute(extractedEngine_,
+                                               CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                                               CUDNN_TYPE_NUMERICAL_NOTE,
+                                               CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                                               &elem_count,
+                                               numeric_notes_vec.data());
         ptrdiff_t end =
             static_cast<ptrdiff_t>(std::min(elem_count, static_cast<int64_t>(CUDNN_NUMERICAL_NOTE_TYPE_COUNT)));
         std::copy(numeric_notes_vec.begin(), numeric_notes_vec.begin() + end, numeric_notes.begin());
@@ -203,20 +189,19 @@ class ExecutionPlan_v8 : public BackendDescriptor {
                                           "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
                                           "CUDNN_ATTR_ENGINE_NUMERICAL_NOTE Failed");
         }
-#if (CUDNN_VERSION >= 8200)
-        status = cudnnBackendGetAttribute(extractedEngine_,
-                                          CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
-                                          CUDNN_TYPE_BEHAVIOR_NOTE,
-                                          CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
-                                          &elem_count,
-                                          nullptr);
+        status = cudnn_frontend::get_attribute(extractedEngine_,
+                                               CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                                               CUDNN_TYPE_BEHAVIOR_NOTE,
+                                               CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                                               &elem_count,
+                                               nullptr);
         behavior_notes_vec.resize(static_cast<size_t>(elem_count));
-        status = cudnnBackendGetAttribute(extractedEngine_,
-                                          CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
-                                          CUDNN_TYPE_BEHAVIOR_NOTE,
-                                          CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
-                                          &elem_count,
-                                          behavior_notes_vec.data());
+        status = cudnn_frontend::get_attribute(extractedEngine_,
+                                               CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                                               CUDNN_TYPE_BEHAVIOR_NOTE,
+                                               CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                                               &elem_count,
+                                               behavior_notes_vec.data());
         end    = static_cast<ptrdiff_t>(std::min(elem_count, static_cast<int64_t>(CUDNN_BEHAVIOR_NOTE_TYPE_COUNT)));
         std::copy(behavior_notes_vec.begin(), behavior_notes_vec.begin() + end, behavior_notes.begin());
         if (static_cast<size_t>(elem_count) < behavior_notes.size())
@@ -229,7 +214,6 @@ class ExecutionPlan_v8 : public BackendDescriptor {
                                           "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
                                           "CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE Failed");
         }
-#endif
     }
 
     void
@@ -255,7 +239,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
             extractedKnobs_[i] = extractedKnobs[i]->get_backend_descriptor();
         }
 
-        status = cudnnBackendGetAttribute(
+        status = cudnn_frontend::get_attribute(
             extractedEngine_, CUDNN_ATTR_ENGINE_GLOBAL_INDEX, CUDNN_TYPE_INT64, 1, &elemCount, &engineId);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(this,
@@ -265,12 +249,12 @@ class ExecutionPlan_v8 : public BackendDescriptor {
         }
         tag << "eng" << engineId;
 
-        status = cudnnBackendGetAttribute(engine_config->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          CUDNN_KNOB_TYPE_COUNTS,
-                                          &numKnobs,
-                                          &(extractedKnobs_[0]));
+        status = cudnn_frontend::get_attribute(engine_config->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               CUDNN_KNOB_TYPE_COUNTS,
+                                               &numKnobs,
+                                               &(extractedKnobs_[0]));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(this,
                                           status,
@@ -287,7 +271,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
             const cudnnBackendDescriptor_t &knob = extractedKnobs_[idx];
             cudnnBackendKnobType_t type          = CUDNN_KNOB_TYPE_COUNTS;
             int64_t choice                       = -2;
-            status                               = cudnnBackendGetAttribute(
+            status                               = cudnn_frontend::get_attribute(
                 knob, CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE, CUDNN_TYPE_KNOB_TYPE, 1, nullptr, &type);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(this,
@@ -295,7 +279,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
                                               "computeTag CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: GetAttribute "
                                               "CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE Failed");
             }
-            status = cudnnBackendGetAttribute(
+            status = cudnn_frontend::get_attribute(
                 knob, CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE, CUDNN_TYPE_INT64, 1, nullptr, &choice);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(this,
@@ -310,12 +294,12 @@ class ExecutionPlan_v8 : public BackendDescriptor {
 
     void
     computeWorkSpaceSize() {
-        auto status = cudnnBackendGetAttribute(pointer->get_backend_descriptor(),
-                                               CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
-                                               CUDNN_TYPE_INT64,
-                                               1,
-                                               nullptr,
-                                               &workSpaceSize);
+        auto status = cudnn_frontend::get_attribute(pointer->get_backend_descriptor(),
+                                                    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
+                                                    CUDNN_TYPE_INT64,
+                                                    1,
+                                                    nullptr,
+                                                    &workSpaceSize);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(this,
                                           status,
@@ -336,10 +320,8 @@ class ExecutionPlan_v8 : public BackendDescriptor {
     std::int64_t workSpaceSize = 0;
     std::array<cudnnBackendNumericalNote_t, CUDNN_NUMERICAL_NOTE_TYPE_COUNT> numeric_notes;
     std::vector<cudnnBackendNumericalNote_t> numeric_notes_vec;
-#if (CUDNN_VERSION >= 8200)
     std::array<cudnnBackendBehaviorNote_t, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT> behavior_notes;
     std::vector<cudnnBackendBehaviorNote_t> behavior_notes_vec;
-#endif
 
     float execution_time_ms = 0.0f;
 };
@@ -411,11 +393,11 @@ class ExecutionPlanBuilder_v8 {
             return std::move(m_execution_plan);
         }
 
-        status = cudnnBackendSetAttribute(m_execution_plan.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_execution_plan.engine_config->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_execution_plan.engine_config->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_execution_plan,
@@ -423,11 +405,11 @@ class ExecutionPlanBuilder_v8 {
                 "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: SetAttribute CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG Failed");
             return std::move(m_execution_plan);
         }
-        status = cudnnBackendSetAttribute(m_execution_plan.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
-                                          CUDNN_TYPE_HANDLE,
-                                          1,
-                                          &m_execution_plan.handle);
+        status = cudnn_frontend::set_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
+                                               CUDNN_TYPE_HANDLE,
+                                               1,
+                                               &m_execution_plan.handle);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_execution_plan,
@@ -436,7 +418,7 @@ class ExecutionPlanBuilder_v8 {
             return std::move(m_execution_plan);
         }
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_execution_plan.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_execution_plan.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_execution_plan, status, "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed");
@@ -454,12 +436,12 @@ class ExecutionPlanBuilder_v8 {
         }
         cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
         int64_t elemCount                         = 0;
-        status = cudnnBackendGetAttribute(m_execution_plan.engine_config->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINECFG_ENGINE,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &elemCount,
-                                          &extractedEngine_);
+        status = cudnn_frontend::get_attribute(m_execution_plan.engine_config->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINECFG_ENGINE,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &elemCount,
+                                               &extractedEngine_);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_execution_plan,
                                           status,
@@ -479,7 +461,6 @@ class ExecutionPlanBuilder_v8 {
     ExecutionPlan_v8 &&
     loadFromJson(const std::string &json_plan) {
         CUDNN_FRONTEND_UNUSED(json_plan);
-#if (CUDNN_VERSION >= 8400)
         auto status = CUDNN_STATUS_SUCCESS;
 
         if (m_execution_plan.handle == nullptr) {
@@ -500,11 +481,11 @@ class ExecutionPlanBuilder_v8 {
 
         std::vector<char> serialization_buf;
         serialization_buf.assign(json_plan.begin(), json_plan.end());
-        status = cudnnBackendSetAttribute(m_execution_plan.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION,
-                                          CUDNN_TYPE_CHAR,
-                                          serialization_buf.size(),
-                                          serialization_buf.data());
+        status = cudnn_frontend::set_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION,
+                                               CUDNN_TYPE_CHAR,
+                                               serialization_buf.size(),
+                                               serialization_buf.data());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_execution_plan,
                                           status,
@@ -513,11 +494,11 @@ class ExecutionPlanBuilder_v8 {
             return std::move(m_execution_plan);
         }
 
-        status = cudnnBackendSetAttribute(m_execution_plan.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
-                                          CUDNN_TYPE_HANDLE,
-                                          1,
-                                          &m_execution_plan.handle);
+        status = cudnn_frontend::set_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
+                                               CUDNN_TYPE_HANDLE,
+                                               1,
+                                               &m_execution_plan.handle);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_execution_plan,
@@ -526,7 +507,7 @@ class ExecutionPlanBuilder_v8 {
             return std::move(m_execution_plan);
         }
 
-        status = cudnnBackendFinalize(m_execution_plan.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_execution_plan.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_execution_plan, status, "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed");
@@ -545,12 +526,12 @@ class ExecutionPlanBuilder_v8 {
 
         cudnnBackendDescriptor_t engCfgDesc = m_execution_plan.engine_config->get_backend_descriptor();
         int64_t elemCount                   = 0;
-        status = cudnnBackendGetAttribute(m_execution_plan.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &elemCount,
-                                          &engCfgDesc);
+        status = cudnn_frontend::get_attribute(m_execution_plan.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &elemCount,
+                                               &engCfgDesc);
 
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_execution_plan,
@@ -571,12 +552,12 @@ class ExecutionPlanBuilder_v8 {
 
         cudnnBackendDescriptor_t extractedEngine_ = extractedEngine->get_backend_descriptor();
 
-        status = cudnnBackendGetAttribute(m_execution_plan.engine_config->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINECFG_ENGINE,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &elemCount,
-                                          &extractedEngine_);
+        status = cudnn_frontend::get_attribute(m_execution_plan.engine_config->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINECFG_ENGINE,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &elemCount,
+                                               &extractedEngine_);
 
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_execution_plan,
@@ -592,14 +573,6 @@ class ExecutionPlanBuilder_v8 {
 
         getLogger() << "[cudnn_frontend] " << m_execution_plan << std::endl;
         return std::move(m_execution_plan);
-#else
-        auto status = CUDNN_STATUS_NOT_SUPPORTED;
-        set_error_and_throw_exception(&m_execution_plan,
-                                      status,
-                                      "CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: Build "
-                                      "From Json Failed");
-        return std::move(m_execution_plan);
-#endif
     }
 
     explicit ExecutionPlanBuilder_v8()                       = default;
diff --git a/include/cudnn_frontend_Filters.h b/include/cudnn_frontend_Filters.h
index c3faa11..781582f 100644
--- a/include/cudnn_frontend_Filters.h
+++ b/include/cudnn_frontend_Filters.h
@@ -46,18 +46,18 @@ hasNumericalNote(cudnnBackendDescriptor_t engine_config) {
     ManagedOpaqueDescriptor engine   = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
     cudnnBackendDescriptor_t engine_ = engine->get_backend_descriptor();
     int64_t engine_count             = -1;
-    status                           = cudnnBackendGetAttribute(
+    status                           = cudnn_frontend::get_attribute(
         engine_config, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine_count, &engine_);
     if (status == CUDNN_STATUS_SUCCESS) {
         cudnnBackendNumericalNote_t notes[CUDNN_NUMERICAL_NOTE_TYPE_COUNT];
         std::fill_n(notes, CUDNN_NUMERICAL_NOTE_TYPE_COUNT, CUDNN_NUMERICAL_NOTE_TYPE_COUNT);
         int64_t elem_count = 0;
-        cudnnBackendGetAttribute(engine->get_backend_descriptor(),
-                                 CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
-                                 CUDNN_TYPE_NUMERICAL_NOTE,
-                                 CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
-                                 &elem_count,
-                                 notes);
+        cudnn_frontend::get_attribute(engine->get_backend_descriptor(),
+                                      CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
+                                      CUDNN_TYPE_NUMERICAL_NOTE,
+                                      CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+                                      &elem_count,
+                                      notes);
         if (std::any_of(
                 notes, notes + elem_count, [](cudnnBackendNumericalNote_t note) { return note == NUMERIC_NOTE; })) {
             hasNumerics = true;
@@ -66,7 +66,6 @@ hasNumericalNote(cudnnBackendDescriptor_t engine_config) {
     return hasNumerics;
 }
 
-#if (CUDNN_VERSION >= 8200)
 template <cudnnBackendBehaviorNote_t BEHAVIOR_NOTE>
 bool
 hasBehaviorNote(cudnnBackendDescriptor_t engine_config) {
@@ -75,18 +74,18 @@ hasBehaviorNote(cudnnBackendDescriptor_t engine_config) {
     ManagedOpaqueDescriptor engine   = make_shared_backend_pointer(CUDNN_BACKEND_ENGINE_DESCRIPTOR);
     cudnnBackendDescriptor_t engine_ = engine->get_backend_descriptor();
     int64_t engine_count             = -1;
-    status                           = cudnnBackendGetAttribute(
+    status                           = cudnn_frontend::get_attribute(
         engine_config, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine_count, &engine_);
     if (status == CUDNN_STATUS_SUCCESS) {
         cudnnBackendBehaviorNote_t notes[CUDNN_BEHAVIOR_NOTE_TYPE_COUNT];
         std::fill_n(notes, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT);
         int64_t elem_count = 0;
-        cudnnBackendGetAttribute(engine->get_backend_descriptor(),
-                                 CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
-                                 CUDNN_TYPE_BEHAVIOR_NOTE,
-                                 CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
-                                 &elem_count,
-                                 notes);
+        cudnn_frontend::get_attribute(engine->get_backend_descriptor(),
+                                      CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
+                                      CUDNN_TYPE_BEHAVIOR_NOTE,
+                                      CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+                                      &elem_count,
+                                      notes);
         if (std::any_of(
                 notes, notes + elem_count, [](cudnnBackendBehaviorNote_t note) { return note == BEHAVIOR_NOTE; })) {
             hasBehavior = true;
@@ -94,5 +93,4 @@ hasBehaviorNote(cudnnBackendDescriptor_t engine_config) {
     }
     return hasBehavior;
 }
-#endif
 }  // namespace cudnn_frontend
diff --git a/include/cudnn_frontend_Heuristics.h b/include/cudnn_frontend_Heuristics.h
index fe47d31..4c8d046 100644
--- a/include/cudnn_frontend_Heuristics.h
+++ b/include/cudnn_frontend_Heuristics.h
@@ -27,9 +27,6 @@
 
 #include "cudnn_frontend_OperationGraph.h"
 #include "cudnn_frontend_EngineConfig.h"
-#if (CUDNN_VERSION < 8400)
-#include "cudnn_frontend_EngineFallbackList.h"
-#endif
 #include "cudnn_frontend_utils.h"
 #include "cudnn_frontend_Filters.h"
 
@@ -90,12 +87,13 @@ class EngineHeuristics_v8 : public BackendDescriptor {
             heuristic_results_.emplace_back(m_heuristic_results[i]->get_backend_descriptor());
         }
         int64_t result = -1;
-        status         = cudnnBackendGetAttribute(pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINEHEUR_RESULTS,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          count,
-                                          &result,
-                                          heuristic_results_.data());
+        status         = cudnn_frontend::get_attribute(pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINEHEUR_RESULTS,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               count,
+                                               &result,
+                                               heuristic_results_.data());
+
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 this, status, "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: GetAttribute CUDNN_ATTR_ENGINEHEUR_RESULTS Failed");
@@ -109,12 +107,12 @@ class EngineHeuristics_v8 : public BackendDescriptor {
     getEngineConfigCount(void) const -> int64_t {
         cudnnStatus_t status;
         int64_t count = -1;
-        status        = cudnnBackendGetAttribute(pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINEHEUR_RESULTS,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          0,
-                                          &count,
-                                          nullptr);
+        status        = cudnn_frontend::get_attribute(pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINEHEUR_RESULTS,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               0,
+                                               &count,
+                                               nullptr);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 this,
@@ -200,11 +198,11 @@ class EngineHeuristicsBuilder_v8 {
             return std::move(m_heuristics);
         };
 
-        status = cudnnBackendSetAttribute(m_heuristics.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_heuristics.opGraph->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_heuristics.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_heuristics.opGraph->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_heuristics,
@@ -212,11 +210,11 @@ class EngineHeuristicsBuilder_v8 {
                 "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: SetAttribute  CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH Failed");
             return std::move(m_heuristics);
         };
-        status = cudnnBackendSetAttribute(m_heuristics.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_ENGINEHEUR_MODE,
-                                          CUDNN_TYPE_HEUR_MODE,
-                                          1,
-                                          &m_heuristics.mode);
+        status = cudnn_frontend::set_attribute(m_heuristics.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_ENGINEHEUR_MODE,
+                                               CUDNN_TYPE_HEUR_MODE,
+                                               1,
+                                               &m_heuristics.mode);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_heuristics,
@@ -226,11 +224,16 @@ class EngineHeuristicsBuilder_v8 {
         };
 #if (CUDNN_VERSION >= 8905)
         if (m_heuristics.target_sm_count >= 0) {
-            status = cudnnBackendSetAttribute(m_heuristics.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_ENGINE_SM_COUNT_TARGET,
-                                              CUDNN_TYPE_INT32,
-                                              1,
-                                              &m_heuristics.target_sm_count);
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+                8905,
+                m_heuristics,
+                "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: SetAttribute CUDNN_ATTR_ENGINE_SM_COUNT_TARGET requires cudnn "
+                "version 8.9.5");
+            status = cudnn_frontend::set_attribute(m_heuristics.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_ENGINE_SM_COUNT_TARGET,
+                                                   CUDNN_TYPE_INT32,
+                                                   1,
+                                                   &m_heuristics.target_sm_count);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_heuristics,
@@ -241,18 +244,17 @@ class EngineHeuristicsBuilder_v8 {
         }
 #endif
 
-#if (CUDNN_VERSION >= 8401)
         if (m_heuristics.mode == CUDNN_HEUR_MODE_B) {
             EngineHeuristics_v8::get_heur_b_mutex().lock();
         }
-#endif
+
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_heuristics.pointer->get_backend_descriptor());
-#if (CUDNN_VERSION >= 8401)
+        status = cudnn_frontend::finalize(m_heuristics.pointer->get_backend_descriptor());
+
         if (m_heuristics.mode == CUDNN_HEUR_MODE_B) {
             EngineHeuristics_v8::get_heur_b_mutex().unlock();
         }
-#endif
+
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_heuristics, status, "CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR: cudnn Finalize failed");
@@ -356,12 +358,7 @@ get_heuristics_list(std::vector<std::string> const &modes,
     for (auto &mode : modes) {
         if (mode.find("heuristics_instant") != std::string::npos ||
             mode.find("heuristics_mode_a") != std::string::npos) {
-            auto heur_mode =
-#if (CUDNN_VERSION >= 8300)
-                CUDNN_HEUR_MODE_A;
-#else
-                CUDNN_HEUR_MODE_INSTANT;
-#endif
+            auto heur_mode = CUDNN_HEUR_MODE_A;
             NV_CUDNN_FE_TRY();
             auto status_l = get_heuristics_list_impl(heur_mode, opGraph, filter_fn, filtered_configs);
             NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(status_l, true);
@@ -369,34 +366,11 @@ get_heuristics_list(std::vector<std::string> const &modes,
 
         } else if (mode.find("heuristics_fallback") != std::string::npos) {
             NV_CUDNN_FE_TRY();
-#if (CUDNN_VERSION >= 8300)
             auto status_l = get_heuristics_list_impl(CUDNN_HEUR_MODE_FALLBACK, opGraph, filter_fn, filtered_configs);
             NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(status_l, true);
-#else
-            DescriptorType_t op_type = DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR;
-            std::string tag_         = opGraph.getTag();
-            if (tag_.find("ConvFwd") != std::string::npos) {
-                op_type = DescriptorType_t::OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR;
-            } else if (tag_.find("ConvBwdFilter") != std::string::npos) {
-                op_type = DescriptorType_t::OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR;
-            }
-            auto heuristics =
-                cudnn_frontend::EngineFallbackListBuilder_v8().setOperationGraph(opGraph).setOperation(op_type).build();
-            NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(heuristics.get_status(), false);
-            auto &fallback_list = heuristics.getFallbackList();
-            NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(heuristics.get_status(), false);
-            getLogger() << "Fallback List has " << fallback_list.size() << " configurations " << std::endl;
-            cudnn_frontend::filter(fallback_list, filtered_configs, filter_fn);
-            NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(heuristics.get_status(), true);
-#endif
             NV_CUDNN_FE_CATCH(NV_CUDNN_SET_STATUS_BREAK_OR_CONTINUE(e.getCudnnStatus(), true));
         } else if (mode.find("heuristics_mode_b") != std::string::npos) {
-            auto heur_mode =
-#if (CUDNN_VERSION >= 8300)
-                CUDNN_HEUR_MODE_B;
-#else
-                CUDNN_HEUR_MODE_INSTANT;
-#endif
+            auto heur_mode = CUDNN_HEUR_MODE_B;
             NV_CUDNN_FE_TRY();
             auto status_l = get_heuristics_list_impl(heur_mode, opGraph, filter_fn, filtered_configs);
 
diff --git a/include/cudnn_frontend_Logging.h b/include/cudnn_frontend_Logging.h
index 1af9e77..083847d 100644
--- a/include/cudnn_frontend_Logging.h
+++ b/include/cudnn_frontend_Logging.h
@@ -26,7 +26,6 @@
 #include <fstream>
 #include <cstring>
 
-#include "cudnn_backend_base.h"
 namespace cudnn_frontend {
 
 static const char *
diff --git a/include/cudnn_frontend_MatMulDesc.h b/include/cudnn_frontend_MatMulDesc.h
index e6511f3..a2b802c 100644
--- a/include/cudnn_frontend_MatMulDesc.h
+++ b/include/cudnn_frontend_MatMulDesc.h
@@ -126,11 +126,11 @@ class MatMulDescBuilder_v8 {
                 "CUDNN_BACKEND_MATMUL_DESCRIPTOR: SetAttribute CUDNN_ATTR_MATMUL_COMP_TYPE Failed");
             return std::move(m_matMulDesc);
         }
-        status = cudnnBackendSetAttribute(m_matMulDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_MATMUL_COMP_TYPE,
-                                          CUDNN_TYPE_DATA_TYPE,
-                                          1,
-                                          &cudnn_data_type);
+        status = cudnn_frontend::set_attribute(m_matMulDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_MATMUL_COMP_TYPE,
+                                               CUDNN_TYPE_DATA_TYPE,
+                                               1,
+                                               &cudnn_data_type);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_matMulDesc,
@@ -141,12 +141,16 @@ class MatMulDescBuilder_v8 {
 
 #if (CUDNN_VERSION >= 8900)
         // Setting padding value if matmul desc is padded
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8900,
+            m_matMulDesc,
+            "CUDNN_BACKEND_MATMUL_DESCRIPTOR: SetAttribute CUDNN_ATTR_MATMUL_PADDING_VALUE requires cudnn 8.9.0");
         if (m_matMulDesc.isPadded) {
-            status = cudnnBackendSetAttribute(m_matMulDesc.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_MATMUL_PADDING_VALUE,
-                                              CUDNN_TYPE_DOUBLE,
-                                              1,
-                                              &m_matMulDesc.paddingValue);
+            status = cudnn_frontend::set_attribute(m_matMulDesc.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_MATMUL_PADDING_VALUE,
+                                                   CUDNN_TYPE_DOUBLE,
+                                                   1,
+                                                   &m_matMulDesc.paddingValue);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_matMulDesc,
@@ -158,7 +162,7 @@ class MatMulDescBuilder_v8 {
 #endif
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_matMulDesc.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_matMulDesc.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_matMulDesc, status, "CUDNN_BACKEND_MATMUL_DESCRIPTOR: cudnnFinalize Failed");
diff --git a/include/cudnn_frontend_Operation.h b/include/cudnn_frontend_Operation.h
index 18e8434..bc264b0 100644
--- a/include/cudnn_frontend_Operation.h
+++ b/include/cudnn_frontend_Operation.h
@@ -243,20 +243,12 @@ class OperationBuilder_v8 {
     Operation_v8 &&
     build_reduction_op() {
         m_operation.operationTag = "Reduction";
-        auto status              = CUDNN_STATUS_SUCCESS;
-        if ((cudnnGetVersion() / 100) == 81) {  // workaround for cudnn 8.1
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_REDUCTION_OPERATOR,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.reductiondesc->get_backend_descriptor()));
-        } else {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_REDUCTION_DESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.reductiondesc->get_backend_descriptor()));
-        }
+        auto status              = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                    CUDNN_ATTR_OPERATION_REDUCTION_DESC,
+                                                    CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                    1,
+                                                    &(m_operation.reductiondesc->get_backend_descriptor()));
+
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -264,11 +256,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_REDUCTION_DESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_REDUCTION_XDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.xdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_REDUCTION_XDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.xdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -276,11 +268,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_REDUCTION_XDESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_REDUCTION_YDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.ydesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_REDUCTION_YDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.ydesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -288,7 +280,7 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_REDUCTION_YDESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -300,43 +292,45 @@ class OperationBuilder_v8 {
     build_matmul_op() {
         m_operation.operationTag = "Matmul";
         auto status              = CUDNN_STATUS_SUCCESS;
-        status                   = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_MATMUL_ADESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.amatdesc->get_backend_descriptor()));
+        status                   = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_MATMUL_ADESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.amatdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_ADESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_MATMUL_BDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.bmatdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_MATMUL_BDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.bmatdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_BDESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_MATMUL_CDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.cmatdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_MATMUL_CDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.cmatdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_CDESC Failed");
             return std::move(m_operation);
         }
 #if (CUDNN_VERSION >= 8700)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8700, m_operation, "CUDNN_BACKEND_OPERATION: M,N,K override Requires cudnn 8.7.0 and above");
         if (m_operation.moverridedesc != nullptr) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.moverridedesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.moverridedesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -346,11 +340,11 @@ class OperationBuilder_v8 {
             }
         }
         if (m_operation.noverridedesc != nullptr) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.noverridedesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.noverridedesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -360,11 +354,11 @@ class OperationBuilder_v8 {
             }
         }
         if (m_operation.koverridedesc != nullptr) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.koverridedesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.koverridedesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -374,17 +368,17 @@ class OperationBuilder_v8 {
             }
         }
 #endif
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_MATMUL_DESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.matmuldesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_MATMUL_DESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.matmuldesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_MATMUL_DESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -399,11 +393,11 @@ class OperationBuilder_v8 {
         json j                   = m_operation.pointwise_mode;
         m_operation.operationTag = j;
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.pwdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.pwdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -412,11 +406,11 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_POINTWISE_XDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.xdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_POINTWISE_XDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.xdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -426,11 +420,11 @@ class OperationBuilder_v8 {
         }
 
         if (!m_operation.is_pointwise_activation_bwd_op) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_POINTWISE_YDESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.ydesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_POINTWISE_YDESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.ydesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -439,11 +433,11 @@ class OperationBuilder_v8 {
                 return std::move(m_operation);
             }
         } else {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_POINTWISE_DYDESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.dydesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_POINTWISE_DYDESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.dydesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -452,11 +446,11 @@ class OperationBuilder_v8 {
                 return std::move(m_operation);
             }
 
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_POINTWISE_DXDESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.dxdesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_POINTWISE_DXDESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.dxdesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -470,11 +464,11 @@ class OperationBuilder_v8 {
                                                                       : static_cast<void *>(&m_operation.alpha_d));
         void *alpha2 = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.alpha2_s)
                                                                       : static_cast<void *>(&m_operation.alpha2_d));
-        status       = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1,
-                                          m_operation.alphabetaType,
-                                          1,
-                                          alpha);
+        status       = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1,
+                                               m_operation.alphabetaType,
+                                               1,
+                                               alpha);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -482,11 +476,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2,
-                                          m_operation.alphabetaType,
-                                          1,
-                                          alpha2);
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2,
+                                               m_operation.alphabetaType,
+                                               1,
+                                               alpha2);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -496,11 +490,11 @@ class OperationBuilder_v8 {
         }
 
         if (m_operation.pointwise_port_count >= 3 && !m_operation.is_pointwise_activation_bwd_op) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_POINTWISE_BDESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.bdesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_POINTWISE_BDESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.bdesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -510,13 +504,12 @@ class OperationBuilder_v8 {
             }
         }
 
-#if (CUDNN_VERSION >= 8400)
         if (m_operation.pointwise_port_count == 4) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_POINTWISE_TDESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.tdesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_POINTWISE_TDESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.tdesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -525,9 +518,8 @@ class OperationBuilder_v8 {
                 return std::move(m_operation);
             }
         }
-#endif
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -542,11 +534,11 @@ class OperationBuilder_v8 {
         auto status = CUDNN_STATUS_SUCCESS;
 
         auto dxdesc_ = m_operation.dxdesc != nullptr ? m_operation.dxdesc : m_operation.xdesc;
-        status       = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(dxdesc_->get_backend_descriptor()));
+        status       = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(dxdesc_->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -555,11 +547,11 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.wdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.wdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -569,11 +561,11 @@ class OperationBuilder_v8 {
         }
 
         auto dydesc_ = m_operation.dydesc != nullptr ? m_operation.dydesc : m_operation.ydesc;
-        status       = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(dydesc_->get_backend_descriptor()));
+        status       = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(dydesc_->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -582,11 +574,11 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.cdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.cdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -599,11 +591,11 @@ class OperationBuilder_v8 {
                                                                      : static_cast<void *>(&m_operation.alpha_d));
         void *beta  = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.beta_s)
                                                                      : static_cast<void *>(&m_operation.beta_d));
-        status      = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA,
-                                          m_operation.alphabetaType,
-                                          1,
-                                          alpha);
+        status      = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA,
+                                               m_operation.alphabetaType,
+                                               1,
+                                               alpha);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -611,11 +603,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA,
-                                          m_operation.alphabetaType,
-                                          1,
-                                          beta);
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA,
+                                               m_operation.alphabetaType,
+                                               1,
+                                               beta);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -624,7 +616,7 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -645,7 +637,7 @@ class OperationBuilder_v8 {
                                        void const *ptr,
                                        cudnnBackendAttributeType_t type = CUDNN_TYPE_BACKEND_DESCRIPTOR,
                                        int64_t cnt                      = 1) {
-            status = cudnnBackendSetAttribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
+            status = cudnn_frontend::set_attribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(&operation, status, fail_msg);
             }
@@ -829,7 +821,7 @@ class OperationBuilder_v8 {
             }
         }
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -842,11 +834,11 @@ class OperationBuilder_v8 {
         m_operation.operationTag = "GenStats";
         auto status              = CUDNN_STATUS_SUCCESS;
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_GENSTATS_XDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.xdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_GENSTATS_XDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.xdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -855,11 +847,11 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.sumdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.sumdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -868,11 +860,11 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.sqsumdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.sqsumdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -881,11 +873,11 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_GENSTATS_MODE,
-                                          CUDNN_TYPE_GENSTATS_MODE,
-                                          1,
-                                          &(m_operation.genstats_mode));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_GENSTATS_MODE,
+                                               CUDNN_TYPE_GENSTATS_MODE,
+                                               1,
+                                               &(m_operation.genstats_mode));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -894,11 +886,11 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC,
-                                          CUDNN_TYPE_DATA_TYPE,
-                                          1,
-                                          &(m_operation.compute_type));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC,
+                                               CUDNN_TYPE_DATA_TYPE,
+                                               1,
+                                               &(m_operation.compute_type));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -907,7 +899,7 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -922,11 +914,11 @@ class OperationBuilder_v8 {
 
         auto status = CUDNN_STATUS_SUCCESS;
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.xdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.xdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -936,11 +928,11 @@ class OperationBuilder_v8 {
         }
 
         auto dwdesc_ = m_operation.dwdesc != nullptr ? m_operation.dwdesc : m_operation.wdesc;
-        status       = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(dwdesc_->get_backend_descriptor()));
+        status       = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(dwdesc_->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -950,11 +942,11 @@ class OperationBuilder_v8 {
         }
 
         auto dydesc_ = m_operation.dydesc != nullptr ? m_operation.dydesc : m_operation.ydesc;
-        status       = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(dydesc_->get_backend_descriptor()));
+        status       = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(dydesc_->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -963,11 +955,11 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.cdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.cdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation,
                                           status,
@@ -979,11 +971,11 @@ class OperationBuilder_v8 {
                                                                      : static_cast<void *>(&m_operation.alpha_d));
         void *beta  = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.beta_s)
                                                                      : static_cast<void *>(&m_operation.beta_d));
-        status      = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA,
-                                          m_operation.alphabetaType,
-                                          1,
-                                          alpha);
+        status      = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA,
+                                               m_operation.alphabetaType,
+                                               1,
+                                               alpha);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -991,11 +983,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA,
-                                          m_operation.alphabetaType,
-                                          1,
-                                          beta);
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA,
+                                               m_operation.alphabetaType,
+                                               1,
+                                               beta);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1004,7 +996,7 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -1016,7 +1008,6 @@ class OperationBuilder_v8 {
 
     Operation_v8 &&
     build_norm_forward() {
-#if (CUDNN_VERSION >= 8500)
         m_operation.operationTag = "Norm_Fwd";
         auto status              = CUDNN_STATUS_SUCCESS;
 
@@ -1026,7 +1017,7 @@ class OperationBuilder_v8 {
                                        void const *ptr,
                                        cudnnBackendAttributeType_t type = CUDNN_TYPE_BACKEND_DESCRIPTOR,
                                        int64_t cnt                      = 1) {
-            status = cudnnBackendSetAttribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
+            status = cudnn_frontend::set_attribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(&operation, status, fail_msg);
             }
@@ -1041,11 +1032,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_MODE Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_NORM_FWD_MODE,
-                                          CUDNN_TYPE_NORM_MODE,
-                                          1,
-                                          &cudnn_norm_mode);
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_NORM_FWD_MODE,
+                                               CUDNN_TYPE_NORM_MODE,
+                                               1,
+                                               &cudnn_norm_mode);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1063,11 +1054,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_NORM_FWD_PHASE Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_NORM_FWD_PHASE,
-                                          CUDNN_TYPE_NORM_FWD_PHASE,
-                                          1,
-                                          &cudnn_norm_fwd_phase);
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_NORM_FWD_PHASE,
+                                               CUDNN_TYPE_NORM_FWD_PHASE,
+                                               1,
+                                               &cudnn_norm_fwd_phase);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1193,24 +1184,17 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
 
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
         }
-#else
-        set_error_and_throw_exception(
-            &m_operation,
-            CUDNN_STATUS_NOT_SUPPORTED,
-            "CUDNN_BACKEND_OPERATION: Nomalization Forward operation Not supported in this version");
-#endif
         return std::move(m_operation);
     }
 
     Operation_v8 &&
     build_norm_backward() {
-#if (CUDNN_VERSION >= 8500)
         m_operation.operationTag = "Norm_Bwd";
         auto status              = CUDNN_STATUS_SUCCESS;
 
@@ -1220,7 +1204,7 @@ class OperationBuilder_v8 {
                                        void const *ptr,
                                        cudnnBackendAttributeType_t type = CUDNN_TYPE_BACKEND_DESCRIPTOR,
                                        int64_t cnt                      = 1) {
-            status = cudnnBackendSetAttribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
+            status = cudnn_frontend::set_attribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(&operation, status, fail_msg);
             }
@@ -1326,31 +1310,24 @@ class OperationBuilder_v8 {
             return std::move(m_operation);
         }
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
 
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
         }
-#else
-        set_error_and_throw_exception(
-            &m_operation,
-            CUDNN_STATUS_NOT_SUPPORTED,
-            "CUDNN_BACKEND_OPERATION: Nomalization Backward operation Not supported in this version");
-#endif
         return std::move(m_operation);
     }
 
     Operation_v8 &&
     build_resample_fwd_operation() {
-#if (CUDNN_VERSION >= 8500)
         m_operation.operationTag = "Resample_fwd";
         auto status              = CUDNN_STATUS_SUCCESS;
-        status                   = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.xdesc->get_backend_descriptor()));
+        status                   = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.xdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1358,11 +1335,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.ydesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.ydesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1370,11 +1347,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA,
-                                          CUDNN_TYPE_DOUBLE,
-                                          1,
-                                          &(m_operation.alpha_d));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &(m_operation.alpha_d));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1382,11 +1359,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA,
-                                          CUDNN_TYPE_DOUBLE,
-                                          1,
-                                          &(m_operation.beta_d));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &(m_operation.beta_d));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1394,11 +1371,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.resampledesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.resampledesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1409,11 +1386,11 @@ class OperationBuilder_v8 {
 
         // Maxpooling forward
         if (m_operation.idxdesc != nullptr) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.idxdesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.idxdesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -1423,29 +1400,26 @@ class OperationBuilder_v8 {
             }
         }
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
         }
-#else
-        set_error_and_throw_exception(&m_operation,
-                                      CUDNN_STATUS_NOT_SUPPORTED,
-                                      "CUDNN_BACKEND_OPERATION: Resample operation Not supported in this version");
-#endif
         return std::move(m_operation);
     }
 
     Operation_v8 &&
     build_resample_bwd_operation() {
 #if (CUDNN_VERSION >= 8600)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8600, m_operation, "CUDNN_BACKEND_OPERATION: Resample_bwd requires cudnn 8.6.0");
         m_operation.operationTag = "Resample_bwd";
         auto status              = CUDNN_STATUS_SUCCESS;
-        status                   = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.dxdesc->get_backend_descriptor()));
+        status                   = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.dxdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1455,11 +1429,15 @@ class OperationBuilder_v8 {
         }
 #if (CUDNN_VERSION >= 8700)
         if (m_operation.xdesc != nullptr) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.xdesc->get_backend_descriptor()));
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+                8700,
+                m_operation,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC requires cudnn 8.7.0");
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.xdesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -1469,25 +1447,29 @@ class OperationBuilder_v8 {
             }
         }
         if (m_operation.ydesc != nullptr) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.ydesc->get_backend_descriptor()));
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+                8700,
+                m_operation,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC requires cudnn 8.7.0");
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.ydesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
                     status,
-                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC Failed");
+                    "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC Failed");
                 return std::move(m_operation);
             }
         }
 #endif
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.dydesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.dydesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1495,11 +1477,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA,
-                                          CUDNN_TYPE_DOUBLE,
-                                          1,
-                                          &(m_operation.alpha_d));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &(m_operation.alpha_d));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1507,11 +1489,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA,
-                                          CUDNN_TYPE_DOUBLE,
-                                          1,
-                                          &(m_operation.beta_d));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &(m_operation.beta_d));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1519,11 +1501,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.resampledesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.resampledesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1534,11 +1516,11 @@ class OperationBuilder_v8 {
 
         // Maxpooling backward
         if (m_operation.idxdesc != nullptr) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.idxdesc->get_backend_descriptor()));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.idxdesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -1548,7 +1530,7 @@ class OperationBuilder_v8 {
             }
         }
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -1564,13 +1546,15 @@ class OperationBuilder_v8 {
     Operation_v8 &&
     build_rng_operation() {
 #if (CUDNN_VERSION >= 8700)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8700, m_operation, "CUDNN_BACKEND_OPERATION: build_rng_operation requires cudnn 8.7.0");
         m_operation.operationTag = "Rng";
         auto status              = CUDNN_STATUS_SUCCESS;
-        status                   = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RNG_YDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.ydesc->get_backend_descriptor()));
+        status                   = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RNG_YDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.ydesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_YDESC Failed");
@@ -1581,11 +1565,15 @@ class OperationBuilder_v8 {
         // seed can be a tensor or an int64
         // if tensor is defined we give it precedence
         if (m_operation.seeddesc) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_RNG_SEED,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.seeddesc->get_backend_descriptor()));
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+                8800,
+                m_operation,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_SEED requires cudnn 8.8.0");
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_RNG_SEED,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.seeddesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_SEED Failed");
@@ -1594,22 +1582,22 @@ class OperationBuilder_v8 {
         } else
 #endif
         {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_RNG_SEED,
-                                              CUDNN_TYPE_INT64,
-                                              1,
-                                              &(m_operation.seed));
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_RNG_SEED,
+                                                   CUDNN_TYPE_INT64,
+                                                   1,
+                                                   &(m_operation.seed));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_SEED Failed");
                 return std::move(m_operation);
             }
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RNG_DESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.rngdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RNG_DESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.rngdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation, status, "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_DESC Failed");
@@ -1618,11 +1606,15 @@ class OperationBuilder_v8 {
 
 #if (CUDNN_VERSION >= 8800)
         if (m_operation.offsetdesc) {
-            status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              1,
-                                              &(m_operation.offsetdesc->get_backend_descriptor()));
+            NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+                8800,
+                m_operation,
+                "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC requires cudnn 8.8.0");
+            status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   1,
+                                                   &(m_operation.offsetdesc->get_backend_descriptor()));
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_operation,
@@ -1633,7 +1625,7 @@ class OperationBuilder_v8 {
         }
 #endif
 
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -1649,13 +1641,15 @@ class OperationBuilder_v8 {
     Operation_v8 &&
     build_reshape_operation() {
 #if (CUDNN_VERSION >= 8700)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8700, m_operation, "CUDNN_BACKEND_OPERATION: build_reshape_operation requires cudnn 8.7.0");
         m_operation.operationTag = "Reshape";
         auto status              = CUDNN_STATUS_SUCCESS;
-        status                   = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESHAPE_XDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.xdesc->get_backend_descriptor()));
+        status                   = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESHAPE_XDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.xdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1663,11 +1657,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESHAPE_XDESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_RESHAPE_YDESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.ydesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_RESHAPE_YDESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.ydesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1675,7 +1669,7 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_RESHAPE_YDESC Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -1690,15 +1684,14 @@ class OperationBuilder_v8 {
 
     Operation_v8 &&
     build_bn_bwd_weight_op() {
-#if (CUDNN_VERSION >= 8400)
         m_operation.operationTag = "Dgrad_Drelu_BN_Bwd";
         auto status              = CUDNN_STATUS_SUCCESS;
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC,
-                                          CUDNN_TYPE_DATA_TYPE,
-                                          1,
-                                          &(m_operation.compute_type));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC,
+                                               CUDNN_TYPE_DATA_TYPE,
+                                               1,
+                                               &(m_operation.compute_type));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1713,7 +1706,7 @@ class OperationBuilder_v8 {
                                        void const *ptr,
                                        cudnnBackendAttributeType_t type = CUDNN_TYPE_BACKEND_DESCRIPTOR,
                                        int64_t cnt                      = 1) {
-            status = cudnnBackendSetAttribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
+            status = cudnn_frontend::set_attribute(operation.pointer->get_backend_descriptor(), attr, type, cnt, ptr);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(&operation, status, fail_msg);
             }
@@ -1814,17 +1807,11 @@ class OperationBuilder_v8 {
         if (status != CUDNN_STATUS_SUCCESS) {
             return std::move(m_operation);
         }
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
         }
-#else
-        set_error_and_throw_exception(
-            &m_operation,
-            CUDNN_STATUS_NOT_SUPPORTED,
-            "CUDNN_BACKEND_OPERATION: Nomalization Backward operation Not supported in this version");
-#endif
         return std::move(m_operation);
     }
 
@@ -1834,11 +1821,11 @@ class OperationBuilder_v8 {
 
         auto status = CUDNN_STATUS_SUCCESS;
 
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.xdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.xdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1846,11 +1833,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.wdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.wdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1858,11 +1845,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.ydesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.ydesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1870,11 +1857,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          1,
-                                          &(m_operation.cdesc->get_backend_descriptor()));
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               1,
+                                               &(m_operation.cdesc->get_backend_descriptor()));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1886,11 +1873,11 @@ class OperationBuilder_v8 {
                                                                      : static_cast<void *>(&m_operation.alpha_d));
         void *beta  = (m_operation.alphabetaType == CUDNN_TYPE_FLOAT ? static_cast<void *>(&m_operation.beta_s)
                                                                      : static_cast<void *>(&m_operation.beta_d));
-        status      = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA,
-                                          m_operation.alphabetaType,
-                                          1,
-                                          alpha);
+        status      = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA,
+                                               m_operation.alphabetaType,
+                                               1,
+                                               alpha);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1898,11 +1885,11 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendSetAttribute(m_operation.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA,
-                                          m_operation.alphabetaType,
-                                          1,
-                                          beta);
+        status = cudnn_frontend::set_attribute(m_operation.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA,
+                                               m_operation.alphabetaType,
+                                               1,
+                                               beta);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -1910,7 +1897,7 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION: SetAttribute CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA Failed");
             return std::move(m_operation);
         }
-        status = cudnnBackendFinalize(m_operation.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operation.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_operation, status, "CUDNN_BACKEND_OPERATION: cudnnFinalize Failed");
             return std::move(m_operation);
@@ -2017,33 +2004,32 @@ class OperationBuilder_v8 {
             return CUDNN_STATUS_BAD_PARAM;
         }
 
-#if (CUDNN_VERSION == 8500)
-        std::array<int64_t, 10> x_dimensions;
-        int64_t dim_count;
-        status = cudnnBackendGetAttribute(m_operation.xdesc->get_backend_descriptor(),
-                                          CUDNN_ATTR_TENSOR_DIMENSIONS,
-                                          CUDNN_TYPE_INT64,
-                                          x_dimensions.size(),
-                                          &dim_count,
-                                          x_dimensions.data());
-        if (status != CUDNN_STATUS_SUCCESS) {
-            msg = "CUDNN_BACKEND_OPERATION: CUDNN_BACKEND_TENSOR has invalid CUDNN_ATTR_TENSOR_DIMENSIONS";
-            return status;
-        }
+        if (get_backend_version() == 8500) {
+            std::array<int64_t, 10> x_dimensions;
+            int64_t dim_count;
+            status = cudnn_frontend::get_attribute(m_operation.xdesc->get_backend_descriptor(),
+                                                   CUDNN_ATTR_TENSOR_DIMENSIONS,
+                                                   CUDNN_TYPE_INT64,
+                                                   x_dimensions.size(),
+                                                   &dim_count,
+                                                   x_dimensions.data());
+            if (status != CUDNN_STATUS_SUCCESS) {
+                msg = "CUDNN_BACKEND_OPERATION: CUDNN_BACKEND_TENSOR has invalid CUDNN_ATTR_TENSOR_DIMENSIONS";
+                return status;
+            }
 
-        int64_t N = x_dimensions[0];
-        int64_t C = x_dimensions[1];
+            int64_t N = x_dimensions[0];
+            int64_t C = x_dimensions[1];
 
-        if ((N != 1) || ((C % 8) != 0)) {
-            msg = "CUDNN_BACKEND_OPERATION: CUDNN_BACKEND_TENSOR has bad CUDNN_ATTR_TENSOR_DIMENSIONS";
-            return CUDNN_STATUS_BAD_PARAM;
+            if ((N != 1) || ((C % 8) != 0)) {
+                msg = "CUDNN_BACKEND_OPERATION: CUDNN_BACKEND_TENSOR has bad CUDNN_ATTR_TENSOR_DIMENSIONS";
+                return CUDNN_STATUS_BAD_PARAM;
+            }
         }
-#endif
 
         return status;
     }
 
-#if (CUDNN_VERSION >= 8500)
     cudnnStatus_t
     validate_resample_op(Message_t &msg) {
         if (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_FWD_DESCRIPTOR) {
@@ -2055,7 +2041,6 @@ class OperationBuilder_v8 {
                 msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESAMPLE.*YDESC";
                 return CUDNN_STATUS_BAD_PARAM;
             }
-#if (CUDNN_VERSION >= 8600)
         } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR) {
             if (m_operation.dxdesc == nullptr) {
                 msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESAMPLE.*DXDESC";
@@ -2065,7 +2050,6 @@ class OperationBuilder_v8 {
                 msg = "CUDNN_BACKEND_OPERATION: Check and Set the CUDNN_ATTR_OPERATION_RESAMPLE.*DYDESC";
                 return CUDNN_STATUS_BAD_PARAM;
             }
-#endif
         }
 
         if (m_operation.resampledesc == nullptr) {
@@ -2075,7 +2059,6 @@ class OperationBuilder_v8 {
 
         return CUDNN_STATUS_SUCCESS;
     }
-#endif
 
     cudnnStatus_t
     validate_rng_op(Message_t &msg) {
@@ -2306,7 +2289,6 @@ class OperationBuilder_v8 {
 
     auto
     settDesc(Tensor_v8 const &tensor) -> OperationBuilder_v8 & {
-#if (CUDNN_VERSION >= 8400)
         if (is_pointwise_op == false) {
             set_error_and_throw_exception(
                 &m_operation,
@@ -2314,12 +2296,6 @@ class OperationBuilder_v8 {
                 "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: Non Pointwise operation does not need tTensor");
         }
         m_operation.tdesc = tensor.get_desc();
-#else
-        CUDNN_FRONTEND_UNUSED(tensor);
-        set_error_and_throw_exception(&m_operation,
-                                      CUDNN_STATUS_NOT_SUPPORTED,
-                                      "CUDNN_BACKEND_OPERATION_*_DESCRIPTOR: tTensor Not supported in this version");
-#endif
         return *this;
     }
 
@@ -2452,7 +2428,6 @@ class OperationBuilder_v8 {
         return *this;
     }
 
-#if (CUDNN_VERSION >= 8500)
     // To be deprecated. Please use setNormalizationMode(cudnn_frontend::NormMode_t mode) instead.
     auto
     setNormalizationMode(cudnnBackendNormMode_t mode) -> OperationBuilder_v8 & {
@@ -2466,7 +2441,6 @@ class OperationBuilder_v8 {
         detail::convert_from_cudnn_type(mode, m_operation.norm_fwd_phase);
         return *this;
     }
-#endif
 
     auto
     setBNFinalizeMode(cudnnBnFinalizeStatsMode_t mode) -> OperationBuilder_v8 & {
@@ -2707,7 +2681,6 @@ class OperationBuilder_v8 {
 
         m_operation.is_pointwise_math_op = ((m_operation.pointwise_mode == PointwiseMode_t::ADD) ||
                                             (m_operation.pointwise_mode == PointwiseMode_t::MUL) ||
-#if (CUDNN_VERSION >= 8300)
                                             (m_operation.pointwise_mode == PointwiseMode_t::DIV) ||
                                             (m_operation.pointwise_mode == PointwiseMode_t::SUB) ||
                                             (m_operation.pointwise_mode == PointwiseMode_t::ADD_SQUARE) ||
@@ -2731,23 +2704,15 @@ class OperationBuilder_v8 {
                                             (m_operation.pointwise_mode == PointwiseMode_t::ABS) ||
                                             (m_operation.pointwise_mode == PointwiseMode_t::CEIL) ||
                                             (m_operation.pointwise_mode == PointwiseMode_t::FLOOR) ||
-#endif
-#if (CUDNN_VERSION >= 8400)
                                             (m_operation.pointwise_mode == PointwiseMode_t::GEN_INDEX) ||
                                             (m_operation.pointwise_mode == PointwiseMode_t::BINARY_SELECT) ||
-#endif
-#if (CUDNN_VERSION >= 8500)
                                             (m_operation.pointwise_mode == PointwiseMode_t::ERF) ||
-#endif
-#if (CUDNN_VERSION >= 8900)
                                             (m_operation.pointwise_mode == PointwiseMode_t::RECIPROCAL) ||
-#endif
                                             (m_operation.pointwise_mode == PointwiseMode_t::MIN) ||
                                             (m_operation.pointwise_mode == PointwiseMode_t::MAX) ||
                                             (m_operation.pointwise_mode == PointwiseMode_t::SQRT));
-#if (CUDNN_VERSION >= 8500)
+
         m_operation.is_pointwise_identity_op = (m_operation.pointwise_mode == PointwiseMode_t::IDENTITY);
-#endif
 
         m_operation.is_pointwise_activation_fwd_op =
             ((m_operation.pointwise_mode == PointwiseMode_t::RELU_FWD) ||
@@ -2755,13 +2720,9 @@ class OperationBuilder_v8 {
              (m_operation.pointwise_mode == PointwiseMode_t::SIGMOID_FWD) ||
              (m_operation.pointwise_mode == PointwiseMode_t::ELU_FWD) ||
              (m_operation.pointwise_mode == PointwiseMode_t::GELU_FWD) ||
-#if (CUDNN_VERSION >= 8500)
              (m_operation.pointwise_mode == PointwiseMode_t::GELU_APPROX_TANH_FWD) ||
-#endif
              (m_operation.pointwise_mode == PointwiseMode_t::SOFTPLUS_FWD) ||
-#if (CUDNN_VERSION >= 8300)
              (m_operation.pointwise_mode == PointwiseMode_t::EXP) ||
-#endif
              (m_operation.pointwise_mode == PointwiseMode_t::SWISH_FWD));
 
         m_operation.is_pointwise_activation_bwd_op =
@@ -2770,9 +2731,7 @@ class OperationBuilder_v8 {
              (m_operation.pointwise_mode == PointwiseMode_t::SIGMOID_BWD) ||
              (m_operation.pointwise_mode == PointwiseMode_t::ELU_BWD) ||
              (m_operation.pointwise_mode == PointwiseMode_t::GELU_BWD) ||
-#if (CUDNN_VERSION >= 8500)
              (m_operation.pointwise_mode == PointwiseMode_t::GELU_APPROX_TANH_BWD) ||
-#endif
              (m_operation.pointwise_mode == PointwiseMode_t::SOFTPLUS_BWD) ||
              (m_operation.pointwise_mode == PointwiseMode_t::SWISH_BWD));
 
@@ -2893,12 +2852,10 @@ class OperationBuilder_v8 {
             status_ = CUDNN_STATUS_SUCCESS;
         } else if (is_bn_bwd_weight) {
             status_ = validate_bn_bwd_weight_op(msg);
-#if (CUDNN_VERSION >= 8500)
         } else if (is_resample_fwd_op) {
             status_ = validate_resample_op(msg);
         } else if (is_resample_bwd_op) {
             status_ = validate_resample_op(msg);
-#endif
         } else if (is_rng_op) {
             status_ = validate_rng_op(msg);
         } else if (is_norm_forward_op || is_norm_backward_op) {
@@ -2948,28 +2905,20 @@ class OperationBuilder_v8 {
             return build_genstats_op();
         } else if (m_operation.op_mode == DescriptorType_t::OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR) {
             return build_bn_finalize_op();
-#if (CUDNN_VERSION >= 8400)
         } else if (m_operation.op_mode == DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR) {
             return build_bn_bwd_weight_op();
-#endif
-#if (CUDNN_VERSION >= 8500)
         } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_FWD_DESCRIPTOR) {
             return build_resample_fwd_operation();
         } else if (m_operation.op_mode == DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR) {
             return build_norm_forward();
         } else if (m_operation.op_mode == DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR) {
             return build_norm_backward();
-#endif
-#if (CUDNN_VERSION >= 8600)
         } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR) {
             return build_resample_bwd_operation();
-#endif
-#if (CUDNN_VERSION >= 8700)
         } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RNG_DESCRIPTOR) {
             return build_rng_operation();
         } else if (m_operation.op_mode == DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR) {
             return build_reshape_operation();
-#endif
         } else {
             set_error_and_throw_exception(
                 &m_operation, status, "CUDNN_BACKEND_OPERATION: unimplemented operation in frontend");
diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h
index 72a26f8..a6f5ef9 100644
--- a/include/cudnn_frontend_OperationGraph.h
+++ b/include/cudnn_frontend_OperationGraph.h
@@ -74,12 +74,12 @@ class OperationGraph_v8 : public BackendDescriptor {
     auto
     getEngineCount(void) const -> int64_t {
         int64_t global_count = -1;
-        auto status          = cudnnBackendGetAttribute(pointer->get_backend_descriptor(),
-                                               CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT,
-                                               CUDNN_TYPE_INT64,
-                                               1,
-                                               nullptr,
-                                               &global_count);
+        auto status          = cudnn_frontend::get_attribute(pointer->get_backend_descriptor(),
+                                                    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT,
+                                                    CUDNN_TYPE_INT64,
+                                                    1,
+                                                    nullptr,
+                                                    &global_count);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(this,
                                           status,
@@ -214,11 +214,11 @@ class OperationGraphBuilder_v8 {
             ops_raw[i] = m_operationGraph.ops[i]->get_backend_descriptor();
         }
 
-        status = cudnnBackendSetAttribute(m_operationGraph.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATIONGRAPH_OPS,
-                                          CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                          m_operationGraph.numOps,
-                                          ops_raw.data());
+        status = cudnn_frontend::set_attribute(m_operationGraph.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATIONGRAPH_OPS,
+                                               CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                               m_operationGraph.numOps,
+                                               ops_raw.data());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operationGraph,
@@ -226,11 +226,11 @@ class OperationGraphBuilder_v8 {
                 "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: SetAttribute CUDNN_ATTR_OPERATIONGRAPH_OPS Failed");
             return std::move(m_operationGraph);
         }
-        status = cudnnBackendSetAttribute(m_operationGraph.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
-                                          CUDNN_TYPE_HANDLE,
-                                          1,
-                                          &m_operationGraph.handle);
+        status = cudnn_frontend::set_attribute(m_operationGraph.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
+                                               CUDNN_TYPE_HANDLE,
+                                               1,
+                                               &m_operationGraph.handle);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operationGraph,
@@ -240,7 +240,7 @@ class OperationGraphBuilder_v8 {
         }
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_operationGraph.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_operationGraph.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_operationGraph, status, "CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR: cudnnFinalize Failed");
diff --git a/include/cudnn_frontend_PointWiseDesc.h b/include/cudnn_frontend_PointWiseDesc.h
index cde5019..4c6039e 100644
--- a/include/cudnn_frontend_PointWiseDesc.h
+++ b/include/cudnn_frontend_PointWiseDesc.h
@@ -92,9 +92,7 @@ class PointWiseDesc_v8 : public BackendDescriptor {
     double elu_alpha                      = 1.0;
     double softplus_beta                  = 1.0;
     double swish_beta                     = 1.0;
-#if (CUDNN_VERSION >= 8400)
-    int64_t axis = -1;
-#endif
+    int64_t axis                          = -1;
 };
 
 ////
@@ -191,9 +189,7 @@ class PointWiseDescBuilder_v8 {
     auto
     setAxis(int64_t axis_) -> PointWiseDescBuilder_v8 & {
         CUDNN_FRONTEND_UNUSED(axis_);
-#if (CUDNN_VERSION >= 8400)
         m_pointWiseDesc.axis = axis_;
-#endif
         return *this;
     }
 
@@ -219,11 +215,11 @@ class PointWiseDescBuilder_v8 {
                 "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_TYPE_POINTWISE_MODE Failed");
             return std::move(m_pointWiseDesc);
         }
-        status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_POINTWISE_MODE,
-                                          CUDNN_TYPE_POINTWISE_MODE,
-                                          1,
-                                          &cudnn_pointwise_mode);
+        status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_POINTWISE_MODE,
+                                               CUDNN_TYPE_POINTWISE_MODE,
+                                               1,
+                                               &cudnn_pointwise_mode);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_pointWiseDesc,
@@ -241,11 +237,11 @@ class PointWiseDescBuilder_v8 {
                 "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_MATH_PREC Failed");
             return std::move(m_pointWiseDesc);
         }
-        status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_POINTWISE_MATH_PREC,
-                                          CUDNN_TYPE_DATA_TYPE,
-                                          1,
-                                          &cudnn_data_type);
+        status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_POINTWISE_MATH_PREC,
+                                               CUDNN_TYPE_DATA_TYPE,
+                                               1,
+                                               &cudnn_data_type);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_pointWiseDesc,
@@ -255,11 +251,11 @@ class PointWiseDescBuilder_v8 {
         }
 
         if (m_pointWiseDesc.mode == PointwiseMode_t::RELU_FWD || m_pointWiseDesc.mode == PointwiseMode_t::RELU_BWD) {
-            status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_POINTWISE_NAN_PROPAGATION,
-                                              CUDNN_TYPE_NAN_PROPOGATION,
-                                              1,
-                                              &m_pointWiseDesc.nan_propagation);
+            status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_POINTWISE_NAN_PROPAGATION,
+                                                   CUDNN_TYPE_NAN_PROPOGATION,
+                                                   1,
+                                                   &m_pointWiseDesc.nan_propagation);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_pointWiseDesc,
@@ -268,11 +264,11 @@ class PointWiseDescBuilder_v8 {
                 return std::move(m_pointWiseDesc);
             }
 
-            status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP,
-                                              CUDNN_TYPE_DOUBLE,
-                                              1,
-                                              &m_pointWiseDesc.lower_clip);
+            status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP,
+                                                   CUDNN_TYPE_DOUBLE,
+                                                   1,
+                                                   &m_pointWiseDesc.lower_clip);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_pointWiseDesc,
@@ -284,18 +280,18 @@ class PointWiseDescBuilder_v8 {
             if (m_pointWiseDesc.compute_type == DataType_t::FLOAT) {
                 double clamped_upper_clip =
                     std::min<double>(m_pointWiseDesc.upper_clip, std::numeric_limits<float>::max());
-                status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                                  CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP,
-                                                  CUDNN_TYPE_DOUBLE,
-                                                  1,
-                                                  &clamped_upper_clip);
+                status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                                       CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP,
+                                                       CUDNN_TYPE_DOUBLE,
+                                                       1,
+                                                       &clamped_upper_clip);
 
             } else {
-                status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                                  CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP,
-                                                  CUDNN_TYPE_DOUBLE,
-                                                  1,
-                                                  &m_pointWiseDesc.upper_clip);
+                status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                                       CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP,
+                                                       CUDNN_TYPE_DOUBLE,
+                                                       1,
+                                                       &m_pointWiseDesc.upper_clip);
             }
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
@@ -305,11 +301,11 @@ class PointWiseDescBuilder_v8 {
                 return std::move(m_pointWiseDesc);
             }
 
-            status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE,
-                                              CUDNN_TYPE_DOUBLE,
-                                              1,
-                                              &m_pointWiseDesc.lower_clip_slope);
+            status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE,
+                                                   CUDNN_TYPE_DOUBLE,
+                                                   1,
+                                                   &m_pointWiseDesc.lower_clip_slope);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(&m_pointWiseDesc,
                                               status,
@@ -319,11 +315,11 @@ class PointWiseDescBuilder_v8 {
             }
         } else if (m_pointWiseDesc.mode == PointwiseMode_t::ELU_FWD ||
                    m_pointWiseDesc.mode == PointwiseMode_t::ELU_BWD) {
-            status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_POINTWISE_ELU_ALPHA,
-                                              CUDNN_TYPE_DOUBLE,
-                                              1,
-                                              &m_pointWiseDesc.elu_alpha);
+            status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_POINTWISE_ELU_ALPHA,
+                                                   CUDNN_TYPE_DOUBLE,
+                                                   1,
+                                                   &m_pointWiseDesc.elu_alpha);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_pointWiseDesc,
@@ -333,11 +329,11 @@ class PointWiseDescBuilder_v8 {
             }
         } else if (m_pointWiseDesc.mode == PointwiseMode_t::SOFTPLUS_FWD ||
                    m_pointWiseDesc.mode == PointwiseMode_t::SOFTPLUS_BWD) {
-            status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA,
-                                              CUDNN_TYPE_DOUBLE,
-                                              1,
-                                              &m_pointWiseDesc.softplus_beta);
+            status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA,
+                                                   CUDNN_TYPE_DOUBLE,
+                                                   1,
+                                                   &m_pointWiseDesc.softplus_beta);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_pointWiseDesc,
@@ -347,11 +343,11 @@ class PointWiseDescBuilder_v8 {
             }
         } else if (m_pointWiseDesc.mode == PointwiseMode_t::SWISH_FWD ||
                    m_pointWiseDesc.mode == PointwiseMode_t::SWISH_BWD) {
-            status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_POINTWISE_SWISH_BETA,
-                                              CUDNN_TYPE_DOUBLE,
-                                              1,
-                                              &m_pointWiseDesc.swish_beta);
+            status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_POINTWISE_SWISH_BETA,
+                                                   CUDNN_TYPE_DOUBLE,
+                                                   1,
+                                                   &m_pointWiseDesc.swish_beta);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_pointWiseDesc,
@@ -359,14 +355,12 @@ class PointWiseDescBuilder_v8 {
                     "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: SetAttribute CUDNN_ATTR_POINTWISE_SWISH_BETA, Failed");
                 return std::move(m_pointWiseDesc);
             }
-        }
-#if (CUDNN_VERSION >= 8400)
-        else if (m_pointWiseDesc.mode == PointwiseMode_t::GEN_INDEX) {
-            status = cudnnBackendSetAttribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_POINTWISE_AXIS,
-                                              CUDNN_TYPE_INT64,
-                                              1,
-                                              &m_pointWiseDesc.axis);
+        } else if (m_pointWiseDesc.mode == PointwiseMode_t::GEN_INDEX) {
+            status = cudnn_frontend::set_attribute(m_pointWiseDesc.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_POINTWISE_AXIS,
+                                                   CUDNN_TYPE_INT64,
+                                                   1,
+                                                   &m_pointWiseDesc.axis);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_pointWiseDesc,
@@ -375,10 +369,9 @@ class PointWiseDescBuilder_v8 {
                 return std::move(m_pointWiseDesc);
             }
         }
-#endif
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_pointWiseDesc.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_pointWiseDesc.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_pointWiseDesc, status, "CUDNN_BACKEND_POINTWISE_DESCRIPTOR: cudnnFinalize Failed");
diff --git a/include/cudnn_frontend_ReductionDesc.h b/include/cudnn_frontend_ReductionDesc.h
index b4429ed..3328702 100644
--- a/include/cudnn_frontend_ReductionDesc.h
+++ b/include/cudnn_frontend_ReductionDesc.h
@@ -130,11 +130,11 @@ class ReductionDescBuilder_v8 {
                 "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_REDUCTION_COMP_TYPE Failed");
             return std::move(m_reductionDesc);
         }
-        status = cudnnBackendSetAttribute(m_reductionDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_REDUCTION_COMP_TYPE,
-                                          CUDNN_TYPE_DATA_TYPE,
-                                          1,
-                                          &cudnn_data_type);
+        status = cudnn_frontend::set_attribute(m_reductionDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_REDUCTION_COMP_TYPE,
+                                               CUDNN_TYPE_DATA_TYPE,
+                                               1,
+                                               &cudnn_data_type);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_reductionDesc,
@@ -152,11 +152,11 @@ class ReductionDescBuilder_v8 {
                 "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: SetAttribute CUDNN_ATTR_REDUCTION_OPERATOR Failed");
             return std::move(m_reductionDesc);
         }
-        status = cudnnBackendSetAttribute(m_reductionDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_REDUCTION_OPERATOR,
-                                          CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
-                                          1,
-                                          &cudnn_reduction_mode);
+        status = cudnn_frontend::set_attribute(m_reductionDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_REDUCTION_OPERATOR,
+                                               CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
+                                               1,
+                                               &cudnn_reduction_mode);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_reductionDesc,
@@ -166,7 +166,7 @@ class ReductionDescBuilder_v8 {
         }
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_reductionDesc.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_reductionDesc.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_reductionDesc, status, "CUDNN_BACKEND_REDUCTION_DESCRIPTOR: cudnnFinalize Failed");
diff --git a/include/cudnn_frontend_Reorder_Tensor.h b/include/cudnn_frontend_Reorder_Tensor.h
index d0bad99..a5a8c8d 100644
--- a/include/cudnn_frontend_Reorder_Tensor.h
+++ b/include/cudnn_frontend_Reorder_Tensor.h
@@ -54,7 +54,7 @@ cudnnReorderFilterAndBiasInt8x32(cudnnHandle_t handle,
 
     cudnnFilterDescriptor_t filterDesc = nullptr;
 
-    cudnn_status = cudnnCreateFilterDescriptor(&filterDesc);
+    cudnn_status = create_filter_desc_v7(&filterDesc);
     if (cudnn_status != CUDNN_STATUS_SUCCESS) {
         return cudnn_status;
     }
@@ -81,8 +81,8 @@ cudnnReorderFilterAndBiasInt8x32(cudnnHandle_t handle,
         filter_dims_[4] = static_cast<int>((non_shape_dims == 2) ? filter_dims[4] : filter_dims[5]);  // w
     }
 
-    cudnn_status = cudnnSetFilterNdDescriptor(
-        filterDesc, CUDNN_DATA_INT8x32, CUDNN_TENSOR_NCHW_VECT_C, conv_dims + 2, filter_dims_);
+    cudnn_status =
+        set_ndfilter_desc_v7(filterDesc, CUDNN_DATA_INT8x32, CUDNN_TENSOR_NCHW_VECT_C, conv_dims + 2, filter_dims_);
 
     if (cudnn_status != CUDNN_STATUS_SUCCESS) {
         return cudnn_status;
@@ -90,16 +90,16 @@ cudnnReorderFilterAndBiasInt8x32(cudnnHandle_t handle,
 
     int reorderBias = (dev_bias_ptr != nullptr);
 
-    cudnn_status = cudnnReorderFilterAndBias(handle,
-                                             filterDesc,
-                                             CUDNN_DEFAULT_REORDER,
-                                             dev_filter_ptr,
-                                             reordered_filter_ptr,
-                                             reorderBias,
-                                             dev_bias_ptr,
-                                             reordered_bias_ptr);
+    cudnn_status = reorder_filter_bias(handle,
+                                       filterDesc,
+                                       CUDNN_DEFAULT_REORDER,
+                                       dev_filter_ptr,
+                                       reordered_filter_ptr,
+                                       reorderBias,
+                                       dev_bias_ptr,
+                                       reordered_bias_ptr);
 
-    cudnnDestroyFilterDescriptor(filterDesc);
+    destroy_filter(filterDesc);
     return cudnn_status;
 }
 }  // namespace cudnn_frontend
diff --git a/include/cudnn_frontend_Resample.h b/include/cudnn_frontend_Resample.h
index cefa683..bdcc9e4 100644
--- a/include/cudnn_frontend_Resample.h
+++ b/include/cudnn_frontend_Resample.h
@@ -47,7 +47,6 @@ class ResampleDesc_v8 : public BackendDescriptor {
     std::string
     describe() const override {
         std::stringstream ss;
-#if (CUDNN_VERSION >= 8500)
         char sep = ',';
         ss << "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: "
            << "Compute Type: " << json{computeType} << ", Resample Mode: " << json{resample_mode}
@@ -73,7 +72,6 @@ class ResampleDesc_v8 : public BackendDescriptor {
             ss << '(' << stride[i].numerator << sep << stride[i].denominator << ')' << sep;
         }
         ss << "]";
-#endif
         return ss.str();
     }
 
@@ -113,8 +111,6 @@ class ResampleDesc_v8 : public BackendDescriptor {
         return padding_mode;
     }
 
-#if (CUDNN_VERSION >= 8500)
-
     cudnnFraction_t const *
     getSpatialStride() const {
         return stride;
@@ -134,7 +130,6 @@ class ResampleDesc_v8 : public BackendDescriptor {
     getWindowDim() const {
         return windowDim;
     }
-#endif
     /** @} */
 
    private:
@@ -151,13 +146,11 @@ class ResampleDesc_v8 : public BackendDescriptor {
 
     int64_t spatialDim = 0;
 
-#if (CUDNN_VERSION >= 8500)
     // Shape attributes
-    cudnnFraction_t windowDim[CUDNN_DIM_MAX]   = {{0, 1}, { 0, 1 }};
-    cudnnFraction_t prePadding[CUDNN_DIM_MAX]  = {{0, 1}, { 0, 1 }};
-    cudnnFraction_t postPadding[CUDNN_DIM_MAX] = {{0, 1}, { 0, 1 }};
-    cudnnFraction_t stride[CUDNN_DIM_MAX]      = {{0, 1}, { 0, 1 }};
-#endif
+    cudnnFraction_t windowDim[CUDNN_DIM_MAX]   = {{0, 1}, {0, 1}};
+    cudnnFraction_t prePadding[CUDNN_DIM_MAX]  = {{0, 1}, {0, 1}};
+    cudnnFraction_t postPadding[CUDNN_DIM_MAX] = {{0, 1}, {0, 1}};
+    cudnnFraction_t stride[CUDNN_DIM_MAX]      = {{0, 1}, {0, 1}};
 };
 
 ///
@@ -189,7 +182,6 @@ class ResampleDescBuilder_v8 {
         return *this;
     }
 
-#if CUDNN_VERSION >= 8500
     //! (Overloaded) Set post padding for the Resample Operation with cudnnFraction_t
     auto
     setPostPadding(int64_t count, cudnnFraction_t const *arr) -> ResampleDescBuilder_v8 & {
@@ -237,7 +229,6 @@ class ResampleDescBuilder_v8 {
         detail::convert_from_cudnn_type(padding_mode, m_resampleDesc.padding_mode);
         return *this;
     }
-#endif
 
     //! Set padding mode for the Resample Operation
     auto
@@ -256,73 +247,45 @@ class ResampleDescBuilder_v8 {
     //! (Overloaded) Set post padding for the Resample Operation with int64_t
     auto
     setPostPadding(int64_t count, int64_t const *arr) -> ResampleDescBuilder_v8 & {
-#if CUDNN_VERSION < 8500
-        CUDNN_FRONTEND_UNUSED(count);
-        CUDNN_FRONTEND_UNUSED(arr);
-        set_error_and_throw_exception(
-            &m_resampleDesc, CUDNN_STATUS_NOT_SUPPORTED, "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR setPostPadding failed");
-#else
         // TODO: check the provided array count against the stored spatial dimension count.
         for (int i = 0; i < count; i++) {
             m_resampleDesc.postPadding[i].numerator   = arr[i];
             m_resampleDesc.postPadding[i].denominator = 1;
         }
-#endif
         return *this;
     }
 
     //! (Overloaded) Set pre padding for the Resample Operation with int64_t
     auto
     setPrePadding(int64_t count, int64_t const *arr) -> ResampleDescBuilder_v8 & {
-#if CUDNN_VERSION < 8500
-        CUDNN_FRONTEND_UNUSED(count);
-        CUDNN_FRONTEND_UNUSED(arr);
-        set_error_and_throw_exception(
-            &m_resampleDesc, CUDNN_STATUS_NOT_SUPPORTED, "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR setPrePadding failed");
-#else
         // TODO: check the provided array count against the stored spatial dimension count.
         for (int i = 0; i < count; i++) {
             m_resampleDesc.prePadding[i].numerator   = arr[i];
             m_resampleDesc.prePadding[i].denominator = 1;
         }
-#endif
         return *this;
     }
 
     //! (Overloaded) Set stride for the Resample Operation with int64_t
     auto
     setSpatialStride(int64_t count, int64_t const *arr) -> ResampleDescBuilder_v8 & {
-#if CUDNN_VERSION < 8500
-        CUDNN_FRONTEND_UNUSED(count);
-        CUDNN_FRONTEND_UNUSED(arr);
-        set_error_and_throw_exception(
-            &m_resampleDesc, CUDNN_STATUS_NOT_SUPPORTED, "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR setSpatialStride failed");
-#else
         // TODO: check the provided array count against the stored spatial dimension count.
         for (int i = 0; i < count; i++) {
             m_resampleDesc.stride[i].numerator   = arr[i];
             m_resampleDesc.stride[i].denominator = 1;
         }
-#endif
         return *this;
     }
 
     //! (Overloaded) Set window dim for the Resample Operation with int64_t
     auto
     setSpatialDim(int64_t count, int64_t const *arr) -> ResampleDescBuilder_v8 & {
-#if CUDNN_VERSION < 8500
-        CUDNN_FRONTEND_UNUSED(count);
-        CUDNN_FRONTEND_UNUSED(arr);
-        set_error_and_throw_exception(
-            &m_resampleDesc, CUDNN_STATUS_NOT_SUPPORTED, "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR setSpatialDim failed");
-#else
         // TODO: check the provided array count against the stored spatial dimension count.
         m_resampleDesc.spatialDim = count;
         for (int i = 0; i < count; i++) {
             m_resampleDesc.windowDim[i].numerator   = arr[i];
             m_resampleDesc.windowDim[i].denominator = 1;
         }
-#endif
         return *this;
     }
 
@@ -332,7 +295,6 @@ class ResampleDescBuilder_v8 {
     //! Throws the appropriate error message
     ResampleDesc_v8 &&
     build() {
-#if (CUDNN_VERSION >= 8500)
         // Sanity check if non-default fields have been set correctly.
         if (m_resampleDesc.spatialDim < 0) {
             set_error_and_throw_exception(&m_resampleDesc,
@@ -359,11 +321,11 @@ class ResampleDescBuilder_v8 {
                 "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_MODE Failed");
             return std::move(m_resampleDesc);
         }
-        status = cudnnBackendSetAttribute(m_resampleDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RESAMPLE_MODE,
-                                          CUDNN_TYPE_RESAMPLE_MODE,
-                                          1,
-                                          &cudnn_resample_mode);
+        status = cudnn_frontend::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RESAMPLE_MODE,
+                                               CUDNN_TYPE_RESAMPLE_MODE,
+                                               1,
+                                               &cudnn_resample_mode);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc,
@@ -381,11 +343,11 @@ class ResampleDescBuilder_v8 {
                 "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_COMP_TYPE Failed");
             return std::move(m_resampleDesc);
         }
-        status = cudnnBackendSetAttribute(m_resampleDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RESAMPLE_COMP_TYPE,
-                                          CUDNN_TYPE_DATA_TYPE,
-                                          1,
-                                          &cudnn_data_type);
+        status = cudnn_frontend::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RESAMPLE_COMP_TYPE,
+                                               CUDNN_TYPE_DATA_TYPE,
+                                               1,
+                                               &cudnn_data_type);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc,
@@ -394,11 +356,11 @@ class ResampleDescBuilder_v8 {
             return std::move(m_resampleDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_resampleDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION,
-                                          CUDNN_TYPE_NAN_PROPOGATION,
-                                          1,
-                                          &(m_resampleDesc.nanOpt));
+        status = cudnn_frontend::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION,
+                                               CUDNN_TYPE_NAN_PROPOGATION,
+                                               1,
+                                               &(m_resampleDesc.nanOpt));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc,
@@ -416,11 +378,11 @@ class ResampleDescBuilder_v8 {
                 "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: SetAttribute CUDNN_ATTR_RESAMPLE_PADDING_MODE Failed");
             return std::move(m_resampleDesc);
         }
-        status = cudnnBackendSetAttribute(m_resampleDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RESAMPLE_PADDING_MODE,
-                                          CUDNN_TYPE_PADDING_MODE,
-                                          1,
-                                          &cudnn_padding_mode);
+        status = cudnn_frontend::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RESAMPLE_PADDING_MODE,
+                                               CUDNN_TYPE_PADDING_MODE,
+                                               1,
+                                               &cudnn_padding_mode);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc,
@@ -429,11 +391,11 @@ class ResampleDescBuilder_v8 {
             return std::move(m_resampleDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_resampleDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS,
-                                          CUDNN_TYPE_INT64,
-                                          1,
-                                          &(m_resampleDesc.spatialDim));
+        status = cudnn_frontend::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS,
+                                               CUDNN_TYPE_INT64,
+                                               1,
+                                               &(m_resampleDesc.spatialDim));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc,
@@ -442,11 +404,11 @@ class ResampleDescBuilder_v8 {
             return std::move(m_resampleDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_resampleDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RESAMPLE_WINDOW_DIMS,
-                                          CUDNN_TYPE_FRACTION,
-                                          m_resampleDesc.spatialDim,
-                                          m_resampleDesc.windowDim);
+        status = cudnn_frontend::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RESAMPLE_WINDOW_DIMS,
+                                               CUDNN_TYPE_FRACTION,
+                                               m_resampleDesc.spatialDim,
+                                               m_resampleDesc.windowDim);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc,
@@ -455,11 +417,11 @@ class ResampleDescBuilder_v8 {
             return std::move(m_resampleDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_resampleDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RESAMPLE_PRE_PADDINGS,
-                                          CUDNN_TYPE_FRACTION,
-                                          m_resampleDesc.spatialDim,
-                                          m_resampleDesc.prePadding);
+        status = cudnn_frontend::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RESAMPLE_PRE_PADDINGS,
+                                               CUDNN_TYPE_FRACTION,
+                                               m_resampleDesc.spatialDim,
+                                               m_resampleDesc.prePadding);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc,
@@ -468,11 +430,11 @@ class ResampleDescBuilder_v8 {
             return std::move(m_resampleDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_resampleDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RESAMPLE_POST_PADDINGS,
-                                          CUDNN_TYPE_FRACTION,
-                                          m_resampleDesc.spatialDim,
-                                          m_resampleDesc.postPadding);
+        status = cudnn_frontend::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RESAMPLE_POST_PADDINGS,
+                                               CUDNN_TYPE_FRACTION,
+                                               m_resampleDesc.spatialDim,
+                                               m_resampleDesc.postPadding);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc,
@@ -481,11 +443,11 @@ class ResampleDescBuilder_v8 {
             return std::move(m_resampleDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_resampleDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RESAMPLE_STRIDES,
-                                          CUDNN_TYPE_FRACTION,
-                                          m_resampleDesc.spatialDim,
-                                          m_resampleDesc.stride);
+        status = cudnn_frontend::set_attribute(m_resampleDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RESAMPLE_STRIDES,
+                                               CUDNN_TYPE_FRACTION,
+                                               m_resampleDesc.spatialDim,
+                                               m_resampleDesc.stride);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc,
@@ -495,7 +457,7 @@ class ResampleDescBuilder_v8 {
         }
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_resampleDesc.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_resampleDesc.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_resampleDesc, status, "CUDNN_BACKEND_RESAMPLE_DESCRIPTOR: cudnnFinalize Failed");
@@ -503,11 +465,6 @@ class ResampleDescBuilder_v8 {
         }
         getLogger() << "[cudnn_frontend] " << m_resampleDesc << std::endl;
         return std::move(m_resampleDesc);
-#else
-        set_error_and_throw_exception(
-            &m_resampleDesc, CUDNN_STATUS_NOT_SUPPORTED, "CUDNN_RESAMPLE_DESCRIPTOR: Not supported in this version");
-        return std::move(m_resampleDesc);
-#endif
     }
 
     explicit ResampleDescBuilder_v8()                      = default;
diff --git a/include/cudnn_frontend_Rng.h b/include/cudnn_frontend_Rng.h
index dc194cf..452a0de 100644
--- a/include/cudnn_frontend_Rng.h
+++ b/include/cudnn_frontend_Rng.h
@@ -203,6 +203,9 @@ class RngDescBuilder_v8 {
     build() {
 #if (CUDNN_VERSION >= 8700)
         // Create a descriptor. Memory allocation happens here.
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(
+            8700, m_RngDesc, "CUDNN_BACKEND_RNG_DESCRIPTOR: Requires cudnn 8.7.0");
+
         auto status = m_RngDesc.initialize_managed_backend_pointer(CUDNN_BACKEND_RNG_DESCRIPTOR);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_RngDesc, status, "CUDNN_BACKEND_RNG_DESCRIPTOR: cudnnCreate Failed");
@@ -218,22 +221,22 @@ class RngDescBuilder_v8 {
             return std::move(m_RngDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_RngDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RNG_DISTRIBUTION,
-                                          CUDNN_TYPE_RNG_DISTRIBUTION,
-                                          1,
-                                          &cudnn_rng_distribution);
+        status = cudnn_frontend::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RNG_DISTRIBUTION,
+                                               CUDNN_TYPE_RNG_DISTRIBUTION,
+                                               1,
+                                               &cudnn_rng_distribution);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_RngDesc, status, "CUDNN_BACKEND_RNG_DESCRIPTOR: SetAttribute CUDNN_ATTR_RNG_DISTRIBUTION Failed");
             return std::move(m_RngDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_RngDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RNG_NORMAL_DIST_MEAN,
-                                          CUDNN_TYPE_DOUBLE,
-                                          1,
-                                          &(m_RngDesc.normal_dist_mean));
+        status = cudnn_frontend::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RNG_NORMAL_DIST_MEAN,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &(m_RngDesc.normal_dist_mean));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_RngDesc,
@@ -242,11 +245,11 @@ class RngDescBuilder_v8 {
             return std::move(m_RngDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_RngDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION,
-                                          CUDNN_TYPE_DOUBLE,
-                                          1,
-                                          &(m_RngDesc.normal_dist_std_dev));
+        status = cudnn_frontend::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &(m_RngDesc.normal_dist_std_dev));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_RngDesc,
@@ -255,11 +258,11 @@ class RngDescBuilder_v8 {
             return std::move(m_RngDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_RngDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM,
-                                          CUDNN_TYPE_DOUBLE,
-                                          1,
-                                          &(m_RngDesc.uniform_dist_max));
+        status = cudnn_frontend::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &(m_RngDesc.uniform_dist_max));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_RngDesc,
@@ -268,11 +271,11 @@ class RngDescBuilder_v8 {
             return std::move(m_RngDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_RngDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM,
-                                          CUDNN_TYPE_DOUBLE,
-                                          1,
-                                          &(m_RngDesc.uniform_dist_min));
+        status = cudnn_frontend::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &(m_RngDesc.uniform_dist_min));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_RngDesc,
@@ -281,11 +284,11 @@ class RngDescBuilder_v8 {
             return std::move(m_RngDesc);
         }
 
-        status = cudnnBackendSetAttribute(m_RngDesc.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY,
-                                          CUDNN_TYPE_DOUBLE,
-                                          1,
-                                          &(m_RngDesc.bernoulli_dist_probability));
+        status = cudnn_frontend::set_attribute(m_RngDesc.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY,
+                                               CUDNN_TYPE_DOUBLE,
+                                               1,
+                                               &(m_RngDesc.bernoulli_dist_probability));
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_RngDesc,
@@ -295,7 +298,7 @@ class RngDescBuilder_v8 {
         }
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_RngDesc.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_RngDesc.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_RngDesc, status, "CUDNN_BACKEND_RNG_DESCRIPTOR: cudnnFinalize Failed");
             return std::move(m_RngDesc);
diff --git a/include/cudnn_frontend_Tensor.h b/include/cudnn_frontend_Tensor.h
index d1563ec..1a0283c 100644
--- a/include/cudnn_frontend_Tensor.h
+++ b/include/cudnn_frontend_Tensor.h
@@ -75,9 +75,7 @@ class Tensor_v8 : public BackendDescriptor {
         }
         ss << " ]";
         ss << " isVirtual: " << isVirtual << " isByValue: " << isByValue << " Alignment: " << alignment;
-#if (CUDNN_VERSION >= 8300)
         ss << " reorder_type: " << json{reorder_type};
-#endif
         return ss.str();
     }
 
@@ -242,14 +240,12 @@ class TensorBuilder_v8 {
         return *this;
     }
 
-#if (CUDNN_VERSION >= 8300)
     // To be deprecated. Please use setReorderType(cudnn_frontend::cudnnBackendTensorReordering_t).
     auto
     setReorderType(cudnnBackendTensorReordering_t reordering_type) -> TensorBuilder_v8 & {
         detail::convert_from_cudnn_type(reordering_type, m_tensor.reorder_type);
         return *this;
     }
-#endif
 
     /** @} */
 
@@ -340,48 +336,48 @@ class TensorBuilder_v8 {
                 &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_DATA_TYPE Failed");
             return std::move(m_tensor);
         }
-        status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_TENSOR_DATA_TYPE,
-                                          CUDNN_TYPE_DATA_TYPE,
-                                          1,
-                                          &cudnn_data_type);
+        status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_TENSOR_DATA_TYPE,
+                                               CUDNN_TYPE_DATA_TYPE,
+                                               1,
+                                               &cudnn_data_type);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_DATA_TYPE Failed");
             return std::move(m_tensor);
         }
-        status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_TENSOR_DIMENSIONS,
-                                          CUDNN_TYPE_INT64,
-                                          m_tensor.nDims,
-                                          m_tensor.btensor_dimA);
+        status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_TENSOR_DIMENSIONS,
+                                               CUDNN_TYPE_INT64,
+                                               m_tensor.nDims,
+                                               m_tensor.btensor_dimA);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_DIMENSIONS Failed");
             return std::move(m_tensor);
         }
-        status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_TENSOR_STRIDES,
-                                          CUDNN_TYPE_INT64,
-                                          m_tensor.nDims,
-                                          m_tensor.btensor_strA);
+        status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_TENSOR_STRIDES,
+                                               CUDNN_TYPE_INT64,
+                                               m_tensor.nDims,
+                                               m_tensor.btensor_strA);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_STRIDES Failed");
             return std::move(m_tensor);
         }
-        status = cudnnBackendSetAttribute(
+        status = cudnn_frontend::set_attribute(
             m_tensor.pointer->get_backend_descriptor(), CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &m_tensor.id);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_UNIQUE_ID Failed");
             return std::move(m_tensor);
         }
-        status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
-                                          CUDNN_TYPE_INT64,
-                                          1,
-                                          &m_tensor.alignment);
+        status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
+                                               CUDNN_TYPE_INT64,
+                                               1,
+                                               &m_tensor.alignment);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_tensor,
@@ -390,11 +386,11 @@ class TensorBuilder_v8 {
             return std::move(m_tensor);
         }
         if (m_tensor.isVirtual) {
-            status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_TENSOR_IS_VIRTUAL,
-                                              CUDNN_TYPE_BOOLEAN,
-                                              1,
-                                              &m_tensor.isVirtual);
+            status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_TENSOR_IS_VIRTUAL,
+                                                   CUDNN_TYPE_BOOLEAN,
+                                                   1,
+                                                   &m_tensor.isVirtual);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_tensor,
@@ -404,11 +400,11 @@ class TensorBuilder_v8 {
             }
         }
         if (m_tensor.isByValue) {
-            status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_TENSOR_IS_BY_VALUE,
-                                              CUDNN_TYPE_BOOLEAN,
-                                              1,
-                                              &m_tensor.isByValue);
+            status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_TENSOR_IS_BY_VALUE,
+                                                   CUDNN_TYPE_BOOLEAN,
+                                                   1,
+                                                   &m_tensor.isByValue);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_tensor,
@@ -419,11 +415,11 @@ class TensorBuilder_v8 {
         }
 
         if (m_tensor.vectorCount > 1) {
-            status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_TENSOR_VECTOR_COUNT,
-                                              CUDNN_TYPE_INT64,
-                                              1,
-                                              &m_tensor.vectorCount);
+            status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_TENSOR_VECTOR_COUNT,
+                                                   CUDNN_TYPE_INT64,
+                                                   1,
+                                                   &m_tensor.vectorCount);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_tensor,
@@ -433,11 +429,11 @@ class TensorBuilder_v8 {
             }
         }
         if (m_tensor.vectorDimension >= 0) {
-            status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION,
-                                              CUDNN_TYPE_INT64,
-                                              1,
-                                              &m_tensor.vectorDimension);
+            status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION,
+                                                   CUDNN_TYPE_INT64,
+                                                   1,
+                                                   &m_tensor.vectorDimension);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_tensor,
@@ -449,14 +445,18 @@ class TensorBuilder_v8 {
 
         // Set ragged offset descriptor
 #if (CUDNN_VERSION >= 8900)
+        NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(8900,
+                                                     m_tensor,
+                                                     "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute "
+                                                     "CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC requires cudnn version 8.9");
         if (m_tensor.raggedOffset != nullptr) {
             std::vector<cudnnBackendDescriptor_t> backendRaggedOffset;
             backendRaggedOffset.push_back(m_tensor.raggedOffset.get()->pointer->get_backend_descriptor());
-            status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC,
-                                              CUDNN_TYPE_BACKEND_DESCRIPTOR,
-                                              static_cast<int64_t>(backendRaggedOffset.size()),
-                                              backendRaggedOffset.data());
+            status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC,
+                                                   CUDNN_TYPE_BACKEND_DESCRIPTOR,
+                                                   static_cast<int64_t>(backendRaggedOffset.size()),
+                                                   backendRaggedOffset.data());
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_tensor,
@@ -468,7 +468,6 @@ class TensorBuilder_v8 {
 #endif
 
         // Set the reorder_type
-#if (CUDNN_VERSION >= 8300)
         if (m_tensor.reorder_type != cudnn_frontend::TensorReordering_t::NONE) {
             cudnnBackendTensorReordering_t cudnn_reordering_type;
             status = detail::convert_to_cudnn_type(m_tensor.reorder_type, cudnn_reordering_type);
@@ -479,11 +478,11 @@ class TensorBuilder_v8 {
                     "CUDNN_BACKEND_TENSOR_DESCRIPTOR: SetAttribute CUDNN_ATTR_TENSOR_REORDERING_MODE Failed");
                 return std::move(m_tensor);
             }
-            status = cudnnBackendSetAttribute(m_tensor.pointer->get_backend_descriptor(),
-                                              CUDNN_ATTR_TENSOR_REORDERING_MODE,
-                                              CUDNN_TYPE_TENSOR_REORDERING_MODE,
-                                              1,
-                                              &m_tensor.reorder_type);
+            status = cudnn_frontend::set_attribute(m_tensor.pointer->get_backend_descriptor(),
+                                                   CUDNN_ATTR_TENSOR_REORDERING_MODE,
+                                                   CUDNN_TYPE_TENSOR_REORDERING_MODE,
+                                                   1,
+                                                   &m_tensor.reorder_type);
             if (status != CUDNN_STATUS_SUCCESS) {
                 set_error_and_throw_exception(
                     &m_tensor,
@@ -492,9 +491,8 @@ class TensorBuilder_v8 {
                 return std::move(m_tensor);
             }
         }
-#endif
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_tensor.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_tensor.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(&m_tensor, status, "CUDNN_BACKEND_TENSOR_DESCRIPTOR cudnnFinalize failed");
             return std::move(m_tensor);
diff --git a/include/cudnn_frontend_VariantPack.h b/include/cudnn_frontend_VariantPack.h
index 969aed0..4f7662f 100644
--- a/include/cudnn_frontend_VariantPack.h
+++ b/include/cudnn_frontend_VariantPack.h
@@ -136,11 +136,11 @@ class VariantPackBuilder_v8 {
             return std::move(m_variant_pack);
         }
 
-        status = cudnnBackendSetAttribute(m_variant_pack.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS,
-                                          CUDNN_TYPE_VOID_PTR,
-                                          m_variant_pack.num_ptrs,
-                                          m_variant_pack.data_pointers.data());
+        status = cudnn_frontend::set_attribute(m_variant_pack.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS,
+                                               CUDNN_TYPE_VOID_PTR,
+                                               m_variant_pack.num_ptrs,
+                                               m_variant_pack.data_pointers.data());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_variant_pack,
@@ -149,11 +149,11 @@ class VariantPackBuilder_v8 {
             return std::move(m_variant_pack);
         }
 
-        status = cudnnBackendSetAttribute(m_variant_pack.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS,
-                                          CUDNN_TYPE_INT64,
-                                          m_variant_pack.num_ptrs,
-                                          m_variant_pack.uid.data());
+        status = cudnn_frontend::set_attribute(m_variant_pack.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS,
+                                               CUDNN_TYPE_INT64,
+                                               m_variant_pack.num_ptrs,
+                                               m_variant_pack.uid.data());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_variant_pack,
@@ -162,11 +162,11 @@ class VariantPackBuilder_v8 {
             return std::move(m_variant_pack);
         }
 
-        status = cudnnBackendSetAttribute(m_variant_pack.pointer->get_backend_descriptor(),
-                                          CUDNN_ATTR_VARIANT_PACK_WORKSPACE,
-                                          CUDNN_TYPE_VOID_PTR,
-                                          1,
-                                          &m_variant_pack.workspace);
+        status = cudnn_frontend::set_attribute(m_variant_pack.pointer->get_backend_descriptor(),
+                                               CUDNN_ATTR_VARIANT_PACK_WORKSPACE,
+                                               CUDNN_TYPE_VOID_PTR,
+                                               1,
+                                               &m_variant_pack.workspace);
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_variant_pack,
@@ -176,7 +176,7 @@ class VariantPackBuilder_v8 {
         }
 
         // Finalizing the descriptor
-        status = cudnnBackendFinalize(m_variant_pack.pointer->get_backend_descriptor());
+        status = cudnn_frontend::finalize(m_variant_pack.pointer->get_backend_descriptor());
         if (status != CUDNN_STATUS_SUCCESS) {
             set_error_and_throw_exception(
                 &m_variant_pack, status, "CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR: cudnnFinalize Failed");
diff --git a/include/cudnn_frontend_find_plan.h b/include/cudnn_frontend_find_plan.h
index e8442ed..54e7740 100644
--- a/include/cudnn_frontend_find_plan.h
+++ b/include/cudnn_frontend_find_plan.h
@@ -53,12 +53,12 @@ time_sorted_plan(cudnnHandle_t handle,
     const float threshhold         = 0.95f;
     uint64_t successful_plan_count = 0;
     cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    cudaDeviceSynchronize();
+    cuda_event_create(&start);
+    cuda_event_create(&stop);
+    cuda_device_synchronize();
 
     cudaStream_t stream = nullptr;
-    cudnnGetStream(handle, &stream);
+    cudnn_frontend::get_stream(handle, &stream);
 
     for (auto &plan : plans) {
         float time_ms       = 0.0f;
@@ -66,24 +66,24 @@ time_sorted_plan(cudnnHandle_t handle,
         float min_time_ms   = std::numeric_limits<float>::max();
 
         // Warm-up run
-        auto warmup_status = cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
+        auto warmup_status = cudnn_frontend::execute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
         if (warmup_status != CUDNN_STATUS_SUCCESS) {
             getLogger() << "[cudnn_frontend] Plan " << plan.getTag() << " failed with " << to_string(warmup_status)
                         << std::endl;
             continue;
         }
         successful_plan_count++;
-        cudaDeviceSynchronize();
+        cuda_device_synchronize();
 
         float time_run_ms[3] = {0.0f, 0.0f, 0.0f};
         for (int i = 0; i < maxIterCount; i++) {
-            cudaEventRecord(start, stream);
+            cuda_event_record(start, stream);
 
-            cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
+            cudnn_frontend::execute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
 
-            cudaEventRecord(stop, stream);
-            cudaEventSynchronize(stop);
-            cudaEventElapsedTime(&time_ms, start, stop);
+            cuda_event_record(stop, stream);
+            cuda_event_synchronize(stop);
+            cuda_event_elapsed_time(&time_ms, start, stop);
 
             if constexpr (samplingTechnique == CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_TILL_STABLE) {
                 final_time_ms = std::min(min_time_ms, time_ms);
@@ -117,8 +117,8 @@ time_sorted_plan(cudnnHandle_t handle,
         time_sorted_plans.emplace_back(std::move(plan));
     }
 
-    cudaEventDestroy(start);
-    cudaEventDestroy(stop);
+    cuda_event_destroy(start);
+    cuda_event_destroy(stop);
 
     getLogger() << "[cudnn_frontend] Auto-tuning returns " << time_sorted_plans.size() << " plans." << std::endl;
 
diff --git a/include/cudnn_frontend_shim.h b/include/cudnn_frontend_shim.h
new file mode 100644
index 0000000..4ff57fc
--- /dev/null
+++ b/include/cudnn_frontend_shim.h
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <ostream>
+#include <iostream>
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#include <dlfcn.h>
+#include <mutex>
+#endif
+
+namespace cudnn_frontend {
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+inline void *
+get_symbol(const char *function_name) {
+    static std::mutex cudnn_fe_lib_mutex;
+    std::lock_guard<std::mutex> lock(cudnn_fe_lib_mutex);
+    char *c                = NULL;
+    c                      = dlerror();
+    static void *dl_handle = dlopen("libcudnn.so", RTLD_NOW);
+    c                      = dlerror();
+    (void)c;
+    if (dl_handle == nullptr) {
+        // Fall back major version name
+        dl_handle = dlopen("libcudnn.so.9", RTLD_NOW);
+        if (dl_handle == nullptr) {
+            dl_handle = dlopen("libcudnn.so.8", RTLD_NOW);
+            if (dl_handle == nullptr) {
+                std::string error_msg = std::string("Unable to dlopen libcudnn.so.[8/9]") + std::string(c);
+                throw std::runtime_error(error_msg.c_str());
+            }
+        }
+    }
+
+    void *ret = dlsym(dl_handle, function_name);
+    return ret;
+}
+
+inline void *
+get_cuda_symbol(const char *function_name) {
+    static std::mutex cuda_fe_lib_mutex;
+    std::lock_guard<std::mutex> lock(cuda_fe_lib_mutex);
+    char *c                = NULL;
+    c                      = dlerror();
+    static void *dl_handle = dlopen("libcudart.so", RTLD_NOW);
+    c                      = dlerror();
+    (void)c;
+    if (dl_handle == nullptr) {
+        std::string error_msg = std::string("Unable to dlopen libcudart.so") + std::string(c);
+        throw std::runtime_error(error_msg.c_str());
+    }
+
+    void *ret = dlsym(dl_handle, function_name);
+    return ret;
+}
+#endif
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#define NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(MINIMUM_VERSION, DESCRIPTOR, MESSAGE) \
+    if (MINIMUM_VERSION > get_backend_version()) {                                         \
+        set_error_and_throw_exception(&DESCRIPTOR, CUDNN_STATUS_INVALID_VALUE, MESSAGE);   \
+        return std::move(DESCRIPTOR);                                                      \
+    }
+#else
+#define NV_CUDNN_FE_DYNAMIC_CHECK_BACKEND_DESCRIPTOR(MINIMUM_VERSION, DESCRIPTOR, MESSAGE)
+#endif
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#define NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(MINIMUM_VERSION, STATUS) \
+    if (MINIMUM_VERSION > get_backend_version()) {                               \
+        return STATUS;                                                           \
+    }
+#else
+#define NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(MINIMUM_VERSION, STATUS)
+#endif
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#define NV_FE_CALL_TO_BACKEND(function_name, backend_symbol, ...)           \
+    static void *fptr = get_symbol(#backend_symbol);                        \
+    if (fptr == nullptr) {                                                  \
+        throw std::runtime_error("Unable to find symbol " #backend_symbol); \
+    }                                                                       \
+    return reinterpret_cast<decltype(function_name) *>(fptr)(__VA_ARGS__);
+#else
+#define NV_FE_CALL_TO_BACKEND(function_name, backend_symbol, ...) return backend_symbol(__VA_ARGS__);
+#endif
+
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+#define NV_FE_CALL_TO_CUDA(function_name, cuda_symbol, ...)              \
+    static void *fptr = get_cuda_symbol(#cuda_symbol);                   \
+    if (fptr == nullptr) {                                               \
+        throw std::runtime_error("Unable to find symbol " #cuda_symbol); \
+    }                                                                    \
+    return reinterpret_cast<decltype(function_name) *>(fptr)(__VA_ARGS__);
+#else
+#define NV_FE_CALL_TO_CUDA(function_name, cuda_symbol, ...) return cuda_symbol(__VA_ARGS__);
+#endif
+
+inline cudaError_t
+cuda_event_create(cudaEvent_t *event) {
+    NV_FE_CALL_TO_CUDA(cuda_event_create, cudaEventCreate, event);
+}
+
+inline cudaError_t
+cuda_event_destroy(cudaEvent_t event) {
+    NV_FE_CALL_TO_CUDA(cuda_event_destroy, cudaEventDestroy, event);
+}
+
+inline cudaError_t
+cuda_event_record(cudaEvent_t event, cudaStream_t stream) {
+    NV_FE_CALL_TO_CUDA(cuda_event_record, cudaEventRecord, event, stream);
+}
+
+inline cudaError_t
+cuda_event_synchronize(cudaEvent_t event) {
+    NV_FE_CALL_TO_CUDA(cuda_event_synchronize, cudaEventSynchronize, event);
+}
+
+inline cudaError_t
+cuda_event_elapsed_time(float *ms, cudaEvent_t start, cudaEvent_t end) {
+    NV_FE_CALL_TO_CUDA(cuda_event_elapsed_time, cudaEventElapsedTime, ms, start, end);
+}
+
+inline cudaError_t
+cuda_mem_cpy_async(void *dst, const void *src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) {
+    NV_FE_CALL_TO_CUDA(cuda_mem_cpy_async, cudaMemcpyAsync, dst, src, count, kind, stream);
+}
+
+inline cudaError_t
+cuda_mem_set_async(void *devPtr, int value, size_t count, cudaStream_t stream) {
+    NV_FE_CALL_TO_CUDA(cuda_mem_set_async, cudaMemsetAsync, devPtr, value, count, stream);
+}
+
+inline cudaError_t
+cuda_get_device_properties(cudaDeviceProp *prop, int device) {
+    NV_FE_CALL_TO_CUDA(cuda_get_device_properties, cudaGetDeviceProperties, prop, device);
+}
+
+inline const char *
+cuda_get_error_string(cudaError_t error) {
+    NV_FE_CALL_TO_CUDA(cuda_get_error_string, cudaGetErrorString, error);
+}
+
+inline cudaError_t
+cuda_device_synchronize() {
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+    static void *fptr = get_cuda_symbol("cudaDeviceSynchronize");
+    if (fptr == nullptr) {
+        throw std::runtime_error("Unable to find symbol cudaDeviceSynchronize");
+    }
+    return reinterpret_cast<decltype(cuda_device_synchronize) *>(fptr)();
+#else
+    return cudaDeviceSynchronize();
+#endif
+}
+
+inline cudnnStatus_t
+create_handle(cudnnHandle_t *handle) {
+    NV_FE_CALL_TO_BACKEND(create_handle, cudnnCreate, handle);
+}
+
+inline cudnnStatus_t
+destroy_handle(cudnnHandle_t handle) {
+    NV_FE_CALL_TO_BACKEND(destroy_handle, cudnnDestroy, handle);
+}
+
+inline size_t
+get_backend_version(void) {
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+    static void *fptr = get_symbol("cudnnGetVersion");
+    if (fptr == nullptr) {
+        throw std::runtime_error("Unable to find symbol cudnnGetVersion");
+    }
+    return reinterpret_cast<decltype(get_backend_version) *>(fptr)();
+#else
+    return cudnnGetVersion();
+#endif
+}
+
+inline cudnnStatus_t
+create_descriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor) {
+    NV_FE_CALL_TO_BACKEND(create_descriptor, cudnnBackendCreateDescriptor, descriptorType, descriptor);
+}
+
+inline cudnnStatus_t
+destroy_descriptor(cudnnBackendDescriptor_t descriptor) {
+    NV_FE_CALL_TO_BACKEND(destroy_descriptor, cudnnBackendDestroyDescriptor, descriptor);
+}
+
+inline cudnnStatus_t
+set_attribute(cudnnBackendDescriptor_t descriptor,
+              cudnnBackendAttributeName_t attributeName,
+              cudnnBackendAttributeType_t attributeType,
+              int64_t elementCount,
+              const void *arrayOfElements) {
+    NV_FE_CALL_TO_BACKEND(set_attribute,
+                          cudnnBackendSetAttribute,
+                          descriptor,
+                          attributeName,
+                          attributeType,
+                          elementCount,
+                          arrayOfElements);
+}
+
+inline cudnnStatus_t
+get_attribute(cudnnBackendDescriptor_t const descriptor,
+              cudnnBackendAttributeName_t attributeName,
+              cudnnBackendAttributeType_t attributeType,
+              int64_t requestedElementCount,
+              int64_t *elementCount,
+              void *arrayOfElements) {
+    NV_FE_CALL_TO_BACKEND(get_attribute,
+                          cudnnBackendGetAttribute,
+                          descriptor,
+                          attributeName,
+                          attributeType,
+                          requestedElementCount,
+                          elementCount,
+                          arrayOfElements)
+}
+
+inline cudnnStatus_t
+finalize(cudnnBackendDescriptor_t descriptor) {
+    NV_FE_CALL_TO_BACKEND(finalize, cudnnBackendFinalize, descriptor);
+}
+
+inline cudnnStatus_t
+execute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack) {
+    NV_FE_CALL_TO_BACKEND(execute, cudnnBackendExecute, handle, executionPlan, variantPack);
+}
+
+inline const char *
+get_error_string(cudnnStatus_t status) {
+    NV_FE_CALL_TO_BACKEND(get_error_string, cudnnGetErrorString, status);
+}
+
+inline cudnnStatus_t
+set_stream(cudnnHandle_t handle, cudaStream_t stream) {
+    NV_FE_CALL_TO_BACKEND(set_stream, cudnnSetStream, handle, stream);
+}
+
+inline cudnnStatus_t
+get_stream(cudnnHandle_t handle, cudaStream_t *stream) {
+    NV_FE_CALL_TO_BACKEND(get_stream, cudnnGetStream, handle, stream);
+}
+
+inline cudnnStatus_t
+create_filter_desc_v7(cudnnFilterDescriptor_t *filter) {
+    NV_FE_CALL_TO_BACKEND(create_filter_desc_v7, cudnnCreateFilterDescriptor, filter);
+}
+
+inline cudnnStatus_t
+set_ndfilter_desc_v7(cudnnFilterDescriptor_t filter,
+                     cudnnDataType_t type,
+                     cudnnTensorFormat_t format,
+                     int x,
+                     const int filterDimA[]) {
+    NV_FE_CALL_TO_BACKEND(set_ndfilter_desc_v7, cudnnSetFilterNdDescriptor, filter, type, format, x, filterDimA);
+}
+
+inline cudnnStatus_t
+reorder_filter_bias(cudnnHandle_t handle,
+                    const cudnnFilterDescriptor_t filterDesc,
+                    cudnnReorderType_t reorderType,
+                    const void *filterData,
+                    void *reorderedFilterData,
+                    int reorderBias,
+                    const void *biasData,
+                    void *reorderedBiasData) {
+    NV_FE_CALL_TO_BACKEND(reorder_filter_bias,
+                          cudnnReorderFilterAndBias,
+                          handle,
+                          filterDesc,
+                          reorderType,
+                          filterData,
+                          reorderedFilterData,
+                          reorderBias,
+                          biasData,
+                          reorderedBiasData);
+}
+
+inline cudnnStatus_t
+destroy_filter(cudnnFilterDescriptor_t filter) {
+    NV_FE_CALL_TO_BACKEND(destroy_filter, cudnnDestroyFilterDescriptor, filter);
+}
+}  // namespace cudnn_frontend
\ No newline at end of file
diff --git a/include/cudnn_frontend_utils.h b/include/cudnn_frontend_utils.h
index 5454a71..f22d95a 100644
--- a/include/cudnn_frontend_utils.h
+++ b/include/cudnn_frontend_utils.h
@@ -24,11 +24,69 @@
 #include <exception>
 #include <optional>
 #include <string>
+#include <variant>
 #include <vector>
 
-#include "thirdparty/nlohmann/json.hpp"
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#ifndef CUDNN_FRONTEND_SKIP_NLOHMANN_JSON
+#include "cudnn_frontend/thirdparty/nlohmann/json.hpp"
+#endif
+
 using json = nlohmann::json;
 
+template <>
+struct nlohmann::adl_serializer<half> {
+    static void
+    to_json(json& j, const half& opt) {
+        // No precision loss when converting to float
+        j = __half2float(opt);
+    }
+
+    static void
+    from_json(const json& j, half& opt) {
+        opt = __float2half(j.get<float>());
+    }
+};
+
+template <>
+struct nlohmann::adl_serializer<nv_bfloat16> {
+    static void
+    to_json(json& j, const nv_bfloat16& opt) {
+        // No precision loss when converting to float
+        j = __bfloat162float(opt);
+    }
+
+    static void
+    from_json(const json& j, nv_bfloat16& opt) {
+        opt = __float2bfloat16(j.get<float>());
+    }
+};
+
+template <typename T, typename... Args>
+void
+convert_from_json_to_variant(const nlohmann::json& j, std::variant<Args...>& data) {
+    try {
+        data = j.get<T>();
+    } catch (...) {
+        // get will throw an error if incorrect type
+    }
+}
+
+template <typename... Args>
+struct nlohmann::adl_serializer<std::variant<Args...>> {
+    static void
+    to_json(nlohmann::json& j, const std::variant<Args...>& data) {
+        std::visit([&j](const auto& v) { j = v; }, data);
+    }
+
+    static void
+    from_json(const nlohmann::json& j, std::variant<Args...>& data) {
+        (convert_from_json_to_variant<Args>(j, data), ...);
+    }
+};
+
 // Specialization of nlohmann::adl_serializer for std::optional<T>
 template <typename T>
 struct nlohmann::adl_serializer<std::optional<T>> {
@@ -69,6 +127,7 @@ struct nlohmann::adl_serializer<std::shared_ptr<T>> {
     }
 };
 
+#include "cudnn_frontend_shim.h"
 #include "cudnn_backend_base.h"
 #include "cudnn_frontend_Logging.h"
 
@@ -109,7 +168,7 @@ AllowAll(cudnnBackendDescriptor_t engine_config) {
 
 static inline std::string
 to_string(cudnnStatus_t const status) {
-    return cudnnGetErrorString(status);
+    return cudnn_frontend::get_error_string(status);
 }
 
 #ifndef NV_CUDNN_DISABLE_EXCEPTION
@@ -127,18 +186,15 @@ set_error_and_throw_exception(BackendDescriptor const* desc, cudnnStatus_t statu
 #endif
 }
 
-#if (CUDNN_VERSION >= 8200)
 static inline std::string
 to_string(cudnnBackendBehaviorNote_t note) {
     switch (note) {
         case CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION:
             return std::string("CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION");
-#if (CUDNN_VERSION >= 8300)
         case CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER:
             return std::string("CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER");
         case CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER:
             return std::string("CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER");
-#endif
         case CUDNN_BEHAVIOR_NOTE_TYPE_COUNT:
             return std::string("CUDNN_BEHAVIOR_NOTE_TYPE_COUNT");
 #ifndef NO_DEFAULT_IN_SWITCH
@@ -148,7 +204,6 @@ to_string(cudnnBackendBehaviorNote_t note) {
     }
     return std::string("INVALID_BEHAVIOR_NOTE");
 }
-#endif
 
 static inline std::string
 to_string(cudnnBackendNumericalNote_t note) {
@@ -165,14 +220,12 @@ to_string(cudnnBackendNumericalNote_t note) {
             return std::string("CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC");
         case CUDNN_NUMERICAL_NOTE_WINOGRAD:
             return std::string("CUDNN_NUMERICAL_NOTE_WINOGRAD");
-#if (CUDNN_VERSION >= 8300)
         case CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4:
             return std::string("CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4");
         case CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6:
             return std::string("CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6");
         case CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13:
             return std::string("CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13");
-#endif
         case CUDNN_NUMERICAL_NOTE_TYPE_COUNT:
             return std::string("CUDNN_NUMERICAL_NOTE_TYPE_COUNT");
 #ifndef NO_DEFAULT_IN_SWITCH
@@ -827,14 +880,11 @@ convert_to_cudnn_type(cudnn_frontend::DataType_t const mode, cudnnDataType_t& cu
             cudnn_mode = CUDNN_DATA_INT64;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
         case DataType_t::BOOLEAN:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_DATA_BOOLEAN;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case DataType_t::FP8_E4M3:
 #if (CUDNN_VERSION >= 8600)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8600, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
             cudnn_mode = CUDNN_DATA_FP8_E4M3;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #else
@@ -842,6 +892,7 @@ convert_to_cudnn_type(cudnn_frontend::DataType_t const mode, cudnnDataType_t& cu
 #endif
         case DataType_t::FP8_E5M2:
 #if (CUDNN_VERSION >= 8600)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8600, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
             cudnn_mode = CUDNN_DATA_FP8_E5M2;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #else
@@ -849,6 +900,7 @@ convert_to_cudnn_type(cudnn_frontend::DataType_t const mode, cudnnDataType_t& cu
 #endif
         case DataType_t::FAST_FLOAT_FOR_FP8:
 #if (CUDNN_VERSION >= 8700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
             cudnn_mode = CUDNN_DATA_FAST_FLOAT_FOR_FP8;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #else
@@ -962,222 +1014,99 @@ convert_to_cudnn_type(cudnn_frontend::PointwiseMode_t const mode, cudnnPointwise
         case PointwiseMode_t::SWISH_BWD:
             cudnn_mode = CUDNN_POINTWISE_SWISH_BWD;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-
         case PointwiseMode_t::DIV:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_DIV;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::ADD_SQUARE:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_ADD_SQUARE;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::EXP:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_EXP;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::SUB:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_SUB;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::CMP_EQ:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_CMP_EQ;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::CMP_NEQ:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_CMP_NEQ;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::CMP_GT:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_CMP_GT;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::CMP_GE:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_CMP_GE;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::CMP_LT:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_CMP_LT;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::CMP_LE:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_CMP_LE;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::LOGICAL_AND:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_LOGICAL_AND;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::LOGICAL_OR:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_LOGICAL_OR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::LOGICAL_NOT:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_LOGICAL_NOT;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::LOG:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_LOG;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::NEG:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_NEG;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::MOD:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_MOD;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::POW:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_POW;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::ABS:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_ABS;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::CEIL:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_CEIL;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::COS:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_COS;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::FLOOR:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_FLOOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::RSQRT:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_RSQRT;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::SIN:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_SIN;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::TAN:
-#if (CUDNN_VERSION >= 8300)
             cudnn_mode = CUDNN_POINTWISE_TAN;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
-
         case PointwiseMode_t::GEN_INDEX:
-#if (CUDNN_VERSION >= 8400)
             cudnn_mode = CUDNN_POINTWISE_GEN_INDEX;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::BINARY_SELECT:
-#if (CUDNN_VERSION >= 8400)
             cudnn_mode = CUDNN_POINTWISE_BINARY_SELECT;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
-
         case PointwiseMode_t::ERF:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_POINTWISE_ERF;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::IDENTITY:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_POINTWISE_IDENTITY;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::GELU_APPROX_TANH_BWD:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_POINTWISE_GELU_APPROX_TANH_BWD;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case PointwiseMode_t::GELU_APPROX_TANH_FWD:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_POINTWISE_GELU_APPROX_TANH_FWD;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
-
         case PointwiseMode_t::RECIPROCAL:
 #if (CUDNN_VERSION >= 8900)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8900, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
             cudnn_mode = CUDNN_POINTWISE_RECIPROCAL;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #else
@@ -1314,72 +1243,38 @@ convert_to_cudnn_type(cudnn_frontend::DescriptorType_t const mode, cudnnBackendD
         case DescriptorType_t::OPERATION_REDUCTION_DESCRIPTOR:
             cudnn_mode = CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-
         case DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR:
-#if (CUDNN_VERSION >= 8400)
             cudnn_mode = CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
         case DescriptorType_t::RESAMPLE_DESCRIPTOR:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_BACKEND_RESAMPLE_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
-
         case DescriptorType_t::OPERATION_RESAMPLE_FWD_DESCRIPTOR:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
-
         case DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR:
 #if (CUDNN_VERSION >= 8600)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8600, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
             cudnn_mode = CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #else
             return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
 #endif
-
         case DescriptorType_t::OPERATION_CONCAT_DESCRIPTOR:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
-
         case DescriptorType_t::OPERATION_SIGNAL_DESCRIPTOR:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
-
         case DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
-
         case DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR:
-#if (CUDNN_VERSION >= 8500)
             cudnn_mode = CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#else
-            return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
-#endif
-
         case DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR:
 #if (CUDNN_VERSION >= 8700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
             cudnn_mode = CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #else
@@ -1388,6 +1283,7 @@ convert_to_cudnn_type(cudnn_frontend::DescriptorType_t const mode, cudnnBackendD
 
         case DescriptorType_t::RNG_DESCRIPTOR:
 #if (CUDNN_VERSION >= 8700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
             cudnn_mode = CUDNN_BACKEND_RNG_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #else
@@ -1396,6 +1292,7 @@ convert_to_cudnn_type(cudnn_frontend::DescriptorType_t const mode, cudnnBackendD
 
         case DescriptorType_t::OPERATION_RNG_DESCRIPTOR:
 #if (CUDNN_VERSION >= 8700)
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
             cudnn_mode = CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #else
@@ -1410,7 +1307,6 @@ convert_to_cudnn_type(cudnn_frontend::DescriptorType_t const mode, cudnnBackendD
     return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
 }
 
-#if (CUDNN_VERSION >= 8500)
 static inline cudnnStatus_t
 convert_to_cudnn_type(cudnn_frontend::ResampleMode_t const mode, cudnnResampleMode_t& cudnn_mode) {
     switch (mode) {
@@ -1466,7 +1362,6 @@ convert_to_cudnn_type(cudnn_frontend::PaddingMode_t const mode, cudnnPaddingMode
 static inline cudnnStatus_t
 convert_to_cudnn_type(cudnn_frontend::NormMode_t const mode, cudnnBackendNormMode_t& cudnn_mode) {
     switch (mode) {
-#if (CUDNN_VERSION >= 8500)
         case NormMode_t::LAYER_NORM:
             cudnn_mode = CUDNN_LAYER_NORM;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
@@ -1479,10 +1374,10 @@ convert_to_cudnn_type(cudnn_frontend::NormMode_t const mode, cudnnBackendNormMod
         case NormMode_t::GROUP_NORM:
             cudnn_mode = CUDNN_GROUP_NORM;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#endif
 
 #if (CUDNN_VERSION >= 8906)
         case NormMode_t::RMS_NORM:
+            NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8906, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
             cudnn_mode = CUDNN_RMS_NORM;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #endif
@@ -1498,14 +1393,12 @@ convert_to_cudnn_type(cudnn_frontend::NormMode_t const mode, cudnnBackendNormMod
 static inline cudnnStatus_t
 convert_to_cudnn_type(cudnn_frontend::NormFwdPhase_t const mode, cudnnBackendNormFwdPhase_t& cudnn_mode) {
     switch (mode) {
-#if (CUDNN_VERSION >= 8500)
         case NormFwdPhase_t::INFERENCE:
             cudnn_mode = CUDNN_NORM_FWD_INFERENCE;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
         case NormFwdPhase_t::TRAINING:
             cudnn_mode = CUDNN_NORM_FWD_TRAINING;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
-#endif
 
 #ifndef NO_DEFAULT_IN_SWITCH
         default:
@@ -1574,7 +1467,6 @@ static inline void
 convert_from_cudnn_type(cudnnBackendNormMode_t const cudnn_mode, cudnn_frontend::NormMode_t& mode) {
     mode = NormMode_t::NOT_SET;
     switch (cudnn_mode) {
-#if (CUDNN_VERSION >= 8500)
         case CUDNN_LAYER_NORM:
             mode = NormMode_t::LAYER_NORM;
             break;
@@ -1587,7 +1479,6 @@ convert_from_cudnn_type(cudnnBackendNormMode_t const cudnn_mode, cudnn_frontend:
         case CUDNN_GROUP_NORM:
             mode = NormMode_t::GROUP_NORM;
             break;
-#endif
 
 #if (CUDNN_VERSION >= 8906)
         case CUDNN_RMS_NORM:
@@ -1607,14 +1498,12 @@ static inline void
 convert_from_cudnn_type(cudnnBackendNormFwdPhase_t const cudnn_mode, cudnn_frontend::NormFwdPhase_t& mode) {
     mode = NormFwdPhase_t::NOT_SET;
     switch (cudnn_mode) {
-#if (CUDNN_VERSION >= 8500)
         case CUDNN_NORM_FWD_INFERENCE:
             mode = NormFwdPhase_t::INFERENCE;
             break;
         case CUDNN_NORM_FWD_TRAINING:
             mode = NormFwdPhase_t::TRAINING;
             break;
-#endif
 
 #ifndef NO_DEFAULT_IN_SWITCH
         default:
@@ -1623,9 +1512,6 @@ convert_from_cudnn_type(cudnnBackendNormFwdPhase_t const cudnn_mode, cudnn_front
     }
 }
 
-#endif
-
-#if (CUDNN_VERSION >= 8300)
 static inline cudnnStatus_t
 convert_to_cudnn_type(cudnn_frontend::TensorReordering_t const mode, cudnnBackendTensorReordering_t& cudnn_mode) {
     switch (mode) {
@@ -1637,6 +1523,17 @@ convert_to_cudnn_type(cudnn_frontend::TensorReordering_t const mode, cudnnBacken
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
         case cudnn_frontend::TensorReordering_t::F16x16:
 #if CUDNN_VERSION >= 8800
+#if defined NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING
+            if (get_backend_version() >= 8800) {
+                cudnn_mode = CUDNN_TENSOR_REORDERING_F16x16;
+                return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+            } else if (get_backend_version() >= 8700) {
+                cudnn_mode = CUDNN_TENSOR_REORDERING_NONE;
+                return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
+            } else {
+                return cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE;
+            }
+#endif
             cudnn_mode = CUDNN_TENSOR_REORDERING_F16x16;
             return cudnnStatus_t::CUDNN_STATUS_SUCCESS;
 #elif CUDNN_VERSION >= 8700
@@ -1673,8 +1570,6 @@ convert_from_cudnn_type(cudnnBackendTensorReordering_t const cudnn_mode, cudnn_f
     }
 }
 
-#endif
-
 // To be deprecated. Only exists as OperationBuilder_v8(::cudnnBackendDescriptorType_t mode) requires it.
 static inline cudnn_frontend::DescriptorType_t
 convert_from_cudnn_type(cudnnBackendDescriptorType_t const cudnn_mode) {
@@ -1725,13 +1620,8 @@ convert_from_cudnn_type(cudnnBackendDescriptorType_t const cudnn_mode) {
             return DescriptorType_t::REDUCTION_DESCRIPTOR;
         case CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR:
             return DescriptorType_t::OPERATION_REDUCTION_DESCRIPTOR;
-
-#if (CUDNN_VERSION >= 8400)
         case CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR:
             return DescriptorType_t::OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR;
-#endif
-
-#if (CUDNN_VERSION >= 8500)
         case CUDNN_BACKEND_RESAMPLE_DESCRIPTOR:
             return DescriptorType_t::RESAMPLE_DESCRIPTOR;
         case CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR:
@@ -1744,13 +1634,10 @@ convert_from_cudnn_type(cudnnBackendDescriptorType_t const cudnn_mode) {
             return DescriptorType_t::OPERATION_NORM_FORWARD_DESCRIPTOR;
         case CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR:
             return DescriptorType_t::OPERATION_NORM_BACKWARD_DESCRIPTOR;
-#endif
-
 #if (CUDNN_VERSION >= 8600)
         case CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR:
             return DescriptorType_t::OPERATION_RESAMPLE_BWD_DESCRIPTOR;
 #endif
-
 #if (CUDNN_VERSION >= 8700)
         case CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR:
             return DescriptorType_t::OPERATION_RESHAPE_DESCRIPTOR;
@@ -1811,8 +1698,6 @@ convert_from_cudnn_type(cudnnPointwiseMode_t const cudnn_mode) {
             return PointwiseMode_t::SOFTPLUS_BWD;
         case CUDNN_POINTWISE_SWISH_BWD:
             return PointwiseMode_t::SWISH_BWD;
-
-#if (CUDNN_VERSION >= 8300)
         case CUDNN_POINTWISE_DIV:
             return PointwiseMode_t::DIV;
         case CUDNN_POINTWISE_ADD_SQUARE:
@@ -1861,16 +1746,10 @@ convert_from_cudnn_type(cudnnPointwiseMode_t const cudnn_mode) {
             return PointwiseMode_t::SIN;
         case CUDNN_POINTWISE_TAN:
             return PointwiseMode_t::TAN;
-#endif
-
-#if (CUDNN_VERSION >= 8400)
         case CUDNN_POINTWISE_GEN_INDEX:
             return PointwiseMode_t::GEN_INDEX;
         case CUDNN_POINTWISE_BINARY_SELECT:
             return PointwiseMode_t::BINARY_SELECT;
-#endif
-
-#if (CUDNN_VERSION >= 8500)
         case CUDNN_POINTWISE_ERF:
             return PointwiseMode_t::ERF;
         case CUDNN_POINTWISE_IDENTITY:
@@ -1879,8 +1758,6 @@ convert_from_cudnn_type(cudnnPointwiseMode_t const cudnn_mode) {
             return PointwiseMode_t::GELU_APPROX_TANH_BWD;
         case CUDNN_POINTWISE_GELU_APPROX_TANH_FWD:
             return PointwiseMode_t::GELU_APPROX_TANH_FWD;
-#endif
-
 #if (CUDNN_VERSION >= 8900)
         case CUDNN_POINTWISE_RECIPROCAL:
             return PointwiseMode_t::RECIPROCAL;
@@ -1920,10 +1797,8 @@ convert_from_cudnn_type(cudnnDataType_t const cudnn_mode) {
             return DataType_t::BFLOAT16;
         case CUDNN_DATA_INT64:
             return DataType_t::INT64;
-#if (CUDNN_VERSION >= 8300)
         case CUDNN_DATA_BOOLEAN:
             return DataType_t::BOOLEAN;
-#endif
 #if (CUDNN_VERSION >= 8600)
         case CUDNN_DATA_FP8_E4M3:
             return DataType_t::FP8_E4M3;
@@ -1975,6 +1850,8 @@ convert_from_cudnn_type(cudnnReduceTensorOp_t const cudnn_mode) {
 #if (CUDNN_VERSION >= 8700)
 static inline cudnnStatus_t
 convert_to_cudnn_type(cudnn_frontend::RngDistribution_t const mode, cudnnRngDistribution_t& cudnn_mode) {
+    NV_CUDNN_FE_DYNAMIC_CHECK_CUDNN_BACKEND_VERSION(8700, cudnnStatus_t::CUDNN_STATUS_INVALID_VALUE);
+
     switch (mode) {
         case RngDistribution_t::BERNOULLI:
             cudnn_mode = CUDNN_RNG_DISTRIBUTION_BERNOULLI;
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..edd73b2
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=64", "cmake>=3.17", "ninja", "pybind11[global]"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "nvidia-cudnn-frontend"
+dynamic = ["version"]
+description = "CUDNN FrontEnd python library"
+readme = "README.md"
+requires-python = ">=3.7"
+license = {file = "LICENSE.txt"}
+classifiers = [
+    "Programming Language :: Python :: 3",
+]
+
+[tool.setuptools]
+packages = ["cudnn"]
+package-dir = {"" = "python"}
+
+[project.urls]
+"Homepage" = "https://github.com/nvidia/cudnn-frontend"
+"Bug Tracker" = "https://github.com/nvidia/cudnn-frontend/issues"
+
+[tool.setuptools.dynamic]
+version = {attr = "cudnn.__version__"}
\ No newline at end of file
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
new file mode 100644
index 0000000..2229e89
--- /dev/null
+++ b/python/CMakeLists.txt
@@ -0,0 +1,82 @@
+cmake_minimum_required(VERSION 3.18)
+
+Include(FetchContent)
+
+# Fetch and build dlpack
+set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+set(BUILD_MOCK OFF)
+FetchContent_Declare(
+  dlpack
+  GIT_REPOSITORY https://github.com/dmlc/dlpack
+  GIT_TAG        v0.8
+)
+FetchContent_MakeAvailable(dlpack)
+
+find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
+
+option(CUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE "Whether cmake build system should fetch pybinds." ON)
+if(CUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE)
+    FetchContent_Declare(
+        pybind11
+        GIT_REPOSITORY https://github.com/pybind/pybind11
+        GIT_TAG        v2.11.1
+    )
+    FetchContent_MakeAvailable(pybind11)
+else()
+    find_package(pybind11 CONFIG REQUIRED)
+endif()
+# Add a library using FindPython's tooling (pybind11 also provides a helper like
+# this)
+python_add_library(
+    _compiled_module
+    
+    MODULE
+    pycudnn.cpp
+    properties.cpp
+
+    pygraph/pygraph.cpp
+    pygraph/norm.cpp
+    pygraph/sdpa.cpp
+    pygraph/pointwise.cpp
+
+    WITH_SOABI
+)
+target_link_libraries(_compiled_module PRIVATE pybind11::headers)
+
+target_compile_features(_compiled_module PRIVATE cxx_std_17)
+
+target_include_directories(
+    _compiled_module
+    PRIVATE $<TARGET_PROPERTY:cudnn_frontend,INTERFACE_INCLUDE_DIRECTORIES>
+)
+
+target_compile_definitions(_compiled_module PRIVATE NV_CUDNN_FRONTEND_USE_DYNAMIC_LOADING)
+
+target_link_libraries(
+    _compiled_module
+    
+    PRIVATE dlpack
+)
+
+set_target_properties(
+    _compiled_module
+
+    PROPERTIES
+    LINK_FLAGS "-Wl,--no-as-needed"
+    LINK_FLAGS "-Wl,--enable-new-dtags"
+    LINK_FLAGS "-Wl,-rpath,'$ORIGIN',-rpath,'$ORIGIN/../lib',-rpath,'$ORIGIN/../nvidia/cudnn/lib'"
+    LINK_WHAT_YOU_USE TRUE
+)
+
+# using python bindings directly with cmake build system is not supported
+# Temparorily use below parameter
+option(CUDNN_FRONTEND_KEEP_PYBINDS_IN_BINARY_DIR "Whether pybinds should be kept inside build/cudnn directory." ON)
+if(CUDNN_FRONTEND_KEEP_PYBINDS_IN_BINARY_DIR)
+set_target_properties(
+        _compiled_module
+
+        PROPERTIES
+        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/cudnn
+    )
+    file(COPY ${CMAKE_SOURCE_DIR}/python/cudnn DESTINATION ${CMAKE_BINARY_DIR})
+endif()
diff --git a/python/cudnn/__init__.py b/python/cudnn/__init__.py
new file mode 100644
index 0000000..f5b7b5d
--- /dev/null
+++ b/python/cudnn/__init__.py
@@ -0,0 +1,128 @@
+from ._compiled_module import (        
+    backend_version
+    , destroy_handle
+    , norm_forward_phase
+    , reduction_mode
+    , behavior_note
+    , create_handle
+    , get_stream
+    , numerical_note
+    , set_stream
+    , build_plan_policy
+    , data_type
+    , heur_mode
+    , pygraph
+    , tensor
+)
+
+from .datatypes import (_library_type, _is_torch_tensor)
+
+__version__ = '1.2.0'
+
+def _tensor(
+    self,
+    dim,
+    stride,
+    data_type = data_type.NOT_SET,
+    is_virtual = False,
+    is_pass_by_value = False,
+    ragged_offset = None,
+    name = ""
+):
+    """
+    Create a tensor.
+
+    Args:
+        dim (List[int]): The dimensions of the tensor.
+        stride (List[int]): The strides of the tensor.
+        data_type (cudnn.data_type): The data type of the tensor.
+        is_virtual (bool): Flag indicating if the tensor is virtual.
+        is_pass_by_value (bool): Flag indicating if the tensor is passed by value.
+        ragged_offset (cudnn_tensor): The ragged offset tensor.
+        name (str): The name of the tensor.
+
+    Returns:
+        cudnn_tensor: The created tensor.
+    """
+    return self._make_tensor(
+        dim = dim,
+        stride = stride,
+        data_type = _library_type(data_type),
+        is_virtual = is_virtual,
+        is_pass_by_value = is_pass_by_value,
+        ragged_offset = ragged_offset,
+        name = name
+    )
+
+def _set_data_type(
+    self,
+    data_type = data_type.NOT_SET,
+):
+    return self._set_data_type(_library_type(data_type))
+
+_compiled_module.tensor.set_data_type = _set_data_type
+pygraph.tensor = _tensor
+
+def _library_device_pointer(input_tensor):
+    # either pass in pointers directly
+    if type(input_tensor) is int:
+        return input_tensor
+    # directly extract data pointer for torch tensors
+    elif _is_torch_tensor(input_tensor):
+        return input_tensor.data_ptr()
+    # fall back to dlpack support by library
+    else:
+        return _compiled_module._get_data_ptr(input_tensor)
+
+def _execute(
+    self,
+    tensor_to_device_buffer,
+    workspace,
+    handle = None
+):
+    """
+    Execute a cudnn graph.
+
+    Args:
+        tensor_to_device_buffer (dict(cudnn_tensor, Union[torch.Tensor, int, __dlpack__])): The dimensions of the tensor.
+        workspace (Union[torch.Tensor, int, __dlpack__]): The name of the tensor.
+        handle: cudnn_handle created with cudnn.create_handle()
+    Returns:
+        None
+    """
+    uid_to_tensor_pointer = {
+        x if type(x) is int else x.get_uid() : _library_device_pointer(pointer)
+        for x, pointer in tensor_to_device_buffer.items() if x is not None
+    }
+
+    workspace_pointer = _library_device_pointer(workspace)
+    self._execute(uid_to_tensor_pointer, workspace_pointer, handle)
+    
+def _execute_plan_at_index(
+    self,
+    tensor_to_device_buffer,
+    workspace,
+    index,
+    handle = None
+):
+    """
+    Execute a cudnn graph.
+
+    Args:
+        tensor_to_device_buffer (dict(cudnn_tensor, Union[torch.Tensor, int, __dlpack__])): The dimensions of the tensor.
+        workspace (Union[torch.Tensor, int, __dlpack__]): The name of the tensor.
+        index(int): Location of execution plan to use.
+        handle: cudnn_handle created with cudnn.create_handle()
+    Returns:
+        None
+    """
+    uid_to_tensor_pointer = {
+        x if type(x) is int else x.get_uid() : _library_device_pointer(pointer)
+        for x, pointer in tensor_to_device_buffer.items() if x is not None
+    }
+
+    workspace_pointer = _library_device_pointer(workspace)
+    self._execute_plan_at_index(uid_to_tensor_pointer, workspace_pointer, index, handle)
+
+pygraph.execute = _execute
+pygraph.execute_plan_at_index = _execute_plan_at_index
\ No newline at end of file
diff --git a/python/cudnn/datatypes.py b/python/cudnn/datatypes.py
new file mode 100644
index 0000000..c8a9452
--- /dev/null
+++ b/python/cudnn/datatypes.py
@@ -0,0 +1,68 @@
+from ._compiled_module import data_type as cudnn_data_type
+
+torch_available = None
+_torch_to_cudnn_data_type_dict = None
+
+def is_torch_available():
+    global torch_available, _torch_to_cudnn_data_type_dict
+    # this condition ensures that datatype mapping is only created once
+    if torch_available is None:
+        try:
+            import torch
+
+            torch_available = True
+            _torch_to_cudnn_data_type_dict = {
+                torch.half: cudnn_data_type.HALF,
+                torch.float16: cudnn_data_type.HALF,
+                torch.bfloat16: cudnn_data_type.BFLOAT16,
+                torch.float: cudnn_data_type.FLOAT,
+                torch.float32: cudnn_data_type.FLOAT,
+                torch.double: cudnn_data_type.DOUBLE,
+                torch.float64: cudnn_data_type.DOUBLE,
+                torch.int8: cudnn_data_type.INT8,
+                torch.int32: cudnn_data_type.INT32,
+                torch.int64: cudnn_data_type.INT64,
+                torch.uint8: cudnn_data_type.UINT8,
+                torch.bool: cudnn_data_type.BOOLEAN,
+            }
+
+            def possibly_add_type(torch_type_name, cudnn_type):
+                # Only try adding the type if the version of torch being used supports it
+                if hasattr(torch, torch_type_name):
+                    torch_type = getattr(torch, torch_type_name)
+                    _torch_to_cudnn_data_type_dict[torch_type] = cudnn_type
+
+            possibly_add_type("float8_e4m3fn", cudnn_data_type.FP8_E4M3)
+            possibly_add_type("float8_e5m2", cudnn_data_type.FP8_E5M2)
+
+        except ImportError:
+            torch_available = False
+            _torch_to_cudnn_data_type_dict = {}
+    return torch_available
+
+# Returns None in case mapping is not available
+def _torch_to_cudnn_data_type(torch_data_type) -> cudnn_data_type:
+    if is_torch_available():
+        return _torch_to_cudnn_data_type_dict.get(torch_data_type, None)
+    else:
+        return None
+
+def _library_type(input_type):
+    if type(input_type) is cudnn_data_type:
+        return input_type
+
+    for cvt_fn in [
+        _torch_to_cudnn_data_type,
+        # Add more DL libraries to support here
+    ]:
+        out = cvt_fn(input_type)
+        if out is not None:
+            return out
+
+    raise Exception(f"No available conversion from type {input_type} to a library type.")
+
+def _is_torch_tensor(input_tensor) -> bool:
+    if is_torch_available():
+        import torch
+        return isinstance(input_tensor, torch.Tensor)
+    return False
\ No newline at end of file
diff --git a/python_bindings/properties.cpp b/python/properties.cpp
similarity index 88%
rename from python_bindings/properties.cpp
rename to python/properties.cpp
index 5fd700b..60d97a2 100644
--- a/python_bindings/properties.cpp
+++ b/python/properties.cpp
@@ -18,30 +18,33 @@ throw_if(bool const cond, cudnn_frontend::error_code_t const error_code, std::st
 
 class HandleManagement {
    public:
-    static void*
+    static std::intptr_t
     create_handle() {
         cudnnHandle_t handle;
-        cudnnCreate(&handle);
-        return (void*)handle;
+        cudnn_frontend::create_handle(&handle);
+        return reinterpret_cast<std::intptr_t>(handle);
     }
 
     static void
-    destroy_handle(void* handle) {
-        auto status = cudnnDestroy((cudnnHandle_t)handle);
+    destroy_handle(std::intptr_t handle) {
+        auto status = cudnn_frontend::destroy_handle((cudnnHandle_t)handle);
         throw_if(
             status != CUDNN_STATUS_SUCCESS, cudnn_frontend::error_code_t::HANDLE_ERROR, "cudnnHandle Destroy failed");
     }
 
     static void
-    set_stream(void* handle, void* stream) {
-        auto status = cudnnSetStream((cudnnHandle_t)handle, (cudaStream_t)stream);
+    set_stream(std::intptr_t handle, std::intptr_t stream) {
+        auto status = cudnn_frontend::set_stream((cudnnHandle_t)handle, (cudaStream_t)stream);
         throw_if(status != CUDNN_STATUS_SUCCESS, cudnn_frontend::error_code_t::HANDLE_ERROR, "cudnnSetStream failed");
     }
 
-    static void
-    get_stream(void* handle, void* streamId) {
-        auto status = cudnnGetStream((cudnnHandle_t)handle, (cudaStream_t*)streamId);
+    static std::intptr_t
+    get_stream(std::intptr_t handle) {
+        cudaStream_t streamId = nullptr;
+        auto status           = cudnn_frontend::get_stream((cudnnHandle_t)handle, &streamId);
         throw_if(status != CUDNN_STATUS_SUCCESS, cudnn_frontend::error_code_t::HANDLE_ERROR, "cudnnGetStream failed");
+
+        return reinterpret_cast<std::intptr_t>(streamId);
     }
 };
 
@@ -71,7 +74,7 @@ init_properties(py::module_& m) {
         .def("get_name", &cudnn_frontend::graph::Tensor_attributes::get_name)
         .def("set_name", &cudnn_frontend::graph::Tensor_attributes::set_name)
         .def("get_data_type", &cudnn_frontend::graph::Tensor_attributes::get_data_type)
-        .def("set_data_type", &cudnn_frontend::graph::Tensor_attributes::set_data_type)
+        .def("_set_data_type", &cudnn_frontend::graph::Tensor_attributes::set_data_type)
         .def("get_dim", &cudnn_frontend::graph::Tensor_attributes::get_dim)
         .def("set_dim", &cudnn_frontend::graph::Tensor_attributes::set_dim)
         .def("get_stride", &cudnn_frontend::graph::Tensor_attributes::get_stride)
@@ -100,11 +103,7 @@ init_properties(py::module_& m) {
     m.def("create_handle", &HandleManagement::create_handle);
     m.def("destroy_handle", &HandleManagement::destroy_handle);
     m.def("get_stream", &HandleManagement::get_stream);
-    m.def(
-        "set_stream",
-        [](void* handle, int64_t stream) { return HandleManagement::set_stream(handle, (void*)stream); },
-        py::arg("handle"),
-        py::arg("stream"));
+    m.def("set_stream", &HandleManagement::set_stream, py::arg("handle"), py::arg("stream"));
 
     py::enum_<cudnn_frontend::NormFwdPhase_t>(m, "norm_forward_phase")
         .value("INFERENCE", cudnn_frontend::NormFwdPhase_t::INFERENCE)
diff --git a/python_bindings/pycudnn.cpp b/python/pycudnn.cpp
similarity index 95%
rename from python_bindings/pycudnn.cpp
rename to python/pycudnn.cpp
index e73850a..b837247 100644
--- a/python_bindings/pycudnn.cpp
+++ b/python/pycudnn.cpp
@@ -58,8 +58,8 @@ init_pygraph_submodule(py::module_ &);
 void
 init_properties(py::module_ &);
 
-PYBIND11_MODULE(cudnn, m) {
-    m.def("backend_version", &cudnnGetVersion);
+PYBIND11_MODULE(_compiled_module, m) {
+    m.def("backend_version", &cudnn_frontend::get_backend_version);
 
     init_properties(m);
     init_pygraph_submodule(m);
diff --git a/python_bindings/pygraph/norm.cpp b/python/pygraph/norm.cpp
similarity index 100%
rename from python_bindings/pygraph/norm.cpp
rename to python/pygraph/norm.cpp
diff --git a/python_bindings/pygraph/pointwise.cpp b/python/pygraph/pointwise.cpp
similarity index 100%
rename from python_bindings/pygraph/pointwise.cpp
rename to python/pygraph/pointwise.cpp
diff --git a/python_bindings/pygraph/pygraph.cpp b/python/pygraph/pygraph.cpp
similarity index 88%
rename from python_bindings/pygraph/pygraph.cpp
rename to python/pygraph/pygraph.cpp
index 2c516b7..0a88b66 100644
--- a/python_bindings/pygraph/pygraph.cpp
+++ b/python/pygraph/pygraph.cpp
@@ -77,30 +77,6 @@ convert_to_cudnn_data_type(const DLDataType& dtype) {
     return cudnn_frontend::DataType_t::NOT_SET;
 }
 
-char*
-extract_data_pointer(py::object const& obj) {
-    throw_if(!py::hasattr(obj, "__dlpack__"),
-             cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
-             "Object does not have the __dlpack__() method");
-
-    py::capsule capsule = obj.attr("__dlpack__")();
-    throw_if(capsule.is_none(),
-             cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
-             "Failed to retrieve the DLPack capsule.");
-
-    DLManagedTensor* managed =
-        static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule.ptr(), CUDNN_FRONTEND_DLPACK_CAPSULE_NAME));
-    throw_if(managed == nullptr, cudnn_frontend::error_code_t::INVALID_VARIANT_PACK, "Invalid DLPack capsule.");
-
-    DLDeviceType device_type = managed->dl_tensor.device.device_type;
-    throw_if(
-        device_type != kDLCPU && device_type != kDLCUDAHost && device_type != kDLCUDA && device_type != kDLCUDAManaged,
-        cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
-        "Invalid device type.");
-
-    return (char*)managed->dl_tensor.data + managed->dl_tensor.byte_offset;
-}
-
 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
 PyGraph::tensor(std::vector<int64_t> const& dim,
                 std::vector<int64_t> const& stride,
@@ -126,6 +102,32 @@ PyGraph::tensor_like(std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> c
     return graph.tensor_like(tensor, name);
 }
 
+static std::intptr_t
+extract_data_pointer(py::object const& obj) {
+    throw_if(!py::hasattr(obj, "__dlpack__"),
+             cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
+             "Object does not have the __dlpack__() method");
+
+    py::capsule capsule = obj.attr("__dlpack__")();
+    throw_if(capsule.is_none(),
+             cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
+             "Failed to retrieve the DLPack capsule.");
+
+    DLManagedTensor* managed =
+        static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule.ptr(), CUDNN_FRONTEND_DLPACK_CAPSULE_NAME));
+    throw_if(managed == nullptr, cudnn_frontend::error_code_t::INVALID_VARIANT_PACK, "Invalid DLPack capsule.");
+
+    DLDeviceType device_type = managed->dl_tensor.device.device_type;
+    throw_if(
+        device_type != kDLCPU && device_type != kDLCUDAHost && device_type != kDLCUDA && device_type != kDLCUDAManaged,
+        cudnn_frontend::error_code_t::INVALID_VARIANT_PACK,
+        "Invalid device type.");
+
+    void* p     = (char*)managed->dl_tensor.data + managed->dl_tensor.byte_offset;
+    auto result = reinterpret_cast<std::intptr_t>(p);
+    return result;
+}
+
 std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>
 PyGraph::tensor_like(py::object const& pyobj) {
     throw_if(!py::hasattr(pyobj, "__dlpack__"),
@@ -298,6 +300,12 @@ PyGraph::build_plans(BuildPlanPolicy_t const policy) {
     throw_if(status.is_bad(), status.get_code(), status.get_message());
 }
 
+void
+PyGraph::build_plan_at_index(int64_t const index) {
+    auto status = graph.build_plan_at_index(handle, index);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+}
+
 void
 PyGraph::build(std::vector<cudnn_frontend::HeurMode_t> const& modes) {
     validate();
@@ -318,38 +326,54 @@ PyGraph::get_workspace_size() {
     return graph.get_workspace_size();
 }
 
+std::vector<uint8_t>
+PyGraph::serialize() const {
+    std::vector<uint8_t> data;
+    auto status = graph.serialize(data);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+    return data;
+}
+
 void
-PyGraph::execute(std::unordered_map<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, py::object> var_pack,
-                 py::object workspace) {
-    std::unordered_map<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, void*> var_pack_;
-    for (auto const& [tensor, pyobject] : var_pack) {
-        // Its alright for the user to pass in None objects as key
-        // FE will just ignore them
-        if (tensor) {
-            var_pack_.emplace(tensor, extract_data_pointer(pyobject));
-        }
+PyGraph::deserialize(std::vector<uint8_t> const& data) {
+    auto status = graph.deserialize(handle, data);
+    throw_if(status.is_bad(), status.get_code(), status.get_message());
+}
+
+void
+PyGraph::execute(std::unordered_map<int64_t, std::intptr_t> var_pack,
+                 std::intptr_t workspace,
+                 std::optional<std::intptr_t> exec_handle) {
+    std::unordered_map<int64_t, void*> var_pack_;
+    for (auto const& [uid, device_pointer] : var_pack) {
+        var_pack_.emplace(uid, (void*)device_pointer);
     }
 
-    void* workspace_ptr = extract_data_pointer(workspace);
+    auto workspace_ptr = (void*)workspace;
 
-    // TODO: Probably concatenate in a macro?
-    auto status = graph.execute(handle, var_pack_, workspace_ptr);
+    cudnnHandle_t handle_ = exec_handle.has_value() ? static_cast<cudnnHandle_t>((void*)(exec_handle.value())) : handle;
+
+    auto status = graph.execute(handle_, var_pack_, workspace_ptr);
     throw_if(status.is_bad(), status.get_code(), status.get_message());
 
     return;
 }
 
 void
-PyGraph::execute(std::unordered_map<int64_t, py::object> var_pack, py::object workspace) {
+PyGraph::execute_plan_at_index(std::unordered_map<int64_t, std::intptr_t> var_pack,
+                               std::intptr_t workspace,
+                               int64_t index,
+                               std::optional<std::intptr_t> exec_handle) {
     std::unordered_map<int64_t, void*> var_pack_;
-    for (auto const& [uid, pyobject] : var_pack) {
-        var_pack_.emplace(uid, extract_data_pointer(pyobject));
+    for (auto const& [uid, device_pointer] : var_pack) {
+        var_pack_.emplace(uid, (void*)device_pointer);
     }
 
-    void* workspace_ptr = extract_data_pointer(workspace);
+    auto workspace_ptr = (void*)workspace;
+
+    cudnnHandle_t handle_ = exec_handle.has_value() ? static_cast<cudnnHandle_t>((void*)(exec_handle.value())) : handle;
 
-    // TODO: Probably concatenate in a macro?
-    auto status = graph.execute(handle, var_pack_, workspace_ptr);
+    auto status = graph.execute_plan_at_index(handle_, var_pack_, workspace_ptr, index);
     throw_if(status.is_bad(), status.get_code(), status.get_message());
 
     return;
@@ -368,19 +392,19 @@ init_pygraph_submodule(py::module_& m) {
                       cudnn_frontend::DataType_t,
                       cudnn_frontend::DataType_t,
                       cudnn_frontend::DataType_t,
-                      void*>(),
+                      std::optional<std::intptr_t>>(),
              py::arg_v("name", "test_graph"),
              py::arg_v("io_data_type", cudnn_frontend::DataType_t::NOT_SET),
              py::arg_v("intermediate_data_type", cudnn_frontend::DataType_t::NOT_SET),
              py::arg_v("compute_data_type", cudnn_frontend::DataType_t::NOT_SET),
-             py::arg_v("handle", nullptr))
+             py::arg_v("handle", std::nullopt))
         .def("tensor_like",
              py::overload_cast<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes> const&, std::string const&>(
                  &PyGraph::tensor_like),
              py::arg("input"),
              py::arg_v("name", ""))
         .def("tensor_like", py::overload_cast<py::object const&>(&PyGraph::tensor_like))
-        .def("tensor",
+        .def("_make_tensor",
              &PyGraph::tensor,
              py::arg{"dim"},
              py::arg{"stride"},
@@ -388,22 +412,7 @@ init_pygraph_submodule(py::module_& m) {
              py::arg_v{"is_virtual", false},
              py::arg_v{"is_pass_by_value", false},
              py::arg_v{"ragged_offset", nullptr},
-             py::arg_v("name", ""),
-             R"pbdoc(
-                Create a tensor.
-
-                Args:
-                    dim (List[int]): The dimensions of the tensor.
-                    stride (List[int]): The strides of the tensor.
-                    data_type (cudnn.data_type): The data type of the tensor. Default is cudnn.data_type.NOT_SET.
-                    is_virtual (bool): Flag indicating if the tensor is virtual. Default is False.
-                    is_pass_by_value (bool): Flag indicating if the tensor is passed by value. Default is False.
-                    ragged_offset (cudnn_tensor): The ragged offset tensor. Default is nullptr.
-                    name (Optional[str]): The name of the tensor.
-
-                Returns:
-                    cudnn_tensor: The created tensor.
-            )pbdoc")
+             py::arg_v("name", ""))
         .def("genstats",
              &PyGraph::genstats,
              py::arg("input"),
@@ -591,15 +600,34 @@ init_pygraph_submodule(py::module_& m) {
         .def("build_plans",
              &PyGraph::build_plans,
              py::arg("policy") = cudnn_frontend::BuildPlanPolicy_t::HEURISTICS_CHOICE)
+        .def("build_plan_at_index",
+             &PyGraph::build_plan_at_index,
+             py::arg("index"),
+             R"pbdoc(
+                Build a plan at the given index.
+                Args:
+                    index (int): The index of the plan to build.
+            )pbdoc")
         .def("build", &PyGraph::build)
+        .def("get_execution_plan_count",
+             &PyGraph::get_execution_plan_count,
+             R"pbdoc(
+                Get the number of execution plan candidates.
+            )pbdoc")
         .def("get_workspace_size", &PyGraph::get_workspace_size)
-        .def(
-            "execute",
-            static_cast<void (PyGraph::*)(
-                std::unordered_map<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, py::object>, py::object)>(
-                &PyGraph::execute))
-        .def("execute",
-             static_cast<void (PyGraph::*)(std::unordered_map<int64_t, py::object>, py::object)>(&PyGraph::execute))
+        .def("get_workspace_size_plan_at_index",
+             &PyGraph::get_workspace_size_plan_at_index,
+             py::arg("index"),
+             R"pbdoc(
+                Get workspace for a plan at the given index.
+                Args:
+                    index (int): The index of the plan to get workspace from.
+                    If the graph is not built at the index, this will return 0.
+            )pbdoc")
+        .def("_execute", &PyGraph::execute)
+        .def("serialize", &PyGraph::serialize)
+        .def("deserialize", &PyGraph::deserialize)
+        .def("_execute_plan_at_index", &PyGraph::execute_plan_at_index)
         .def("__repr__", [](PyGraph const& pygraph) {
             std::stringstream ss;
             json j = pygraph.graph;
@@ -607,6 +635,8 @@ init_pygraph_submodule(py::module_& m) {
             return ss.str();
         });
 
+    m.def("_get_data_ptr", &extract_data_pointer);
+
     init_pygraph_norm_submodule(pygraph_);
     init_pygraph_sdpa_submodule(pygraph_);
     init_pygraph_pointwise_submodule(pygraph_);
diff --git a/python_bindings/pygraph/pygraph.h b/python/pygraph/pygraph.h
similarity index 94%
rename from python_bindings/pygraph/pygraph.h
rename to python/pygraph/pygraph.h
index 3d2dd86..7ac0a22 100644
--- a/python_bindings/pygraph/pygraph.h
+++ b/python/pygraph/pygraph.h
@@ -41,27 +41,29 @@ class PyGraph {
     // descriptors.
     cudnn_frontend::graph::Graph graph;
     cudnnHandle_t handle;
-    bool is_handle_owner;
+    bool is_handle_owner = false;
 
     PyGraph(std::string const&,
             cudnn_frontend::DataType_t io_data_type,
             cudnn_frontend::DataType_t intermediate_data_type,
             cudnn_frontend::DataType_t compute_data_type,
-            void* handle_ = nullptr)
-        : graph(), handle((cudnnHandle_t)handle_), is_handle_owner(false) {
+            std::optional<std::intptr_t> handle_) {
         graph.set_compute_data_type(compute_data_type)
             .set_intermediate_data_type(intermediate_data_type)
             .set_io_data_type(io_data_type);
 
-        if (handle_ == nullptr) {
-            cudnnCreate(&handle);
+        if(handle_.has_value()) {
+            handle = static_cast<cudnnHandle_t>((void*)(handle_.value()));
+        }
+        else {
+            cudnn_frontend::create_handle(&handle);
             is_handle_owner = true;
         }
     }
 
     ~PyGraph() {
         if (is_handle_owner) {
-            cudnnDestroy(handle);
+            cudnn_frontend::destroy_handle(handle);
         }
     }
 
@@ -297,6 +299,9 @@ class PyGraph {
     void
     build_plans(BuildPlanPolicy_t const);
 
+    void
+    build_plan_at_index(int64_t const index);
+
     void
     check_support();
 
@@ -307,11 +312,10 @@ class PyGraph {
     get_workspace_size();
 
     void
-    execute(std::unordered_map<std::shared_ptr<cudnn_frontend::graph::Tensor_attributes>, py::object> var_pack,
-            py::object workspace);
+    execute(std::unordered_map<int64_t, int64_t> var_pack, int64_t workspace, std::optional<std::intptr_t>);
 
     void
-    execute(std::unordered_map<int64_t, py::object> var_pack, py::object workspace);
+    execute_plan_at_index(std::unordered_map<int64_t, int64_t> var_pack, int64_t workspace, int64_t index, std::optional<std::intptr_t>);
 
     void
     deselect_numeric_notes(std::vector<NumericalNote_t> const& notes) {
@@ -330,6 +334,22 @@ class PyGraph {
         graph.deselect_workspace_greater_than(workspace);
         return;
     }
+
+    std::vector<uint8_t>
+    serialize() const;
+
+    void
+    deserialize(std::vector<uint8_t> const& data);
+
+    int64_t
+    get_execution_plan_count() const {
+        return graph.get_execution_plan_count();
+    }
+
+    int64_t
+    get_workspace_size_plan_at_index(int64_t index) const {
+        return graph.get_workspace_size_plan_at_index(index);
+    }
 };
 
 }  // namespace cudnn_frontend::python_bindings
\ No newline at end of file
diff --git a/python_bindings/pygraph/sdpa.cpp b/python/pygraph/sdpa.cpp
similarity index 100%
rename from python_bindings/pygraph/sdpa.cpp
rename to python/pygraph/sdpa.cpp
diff --git a/python_bindings/CMakeLists.txt b/python_bindings/CMakeLists.txt
deleted file mode 100644
index 57a39bf..0000000
--- a/python_bindings/CMakeLists.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-
-Include(FetchContent)
-
-# Fetch and build dlpack
-FetchContent_Declare(
-  dlpack
-  GIT_REPOSITORY https://github.com/dmlc/dlpack
-  GIT_TAG        v0.8
-)
-FetchContent_MakeAvailable(dlpack)
-
-FetchContent_Declare(
-    pybind11
-    GIT_REPOSITORY https://github.com/pybind/pybind11
-    GIT_TAG        v2.10.4
-)
-FetchContent_MakeAvailable(pybind11)
-
-pybind11_add_module(
-    cudnn
-    
-    pycudnn.cpp
-    properties.cpp
-    
-    pygraph/pygraph.cpp
-    pygraph/norm.cpp
-    pygraph/sdpa.cpp
-    pygraph/pointwise.cpp
-)
-
-target_link_libraries(
-    cudnn
-    
-    PRIVATE cudnn_frontend
-    PRIVATE dlpack
-)
-
-set_target_properties(
-    cudnn
-    PROPERTIES LINK_FLAGS "-Wl,--no-as-needed"
-    LINK_WHAT_YOU_USE TRUE
-    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib64
-)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..a733f62
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+jupyter
+numpy
+pybind11[global]
+pytest
+pytest-xdist
+torch
\ No newline at end of file
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 95fbaa7..d623eca 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -1,14 +1,18 @@
 cmake_minimum_required(VERSION 3.18)
 
-Include(FetchContent)
+find_package(Catch2 QUIET)
 
-# Fetch and build catch2
-FetchContent_Declare(
-  Catch2
-  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-  GIT_TAG        v3.3.2
-)
-FetchContent_MakeAvailable(Catch2)
+if(NOT Catch2_FOUND)
+    Include(FetchContent)
+
+    # Fetch and build catch2
+    FetchContent_Declare(
+      Catch2
+      GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+      GIT_TAG        v3.3.2
+    )
+    FetchContent_MakeAvailable(Catch2)
+endif()
 
 add_executable(
     samples
@@ -22,6 +26,7 @@ add_executable(
     cpp/rmsnorm.cpp
     cpp/wgrads.cpp
     cpp/serialization.cpp
+    cpp/autotuning.cpp
     cpp/pointwise.cpp
 
     legacy_samples/conv_sample.cpp 
@@ -82,4 +87,4 @@ set_source_files_properties(
     legacy_samples/resnet_test_list.cpp legacy_samples/resnet_sample.cpp
     PROPERTIES 
     INCLUDE_DIRECTORIES "${PROJECT_SOURCE_DIR}/samples/legacy_samples/resnet_block/include"
-)
\ No newline at end of file
+)
diff --git a/samples/README.md b/samples/README.md
index 71deae8..6105d49 100644
--- a/samples/README.md
+++ b/samples/README.md
@@ -2,6 +2,17 @@
 
 ## Python Interface Samples
 Samples leveraging FE's Python interface are located in [samples/python](/samples/python/).
+* [00_basic_gemm](/samples/python/00_basic_gemm.ipynb)
+    Walks through pycudnn installation and then defining, building, executing a GEMM graph.
+
+* [01_epilogue](/samples/python/01_epilogue.ipynb)
+    Shows how to easily fuse elementwise functions to a GEMM graph.
+
+* [02_caching](/samples/python/02_caching.ipynb)
+    Shows how to cache already built cudnn graphs for faster execution in the future.
+
+* [03_flash_attention](/samples/python/03_flash_attention.ipynb)
+    Shows how to run causal self attention with dropout in forward and backward pass.
 
 ## C++ Interface Samples
 Samples leveraging FE's C++ interface are located in [samples/cpp](/samples/cpp/).
diff --git a/samples/cpp/autotuning.cpp b/samples/cpp/autotuning.cpp
new file mode 100644
index 0000000..32a91f1
--- /dev/null
+++ b/samples/cpp/autotuning.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include "../utils/helpers.h"
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Matmul autotuning", "[matmul][graph][autotuning]") {
+    if (is_arch_supported_by_cudnn() == false) {
+        SKIP("Architecture is not supported by currend cudnn version");
+    }
+    namespace fe = cudnn_frontend;
+
+    // matmul problem size
+    int64_t const b = 16;
+    int64_t const m = 32;
+    int64_t const n = 64;
+    int64_t const k = 128;
+
+    // Initialize input tensors
+    Surface<half> A_gpu(b * m * k, false);
+    Surface<half> B_gpu(b * k * n, false);
+    Surface<half> C_gpu(b * m * n, false);
+
+    int64_t a_uid = 0, b_uid = 1, c_uid = 2;
+
+    cudnnHandle_t handle;
+    checkCudnnErr(cudnnCreate(&handle));
+
+    auto create_graph = [&]() -> fe::graph::Graph {
+        // Make cudnn graph
+        fe::graph::Graph graph{};
+
+        // Create the two non-virtual input tensors A and B.
+        // There are read from global memory.
+        auto A_attributes = fe::graph::Tensor_attributes()
+                                .set_name("A")
+                                .set_dim({b, m, k})
+                                .set_stride({m * k, k, 1})
+                                .set_uid(a_uid)
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto A            = graph.tensor(A_attributes);
+        auto B_attributes = fe::graph::Tensor_attributes()
+                                .set_name("B")
+                                .set_dim({b, k, n})
+                                .set_stride({k * n, n, 1})
+                                .set_uid(b_uid)
+                                .set_data_type(fe::DataType_t::BFLOAT16);
+        auto B = graph.tensor(B_attributes);
+
+        auto matmul_attributes =
+            fe::graph::Matmul_attributes().set_name("GEMM").set_compute_data_type(fe::DataType_t::FLOAT);
+        auto C = graph.matmul(A, B, matmul_attributes);
+        C->set_output(true).set_uid(c_uid).set_data_type(fe::DataType_t::BFLOAT16);
+
+        REQUIRE(graph.validate().is_good());
+
+        REQUIRE(graph.build_operation_graph(handle).is_good());
+
+        REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+        REQUIRE(graph.check_support(handle).is_good());
+
+        return graph;
+    };
+
+    auto graph = create_graph();
+
+    graph.deselect_workspace_greater_than(0);
+    auto plan_count = graph.get_execution_plan_count();
+    std::cout << "Graph has " << plan_count << " plan candidates." << std::endl;
+
+    REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::ALL).is_good());
+
+    std::unordered_map<int64_t, void*> variant_pack = {
+        {a_uid, A_gpu.devPtr}, {b_uid, B_gpu.devPtr}, {c_uid, C_gpu.devPtr}};
+
+    auto autotune = [&]() -> int64_t {
+        const int iter_count = 10;
+        cudaEvent_t start, stop;
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+        cudaDeviceSynchronize();
+
+        cudaStream_t stream = nullptr;
+        cudnnGetStream(handle, &stream);
+
+        std::vector<float> execution_times;
+        execution_times.resize(plan_count, 10.0f);  // Some arbitrary high time
+
+        int64_t workspace_size = 0;
+        for (auto i = 0; i < plan_count; i++) {
+            workspace_size = std::max(workspace_size, graph.get_workspace_size_plan_at_index(i));
+        }
+
+        Surface<int8_t> workspace(workspace_size, false);
+
+        for (auto i = 0; i < plan_count; i++) {
+            float time_ms = 0.0f;
+
+            auto warmup_status = graph.execute_plan_at_index(handle, variant_pack, workspace.devPtr, i);
+
+            if (warmup_status.is_bad()) {
+                std::cout << "Plan at index " << i << " failed execution " << warmup_status.get_message() << std::endl;
+                continue;
+            }
+            cudaDeviceSynchronize();
+
+            cudaEventRecord(start, stream);
+            for (int iter = 0; iter < iter_count; iter++) {
+                auto status = graph.execute_plan_at_index(handle, variant_pack, workspace.devPtr, i);
+                (void)status;
+            }
+            cudaEventRecord(stop, stream);
+            cudaEventSynchronize(stop);
+            cudaEventElapsedTime(&time_ms, start, stop);
+
+            std::cout << "Plan at index " << i << " took " << time_ms / iter_count << " ms." << std::endl;
+            execution_times[i] = time_ms / iter_count;
+        }
+
+        return std::distance(std::begin(execution_times),
+                             std::min_element(std::begin(execution_times), std::end(execution_times)));
+    };
+    // Run cudnn graph
+
+    auto candidate_index = autotune();
+
+    std::cout << "Successful candidate is at index " << candidate_index << std::endl;
+
+    REQUIRE(graph.build_plan_at_index(handle, candidate_index).is_good());
+
+    Surface<int8_t> workspace(graph.get_workspace_size_plan_at_index(candidate_index), false);
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+    checkCudnnErr(cudnnDestroy(handle));
+}
\ No newline at end of file
diff --git a/samples/cpp/convolutions.cpp b/samples/cpp/convolutions.cpp
index 51fc581..98d8b8b 100644
--- a/samples/cpp/convolutions.cpp
+++ b/samples/cpp/convolutions.cpp
@@ -48,10 +48,8 @@ TEST_CASE("Convolution fprop", "[conv][graph][caching]") {
                                    .set_dim({k, c, r, s})
                                    .set_stride({c * r * s, 1, c * s, c}));
 
-        auto conv_options = fe::graph::Conv_fprop_attributes()
-                                .set_padding({0, 0})
-                                .set_stride({1, 1})
-                                .set_dilation({1, 1});
+        auto conv_options =
+            fe::graph::Conv_fprop_attributes().set_padding({0, 0}).set_stride({1, 1}).set_dilation({1, 1});
         auto Y = graph->conv_fprop(X, W, conv_options);
 
         Y->set_output(true);
@@ -368,3 +366,108 @@ TEST_CASE("Conv with Int8 datatypes", "[conv][graph][caching]") {
     REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
     cudnnDestroy(handle);
 }
+
+TEST_CASE("Convolution fp8 precision", "[matmul][graph]") {
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+    if (cudnnGetVersion() < 8600) {
+        SKIP("TEST REQUIRES minimum cudnn version 8.6.0");
+    }
+    if (check_device_arch_newer_than("hopper") == false) {
+        SKIP("TEST REQUIRES device  hopper arch or newer");
+    }
+
+    namespace fe = cudnn_frontend;
+    // conv problem size
+    int64_t n = 16, c = 128, h = 64, w = 64, k = 256, r = 1, s = 1;
+
+    // Initialize input tensors with int8_t as proxy for fp8
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("image")
+                               .set_dim({n, c, h, w})
+                               .set_stride({c * h * w, 1, c * w, c})
+                               .set_data_type(fe::DataType_t::FP8_E4M3));
+
+    auto W = graph->tensor(fe::graph::Tensor_attributes()
+                               .set_name("filter")
+                               .set_dim({k, c, r, s})
+                               .set_stride({c * r * s, 1, c * s, c})
+                               .set_data_type(fe::DataType_t::FP8_E4M3));
+
+    auto conv_options = fe::graph::Conv_fprop_attributes().set_padding({0, 0}).set_stride({1, 1}).set_dilation({1, 1});
+    auto conv_output_fp8 = graph->conv_fprop(X, W, conv_options);
+
+    auto descale_x = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("descale_x")
+                                       .set_dim({1, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::FLOAT));
+
+    auto descale_w = graph->tensor(fe::graph::Tensor_attributes()
+                                       .set_name("descale_w")
+                                       .set_dim({1, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_data_type(fe::DataType_t::FLOAT));
+
+    auto scale_y = graph->tensor(fe::graph::Tensor_attributes()
+                                     .set_name("scale_y")
+                                     .set_dim({1, 1, 1, 1})
+                                     .set_stride({1, 1, 1, 1})
+                                     .set_data_type(fe::DataType_t::FLOAT));
+
+    auto scale_options   = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+    auto after_descale_x = graph->pointwise(conv_output_fp8, descale_x, scale_options);
+    auto after_descale_w = graph->pointwise(after_descale_x, descale_w, scale_options);
+    auto Y               = graph->pointwise(after_descale_w, scale_y, scale_options);
+
+    Y->set_output(true).set_data_type(fe::DataType_t::FP8_E4M3);
+
+    auto amax = graph->reduction(after_descale_w,
+                                 fe::graph::Reduction_attributes()
+                                     .set_mode(fe::ReductionMode_t::AMAX)
+                                     .set_compute_data_type(fe::DataType_t::FLOAT));
+
+    amax->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({1, 1, 1, 1});
+
+    REQUIRE(graph->validate().is_good());
+
+    cudnnHandle_t handle;
+    checkCudnnErr(cudnnCreate(&handle));
+
+    REQUIRE(graph->build_operation_graph(handle).is_good());
+    REQUIRE(graph->create_execution_plans({fe::HeurMode_t::A}).is_good());
+
+    REQUIRE(graph->check_support(handle).is_good());
+
+    REQUIRE(graph->build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    // Use int8_t as proxy for fp8
+    Surface<int8_t> X_gpu(n * c * h * w, false);
+    Surface<int8_t> W_gpu(k * c * r * s, false);
+    Surface<int8_t> Y_gpu(n * k * h * w, false);
+
+    Surface<float> X_descale_gpu(1, false);
+    Surface<float> W_descale_gpu(1, false);
+    Surface<float> Y_scale_gpu(1, false);
+    Surface<float> amax_gpu(1, false);
+
+    Surface<int8_t> workspace(graph->get_workspace_size(), false);
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {X, X_gpu.devPtr},
+        {W, W_gpu.devPtr},
+        {Y, Y_gpu.devPtr},
+        {descale_x, X_descale_gpu.devPtr},
+        {descale_w, W_descale_gpu.devPtr},
+        {scale_y, Y_scale_gpu.devPtr},
+        {amax, amax_gpu.devPtr}};
+
+    std::cout << graph->print() << std::endl;
+    REQUIRE(graph->execute(handle, variant_pack, workspace.devPtr).is_good());
+    checkCudnnErr(cudnnDestroy(handle));
+}
\ No newline at end of file
diff --git a/samples/cpp/matmuls.cpp b/samples/cpp/matmuls.cpp
index 3c2cdf7..dde5ab9 100644
--- a/samples/cpp/matmuls.cpp
+++ b/samples/cpp/matmuls.cpp
@@ -525,6 +525,15 @@ TEST_CASE("Bias + Matmul", "[matmul][graph]") {
 TEST_CASE("Matmul SBR Graph", "[matmul][graph]") {
     namespace fe = cudnn_frontend;
 
+    if (cudnnGetVersion() < 8600) {
+        SKIP("Test requires cuDNN version 8.6.0 or above");
+        return;
+    }
+
+    if (cudnnGetCudartVersion() < 12000) {
+        SKIP("Test requires cuda toolkit 12.0 or above");
+    }
+
     auto b = 4;
     auto m = 16;
     auto k = 64;
diff --git a/samples/cpp/pointwise.cpp b/samples/cpp/pointwise.cpp
index 8137bf5..7ecbac1 100644
--- a/samples/cpp/pointwise.cpp
+++ b/samples/cpp/pointwise.cpp
@@ -55,3 +55,95 @@ TEST_CASE("Reduction", "[reduction]") {
     REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
     checkCudnnErr(cudnnDestroy(handle));
 }
+
+TEST_CASE("Fused scalar", "[scalar][graph]") {
+    namespace fe = cudnn_frontend;
+
+    constexpr int n = 4;
+
+    fe::graph::Graph graph{};
+    auto A = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_dim({n, n, n})
+                              .set_stride({n * n, n, 1})
+                              .set_data_type(fe::DataType_t::HALF));
+    auto C = graph.pointwise(A,
+                             graph.tensor(5.0f),
+                             fe::graph::Pointwise_attributes()
+                                 .set_mode(fe::PointwiseMode_t::ADD)
+                                 .set_compute_data_type(fe::DataType_t::FLOAT));
+    C->set_output(true).set_data_type(fe::DataType_t::HALF);
+
+    REQUIRE(graph.validate().is_good());
+    cudnnHandle_t handle;
+    checkCudnnErr(cudnnCreate(&handle));
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    Surface<half> C_gpu(n * n * n, false);
+    Surface<half> A_gpu(n * n * n, false);
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{A, A_gpu.devPtr},
+                                                                                             {C, C_gpu.devPtr}};
+    Surface<int8_t> workspace(graph.get_workspace_size(), false);
+
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+
+    checkCudnnErr(cudnnDestroy(handle));
+}
+
+TEST_CASE("Fused Amax Reduction and type conversion", "[reduction]") {
+    namespace fe    = cudnn_frontend;
+    constexpr int n = 64;
+
+    if (cudnnGetVersion() < 8600) {
+        SKIP("TEST REQUIRES minimum cudnn version 8.6.0");
+    }
+
+    if (check_device_arch_newer_than("hopper") == false) {
+        SKIP("TEST REQUIRES device  hopper arch or newer");
+    }
+
+    fe::graph::Graph graph{};
+
+    auto A = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_dim({n, n, n, n})
+                              .set_stride({n * n * n, 1, n * n, n})
+                              .set_data_type(fe::DataType_t::FLOAT));
+
+    auto scale = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_dim({1, 1, 1, 1})
+                                  .set_stride({1, 1, 1, 1})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+
+    auto amax = graph.reduction(A,
+                                fe::graph::Reduction_attributes()
+                                    .set_mode(fe::ReductionMode_t::AMAX)
+                                    .set_compute_data_type(fe::DataType_t::FLOAT));
+
+    amax->set_output(true).set_data_type(fe::DataType_t::FLOAT).set_dim({1, 1, 1, 1});
+
+    auto scale_options = fe::graph::Pointwise_attributes()
+                             .set_mode(fe::PointwiseMode_t::MUL)
+                             .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto C = graph.pointwise(A, scale, scale_options);
+    C->set_output(true).set_data_type(fe::DataType_t::FP8_E4M3);
+
+    REQUIRE(graph.validate().is_good());
+    cudnnHandle_t handle;
+    checkCudnnErr(cudnnCreate(&handle));
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+
+    Surface<float> A_gpu(n * n * n * n, false);
+    Surface<float> scale_gpu(1, false);
+    Surface<float> amax_gpu(1, false);
+    Surface<int8_t> C_gpu(n * n * n * n, false);  // Substitute for fp8
+
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+        {A, A_gpu.devPtr}, {scale, scale_gpu.devPtr}, {amax, amax_gpu.devPtr}, {C, C_gpu.devPtr}};
+    Surface<int8_t> workspace(graph.get_workspace_size(), false);
+    REQUIRE(graph.execute(handle, variant_pack, workspace.devPtr).is_good());
+    checkCudnnErr(cudnnDestroy(handle));
+}
\ No newline at end of file
diff --git a/samples/python/01_matmul_bias.ipynb b/samples/python/01_matmul_bias.ipynb
new file mode 100644
index 0000000..5bd93e1
--- /dev/null
+++ b/samples/python/01_matmul_bias.ipynb
@@ -0,0 +1,223 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Matrix multiplication operation with fused bias using cudnn FE\n",
+    "This notebook shows how a matmul operation with fused bias can be done using cudnn."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/https://github.com/NVIDIA/cudnn-frontend/tree/main/samples/python/convolutions/00_basic_convolutions.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU H100 or newer. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!# export CUDA_VERSION=\"12.3\"\n",
+    "!# pip install nvidia-cudnn-cu12\n",
+    "!# conda install -y -c nvidia cuda-nvcc=\"${CUDA_VERSION}\" cuda-libraries-dev=\"${CUDA_VERSION}\"\n",
+    "!# CUDNN_PATH=`pip show nvidia-cudnn-cu12  | grep Location | cut -d\":\" -f2 | xargs`/nvidia/cudnn pip install git+https://github.com/NVIDIA/cudnn-frontend.git\n",
+    "!# pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### General Setup\n",
+    "We are going to call the cudnn through torch in this example. In general any dlpack tensor should work.\n",
+    "cudnn handle is a per device handle used to initialize cudnn context.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "StopExecution",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": []
+    }
+   ],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "import sys\n",
+    "\n",
+    "handle = cudnn.create_handle()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create input tensors and calculate reference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch, m, n, k = 16, 128, 128, 512\n",
+    "\n",
+    "input_type = torch.float16\n",
+    "\n",
+    "# input tensors\n",
+    "a = torch.randn(batch, m, k, dtype=input_type, device='cuda')\n",
+    "b = torch.randn(batch, k, n, dtype=input_type, device='cuda')\n",
+    "B = torch.randn(1, m, n, dtype=torch.float16, device='cuda')\n",
+    "\n",
+    "# reference output\n",
+    "c_ref = torch.matmul(a, b) + B\n",
+    "\n",
+    "# place holder for cudnn output\n",
+    "c = torch.randn_like(c_ref, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create cudnn graph and tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = cudnn.pygraph(intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)\n",
+    "\n",
+    "a_cudnn_tensor = graph.tensor_like(a)\n",
+    "b_cudnn_tensor = graph.tensor_like(b)\n",
+    "bias_cudnn_tensor = graph.tensor_like(B)\n",
+    "\n",
+    "c_intermediate = graph.matmul(name = \"matmul\", A = a_cudnn_tensor, B = b_cudnn_tensor)\n",
+    "\n",
+    "c_cudnn_tensor = graph.bias(name = \"bias\", input = c_intermediate, bias = bias_cudnn_tensor)\n",
+    "    \n",
+    "c_cudnn_tensor.set_name(\"c\").set_output(True).set_data_type(cudnn.data_type.HALF)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Build the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Execute the code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack = {\n",
+    "    a_cudnn_tensor: a,\n",
+    "    b_cudnn_tensor: b,\n",
+    "    c_cudnn_tensor: c,\n",
+    "    bias_cudnn_tensor: B,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.testing.assert_close(c, c_ref, rtol = 5e-3, atol = 5e-3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "build_thunder",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/samples/python/02_sdpa_graph_serialization.ipynb b/samples/python/02_sdpa_graph_serialization.ipynb
new file mode 100644
index 0000000..4e87453
--- /dev/null
+++ b/samples/python/02_sdpa_graph_serialization.ipynb
@@ -0,0 +1,286 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SDPA operation using cudnn FE and serialization\n",
+    "This notebook shows how a sdpa operation can be done using cudnn and how to serialize and deserialize the graph."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/https://github.com/NVIDIA/cudnn-frontend/tree/main/samples/python/convolutions/00_basic_convolutions.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!# export CUDA_VERSION=\"12.3\"\n",
+    "!# pip install nvidia-cudnn-cu12\n",
+    "!# conda install -y -c nvidia cuda-nvcc=\"${CUDA_VERSION}\" cuda-libraries-dev=\"${CUDA_VERSION}\"\n",
+    "!# CUDNN_PATH=`pip show nvidia-cudnn-cu12  | grep Location | cut -d\":\" -f2 | xargs`/nvidia/cudnn pip install git+https://github.com/NVIDIA/cudnn-frontend.git\n",
+    "!# pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### General Setup\n",
+    "We are going to call the cudnn through torch in this example. In general any dlpack tensor should work.\n",
+    "cudnn handle is a per device handle used to initialize cudnn context.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "from enum import Enum\n",
+    "\n",
+    "handle = cudnn.create_handle()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Problem definition"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = 2 # batch size\n",
+    "\n",
+    "s_q  = 1024 # query sequence length\n",
+    "s_kv = 1024 # key+value sequence length\n",
+    "\n",
+    "h = 6 # Query heads\n",
+    "\n",
+    "d = 64   # query+key embedding dimension per head\n",
+    "\n",
+    "shape_q = (b, h, s_q, d)\n",
+    "shape_k = (b, h, s_kv, d)\n",
+    "shape_v = (b, h, s_kv, d)\n",
+    "shape_o = (b, h, s_q, d)\n",
+    "\n",
+    "stride_q = (s_q  * h * d, d, h * d, 1)\n",
+    "stride_k = (s_kv * h * d, d, h * d, 1)\n",
+    "stride_v = (s_kv * h * d, d, h * d, 1)\n",
+    "stride_o = (s_q  * h * d, d, h * d, 1)\n",
+    "\n",
+    "attn_scale = 0.125\n",
+    "\n",
+    "q_gpu     = torch.randn(b * h * s_q * d, dtype=torch.bfloat16, device=\"cuda\").as_strided(shape_q, stride_q)\n",
+    "k_gpu     = torch.randn(b * h * s_kv * d, dtype=torch.bfloat16, device=\"cuda\").as_strided(shape_k, stride_k)\n",
+    "v_gpu     = torch.randn(b * h * s_kv * d, dtype=torch.bfloat16, device=\"cuda\").as_strided(shape_v, stride_v)\n",
+    "o_gpu     = torch.empty(b * h * s_q * d, dtype=torch.bfloat16, device=\"cuda\").as_strided(shape_o, stride_o)\n",
+    "stats_gpu = torch.empty(b, h, s_q, 1, dtype=torch.float32, device=\"cuda\")\n",
+    "\n",
+    "class UIDs(Enum):\n",
+    "    Q_UID     = 0\n",
+    "    K_UID     = 1\n",
+    "    V_UID     = 2\n",
+    "    O_UID     = 3\n",
+    "    STATS_UID = 4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Graph build helper\n",
+    "This will called by check_support and serialize function to build the sdpa graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_and_validate_graph_helper():\n",
+    "    graph = cudnn.pygraph(\n",
+    "        io_data_type=cudnn.data_type.HALF,\n",
+    "        intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "        compute_data_type=cudnn.data_type.FLOAT,\n",
+    "        handle = handle)\n",
+    "    \n",
+    "    q = graph.tensor_like(q_gpu)\n",
+    "    k = graph.tensor_like(k_gpu)\n",
+    "    v = graph.tensor_like(v_gpu)\n",
+    "    \n",
+    "    o, stats = graph.sdpa(name=\"sdpa\",\n",
+    "        q=q, k=k, v=v,\n",
+    "        is_inference=False,\n",
+    "        attn_scale=attn_scale,\n",
+    "        use_causal_mask=True)\n",
+    "    \n",
+    "    o.set_output(True).set_dim(shape_o).set_stride(stride_o)\n",
+    "    stats.set_output(True).set_data_type(cudnn.data_type.FLOAT)\n",
+    "    \n",
+    "    q.set_uid(UIDs.Q_UID.value)\n",
+    "    k.set_uid(UIDs.K_UID.value)\n",
+    "    v.set_uid(UIDs.V_UID.value)\n",
+    "    o.set_uid(UIDs.O_UID.value)\n",
+    "    stats.set_uid(UIDs.STATS_UID.value)\n",
+    "    \n",
+    "    graph.validate()\n",
+    "    \n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Check support "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_support():\n",
+    "    \n",
+    "    graph = build_and_validate_graph_helper()\n",
+    "    \n",
+    "    graph.build_operation_graph()\n",
+    "    \n",
+    "    graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "\n",
+    "    graph.check_support()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Serialization function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def serialize():\n",
+    "    graph = build_and_validate_graph_helper()\n",
+    "    \n",
+    "    graph.build_operation_graph()\n",
+    "    \n",
+    "    graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "\n",
+    "    graph.check_support()\n",
+    "    \n",
+    "    graph.build_plans()\n",
+    "    \n",
+    "    return graph.serialize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### De-serialization function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def deserialize(payload):\n",
+    "    \n",
+    "    graph = cudnn.pygraph()\n",
+    "    \n",
+    "    graph.deserialize(payload)\n",
+    "    \n",
+    "    return graph"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "####  running the execution plan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_support()\n",
+    "\n",
+    "data = serialize()\n",
+    "\n",
+    "deserialized_graph  = deserialize(data)\n",
+    "\n",
+    "workspace = torch.empty(deserialized_graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "\n",
+    "variant_pack = {\n",
+    "    UIDs.Q_UID.value: q_gpu,\n",
+    "    UIDs.K_UID.value: k_gpu,\n",
+    "    UIDs.V_UID.value: v_gpu,\n",
+    "    UIDs.O_UID.value: o_gpu,\n",
+    "    UIDs.STATS_UID.value: stats_gpu,\n",
+    "}\n",
+    "\n",
+    "deserialized_graph.execute(variant_pack, workspace)\n",
+    "\n",
+    "torch.cuda.synchronize()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "build_thunder",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/samples/python/03_mixed_precision_matmul.ipynb b/samples/python/03_mixed_precision_matmul.ipynb
new file mode 100644
index 0000000..e246b45
--- /dev/null
+++ b/samples/python/03_mixed_precision_matmul.ipynb
@@ -0,0 +1,234 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Mixed precision matrix multiplication operation using cudnn FE\n",
+    "This notebook shows how a mixed precision matmul operation can be done using cudnn."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/https://github.com/NVIDIA/cudnn-frontend/tree/main/samples/python/convolutions/00_basic_convolutions.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU H100 or newer. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the cudnn python interface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!# export CUDA_VERSION=\"12.3\"\n",
+    "!# pip install nvidia-cudnn-cu12\n",
+    "!# conda install -y -c nvidia cuda-nvcc=\"${CUDA_VERSION}\" cuda-libraries-dev=\"${CUDA_VERSION}\"\n",
+    "!# CUDNN_PATH=`pip show nvidia-cudnn-cu12  | grep Location | cut -d\":\" -f2 | xargs`/nvidia/cudnn pip install git+https://github.com/NVIDIA/cudnn-frontend.git\n",
+    "!# pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### General Setup\n",
+    "We are going to call the cudnn through torch in this example. In general any dlpack tensor should work.\n",
+    "cudnn handle is a per device handle used to initialize cudnn context.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "import sys\n",
+    "\n",
+    "handle = cudnn.create_handle()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create input tensors and calculate reference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch, m, n, k = 16, 128, 128, 512\n",
+    "\n",
+    "# input data types can be different\n",
+    "input_type_a = torch.int8\n",
+    "input_type_b = torch.bfloat16\n",
+    "output_type  = torch.bfloat16\n",
+    "\n",
+    "# direct input data type for the matmul operation\n",
+    "mma_data_type = torch.bfloat16\n",
+    "\n",
+    "# input tensors\n",
+    "if input_type_a != torch.int8:\n",
+    "    a = 2 * torch.randn(batch, m, k, dtype=input_type_a, device='cuda') - 0.5\n",
+    "else:\n",
+    "    a = torch.randint(4, (batch, m, k), dtype=input_type_a, device='cuda') - 1\n",
+    "\n",
+    "if input_type_b != torch.int8:\n",
+    "    b_row_major = 3 * torch.randn(batch, k, n, dtype=input_type_b, device='cuda') - 1.25\n",
+    "else:\n",
+    "    b_row_major = torch.randint(3, (batch, k, n), dtype=input_type_b, device='cuda').contiguous() - 2\n",
+    "b = torch.as_strided(b_row_major, (batch, k, n), (n * k, 1, n))\n",
+    "\n",
+    "# reference output\n",
+    "c_ref = torch.matmul(a.to(mma_data_type), b.to(mma_data_type)).to(output_type)\n",
+    "\n",
+    "# place holder for cudnn output\n",
+    "c = torch.randn_like(c_ref, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create cudnn graph and tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = cudnn.pygraph()\n",
+    "\n",
+    "a_cudnn_tensor = graph.tensor_like(a)\n",
+    "b_cudnn_tensor = graph.tensor_like(b)\n",
+    "\n",
+    "# cudnn will do the following conversion path: input_data_type -> compute_data_type -> output_data_type\n",
+    "# compute_data_type can be int32 as well\n",
+    "a_cudnn_tensor_casted = graph.identity(input = a_cudnn_tensor, compute_data_type=cudnn.data_type.FLOAT)\n",
+    "a_cudnn_tensor_casted.set_data_type(mma_data_type)\n",
+    "\n",
+    "# here we omit the code casting tensor b to the mma_data_type\n",
+    "# since both of them are in bf16 data type in this example\n",
+    "# user can also cast tensor b if it has a different input_type from the mma_data_type\n",
+    "\n",
+    "# compute_data_type should be set to int32 if the mma_data_type is int8\n",
+    "c_cudnn_tensor = graph.matmul(name = \"matmul\", A = a_cudnn_tensor_casted, B = b_cudnn_tensor, compute_data_type = cudnn.data_type.FLOAT)\n",
+    "c_cudnn_tensor.set_name(\"c\").set_output(True).set_data_type(output_type)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Build the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Execute the code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack = {\n",
+    "    a_cudnn_tensor: a,\n",
+    "    b_cudnn_tensor: b,\n",
+    "    c_cudnn_tensor: c,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.testing.assert_close(c, c_ref, rtol = 5e-3, atol = 5e-3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "build_thunder",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/samples/python/50_scaled_dot_product_attention.ipynb b/samples/python/50_scaled_dot_product_attention.ipynb
new file mode 100644
index 0000000..6bed79a
--- /dev/null
+++ b/samples/python/50_scaled_dot_product_attention.ipynb
@@ -0,0 +1,258 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scaled Dot Product Attention (SDPA) in cuDNN Frontend\n",
+    "\n",
+    "This notebook is an example for the scaled dot product attention operator in cuDNN frontend. This operation computes scaled dot product attention as\n",
+    "\n",
+    "$\\text{Attention}(Q, K, V) = \\text{softmax}\\left(\\frac{QK^T}{\\sqrt{d}}\\right)V$\n",
+    "\n",
+    "using the FlashAttention-2 algorithm described in the paper [FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning](https://arxiv.org/abs/2307.08691). It is applicable for both training and inference phases, with an option to generate a stats tensor to be used for backwards training computation.\n",
+    "\n",
+    "The full documentation can be found in: [docs/operations/Attention.md#scaled-dot-product-attention](https://github.com/NVIDIA/cudnn-frontend/blob/main/docs/operations/Attention.md#scaled-dot-product-attention)\n",
+    "\n",
+    "The python test code for the full set of features can be found in: [test/python_fe/test.mhas.py](https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python_fe/test_mhas.py)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/https://github.com/NVIDIA/cudnn-frontend/tree/main/samples/python/convolutions/00_basic_convolutions.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU A100 or newer. If running on Colab, go to Runtime → Change runtime type → Hardware accelerator and selct a GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!# nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!# export CUDA_VERSION=\"12.3\"\n",
+    "!# pip install nvidia-cudnn-cu12\n",
+    "!# conda install -y -c nvidia cuda-nvcc=\"${CUDA_VERSION}\" cuda-libraries-dev=\"${CUDA_VERSION}\"\n",
+    "!# CUDNN_PATH=`pip show nvidia-cudnn-cu12  | grep Location | cut -d\":\" -f2 | xargs`/nvidia/cudnn pip install git+https://github.com/NVIDIA/cudnn-frontend.git\n",
+    "!# pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "import math\n",
+    "\n",
+    "torch.manual_seed(42)\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "assert torch.cuda.get_device_capability()[0] >= 8, \"SDPA operation is only supported on SM80 architecture (Ampere) or above\"\n",
+    "assert cudnn.backend_version() >= 8903, \"SDPA operation is only supported cuDNN version 8.9.3 or above\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Problem sizes\n",
+    "\n",
+    "For this example, we will use the problem size from the original GPT-2 paper where:\n",
+    " - maximum sequence length = 1024\n",
+    " - hidden dim = number of heads $\\times$ embedding dimension per head = 12 $\\times$ 64 = 768"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = 4    # batch size\n",
+    "h = 12   # query number of heads\n",
+    "s = 1024 # maximum sequence length\n",
+    "d = 64   # embedding dimension per head\n",
+    "\n",
+    "attn_scale = 1.0 / math.sqrt(d)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the query, key, value, and output GPU tensors using PyTorch. However, the user may use any DLPack compatible tensor instead."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The tensors will have non-interleaved\n",
+    "# BSHD (batch, sequence_length, num_head, dims_per_head) physical tensor layout\n",
+    "# BHSD (batch, num_head, sequence_length, dims_per_head) logical tensor layout\n",
+    "dims = (b, h, s, d)\n",
+    "strides = (s * h * d, d, h * d, 1)\n",
+    "\n",
+    "q_gpu = torch.randn(b * s * h * d).half().cuda().as_strided(dims, strides)\n",
+    "k_gpu = torch.randn(b * s * h * d).half().cuda().as_strided(dims, strides)\n",
+    "v_gpu = torch.randn(b * s * h * d).half().cuda().as_strided(dims, strides)\n",
+    "o_gpu = torch.empty(b * s * h * d).half().cuda().as_strided(dims, strides)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "q = graph.tensor_like(q_gpu)\n",
+    "k = graph.tensor_like(k_gpu)\n",
+    "v = graph.tensor_like(v_gpu)\n",
+    "\n",
+    "# the second return for the stats tensor is used for training only.\n",
+    "# causal mask is enabled\n",
+    "o, _ = graph.sdpa(\n",
+    "    name=\"sdpa\",\n",
+    "    q=q, k=k, v=v,\n",
+    "    is_inference=True,\n",
+    "    attn_scale=attn_scale,\n",
+    "    use_causal_mask=True,\n",
+    ")\n",
+    "\n",
+    "o.set_output(True).set_dim(dims).set_stride(strides)\n",
+    "pass"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Build the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph.validate()\n",
+    "graph.build_operation_graph()\n",
+    "graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph.check_support()\n",
+    "graph.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute the graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack = {\n",
+    "    q: q_gpu,\n",
+    "    k: k_gpu,\n",
+    "    v: v_gpu,\n",
+    "    o: o_gpu,\n",
+    "}\n",
+    "\n",
+    "workspace = torch.empty(graph.get_workspace_size(), device=\"cuda\", dtype=torch.uint8)\n",
+    "graph.execute(variant_pack, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q_ref = q_gpu.detach().float().requires_grad_()\n",
+    "k_ref = k_gpu.detach().float().requires_grad_()\n",
+    "v_ref = v_gpu.detach().float().requires_grad_()\n",
+    "\n",
+    "o_ref = torch.nn.functional.scaled_dot_product_attention(q_ref, k_ref, v_ref, is_causal=True, scale=attn_scale)\n",
+    "torch.testing.assert_close(o_ref, o_gpu.float(), atol=5e-3, rtol=3e-3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/samples/python/51_scaled_dot_product_attention_backward.ipynb b/samples/python/51_scaled_dot_product_attention_backward.ipynb
new file mode 100644
index 0000000..90cc6ee
--- /dev/null
+++ b/samples/python/51_scaled_dot_product_attention_backward.ipynb
@@ -0,0 +1,366 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scaled Dot Product Attention (SDPA) Backward in cuDNN Frontend\n",
+    "\n",
+    "This operation computes gradient tensors for scaled dot product attention using the FlashAttention-2 algorithm as described in the paper FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning. The user is required to pass the stats tensor from the forward operation to the backward operation as input.\n",
+    "\n",
+    "The full documentation can be found in: [docs/operations/Attention.md#scaled-dot-product-attention-backward](https://github.com/NVIDIA/cudnn-frontend/blob/main/docs/operations/Attention.md#scaled-dot-product-attention-backward)\n",
+    "\n",
+    "The python test code for the full set of features can be found in: [test/python_fe/test_mhas.py](https://github.com/NVIDIA/cudnn-frontend/blob/main/test/python_fe/test_mhas.py)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/https://github.com/NVIDIA/cudnn-frontend/tree/main/samples/python/convolutions/00_basic_convolutions.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Prerequisites and Setup\n",
+    "This notebook requires an NVIDIA GPU A100 or newer. If running on Colab, go to Runtime → Change runtime type → Hardware accelerator and selct a GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!# nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!# export CUDA_VERSION=\"12.3\"\n",
+    "!# pip install nvidia-cudnn-cu12\n",
+    "!# conda install -y -c nvidia cuda-nvcc=\"${CUDA_VERSION}\" cuda-libraries-dev=\"${CUDA_VERSION}\"\n",
+    "!# CUDNN_PATH=`pip show nvidia-cudnn-cu12  | grep Location | cut -d\":\" -f2 | xargs`/nvidia/cudnn pip install git+https://github.com/NVIDIA/cudnn-frontend.git\n",
+    "!# pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudnn\n",
+    "import torch\n",
+    "import math\n",
+    "\n",
+    "torch.manual_seed(42)\n",
+    "handle = cudnn.create_handle()\n",
+    "\n",
+    "assert torch.cuda.is_available()\n",
+    "assert torch.cuda.get_device_capability()[0] >= 8, \"SDPA operation is only supported on SM80 architecture (Ampere) or above\"\n",
+    "assert cudnn.backend_version() >= 8903, \"SDPA operation is only supported cuDNN version 8.9.3 or above\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Problem sizes\n",
+    "\n",
+    "For this example, we will use the problem size from the original GPT-2 paper where:\n",
+    " - maximum sequence length = 1024\n",
+    " - hidden dim = number of heads $\\times$ embedding dimension per head = 12 $\\times$ 64 = 768"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = 4    # batch size\n",
+    "h = 12   # query number of heads\n",
+    "s = 1024 # maximum sequence length\n",
+    "d = 64   # embedding dimension per head\n",
+    "\n",
+    "attn_scale = 1.0 / math.sqrt(d)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the query, key, value, and output GPU tensors using PyTorch.\n",
+    "\n",
+    "**However for backwards computation, we also need to pass the stats tensor between the forward graph and the backward graph.**\n",
+    "\n",
+    "The stats tensor should have dims $(B, H, S, 1)$ and float32 datatype."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The tensors will have non-interleaved\n",
+    "# BSHD (batch, sequence_length, num_head, dims_per_head) physical tensor layout\n",
+    "# BHSD (batch, num_head, sequence_length, dims_per_head) logical tensor layout\n",
+    "dims = (b, h, s, d)\n",
+    "strides = (s * h * d, d, h * d, 1)\n",
+    "\n",
+    "q_gpu = torch.randn(b * s * h * d).half().cuda().as_strided(dims, strides)\n",
+    "k_gpu = torch.randn(b * s * h * d).half().cuda().as_strided(dims, strides)\n",
+    "v_gpu = torch.randn(b * s * h * d).half().cuda().as_strided(dims, strides)\n",
+    "o_gpu = torch.empty(b * s * h * d).half().cuda().as_strided(dims, strides)\n",
+    "stats_gpu = torch.empty(b, h, s, 1).float().cuda()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Also create the query, key, value, and output gradient tensors to be used for backwards computation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# note: torch 'like' preserves the strided layout\n",
+    "dQ_gpu = torch.empty_like(q_gpu)\n",
+    "dK_gpu = torch.empty_like(k_gpu)\n",
+    "dV_gpu = torch.empty_like(v_gpu)\n",
+    "dO_gpu = torch.randn_like(o_gpu)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the forward graph and build"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph_forward = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "q_forward = graph_forward.tensor_like(q_gpu)\n",
+    "k_forward = graph_forward.tensor_like(k_gpu)\n",
+    "v_forward = graph_forward.tensor_like(v_gpu)\n",
+    "\n",
+    "# training mode in enabled with is_inference=False\n",
+    "# causal mask is enabled\n",
+    "o_forward, stats_forward = graph_forward.sdpa(\n",
+    "    name=\"sdpa\",\n",
+    "    q=q_forward, k=k_forward, v=v_forward,\n",
+    "    is_inference=False,\n",
+    "    attn_scale=attn_scale,\n",
+    "    use_causal_mask=True,\n",
+    ")\n",
+    "\n",
+    "o_forward.set_output(True).set_dim(o_gpu.size()).set_stride(o_gpu.stride())\n",
+    "stats_forward.set_output(True).set_dim(stats_gpu.size()).set_stride(stats_gpu.stride())\n",
+    "stats_forward.set_data_type(cudnn.data_type.FLOAT)\n",
+    "\n",
+    "graph_forward.validate()\n",
+    "graph_forward.build_operation_graph()\n",
+    "graph_forward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph_forward.check_support()\n",
+    "graph_forward.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the backward graph and build"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graph_backward = cudnn.pygraph(\n",
+    "    io_data_type=cudnn.data_type.HALF,\n",
+    "    intermediate_data_type=cudnn.data_type.FLOAT,\n",
+    "    compute_data_type=cudnn.data_type.FLOAT,\n",
+    ")\n",
+    "\n",
+    "q_backward = graph_backward.tensor_like(q_gpu)\n",
+    "k_backward = graph_backward.tensor_like(k_gpu)\n",
+    "v_backward = graph_backward.tensor_like(v_gpu)\n",
+    "o_backward = graph_backward.tensor_like(o_gpu)\n",
+    "dO_backward = graph_backward.tensor_like(dO_gpu)\n",
+    "stats_backward = graph_backward.tensor_like(stats_gpu)\n",
+    "\n",
+    "dQ_backward, dK_backward, dV_backward = graph_backward.sdpa_backward(\n",
+    "    name=\"sdpa_backward\",\n",
+    "    q=q_backward, k=k_backward, v=v_backward,\n",
+    "    o=o_backward, dO=dO_backward, stats=stats_backward,\n",
+    "    attn_scale=attn_scale,\n",
+    "    use_causal_mask=True,\n",
+    ")\n",
+    "\n",
+    "dQ_backward.set_output(True).set_dim(dQ_gpu.size()).set_stride(dQ_gpu.stride())\n",
+    "dK_backward.set_output(True).set_dim(dK_gpu.size()).set_stride(dK_gpu.stride())\n",
+    "dV_backward.set_output(True).set_dim(dV_gpu.size()).set_stride(dV_gpu.stride())\n",
+    "\n",
+    "graph_backward.validate()\n",
+    "graph_backward.build_operation_graph()\n",
+    "graph_backward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])\n",
+    "graph_backward.check_support()\n",
+    "graph_backward.build_plans()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Allocate workspace required to execute. We take the maximum since forward and backward are executed sequentially."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "workspace_size = max(\n",
+    "    graph_forward.get_workspace_size(),\n",
+    "    graph_backward.get_workspace_size(),\n",
+    ")\n",
+    "workspace = torch.empty(workspace_size, device=\"cuda\", dtype=torch.uint8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute forward graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack_forward = {\n",
+    "    q_forward: q_gpu,\n",
+    "    k_forward: k_gpu,\n",
+    "    v_forward: v_gpu,\n",
+    "    o_forward: o_gpu,\n",
+    "    stats_forward: stats_gpu,\n",
+    "}\n",
+    "\n",
+    "graph_forward.execute(variant_pack_forward, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Execute backward graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "variant_pack_backward = {\n",
+    "    q_backward: q_gpu,\n",
+    "    k_backward: k_gpu,\n",
+    "    v_backward: v_gpu,\n",
+    "    o_backward: o_gpu,\n",
+    "    dO_backward: dO_gpu,\n",
+    "    stats_backward: stats_gpu,\n",
+    "    dQ_backward: dQ_gpu,\n",
+    "    dK_backward: dK_gpu,\n",
+    "    dV_backward: dV_gpu,\n",
+    "}\n",
+    "\n",
+    "graph_backward.execute(variant_pack_backward, workspace)\n",
+    "torch.cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Test cuDNN's output against PyTorch's and check correctness"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q_ref = q_gpu.detach().float().requires_grad_()\n",
+    "k_ref = k_gpu.detach().float().requires_grad_()\n",
+    "v_ref = v_gpu.detach().float().requires_grad_()\n",
+    "dO_ref = dO_gpu.detach().float()\n",
+    "\n",
+    "o_ref = torch.nn.functional.scaled_dot_product_attention(q_ref, k_ref, v_ref, is_causal=True, scale=attn_scale)\n",
+    "torch.testing.assert_close(o_ref, o_gpu.float(), atol=5e-3, rtol=3e-3)\n",
+    "\n",
+    "dQ_ref, dK_ref, dV_ref = torch.autograd.grad(outputs=[o_ref], inputs=[q_ref, k_ref, v_ref], grad_outputs=[dO_ref])\n",
+    "torch.testing.assert_close(dQ_ref, dQ_gpu.float(), atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(dK_ref, dK_gpu.float(), atol=5e-3, rtol=3e-3)\n",
+    "torch.testing.assert_close(dV_ref, dV_gpu.float(), atol=5e-3, rtol=3e-3)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/setup.py b/setup.py
index 77b759f..b5f51c6 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,4 @@
 import os
-import re
 import subprocess
 import sys
 from pathlib import Path
@@ -30,11 +29,16 @@ def build_extension(self, ext: CMakeExtension) -> None:
 
         # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
         cmake_args = [
-            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
-            f"-DPYTHON_EXECUTABLE={sys.executable}",
+            f"-DPython_EXECUTABLE={sys.executable}",
             f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
+            f"-DCUDNN_FRONTEND_BUILD_PYTHON_BINDINGS=ON",
+            # There's no need to build cpp samples and tests with python
             f"-DCUDNN_FRONTEND_BUILD_SAMPLES=OFF",
             f"-DCUDNN_FRONTEND_BUILD_UNIT_TESTS=OFF",
+            # All these are handled by pip
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
+            f"-DCUDNN_FRONTEND_KEEP_PYBINDS_IN_BINARY_DIR=OFF",
+            f"-DCUDNN_FRONTEND_FETCH_PYBINDS_IN_CMAKE=OFF"
         ]
 
         if "CUDA_PATH" in os.environ:
@@ -84,18 +88,7 @@ def build_extension(self, ext: CMakeExtension) -> None:
         )
 
 
-# The information here can also be placed in setup.cfg - better separation of
-# logic and declaration, and simpler if you include description/version in a file.
 setup(
-    name="cudnn",
-    version="1.1.2",
-    author="",
-    author_email="",
-    description="cudnn_frontend python package",
-    long_description="",
-    ext_modules=[CMakeExtension("cudnn")],
+    ext_modules=[CMakeExtension("cudnn/_compiled_module")],
     cmdclass={"build_ext": CMakeBuild},
-    zip_safe=False,
-    extras_require={"test": ["pytest>=6.0"]},
-    python_requires=">=3.7",
 )
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..e155038
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,3 @@
+cmake_minimum_required(VERSION 3.18)
+
+add_subdirectory(unit_tests)
\ No newline at end of file
diff --git a/test/python_fe/conftest.py b/test/python_fe/conftest.py
new file mode 100644
index 0000000..5259816
--- /dev/null
+++ b/test/python_fe/conftest.py
@@ -0,0 +1,11 @@
+import pytest
+
+def pytest_addoption(parser):
+    parser.addoption("--mha_b", default=None, help="[test_mhas.py] batch dimension")
+    parser.addoption("--mha_s_q", default=None, help="[test_mhas.py] query sequence length")
+    parser.addoption("--mha_s_kv", default=None, help="[test_mhas.py] key/value sequence length")
+    parser.addoption("--mha_d_qk", default=None, help="[test_mhas.py] query/key embedding dimension per head")
+    parser.addoption("--mha_d_v", default=None, help="[test_mhas.py] value embedding dimension per head")
+    parser.addoption("--mha_h_q", default=None, help="[test_mhas.py] query number of heads")
+    parser.addoption("--mha_h_k", default=None, help="[test_mhas.py] key number of heads")
+    parser.addoption("--mha_h_v", default=None, help="[test_mhas.py] value number of heads")
diff --git a/test/python_fe/test_apply_rope.py b/test/python_fe/test_apply_rope.py
index 786137d..448af07 100644
--- a/test/python_fe/test_apply_rope.py
+++ b/test/python_fe/test_apply_rope.py
@@ -1,13 +1,7 @@
 import cudnn
 import torch
 
-def convert_to_cudnn_type(torch_type):
-    if torch_type == torch.float16:
-        return cudnn.data_type.HALF
-    elif torch_type == torch.float32:
-        return cudnn.data_type.FLOAT
-    else:
-        raise ValueError("Unsupported tensor data type.")
+from test_utils import torch_fork_set_rng
 
 def build_rope_cache(
     seq_len: int,
@@ -48,7 +42,9 @@ def fn(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
     q_roped = fn(q[..., : rope_n_elem], cos, sin)
     return torch.cat((q_roped, q[..., rope_n_elem :]), dim=-1)
 
-def apply_rope():
+@torch_fork_set_rng(seed=0)
+def test_apply_rope():
+
     B, nh, T, hs = 8, 32, 4096, 128
     rope_n_elem = int(0.25 * hs)
 
@@ -90,10 +86,10 @@ def apply_rope():
     x1_sin2 = graph.mul(a = x1, b = sin2)
     
     Y1 = graph.sub(a = x1_cos1, b = x2_sin1)
-    Y1.set_output(True).set_data_type(convert_to_cudnn_type(torch.float16))
+    Y1.set_output(True).set_data_type(torch.float16)
     
     Y2 = graph.add(a = x2_cos2, b = x1_sin2)
-    Y2.set_output(True).set_data_type(convert_to_cudnn_type(torch.float16))
+    Y2.set_output(True).set_data_type(torch.float16)
    
     graph.validate()
     graph.build_operation_graph()
diff --git a/test/python_fe/test_batchnorm.py b/test/python_fe/test_batchnorm.py
index dcf532a..0373332 100644
--- a/test/python_fe/test_batchnorm.py
+++ b/test/python_fe/test_batchnorm.py
@@ -2,24 +2,16 @@
 import pytest
 import torch
 
-def convert_to_cudnn_type(torch_type):
-    if torch_type == torch.float16:
-        return cudnn.data_type.HALF
-    elif torch_type == torch.float32:
-        return cudnn.data_type.FLOAT
-    elif torch_type == torch.bool:
-        return cudnn.data_type.BOOLEAN
-    elif torch_type == torch.uint8:
-        return cudnn.data_type.UINT8
-    else:
-        raise ValueError("Unsupported tensor data type.")
+from test_utils import torch_fork_set_rng
 
 class SGBN(torch.nn.Module):
     def forward(self, input, running_mean, running_var, weight, bias, eps, momentum):
         return torch.nn.functional.batch_norm(input, running_mean, running_var, weight=weight, bias=bias, training=True, momentum=momentum, eps=eps)
 
 @pytest.mark.skipif(cudnn.backend_version() < 8800, reason="BN with mask output not supported below cudnn 8.8")
+@torch_fork_set_rng(seed=0)
 def test_bn_relu_with_mask():
+
     N, C, H, W = 4, 16, 56, 56
     x_gpu = torch.randn(N, C, H, W, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
     scale_gpu = torch.randn(1, C, 1, 1, requires_grad=False, device="cuda", dtype=torch.float32)
@@ -35,14 +27,14 @@ def test_bn_relu_with_mask():
     # Cudnn code
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.FLOAT, intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
-    X = graph.tensor(name = "X", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = convert_to_cudnn_type(x_gpu.dtype))
+    X = graph.tensor(name = "X", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = x_gpu.dtype)
     scale = graph.tensor(name = "scale", dim = scale_gpu.size(), stride = scale_gpu.stride())
     bias = graph.tensor(name = "bias", dim = bias_gpu.size(), stride = bias_gpu.stride())
     in_running_mean = graph.tensor(name = "in_running_mean", dim = running_mean_gpu.size(), stride = running_mean_gpu.stride())
     in_running_var = graph.tensor(name = "in_running_var", dim = running_var_gpu.size(), stride = running_var_gpu.stride())
     epsilon = graph.tensor(name = "epsilon", dim = epsilon_cpu.size(), stride = epsilon_cpu.stride(), is_pass_by_value = True)
     momentum = graph.tensor(name = "momentum", dim = momentum_cpu.size(), stride = momentum_cpu.stride(), is_pass_by_value = True)
-    comparison = graph.tensor(name = "zeros", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = convert_to_cudnn_type(x_gpu.dtype))
+    comparison = graph.tensor(name = "zeros", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = x_gpu.dtype)
     
     (Y_before_relu, saved_mean, saved_inv_var, out_running_mean, out_running_var) = graph.batchnorm(name = "BN"
                                                                                         , input = X
@@ -81,9 +73,7 @@ def test_bn_relu_with_mask():
     saved_mean_actual = torch.zeros_like(scale_gpu)
     saved_inv_var_actual = torch.zeros_like(scale_gpu)
     Y_actual = torch.zeros_like(Y_expected)
-    # pytorch does not allow bool tensors in dlpack
-    assert C%8 == 0
-    mask_actual_uint8 = torch.empty(N, C // 8, H, W, requires_grad=False, device="cuda", dtype=torch.uint8).to(memory_format=torch.channels_last)
+    mask_actual = torch.empty(N, C, H, W, requires_grad=False, device="cuda", dtype=torch.bool).to(memory_format=torch.channels_last)
 
     zeros = torch.zeros_like(Y_expected)
 
@@ -103,7 +93,7 @@ def test_bn_relu_with_mask():
                     , saved_inv_var : saved_inv_var_actual
                     , Y : Y_actual
                     , comparison: zeros
-                    , mask : mask_actual_uint8
+                    , mask : mask_actual
                 }, workspace)
 
     # Compare
@@ -111,9 +101,12 @@ def test_bn_relu_with_mask():
     torch.testing.assert_close(Y_expected, Y_actual, atol=1e-3, rtol=1e-3)
     torch.testing.assert_close(mean_expected, saved_mean_actual, atol=1e-3, rtol=1e-3)
     torch.testing.assert_close(inv_var_expected, saved_inv_var_actual, atol=1e-3, rtol=1e-3)
+    # torch.testing.assert_close(mask_expected, mask_actual)
 
 @pytest.mark.skipif(cudnn.backend_version() < 8900, reason="DBN fusions not supported below cudnn 8.9")
+@torch_fork_set_rng(seed=0)
 def test_drelu_dadd_dbn():
+
     # Tensors
     N, C, H, W = 4, 16, 56, 56
 
@@ -122,19 +115,18 @@ def test_drelu_dadd_dbn():
     mean_gpu = torch.randn(1, C, 1, 1, requires_grad=False, device="cuda", dtype=torch.float32)
     inv_variance_gpu = torch.randn(1, C, 1, 1, requires_grad=False, device="cuda", dtype=torch.float32)
     dy_gpu = torch.randn(N, C, H, W, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
+    x_mask_gpu = torch.randint(0, 2, [N, C, H, W], requires_grad=False, device="cuda", dtype=torch.bool).to(memory_format=torch.channels_last)
 
     # Cudnn code
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.HALF, intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
-    # Bool type is not supported by dlpack
-    x_mask_gpu = torch.randint(0, 255, [N, int(C / 8), H, W], requires_grad=False, device="cuda", dtype=torch.uint8).to(memory_format=torch.channels_last)
-    X_mask = graph.tensor(name = "X_mask", dim = [N, C, H, W], stride = x_gpu.stride(), data_type = cudnn.data_type.BOOLEAN)
-
-    X = graph.tensor(name = "X", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = convert_to_cudnn_type(x_gpu.dtype))
-    DY = graph.tensor(name = "DY", dim = dy_gpu.size(), stride = dy_gpu.stride(), data_type = convert_to_cudnn_type(dy_gpu.dtype))
-    scale = graph.tensor(name = "scale", dim = scale_gpu.size(), stride = scale_gpu.stride(), data_type = convert_to_cudnn_type(scale_gpu.dtype))
-    mean = graph.tensor(name = "mean", dim = mean_gpu.size(), stride = mean_gpu.stride(), data_type = convert_to_cudnn_type(mean_gpu.dtype))
-    inv_variance = graph.tensor(name = "inv_variance", dim = inv_variance_gpu.size(), stride = inv_variance_gpu.stride(), data_type = convert_to_cudnn_type(inv_variance_gpu.dtype))
+
+    X = graph.tensor(name = "X", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = x_gpu.dtype)
+    DY = graph.tensor(name = "DY", dim = dy_gpu.size(), stride = dy_gpu.stride(), data_type = dy_gpu.dtype)
+    scale = graph.tensor(name = "scale", dim = scale_gpu.size(), stride = scale_gpu.stride(), data_type = scale_gpu.dtype)
+    mean = graph.tensor(name = "mean", dim = mean_gpu.size(), stride = mean_gpu.stride(), data_type = mean_gpu.dtype)
+    inv_variance = graph.tensor(name = "inv_variance", dim = inv_variance_gpu.size(), stride = inv_variance_gpu.stride(), data_type = inv_variance_gpu.dtype)
+    X_mask = graph.tensor(name = "X_mask", dim = x_mask_gpu.size(), stride = x_mask_gpu.stride(), data_type = x_mask_gpu.dtype)
     
     DX_drelu = graph.scale(name = "drelu"
                          , input = DY
@@ -185,7 +177,9 @@ def test_drelu_dadd_dbn():
     graph.execute(device_buffers, workspace)
 
 @pytest.mark.skipif(cudnn.backend_version() < 8904, reason="BN_infer-Drelu-DBN not supported below cudnn 8.9.4")
+@torch_fork_set_rng(seed=0)
 def test_bn_infer_drelu_dbn():
+
     # Tensors
     N, C, H, W = 4, 16, 56, 56
 
@@ -200,12 +194,12 @@ def test_bn_infer_drelu_dbn():
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.HALF, intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
     # Bool type is not supported by dlpack
-    BN_X = graph.tensor(name = "BN_X", dim = bn_x_gpu.size(), stride = bn_x_gpu.stride(), data_type = convert_to_cudnn_type(bn_x_gpu.dtype))
-    DY = graph.tensor(name = "DY", dim = dy_gpu.size(), stride = dy_gpu.stride(), data_type = convert_to_cudnn_type(dy_gpu.dtype))
-    scale = graph.tensor(name = "scale", dim = scale_gpu.size(), stride = scale_gpu.stride(), data_type = convert_to_cudnn_type(scale_gpu.dtype))
-    bias = graph.tensor(name = "bias", dim = bias_gpu.size(), stride = bias_gpu.stride(), data_type = convert_to_cudnn_type(bias_gpu.dtype))
-    mean = graph.tensor(name = "mean", dim = mean_gpu.size(), stride = mean_gpu.stride(), data_type = convert_to_cudnn_type(mean_gpu.dtype))
-    inv_variance = graph.tensor(name = "inv_variance", dim = inv_variance_gpu.size(), stride = inv_variance_gpu.stride(), data_type = convert_to_cudnn_type(inv_variance_gpu.dtype))
+    BN_X = graph.tensor(name = "BN_X", dim = bn_x_gpu.size(), stride = bn_x_gpu.stride(), data_type = bn_x_gpu.dtype)
+    DY = graph.tensor(name = "DY", dim = dy_gpu.size(), stride = dy_gpu.stride(), data_type = dy_gpu.dtype)
+    scale = graph.tensor(name = "scale", dim = scale_gpu.size(), stride = scale_gpu.stride(), data_type = scale_gpu.dtype)
+    bias = graph.tensor(name = "bias", dim = bias_gpu.size(), stride = bias_gpu.stride(), data_type = bias_gpu.dtype)
+    mean = graph.tensor(name = "mean", dim = mean_gpu.size(), stride = mean_gpu.stride(), data_type = mean_gpu.dtype)
+    inv_variance = graph.tensor(name = "inv_variance", dim = inv_variance_gpu.size(), stride = inv_variance_gpu.stride(), data_type = inv_variance_gpu.dtype)
 
     BN_Y = graph.batchnorm_inference(input = BN_X, mean = mean, inv_variance = inv_variance, scale = scale, bias = bias)    
 
diff --git a/test/python_fe/test_conv_bias.py b/test/python_fe/test_conv_bias.py
index 98a1c92..400480e 100644
--- a/test/python_fe/test_conv_bias.py
+++ b/test/python_fe/test_conv_bias.py
@@ -2,13 +2,7 @@
 import pytest
 import torch
 
-def convert_to_cudnn_type(torch_type):
-    if torch_type == torch.float16:
-        return cudnn.data_type.HALF
-    elif torch_type == torch.float32:
-        return cudnn.data_type.FLOAT
-    else:
-        raise ValueError("Unsupported tensor data type.")
+from test_utils import torch_fork_set_rng
 
 class CSBR(torch.nn.Module):
     def forward(self, x, w, b = None, padding = [1,1], stride = [1,1], dilation = [1,1]):
@@ -17,8 +11,8 @@ def forward(self, x, w, b = None, padding = [1,1], stride = [1,1], dilation = [1
         conv_output = torch.nn.functional.conv2d(x, w, bias = b, padding=padding, stride=stride, dilation=dilation)
         return torch.nn.functional.relu(conv_output)
 
+@torch_fork_set_rng(seed=0)
 def test_conv_bias_relu():
-    torch.manual_seed(0)
 
     # Reference code
     X_gpu = torch.randn(4, 16, 56, 56, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
@@ -36,9 +30,9 @@ def test_conv_bias_relu():
 
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.HALF, intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT, handle = handle)
 
-    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = convert_to_cudnn_type(X_gpu.dtype))
-    W = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = convert_to_cudnn_type(W_gpu.dtype))
-    B = graph.tensor(name = "B", dim = B_gpu.size(), stride = B_gpu.stride(), data_type = convert_to_cudnn_type(B_gpu.dtype))
+    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = X_gpu.dtype)
+    W = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = W_gpu.dtype)
+    B = graph.tensor(name = "B", dim = B_gpu.size(), stride = B_gpu.stride(), data_type = B_gpu.dtype)
 
     conv_output = graph.conv_fprop(image = X, weight = W, pre_padding = padding, post_padding = padding, stride = stride, dilation = dilation)
 
@@ -61,8 +55,10 @@ def test_conv_bias_relu():
     torch.testing.assert_close(Y_expected, Y_actual, atol=0.05, rtol=1e-2)
     
     cudnn.destroy_handle(handle)
-    
+
+@torch_fork_set_rng(seed=0)
 def test_conv_relu():
+
     # Reference code
     X_gpu = torch.randn(20, 40, 30, 40, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
     W_gpu = torch.randn(54, 40, 3, 4, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
@@ -75,8 +71,8 @@ def test_conv_relu():
     # Cudnn code
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.HALF, intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
-    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = convert_to_cudnn_type(X_gpu.dtype))
-    W = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = convert_to_cudnn_type(W_gpu.dtype))
+    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = X_gpu.dtype)
+    W = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = W_gpu.dtype)
     
     conv_output = graph.conv_fprop(image = X, weight = W, padding = padding, stride = stride, dilation = dilation)
 
@@ -92,12 +88,14 @@ def test_conv_relu():
     workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
 
     Y_actual = torch.zeros_like(Y_expected)
-    graph.execute({X: X_gpu, W: W_gpu, Y: Y_actual}, workspace)
-
+    handle = cudnn.create_handle()
+    graph.execute({X: X_gpu, W: W_gpu, Y: Y_actual}, workspace, handle = handle)
     # Compare
     torch.testing.assert_close(Y_expected, Y_actual, atol=1e-3, rtol=1e-3)
 
+@torch_fork_set_rng(seed=0)
 def test_conv3d_bias_leaky_relu():
+
     N, C, D, H, W = 4, 16, 52, 54, 56
     K, R, S, T = 32, 3, 3, 3
     padding = [0,1,2]
@@ -115,9 +113,9 @@ def test_conv3d_bias_leaky_relu():
     
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.HALF, intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
-    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = convert_to_cudnn_type(X_gpu.dtype))
-    Weight = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = convert_to_cudnn_type(W_gpu.dtype))
-    B = graph.tensor(name = "B", dim = B_gpu.size(), stride = B_gpu.stride(), data_type = convert_to_cudnn_type(B_gpu.dtype))
+    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = X_gpu.dtype)
+    Weight = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = W_gpu.dtype)
+    B = graph.tensor(name = "B", dim = B_gpu.size(), stride = B_gpu.stride(), data_type = B_gpu.dtype)
 
     conv_output = graph.conv_fprop(image = X, weight = Weight, padding = padding, stride = stride, dilation = dilation)
 
@@ -139,7 +137,9 @@ def test_conv3d_bias_leaky_relu():
 
     torch.testing.assert_close(Y_expected, Y_actual, atol=1e-2, rtol=1e-2)
 
+@torch_fork_set_rng(seed=0)
 def test_leaky_relu_backward():
+
     N, C, H, W = 4, 16, 56, 56
     negative_slope = 0.01
     
@@ -154,8 +154,8 @@ def dleaky_relu(grad: torch.Tensor, mask: torch.Tensor, negative_slope: float):
     
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.HALF, intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
-    loss = graph.tensor(name = "loss", dim = loss_gpu.size(), stride = loss_gpu.stride(), data_type = convert_to_cudnn_type(loss_gpu.dtype))
-    input = graph.tensor(name = "input", dim = input_gpu.size(), stride = input_gpu.stride(), data_type = convert_to_cudnn_type(input_gpu.dtype))
+    loss = graph.tensor(name = "loss", dim = loss_gpu.size(), stride = loss_gpu.stride(), data_type = loss_gpu.dtype)
+    input = graph.tensor(name = "input", dim = input_gpu.size(), stride = input_gpu.stride(), data_type = input_gpu.dtype)
 
     Y = graph.leaky_relu_backward(loss = loss, input = input, negative_slope = negative_slope)
     Y.set_output(True)
@@ -175,7 +175,9 @@ def dleaky_relu(grad: torch.Tensor, mask: torch.Tensor, negative_slope: float):
 
 
 @pytest.mark.skipif(cudnn.backend_version() < 8600, reason="requires cudnn 8.6.0 or higher")
+@torch_fork_set_rng(seed=0)
 def test_conv_int8():
+
     N, C, H, W = 1, 64, 32, 32
     K, R, S = 4, 3, 3
     padding  = [1,1]
@@ -219,8 +221,8 @@ def test_conv_int8():
         torch.testing.assert_close(Y_expected, Y_actual, atol=1e-2, rtol=1e-2)
     
 if __name__ == "__main__":
-    test_conv_int8()
+    # test_conv_int8()
     test_conv_relu()
-    test_conv_bias_relu()
-    test_conv3d_bias_leaky_relu()
-    test_leaky_relu_backward()
+    # test_conv_bias_relu()
+    # test_conv3d_bias_leaky_relu()
+    # test_leaky_relu_backward()
diff --git a/test/python_fe/test_conv_genstats.py b/test/python_fe/test_conv_genstats.py
index eee9e02..c92c4f5 100644
--- a/test/python_fe/test_conv_genstats.py
+++ b/test/python_fe/test_conv_genstats.py
@@ -2,13 +2,7 @@
 import pytest
 import torch
 
-def convert_to_cudnn_type(torch_type):
-    if torch_type == torch.float16:
-        return cudnn.data_type.HALF
-    elif torch_type == torch.float32:
-        return cudnn.data_type.FLOAT
-    else:
-        raise ValueError("Unsupported tensor data type.")
+from test_utils import torch_fork_set_rng
 
 class Conv_Genstats(torch.nn.Module):
     def forward(self, scale, bias, x, w, padding = [1,1], stride = [1,1], dilation = [1,1]):
@@ -28,8 +22,8 @@ def forward(self, scale, bias, x, w, padding = [1,1], stride = [1,1], dilation =
 dilation = [1,1]
 
 @pytest.mark.skipif(cudnn.backend_version() < 8800, reason="requires cudnn 8.8 or higher")
+@torch_fork_set_rng(seed=0)
 def test_conv_genstats():
-    print("Running conv genstats")
 
     # Reference
     X_gpu = torch.randn(n, c, 32, 32, requires_grad=False, device="cuda", dtype=torch.float16).to(memory_format=torch.channels_last)
@@ -41,11 +35,11 @@ def test_conv_genstats():
     # Cudnn code
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.HALF, intermediate_data_type = cudnn.data_type.HALF, compute_data_type = cudnn.data_type.FLOAT)
 
-    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = convert_to_cudnn_type(X_gpu.dtype))
-    W = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = convert_to_cudnn_type(W_gpu.dtype))
+    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = X_gpu.dtype)
+    W = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = W_gpu.dtype)
 
-    S = graph.tensor(name = "S", dim = scale.size(), stride = scale.stride(), data_type = convert_to_cudnn_type(scale.dtype))
-    B  = graph.tensor(name = "B", dim = bias.size(),  stride = bias.stride(), data_type = convert_to_cudnn_type(bias.dtype))
+    S = graph.tensor(name = "S", dim = scale.size(), stride = scale.stride(), data_type = scale.dtype)
+    B  = graph.tensor(name = "B", dim = bias.size(),  stride = bias.stride(), data_type = bias.dtype)
 
     S_OUT = graph.scale(name = "scale", input = X, scale = S)
     B_OUT = graph.bias(name = "bias", input = S_OUT, bias = B)
@@ -67,17 +61,14 @@ def test_conv_genstats():
     sq_sum_dev = torch.zeros_like(sq_sum_expected)
     Y_actual   = torch.zeros_like(Y_expected)
 
+    # Below tests capability to run with just device pointers
     workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
-
-    print("Executing Kernel")
-
-    graph.execute({X: X_gpu, W: W_gpu, Y: Y_actual, SUM : sum_dev, SQ_SUM : sq_sum_dev, S : scale, B : bias}, workspace)
+    graph.execute({X: X_gpu.data_ptr(), W: W_gpu, Y: Y_actual.data_ptr(), SUM : sum_dev, SQ_SUM : sq_sum_dev, S : scale, B : bias}, workspace.data_ptr())
 
     # Compare
     torch.testing.assert_close(sum_expected,       sum_dev, atol=0.5, rtol=1e-2)
     torch.testing.assert_close(sq_sum_expected, sq_sum_dev, atol=1e-3, rtol=1e-3)
     torch.testing.assert_close(Y_expected,        Y_actual, atol=1e-3, rtol=1e-3)
-    print("Done.")
 
 if __name__ == "__main__":
     test_conv_genstats()
\ No newline at end of file
diff --git a/test/python_fe/test_conv_reduction.py b/test/python_fe/test_conv_reduction.py
index bf942d5..92518a7 100644
--- a/test/python_fe/test_conv_reduction.py
+++ b/test/python_fe/test_conv_reduction.py
@@ -2,16 +2,11 @@
 import pytest
 import torch
 
-def convert_to_cudnn_type(torch_type):
-    if torch_type == torch.float16:
-        return cudnn.data_type.HALF
-    elif torch_type == torch.float32:
-        return cudnn.data_type.FLOAT
-    else:
-        raise ValueError("Unsupported tensor data type.")
-
+from test_utils import torch_fork_set_rng
 
+@torch_fork_set_rng(seed=0)
 def test_reduction():
+
     # Define tensor dimensions
     N, K, C, H, W = 4, 32, 16, 64, 64
     R, S = 3, 3
@@ -27,8 +22,8 @@ def test_reduction():
 
     # Cudnn code
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.HALF, intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
-    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = convert_to_cudnn_type(X_gpu.dtype))
-    Weight = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = convert_to_cudnn_type(W_gpu.dtype))
+    X = graph.tensor(name = "X", dim = X_gpu.size(), stride = X_gpu.stride(), data_type = X_gpu.dtype)
+    Weight = graph.tensor(name = "W", dim = W_gpu.size(), stride = W_gpu.stride(), data_type = W_gpu.dtype)
 
     Y0 = graph.conv_fprop(image = X, weight = Weight, padding = padding, stride = stride, dilation = dilation)
     
diff --git a/test/python_fe/test_instancenorm.py b/test/python_fe/test_instancenorm.py
index f852c87..7029f54 100644
--- a/test/python_fe/test_instancenorm.py
+++ b/test/python_fe/test_instancenorm.py
@@ -3,20 +3,7 @@
 import torch
 import itertools
 
-def convert_to_cudnn_type(torch_type):
-    if torch_type == torch.float16:
-        return cudnn.data_type.HALF
-    elif torch_type == torch.bfloat16:
-        return cudnn.data_type.BFLOAT16
-    elif torch_type == torch.float32:
-        return cudnn.data_type.FLOAT
-    elif torch_type == torch.bool:
-        return cudnn.data_type.BOOLEAN
-    elif torch_type == torch.uint8:
-        return cudnn.data_type.UINT8
-    else:
-        raise ValueError("Unsupported tensor data type.")
-
+from test_utils import torch_fork_set_rng
 
 input_type_options = [torch.bfloat16, torch.float16]
 
@@ -27,8 +14,8 @@ def param_extract(request):
   return request.param
 
 @pytest.mark.skipif(cudnn.backend_version() < 8905, reason="IN not supported below cudnn 8.9.5")
+@torch_fork_set_rng(seed=0)
 def test_in(param_extract):
-    torch.manual_seed(0)
 
     input_type, = param_extract
     print(input_type)
@@ -68,9 +55,9 @@ def test_in(param_extract):
                             bias = bias,
                             epsilon = epsilon)
     
-    Y.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
-    mean.set_output(True).set_data_type(convert_to_cudnn_type(mean_expected.dtype))
-    inv_var.set_output(True).set_data_type(convert_to_cudnn_type(inv_var_expected.dtype))
+    Y.set_output(True).set_data_type(x_gpu.dtype)
+    mean.set_output(True).set_data_type(mean_expected.dtype)
+    inv_var.set_output(True).set_data_type(inv_var_expected.dtype)
     
     graph.validate()
     graph.build_operation_graph()
@@ -132,9 +119,9 @@ def test_in(param_extract):
                             mean = mean_bwd,
                             inv_variance = inv_var_bwd)
     
-    DX.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
-    Dscale.set_output(True).set_data_type(convert_to_cudnn_type(scale_gpu.dtype))
-    Dbias.set_output(True).set_data_type(convert_to_cudnn_type(bias_gpu.dtype))
+    DX.set_output(True).set_data_type(x_gpu.dtype)
+    Dscale.set_output(True).set_data_type(scale_gpu.dtype)
+    Dbias.set_output(True).set_data_type(bias_gpu.dtype)
 
     bwd_graph.validate()
     bwd_graph.build_operation_graph()
diff --git a/test/python_fe/test_layernorm.py b/test/python_fe/test_layernorm.py
index e6d4887..9adcd3b 100644
--- a/test/python_fe/test_layernorm.py
+++ b/test/python_fe/test_layernorm.py
@@ -3,20 +3,7 @@
 import torch
 import itertools
 
-def convert_to_cudnn_type(torch_type):
-    if torch_type == torch.float16:
-        return cudnn.data_type.HALF
-    elif torch_type == torch.bfloat16:
-        return cudnn.data_type.BFLOAT16
-    elif torch_type == torch.float32:
-        return cudnn.data_type.FLOAT
-    elif torch_type == torch.bool:
-        return cudnn.data_type.BOOLEAN
-    elif torch_type == torch.uint8:
-        return cudnn.data_type.UINT8
-    else:
-        raise ValueError("Unsupported tensor data type.")
-
+from test_utils import torch_fork_set_rng
 
 embedding_dim_options = [768, 1024, 1280, 1600]
 input_type_options = [torch.bfloat16, torch.float16]
@@ -28,8 +15,8 @@ def param_extract(request):
   return request.param
 
 @pytest.mark.skipif(cudnn.backend_version() < 8905, reason="LN not supported below cudnn 8.9.5")
+@torch_fork_set_rng(seed=0)
 def test_layernorm(param_extract):
-    torch.manual_seed(0)
 
     embedding_dim, input_type = param_extract
 
@@ -55,10 +42,10 @@ def test_layernorm(param_extract):
 
     graph = cudnn.pygraph(intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
-    X = graph.tensor(name = "X", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = convert_to_cudnn_type(x_gpu.dtype))
-    scale = graph.tensor(name = "scale", dim = scale_gpu.size(), stride = scale_gpu.stride(), data_type = convert_to_cudnn_type(scale_gpu.dtype))
-    bias = graph.tensor(name = "bias", dim = bias_gpu.size(), stride = bias_gpu.stride(), data_type = convert_to_cudnn_type(bias_gpu.dtype))
-    epsilon = graph.tensor(name = "epsilon", dim = epsilon_cpu.size(), stride = epsilon_cpu.stride(), is_pass_by_value = True, data_type = convert_to_cudnn_type(epsilon_cpu.dtype))
+    X = graph.tensor(name = "X", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = x_gpu.dtype)
+    scale = graph.tensor(name = "scale", dim = scale_gpu.size(), stride = scale_gpu.stride(), data_type = scale_gpu.dtype)
+    bias = graph.tensor(name = "bias", dim = bias_gpu.size(), stride = bias_gpu.stride(), data_type = bias_gpu.dtype)
+    epsilon = graph.tensor(name = "epsilon", dim = epsilon_cpu.size(), stride = epsilon_cpu.stride(), is_pass_by_value = True, data_type = epsilon_cpu.dtype)
 
     Y, mean, inv_var = graph.layernorm(name = "LN", 
                             norm_forward_phase = cudnn.norm_forward_phase.TRAINING,
@@ -67,9 +54,9 @@ def test_layernorm(param_extract):
                             bias = bias,
                             epsilon = epsilon)
     
-    Y.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
-    mean.set_output(True).set_data_type(convert_to_cudnn_type(mean_expected.dtype))
-    inv_var.set_output(True).set_data_type(convert_to_cudnn_type(inv_var_expected.dtype))
+    Y.set_output(True).set_data_type(x_gpu.dtype)
+    mean.set_output(True).set_data_type(mean_expected.dtype)
+    inv_var.set_output(True).set_data_type(inv_var_expected.dtype)
     
     graph.validate()
     graph.build_operation_graph()
@@ -110,7 +97,7 @@ def test_layernorm(param_extract):
     
     bwd_graph = cudnn.pygraph(intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
-    DY = bwd_graph.tensor(name = "DY", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = convert_to_cudnn_type(x_gpu.dtype))
+    DY = bwd_graph.tensor(name = "DY", dim = x_gpu.size(), stride = x_gpu.stride(), data_type = x_gpu.dtype)
     X_bwd = bwd_graph.tensor_like(X, name = 'X')
     scale_bwd = bwd_graph.tensor_like(scale, name = 'scale')
     mean_bwd = bwd_graph.tensor_like(mean, name = 'mean')
@@ -123,9 +110,9 @@ def test_layernorm(param_extract):
                             mean = mean_bwd,
                             inv_variance = inv_var_bwd)
     
-    DX.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
-    Dscale.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
-    Dbias.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
+    DX.set_output(True).set_data_type(x_gpu.dtype)
+    Dscale.set_output(True).set_data_type(x_gpu.dtype)
+    Dbias.set_output(True).set_data_type(x_gpu.dtype)
 
     bwd_graph.validate()
     bwd_graph.build_operation_graph()    
diff --git a/test/python_fe/test_matmul_bias_relu.py b/test/python_fe/test_matmul_bias_relu.py
index 745d014..c6bce03 100644
--- a/test/python_fe/test_matmul_bias_relu.py
+++ b/test/python_fe/test_matmul_bias_relu.py
@@ -3,6 +3,8 @@
 import pytest
 import torch
 
+from test_utils import torch_fork_set_rng
+
 def convert_to_cudnn_type(torch_type):
     if torch_type == torch.float16:
         return cudnn.data_type.HALF
@@ -23,54 +25,11 @@ def get_cc():
     (major, minor) = torch.cuda.get_device_capability()
     return major*10 + minor 
 
-def compare_tensors(expected, actual, name, rtol=2e-2, atol=2e-2, fudge=1e-9):
-    assert expected.shape == actual.shape
-
-    expected = expected.float().cuda().flatten()
-    actual = actual.float().cuda().flatten()
-
-    n_elem = torch.numel(expected)
-
-    mae = (expected - actual).abs().mean().item()
-    perr = ((expected - actual).abs().sum() / expected.abs().sum()).item()
-    snr = (expected**2).mean().sqrt() / ((expected - actual) ** 2).mean().sqrt()
-    snr_db = (10 * torch.log10(snr)).item()
-
-    absolute_error = (expected - actual).abs()
-    relative_error = absolute_error / torch.where(expected.abs() < fudge, fudge, expected.abs())
-
-    abs_error_indices = absolute_error > atol
-    rel_error_indices = relative_error > rtol
-    n_abs_errors = torch.sum(abs_error_indices)
-    n_rel_errors = torch.sum(rel_error_indices)
-    error_indices = torch.logical_and(abs_error_indices, rel_error_indices)
-    n_errors = torch.sum(error_indices)
-
-    n_nans = torch.isnan(actual).sum()
-    n_zeros = n_elem - torch.count_nonzero(actual)
-
-    if n_errors != 0:
-        print(f"========== Comparison for {name} ==========")
-        print(f"Absolute Tolerance = {atol}")
-        print(f"Relative Tolerance = {rtol}")
-        print(f"Number of elements = {n_elem}")
-        print(f"Number of absolute errors = {n_abs_errors} ({n_abs_errors * 100 / n_elem:.2f}%)")
-        print(f"Number of relative errors = {n_rel_errors} ({n_rel_errors * 100 / n_elem:.2f}%)")
-        print(f"Number of errors (absolute and relative) = {n_errors} ({(n_errors * 100)/n_elem:.2f}%)")
-        print(f"Maximum absolute error = {absolute_error.max():.4f}")
-        print(f"Maximum relative error = {relative_error.max():.4f}")
-        print(f"Mean average error = {mae:.4f}")
-        print(f"Perr error = {perr:.4f} = 1/{(1/perr) if perr != 0 else float('inf'):.2f}")
-        print(f"Signal to noise ratio = {snr.item():.2f} = {snr_db:.2f}dB")
-        print(f"Number of Nans = {n_nans} ({n_nans * 100 / n_elem:.2f}%)")
-        print(f"Number of Zeros = {n_zeros} ({n_zeros * 100 / n_elem:.2f}%)")
-        print("===================================\n")
-
-    return n_errors
-
 @pytest.mark.skipif(cudnn.backend_version() < 8906, reason="requires cudnn 8.9.6 or higher")
 @pytest.mark.skipif(torch.cuda.get_device_capability()[0] < 9, reason="requires Hopper or newer arch")
+@torch_fork_set_rng(seed=0)
 def test_int8_bf16_matmul():
+
     # matmul problem size 
     B, M, N, K = 16, 32, 64, 128
 
@@ -115,6 +74,7 @@ def test_int8_bf16_matmul():
 @pytest.mark.parametrize("A_data_type", A_data_type_options)
 @pytest.mark.parametrize("B_data_type", B_data_type_options)
 @pytest.mark.parametrize("MMA_data_type", MMA_data_type_options)
+@torch_fork_set_rng(seed=0)
 def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type):
 
     # matmul problem size 
@@ -142,7 +102,7 @@ def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type):
     B = graph.tensor_like(B_gpu)
     
     # Cast the input tensors to required mma precision
-    A_casted = graph.identity(input = A, compute_data_type=convert_to_cudnn_type(MMA_data_type))
+    A_casted = graph.identity(input = A, compute_data_type=cudnn.data_type.FLOAT)
     A_casted.set_data_type(convert_to_cudnn_type(MMA_data_type))
     
     # Casting input tensor B is only supported from cudnn v9
@@ -153,9 +113,12 @@ def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type):
         # Do not create a cast node
         B_casted = B
     else:
-        B_casted = graph.identity(input = B, compute_data_type=convert_to_cudnn_type(MMA_data_type))
+        # Cast the input tensors to required mma precision
+        B_casted = graph.identity(input = B, compute_data_type=cudnn.data_type.FLOAT)
         B_casted.set_data_type(convert_to_cudnn_type(MMA_data_type))
 
+    # CAUTION: Hardcodes to fp32 as tests today dont cover inputs that are casted to ints.
+    # In case your usecase does cast inputs to int8, use int32 as compute type here. 
     C = graph.matmul(name = "matmul", A = A_casted, B = B_casted, compute_data_type=cudnn.data_type.FLOAT)
     C.set_output(True).set_data_type(convert_to_cudnn_type(MMA_data_type))
     
@@ -170,7 +133,7 @@ def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type):
     graph.execute({A: A_gpu, B:  B_gpu, C:  C_actual}, workspace)
 
     # compare'em
-    compare_tensors(C_expected, C_actual, "output", atol=1e-4, rtol=1e-4)
+    torch.testing.assert_close(C_expected, C_actual, atol=1e-4, rtol=1e-4)
 
 problem_size_options = [(1, 128, 768)
                         , (16, 512, 1600)
@@ -183,7 +146,9 @@ def test_mixed_precision_matmul(A_data_type, B_data_type, MMA_data_type):
 def param_extract(request):
   return request.param
 
+@torch_fork_set_rng(seed=0)
 def test_matmul_bias_relu(param_extract):
+
     problem_size_options, input_type = param_extract
     b, s, e = problem_size_options
 
diff --git a/test/python_fe/test_mhas.py b/test/python_fe/test_mhas.py
index 5e6f7a6..b25a86f 100644
--- a/test/python_fe/test_mhas.py
+++ b/test/python_fe/test_mhas.py
@@ -3,10 +3,25 @@
 import torch
 import math
 
-import itertools
 import random
 import os
 
+from test_utils import torch_fork_set_rng
+
+input_type_options = [torch.float16, torch.bfloat16]
+layout_options = ["non_interleaved", "bs3hd", "sbh3d"]
+head_group_options = ["multi_head", "group_query", "multi_query"]
+bias_options = [False, True]
+alibi_mask_options = [False, True]
+padding_mask_options = [False, True]
+causal_mask_options = [False, True]
+dropout_options = [False, True]
+ragged_options = [False, True]
+is_infer_options = [False, True]
+
+@pytest.fixture(scope="session")
+def arg_params(request):
+    return request.config.option
 
 def convert_to_cudnn_type(torch_type):
     if torch_type == torch.float16:
@@ -23,52 +38,6 @@ def convert_to_cudnn_type(torch_type):
         raise ValueError("Unsupported tensor data type.")
 
 
-def compare_tensors(expected, actual, name, rtol=2e-2, atol=2e-2, fudge=1e-9):
-    assert expected.shape == actual.shape
-
-    expected = expected.float().cuda().flatten()
-    actual = actual.float().cuda().flatten()
-
-    n_elem = torch.numel(expected)
-
-    mae = (expected - actual).abs().mean().item()
-    perr = ((expected - actual).abs().sum() / expected.abs().sum()).item()
-    snr = (expected**2).mean().sqrt() / ((expected - actual) ** 2).mean().sqrt()
-    snr_db = (10 * torch.log10(snr)).item()
-
-    absolute_error = (expected - actual).abs()
-    relative_error = absolute_error / torch.where(expected.abs() < fudge, fudge, expected.abs())
-
-    abs_error_indices = absolute_error > atol
-    rel_error_indices = relative_error > rtol
-    n_abs_errors = torch.sum(abs_error_indices)
-    n_rel_errors = torch.sum(rel_error_indices)
-    error_indices = torch.logical_and(abs_error_indices, rel_error_indices)
-    n_errors = torch.sum(error_indices)
-
-    n_nans = torch.isnan(actual).sum()
-    n_zeros = n_elem - torch.count_nonzero(actual)
-
-    if n_errors + n_nans != 0:
-        print(f"========== Comparison for {name} ==========")
-        print(f"Absolute Tolerance = {atol}")
-        print(f"Relative Tolerance = {rtol}")
-        print(f"Number of elements = {n_elem}")
-        print(f"Number of absolute errors = {n_abs_errors} ({n_abs_errors * 100 / n_elem:.2f}%)")
-        print(f"Number of relative errors = {n_rel_errors} ({n_rel_errors * 100 / n_elem:.2f}%)")
-        print(f"Number of errors (absolute and relative) = {n_errors} ({(n_errors * 100)/n_elem:.2f}%)")
-        print(f"Maximum absolute error = {absolute_error.max():.4f}")
-        print(f"Maximum relative error = {relative_error.max():.4f}")
-        print(f"Mean average error = {mae:.4f}")
-        print(f"Perr error = {perr:.4f} = 1/{(1/perr) if perr != 0 else float('inf'):.2f}")
-        print(f"Signal to noise ratio = {snr.item():.2f} = {snr_db:.2f}dB")
-        print(f"Number of Nans = {n_nans} ({n_nans * 100 / n_elem:.2f}%)")
-        print(f"Number of Zeros = {n_zeros} ({n_zeros * 100 / n_elem:.2f}%)")
-        print("===================================\n")
-
-    return n_errors + n_nans
-
-
 def compute_ref(
     q,
     k,
@@ -192,52 +161,6 @@ def compute_ref(
     return o
 
 
-input_type_options = [torch.float16, torch.bfloat16]
-layout_options = ["non_interleaved", "bs3hd", "sbh3d"]
-head_group_options = ["multi_head", "group_query", "multi_query"]
-bias_options = [False, True]
-alibi_mask_options = [False, True]
-padding_mask_options = [False, True]
-causal_mask_options = [False, True]
-dropout_options = [False, True]
-ragged_options = [False, True]
-is_infer_options = [False, True]
-
-all_options_forward = [
-    elem
-    for elem in itertools.product(
-        *[
-            input_type_options,
-            layout_options,
-            head_group_options,
-            bias_options,
-            alibi_mask_options,
-            padding_mask_options,
-            causal_mask_options,
-            dropout_options,
-            ragged_options,
-            is_infer_options,
-        ]
-    )
-]
-
-all_options_backward = [
-    elem
-    for elem in itertools.product(
-        *[
-            input_type_options,
-            layout_options,
-            head_group_options,
-            bias_options,
-            alibi_mask_options,
-            padding_mask_options,
-            causal_mask_options,
-            dropout_options,
-            ragged_options,
-        ]
-    )
-]
-
 
 def generate_layout(layout, head_group, shape_q, shape_k, shape_v, shape_o):
     b, h_q, s_q, d_qk = shape_q
@@ -350,21 +273,17 @@ def convert_ragged_to_uniform(ragged_tensor, ragged_offset):
     return uniform_tensor
 
 
-@pytest.fixture(params=all_options_forward)
-def param_extract_forward(request):
-    return request.param
-
-
-@pytest.mark.parametrize("input_type", input_type_options)
-@pytest.mark.parametrize("layout", layout_options)
+@pytest.mark.parametrize("is_infer", is_infer_options, ids=lambda p: f"infer{int(p)}")
+@pytest.mark.parametrize("is_ragged", ragged_options, ids=lambda p: f"ragged{int(p)}")
+@pytest.mark.parametrize("is_dropout", dropout_options, ids=lambda p: f"dropout{int(p)}")
+@pytest.mark.parametrize("is_causal", causal_mask_options, ids=lambda p: f"causal{int(p)}")
+@pytest.mark.parametrize("is_padding", padding_mask_options, ids=lambda p: f"padding{int(p)}")
+@pytest.mark.parametrize("is_alibi", alibi_mask_options, ids=lambda p: f"alibi{int(p)}")
+@pytest.mark.parametrize("is_bias", bias_options, ids=lambda p: f"bias{int(p)}")
 @pytest.mark.parametrize("head_group", head_group_options)
-@pytest.mark.parametrize("is_bias", bias_options)
-@pytest.mark.parametrize("is_alibi", alibi_mask_options)
-@pytest.mark.parametrize("is_padding", padding_mask_options)
-@pytest.mark.parametrize("is_causal", causal_mask_options)
-@pytest.mark.parametrize("is_dropout", dropout_options)
-@pytest.mark.parametrize("is_ragged", ragged_options)
-@pytest.mark.parametrize("is_infer", is_infer_options)
+@pytest.mark.parametrize("layout", layout_options)
+@pytest.mark.parametrize("input_type", input_type_options, ids=lambda p: str(p))
+@torch_fork_set_rng(seed=0)
 def test_sdpa(input_type,
         layout,
         head_group,
@@ -374,7 +293,9 @@ def test_sdpa(input_type,
         is_causal,
         is_dropout,
         is_ragged,
-        is_infer):
+        is_infer,
+        arg_params):
+
     if cudnn.backend_version() < 8903:
         pytest.skip("SDPA fprop requires cudnn 8.9.3 or higher")
 
@@ -402,6 +323,7 @@ def test_sdpa(input_type,
     if is_ragged and not is_padding:
         pytest.skip("Ragged tensor is only tested with packed variable length tensors")
 
+    # -------------------------- default randomized parameter testing ------------------------
     # batch size
     b = 2
     # query sequence length
@@ -426,6 +348,16 @@ def test_sdpa(input_type,
     else:
         assert False, "Head group must be either MHA, GQA, or MQA"
 
+    # -------------------------- override test parameters if args are provided ----------------
+    b = int(arg_params.mha_b) if arg_params.mha_b != None else b
+    s_q = int(arg_params.mha_s_q) if arg_params.mha_s_q != None else s_q
+    s_kv = int(arg_params.mha_s_kv) if arg_params.mha_s_kv != None else s_kv
+    d_qk = int(arg_params.mha_d_qk) if arg_params.mha_d_qk != None else d_qk
+    d_v = int(arg_params.mha_d_v) if arg_params.mha_d_v != None else d_v
+    h_q = int(arg_params.mha_h_q) if arg_params.mha_h_q != None else h_q
+    h_k = int(arg_params.mha_h_k) if arg_params.mha_h_k != None else h_k
+    h_v = int(arg_params.mha_h_v) if arg_params.mha_h_v != None else h_v
+
     if d_qk != d_v and cudnn.backend_version() < 8906:
         pytest.skip("d_qk != d_v is only supported on 8.9.6 onwards.")
 
@@ -442,11 +374,10 @@ def test_sdpa(input_type,
     if (d_qk % 64 != 0) and cudnn.backend_version() < 8906:
         pytest.skip("d not a multiple of 64 is not supported below 8.9.6")
 
-    # TODO file bug
     if d_qk != d_v and is_ragged:
         pytest.skip("d_qk != d_v is not supported with ragged offset")
 
-    print(f"{s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}")
+    print(f"{b=} {s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}")
 
     attn_scale = 0.125
     dropout_prob = 0.1 if is_dropout else 0.0
@@ -481,7 +412,7 @@ def test_sdpa(input_type,
         seed_gpu = torch.full((1, 1, 1, 1), 123456, dtype=torch.int64, device="cuda")
         offset_gpu = torch.full((1, 1, 1, 1), 789, dtype=torch.int64, device="cuda")
 
-    rng_dump_gpu = torch.empty((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda") if is_dropout else None
+    rng_dump_gpu = torch.zeros((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda") if is_dropout else None
 
     q_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_qk).int() if is_ragged else None
     k_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_k * d_qk).int() if is_ragged else None
@@ -628,20 +559,21 @@ def test_sdpa(input_type,
                 stats_ref[i, :, m:, :] = 0
                 stats_gpu[i, :, m:, :] = 0
 
-    assert compare_tensors(o_ref, o_gpu, "O") == 0
+    torch.testing.assert_close(o_ref, o_gpu, check_dtype=False, atol=2e-2, rtol=2e-2)
     if is_infer == False:
-        assert compare_tensors(stats_ref, stats_gpu, "stats") == 0
+        torch.testing.assert_close(stats_ref, stats_gpu, atol=2e-2, rtol=2e-2)
 
 
-@pytest.mark.parametrize("input_type", input_type_options)
-@pytest.mark.parametrize("layout", layout_options)
+@pytest.mark.parametrize("is_ragged", ragged_options, ids=lambda p: f"ragged{int(p)}")
+@pytest.mark.parametrize("is_dropout", dropout_options, ids=lambda p: f"dropout{int(p)}")
+@pytest.mark.parametrize("is_causal", causal_mask_options, ids=lambda p: f"causal{int(p)}")
+@pytest.mark.parametrize("is_padding", padding_mask_options, ids=lambda p: f"padding{int(p)}")
+@pytest.mark.parametrize("is_alibi", alibi_mask_options, ids=lambda p: f"alibi{int(p)}")
+@pytest.mark.parametrize("is_bias", bias_options, ids=lambda p: f"bias{int(p)}")
 @pytest.mark.parametrize("head_group", head_group_options)
-@pytest.mark.parametrize("is_bias", bias_options)
-@pytest.mark.parametrize("is_alibi", alibi_mask_options)
-@pytest.mark.parametrize("is_padding", padding_mask_options)
-@pytest.mark.parametrize("is_causal", causal_mask_options)
-@pytest.mark.parametrize("is_dropout", dropout_options)
-@pytest.mark.parametrize("is_ragged", ragged_options)
+@pytest.mark.parametrize("layout", layout_options)
+@pytest.mark.parametrize("input_type", input_type_options, ids=lambda p: str(p))
+@torch_fork_set_rng(seed=0)
 def test_sdpa_backward(input_type,
         layout,
         head_group,
@@ -650,7 +582,9 @@ def test_sdpa_backward(input_type,
         is_padding,
         is_causal,
         is_dropout,
-        is_ragged):
+        is_ragged,
+        arg_params):
+
     if cudnn.backend_version() < 8903:
         pytest.skip("SDPA bprop requires cudnn 8.9.3 or higher")
 
@@ -696,6 +630,7 @@ def test_sdpa_backward(input_type,
     # test both dP workspace optimization by lowering dP workspace limit to 8MB
     os.environ["CUDNN_FRONTEND_ATTN_DP_WORKSPACE_LIMIT"] = str(8 * 1024 * 1024)
 
+    # -------------------------- default randomized parameter testing ------------------------
     # batch size
     b = 2
     # query sequence length
@@ -739,11 +674,20 @@ def test_sdpa_backward(input_type,
     if (d_qk % 64 != 0) and cudnn.backend_version() < 8906:
         pytest.skip("d not a multiple of 64 is not supported below 8.9.6")
 
-    # TODO file bug
     if d_qk != d_v and is_ragged:
         pytest.skip("d_qk != d_v is not supported with ragged offset")
 
-    print(f"{s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}")
+    # -------------------------- override test parameters if args are provided ----------------
+    b = int(arg_params.mha_b) if arg_params.mha_b != None else b
+    s_q = int(arg_params.mha_s_q) if arg_params.mha_s_q != None else s_q
+    s_kv = int(arg_params.mha_s_kv) if arg_params.mha_s_kv != None else s_kv
+    d_qk = int(arg_params.mha_d_qk) if arg_params.mha_d_qk != None else d_qk
+    d_v = int(arg_params.mha_d_v) if arg_params.mha_d_v != None else d_v
+    h_q = int(arg_params.mha_h_q) if arg_params.mha_h_q != None else h_q
+    h_k = int(arg_params.mha_h_k) if arg_params.mha_h_k != None else h_k
+    h_v = int(arg_params.mha_h_v) if arg_params.mha_h_v != None else h_v
+
+    print(f"{b=} {s_q=} {s_kv=} {d_qk=} {d_v=} {h_q=} {h_k=} {h_v=}")
 
     attn_scale = 0.125
     dropout_prob = 0.1 if is_dropout else 0.0
@@ -786,7 +730,7 @@ def test_sdpa_backward(input_type,
         seed_gpu = torch.full((1, 1, 1, 1), 123456, dtype=torch.int64, device="cuda")
         offset_gpu = torch.full((1, 1, 1, 1), 789, dtype=torch.int64, device="cuda")
 
-    rng_dump_gpu = torch.empty((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda") if is_dropout else None
+    rng_dump_gpu = torch.zeros((b, h_q, s_q, s_kv), dtype=torch.float32, device="cuda") if is_dropout else None
 
     q_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_q_gpu) * h_q * d_qk).int() if is_ragged else None
     k_ragged_offset_gpu = (compute_exclusive_prefix_sum(seq_len_kv_gpu) * h_k * d_qk).int() if is_ragged else None
@@ -1057,33 +1001,42 @@ def test_sdpa_backward(input_type,
                 dBias_ref[i, :, m:, :] = 0
                 dBias_ref[i, :, :, n:] = 0
 
-    assert compare_tensors(dQ_ref, dQ_gpu, "dQ") == 0
-    assert compare_tensors(dK_ref, dK_gpu, "dK", atol=2e-2 if input_type != torch.bfloat16 else 4e-2) == 0
-    assert compare_tensors(dV_ref, dV_gpu, "dV") == 0
+    torch.testing.assert_close(dQ_ref, dQ_gpu, check_dtype=False, atol=2e-2, rtol=2e-2)
+    torch.testing.assert_close(dK_ref, dK_gpu, check_dtype=False, atol=2e-2 if input_type != torch.bfloat16 else 4e-2, rtol=2e-2)
+    torch.testing.assert_close(dV_ref, dV_gpu, check_dtype=False, atol=2e-2 if input_type != torch.bfloat16 else 4e-2, rtol=2e-2)
     if is_bias:
-        assert compare_tensors(dBias_ref, dBias_gpu, "dBias") == 0
+        torch.testing.assert_close(dBias_ref, dBias_gpu, check_dtype=False, atol=2e-2, rtol=2e-2)
 
 
 if __name__ == "__main__":
+    # example usage
+    # ================== forward ==================
+    """
+    pytest \
+      test/python_fe/test_mhas.py::test_sdpa[torch.float16-non_interleaved-group_query-bias0-alibi0-padding0-causal0-dropout0-ragged0-infer0] \
+      -s \
+      --mha_b 3 \
+      --mha_s_q 256 \
+      --mha_s_kv 128 \
+      --mha_d_qk 48 \
+      --mha_d_v 32 \
+      --mha_h_q 12 \
+      --mha_h_k 3 \
+      --mha_h_v 4
+    """
+    # ================== backward ==================
     """
-    option_forward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout, is_ragged, is_infer)
-    option_backward = (input_type, layout, head_group, is_bias, is_alibi, is_padding, is_causal, is_dropout, is_ragged)
-    test_sdpa(torch.float16, "bs3hd", "multi_head", False, False, False, False, False, False, False)
-    test_sdpa_backward(torch.float16, "bs3hd", "multi_head", False, False, False, False, False, False)
+    pytest \
+      test/python_fe/test_mhas.py::test_sdpa_backward[torch.float16-non_interleaved-group_query-bias0-alibi0-padding0-causal0-dropout0-ragged0] \
+      -s \
+      --mha_b 3 \
+      --mha_s_q 256 \
+      --mha_s_kv 128 \
+      --mha_d_qk 48 \
+      --mha_d_v 32 \
+      --mha_h_q 12 \
+      --mha_h_k 3 \
+      --mha_h_v 4
     """
 
-    print("==========running forward tests==========")
-    for option in all_options_forward:
-        try:
-            print(f"Running {option}")
-            test_sdpa(*option)
-        except pytest.skip.Exception as e:
-            print(f"Skipped {option}\n{e}")
-
-    print("==========running backward tests==========")
-    for option in all_options_backward:
-        try:
-            print(f"Running {option}")
-            test_sdpa_backward(*option)
-        except pytest.skip.Exception as e:
-            print(f"Skipped {option}\n{e}")
+    pytest.main([__file__])
diff --git a/test/python_fe/test_rmsnorm.py b/test/python_fe/test_rmsnorm.py
index e999e0d..41cfad4 100644
--- a/test/python_fe/test_rmsnorm.py
+++ b/test/python_fe/test_rmsnorm.py
@@ -5,19 +5,7 @@
 
 import torch.nn as nn
 
-def convert_to_cudnn_type(torch_type):
-    if torch_type == torch.float16:
-        return cudnn.data_type.HALF
-    elif torch_type == torch.bfloat16:
-        return cudnn.data_type.BFLOAT16
-    elif torch_type == torch.float32:
-        return cudnn.data_type.FLOAT
-    elif torch_type == torch.bool:
-        return cudnn.data_type.BOOLEAN
-    elif torch_type == torch.uint8:
-        return cudnn.data_type.UINT8
-    else:
-        raise ValueError("Unsupported tensor data type.")
+from test_utils import torch_fork_set_rng
 
 class RMSNorm(torch.nn.Module):
     """Root Mean Square Layer Normalization.
@@ -52,10 +40,9 @@ def param_extract(request):
   return request.param
 
 @pytest.mark.skipif(cudnn.backend_version() < 8906, reason="RmsNorm not supported below cudnn 8.9.6")
+@torch_fork_set_rng(seed=0)
 def test_rmsnorm(param_extract):
-    # TODO(@barretw): ensure output is deterministic and reproducible
-    torch.manual_seed(0)
-    
+
     embedding_dim, input_type, has_bias = param_extract
     
     batch_size, seq_size = 16, 128
@@ -89,8 +76,8 @@ def test_rmsnorm(param_extract):
                             bias = bias,
                             epsilon = epsilon)
     
-    Y.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
-    inv_var.set_output(True).set_data_type(convert_to_cudnn_type(inv_var_expected.dtype))
+    Y.set_output(True).set_data_type(x_gpu.dtype)
+    inv_var.set_output(True).set_data_type(inv_var_expected.dtype)
     
     graph.validate()
     graph.build_operation_graph()
@@ -143,10 +130,10 @@ def test_rmsnorm(param_extract):
                             inv_variance = inv_var_bwd,
                             has_dbias = has_bias)
     
-    DX.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
-    Dscale.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
+    DX.set_output(True).set_data_type(x_gpu.dtype)
+    Dscale.set_output(True).set_data_type(x_gpu.dtype)
     if has_bias:
-        Dbias.set_output(True).set_data_type(convert_to_cudnn_type(x_gpu.dtype))
+        Dbias.set_output(True).set_data_type(x_gpu.dtype)
     else:
         assert Dbias is None
 
diff --git a/test/python_fe/test_utils.py b/test/python_fe/test_utils.py
new file mode 100644
index 0000000..0598035
--- /dev/null
+++ b/test/python_fe/test_utils.py
@@ -0,0 +1,14 @@
+import torch
+import functools
+
+# decorator function to fork the RNG and set the seed for each tests
+def torch_fork_set_rng(seed=None):
+    def decorator_(func):
+        @functools.wraps(func)
+        def wrapper_(*args, **kwargs):
+            with torch.random.fork_rng():
+                if seed is not None:
+                    torch.manual_seed(seed)
+                return func(*args, **kwargs)
+        return wrapper_
+    return decorator_
diff --git a/test/python_fe/test_wgrads.py b/test/python_fe/test_wgrads.py
index dfd9f45..f475b75 100644
--- a/test/python_fe/test_wgrads.py
+++ b/test/python_fe/test_wgrads.py
@@ -2,13 +2,7 @@
 import pytest
 import torch
 
-def convert_to_cudnn_type(torch_type):
-    if torch_type == torch.float16:
-        return cudnn.data_type.HALF
-    elif torch_type == torch.float32:
-        return cudnn.data_type.FLOAT
-    else:
-        raise ValueError("Unsupported tensor data type.")
+from test_utils import torch_fork_set_rng
 
 def is_ampere_arch():
     (major, minor) = torch.cuda.get_device_capability()
@@ -28,6 +22,7 @@ def is_hopper_arch():
 dilation = [1,1]
 
 @pytest.mark.skipif(cudnn.backend_version() < 8800, reason="requires cudnn 8.8 or higher")
+@torch_fork_set_rng(seed=0)
 def test_scale_bias_relu_wgrad():
 
     if not is_ampere_arch() and not is_hopper_arch():
@@ -42,10 +37,10 @@ def test_scale_bias_relu_wgrad():
     
     graph = cudnn.pygraph(io_data_type = cudnn.data_type.HALF, intermediate_data_type = cudnn.data_type.FLOAT, compute_data_type = cudnn.data_type.FLOAT)
 
-    X  = graph.tensor(name = "X",  dim = X_gpu.size(), stride = X_gpu.stride(), data_type = convert_to_cudnn_type(X_gpu.dtype))
-    DY = graph.tensor(name = "DY", dim = DY_gpu.size(), stride = DY_gpu.stride(), data_type = convert_to_cudnn_type(DY_gpu.dtype))
-    B  = graph.tensor(name = "B", dim = bias.size(), stride = bias.stride(), data_type = convert_to_cudnn_type(bias.dtype))
-    S  = graph.tensor(name = "S", dim = scale.size(), stride = scale.stride(), data_type = convert_to_cudnn_type(scale.dtype))
+    X  = graph.tensor(name = "X",  dim = X_gpu.size(), stride = X_gpu.stride(), data_type = X_gpu.dtype)
+    DY = graph.tensor(name = "DY", dim = DY_gpu.size(), stride = DY_gpu.stride(), data_type = DY_gpu.dtype)
+    B  = graph.tensor(name = "B", dim = bias.size(), stride = bias.stride(), data_type = bias.dtype)
+    S  = graph.tensor(name = "S", dim = scale.size(), stride = scale.stride(), data_type = scale.dtype)
 
     scale_output = graph.scale(name = "scale", input = X, scale = S)
     bias_output  = graph.bias(name = "bias", input = scale_output, bias = B)
diff --git a/test/unit_tests/CMakeLists.txt b/test/unit_tests/CMakeLists.txt
new file mode 100644
index 0000000..272a92f
--- /dev/null
+++ b/test/unit_tests/CMakeLists.txt
@@ -0,0 +1,57 @@
+cmake_minimum_required(VERSION 3.18)
+
+find_package(Catch2 QUIET)
+
+if(NOT Catch2_FOUND)
+    Include(FetchContent)
+
+    # Fetch and build catch2
+    FetchContent_Declare(
+      Catch2
+      GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+      GIT_TAG        v3.3.2
+    )
+    FetchContent_MakeAvailable(Catch2)
+endif()
+
+add_executable(
+    unit_tests
+
+    serialize.cpp
+    validate.cpp
+)
+
+if (MSVC)
+    target_compile_options(
+        unit_tests PRIVATE
+        /W4 /WX # warning level 3 and all warnings as errors
+        /wd4100 # allow unused parameters
+        /wd4458 # local hides class member (currently a problem for all inline setters)
+        /wd4505 # unreferenced function with internal linkage has been removed
+        /wd4101 /wd4189 # unreferenced local
+        /bigobj
+    )
+else()
+    target_compile_options(
+        unit_tests PRIVATE
+        -Wall
+        -Wextra
+        -Werror
+        -Wno-unused-function
+    )
+endif()
+
+target_link_libraries(
+    unit_tests
+    cudnn_frontend
+    Catch2::Catch2WithMain
+)
+
+# cuDNN dlopen's its libraries
+# Add all libraries in link line as NEEDED
+set_target_properties(
+    unit_tests
+    PROPERTIES
+    LINK_WHAT_YOU_USE TRUE
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin
+)
diff --git a/test/unit_tests/serialize.cpp b/test/unit_tests/serialize.cpp
new file mode 100644
index 0000000..6b1e515
--- /dev/null
+++ b/test/unit_tests/serialize.cpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Tensor attributes", "[tensor][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    auto tensor_attributes = fe::graph::Tensor_attributes()
+                                 .set_name("image")
+                                 .set_dim({4, 32, 16, 16})
+                                 .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+                                 .set_is_virtual(true)
+                                 .set_is_pass_by_value(true)
+                                 .set_uid(12312)
+                                 .set_reordering_type(fe::TensorReordering_t::F16x16)
+                                 .set_data_type(fe::DataType_t::HALF);
+
+    json j                              = tensor_attributes;
+    auto tensor_attributes_deserialized = j;
+
+    REQUIRE(tensor_attributes_deserialized == tensor_attributes);
+}
+
+TEST_CASE("Conv fprop attributes", "[conv_fprop][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    auto x = std::make_shared<fe::graph::Tensor_attributes>();
+    x->set_name("image")
+        .set_dim({4, 32, 16, 16})
+        .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+        .set_is_virtual(true)
+        .set_is_pass_by_value(true)
+        .set_uid(12312)
+        .set_reordering_type(fe::TensorReordering_t::F16x16)
+        .set_data_type(fe::DataType_t::HALF);
+
+    auto conv_fprop_attributes = fe::graph::Conv_fprop_attributes()
+                                     .set_name("conv_fprop")
+                                     .set_padding({1, 1})
+                                     .set_stride({1, 1})
+                                     .set_dilation({1, 1})
+                                     .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    json j                                  = conv_fprop_attributes;
+    auto conv_fprop_attributes_deserialized = j;
+
+    REQUIRE(conv_fprop_attributes_deserialized == conv_fprop_attributes);
+}
+
+TEST_CASE("Graph key", "[serialize]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("image").set_dim({4, 16, 64}).set_stride({16 * 64, 1, 16}));
+    auto Y = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("filter").set_dim({4, 64, 32}).set_stride({32 * 64, 1, 64}));
+
+    fe::graph::Matmul_attributes matmul;
+    auto Z = graph.matmul(X, Y, matmul);
+
+    auto scale_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::MUL);
+    auto S             = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("scale").set_dim({4, 16, 32}).set_stride({16 * 32, 32, 1}));
+    auto scale_output = graph.pointwise(Z, S, scale_options);
+
+    auto bias_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::ADD);
+    auto B =
+        graph.tensor(fe::graph::Tensor_attributes().set_name("bias").set_dim({4, 16, 32}).set_stride({16 * 32, 32, 1}));
+    auto bias_output = graph.pointwise(scale_output, B, bias_options);
+
+    auto relu_options = fe::graph::Pointwise_attributes().set_mode(fe::PointwiseMode_t::RELU_FWD);
+    auto O            = graph.pointwise(bias_output, relu_options);
+    O->set_output(true);
+
+    cudnnHandle_t handle;
+    cudnnCreate(&handle);
+
+    REQUIRE(graph.validate().is_good());
+
+    REQUIRE(graph.build_operation_graph(handle).is_good());
+    auto key = graph.key();
+
+    REQUIRE(graph.create_execution_plans({fe::HeurMode_t::A}).is_good());
+    REQUIRE(key == graph.key());
+
+    REQUIRE(graph.check_support(handle).is_good());
+    REQUIRE(key == graph.key());
+
+    REQUIRE(graph.build_plans(handle).is_good());
+    REQUIRE(key == graph.key());
+}
+
+TEST_CASE("conv graph serialization", "[graph][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+
+    auto x = graph.tensor(fe::graph::Tensor_attributes());
+    x->set_name("image")
+        .set_dim({4, 32, 16, 16})
+        .set_stride({32 * 16 * 16, 1, 32 * 16, 32})
+        .set_is_virtual(false)
+        .set_is_pass_by_value(false)
+        .set_reordering_type(fe::TensorReordering_t::NONE)
+        .set_data_type(fe::DataType_t::HALF);
+
+    auto w = graph.tensor(fe::graph::Tensor_attributes());
+    w->set_name("weight")
+        .set_dim({64, 32, 3, 3})
+        .set_stride({32 * 3 * 3, 1, 32 * 3, 32})
+        .set_is_virtual(false)
+        .set_is_pass_by_value(false)
+        .set_reordering_type(fe::TensorReordering_t::NONE)
+        .set_data_type(fe::DataType_t::HALF);
+
+    auto conv_fprop_attributes = fe::graph::Conv_fprop_attributes()
+                                     .set_name("conv_fprop")
+                                     .set_padding({1, 1})
+                                     .set_stride({1, 1})
+                                     .set_dilation({1, 1})
+                                     .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto y = graph.conv_fprop(x, w, conv_fprop_attributes);
+
+    auto b = graph.tensor(fe::graph::Tensor_attributes());
+    b->set_name("bias")
+        .set_dim({1, 32, 1, 1})
+        .set_stride({32, 1, 32, 32})
+        .set_is_virtual(false)
+        .set_is_pass_by_value(false)
+        .set_reordering_type(fe::TensorReordering_t::NONE)
+        .set_data_type(fe::DataType_t::HALF);
+
+    auto pointwise_attributes = fe::graph::Pointwise_attributes()
+                                    .set_name("bias")
+                                    .set_mode(fe::PointwiseMode_t::ADD)
+                                    .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto o = graph.pointwise(y, b, pointwise_attributes);
+
+    auto reduction_attributes = fe::graph::Reduction_attributes()
+                                    .set_name("reduction")
+                                    .set_mode(fe::ReductionMode_t::ADD)
+                                    .set_compute_data_type(fe::DataType_t::FLOAT);
+    auto r = graph.reduction(o, reduction_attributes);
+
+    r->set_output(true).set_data_type(fe::DataType_t::HALF);
+
+    REQUIRE(graph.validate().is_good());
+
+    json j = graph;
+    fe::graph::Graph graph_deserialized;
+    REQUIRE(graph_deserialized.deserialize(j).is_good());
+    json j2 = graph_deserialized;
+
+    REQUIRE(j == j2);
+}
+
+TEST_CASE("sdpa graph serialization", "[graph][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    int64_t b    = 3;     // batch size
+    int64_t h    = 4;     // head dim
+    int64_t s_q  = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv = 1024;  // k and v tensor is padded to this seq length
+    int64_t d    = 128;   // hidden dim
+
+    auto Q = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("Q")
+                              .set_dim({b, h, s_q, d})
+                              .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+    auto K = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("K")
+                              .set_dim({b, h, s_kv, d})
+                              .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+    auto V = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("V")
+                              .set_dim({b, h, s_kv, d})
+                              .set_stride({3 * h * d, 3 * d, 3 * b * h * d, 1}));
+
+    auto attn_scale = graph.tensor(fe::graph::Tensor_attributes()
+                                       .set_name("attn_scale")
+                                       .set_dim({1, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_is_pass_by_value(true)
+                                       .set_data_type(fe::DataType_t::FLOAT));
+
+    auto seed   = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("Seed")
+                                 .set_dim({1, 1, 1, 1})
+                                 .set_stride({1, 1, 1, 1})
+                                 .set_data_type(fe::DataType_t::INT32));
+    auto offset = graph.tensor(fe::graph::Tensor_attributes()
+                                   .set_name("Offset")
+                                   .set_dim({1, 1, 1, 1})
+                                   .set_stride({1, 1, 1, 1})
+                                   .set_data_type(fe::DataType_t::INT32));
+
+    auto bias = graph.tensor(fe::graph::Tensor_attributes()
+                                 .set_name("bias")
+                                 .set_dim({b, 1, s_q, s_kv})
+                                 .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+
+    auto seq_q  = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("seq_q")
+                                  .set_dim({b, 1, 1, 1})
+                                  .set_stride({1, 1, 1, 1})
+                                  .set_data_type(fe::DataType_t::INT32));
+    auto seq_kv = graph.tensor(fe::graph::Tensor_attributes()
+                                   .set_name("seq_kv")
+                                   .set_dim({b, 1, 1, 1})
+                                   .set_stride({1, 1, 1, 1})
+                                   .set_data_type(fe::DataType_t::INT32));
+
+    auto sdpa_options = fe::graph::SDPA_attributes()
+                            .set_name("flash_attention")
+                            .set_is_inference(false)
+                            .set_attn_scale(attn_scale)
+                            .set_alibi_mask(true)
+                            .set_causal_mask(false)
+                            .set_dropout(0.1f, seed, offset)
+                            .set_bias(bias)
+                            .set_padding_mask(true)
+                            .set_seq_len_q(seq_q)
+                            .set_seq_len_kv(seq_kv);
+
+    auto [O, stats] = graph.sdpa(Q, K, V, sdpa_options);
+
+    O->set_output(true).set_dim({b, h, s_q, d}).set_stride({h * d, d, b * h * d, 1});
+    stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+
+    REQUIRE(graph.validate().is_good());
+
+    json j = graph;
+    fe::graph::Graph graph_deserialized;
+    REQUIRE(graph_deserialized.deserialize(j).is_good());
+    json j2 = graph_deserialized;
+
+    REQUIRE(j == j2);
+}
+
+TEST_CASE("sdpa backward graph serialization", "[graph][serialize]") {
+    namespace fe = cudnn_frontend;
+
+    int64_t b    = 3;     // batch size
+    int64_t h    = 4;     // head dim
+    int64_t s_q  = 1024;  // q tensor is padded to this seq length
+    int64_t s_kv = 1024;  // k and v tensor is padded to this seq length
+    int64_t d    = 128;   // hidden dim
+
+    fe::graph::Graph graph;
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    std::shared_ptr<fe::graph::Tensor_attributes> bias, dropout_seed, dropout_offset;
+
+    auto q = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("Q").set_dim({b, h, s_q, d}).set_stride({h * s_q * d, s_q * d, d, 1}));
+    auto k = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("K")
+                              .set_dim({b, h, s_kv, d})
+                              .set_stride({h * s_kv * d, s_kv * d, d, 1}));
+    auto v = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("V")
+                              .set_dim({b, h, s_kv, d})
+                              .set_stride({h * s_kv * d, s_kv * d, d, 1}));
+    auto o = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("O").set_dim({b, h, s_q, d}).set_stride({h * s_q * d, s_q * d, d, 1}));
+    auto dO = graph.tensor(
+        fe::graph::Tensor_attributes().set_name("dO").set_dim({b, h, s_q, d}).set_stride({h * s_q * d, s_q * d, d, 1}));
+    auto stats = graph.tensor(fe::graph::Tensor_attributes()
+                                  .set_name("stats")
+                                  .set_dim({b, h, s_q, 1})
+                                  .set_stride({h * s_q, s_q, 1, 1})
+                                  .set_data_type(fe::DataType_t::FLOAT));
+
+    auto attn_scale = graph.tensor(fe::graph::Tensor_attributes()
+                                       .set_name("attn_scale")
+                                       .set_dim({1, 1, 1, 1})
+                                       .set_stride({1, 1, 1, 1})
+                                       .set_is_pass_by_value(true)
+                                       .set_data_type(fe::DataType_t::FLOAT));
+
+    bias = graph.tensor(fe::graph::Tensor_attributes()
+                            .set_name("bias")
+                            .set_dim({b, 1, s_q, s_kv})
+                            .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1}));
+
+    dropout_seed   = graph.tensor(fe::graph::Tensor_attributes()
+                                    .set_name("Seed")
+                                    .set_dim({1, 1, 1, 1})
+                                    .set_stride({1, 1, 1, 1})
+                                    .set_data_type(fe::DataType_t::INT32));
+    dropout_offset = graph.tensor(fe::graph::Tensor_attributes()
+                                      .set_name("Offset")
+                                      .set_dim({1, 1, 1, 1})
+                                      .set_stride({1, 1, 1, 1})
+                                      .set_data_type(fe::DataType_t::INT32));
+
+    auto sdpa_backward_options = fe::graph::SDPA_backward_attributes()
+                                     .set_name("flash_attention_backward")
+                                     .set_causal_mask(true)
+                                     .set_attn_scale(attn_scale)
+                                     .set_bias(bias)
+                                     .set_dropout(0.1f, dropout_seed, dropout_offset);
+
+    auto [dQ, dK, dV] = graph.sdpa_backward(q, k, v, o, dO, stats, sdpa_backward_options);
+
+    dQ->set_output(true).set_dim({b, h, s_q, d}).set_stride({h * s_q * d, s_q * d, d, 1});
+    dK->set_output(true).set_dim({b, h, s_kv, d}).set_stride({h * s_kv * d, s_kv * d, d, 1});
+    dV->set_output(true).set_dim({b, h, s_kv, d}).set_stride({h * s_kv * d, s_kv * d, d, 1});
+
+    REQUIRE(graph.validate().is_good());
+
+    json j = graph;
+    fe::graph::Graph graph_deserialized;
+    REQUIRE(graph_deserialized.deserialize(j).is_good());
+    json j2 = graph_deserialized;
+
+    REQUIRE(j == j2);
+}
\ No newline at end of file
diff --git a/test/unit_tests/validate.cpp b/test/unit_tests/validate.cpp
new file mode 100644
index 0000000..050ae66
--- /dev/null
+++ b/test/unit_tests/validate.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <string>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <cudnn_frontend.h>
+
+TEST_CASE("Validate conv node", "[conv][validate]") {
+    namespace fe = cudnn_frontend;
+    fe::graph::Graph graph;
+
+    graph.set_io_data_type(fe::DataType_t::HALF)
+        .set_intermediate_data_type(fe::DataType_t::FLOAT)
+        .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    auto X = graph.tensor(fe::graph::Tensor_attributes().set_name("image").set_stride({32 * 16 * 16, 1, 32 * 16, 32}));
+    auto W = graph.tensor(fe::graph::Tensor_attributes()
+                              .set_name("filter")
+                              .set_dim({64, 32, 3, 3})
+                              .set_stride({32 * 3 * 3, 1, 32 * 3, 32}));
+
+    auto conv_options = fe::graph::Conv_fprop_attributes().set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1});
+    auto Y            = graph.conv_fprop(X, W, conv_options);
+    Y->set_output(true);
+
+    auto status = graph.validate();
+
+    // Check that error is attribute not set
+    REQUIRE(status.get_code() == fe::error_code_t::ATTRIBUTE_NOT_SET);
+
+    // Check that error message contains name of tensor
+    REQUIRE(status.get_message().find(X->get_name()) != std::string::npos);
+}
+
+TEST_CASE("Move", "[move]") {
+    namespace fe = cudnn_frontend;
+    auto validate = [](fe::graph::Graph graph) {
+        REQUIRE(graph.validate().is_good());
+    };
+    auto construct = []() {
+        fe::graph::Graph graph;
+        REQUIRE(graph.validate().is_good());
+        return graph;
+    };
+    fe::graph::Graph graph = construct();
+    REQUIRE(graph.validate().is_good());
+    validate(std::move(graph));
+}
\ No newline at end of file