diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 00000000..c2c3ca3a
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,30 @@
+---
+Checks: "-*,\
+misc-*,\
+-misc-incorrect-roundings,\
+-misc-macro-parentheses,\
+-misc-misplaced-widening-cast,\
+-misc-static-assert,\
+-misc-no-recursion,\
+-misc-non-private-member-variables-in-classes,\
+modernize-*,\
+-modernize-deprecated-headers,\
+-modernize-raw-string-literal,\
+-modernize-return-braced-init-list,\
+-modernize-use-transparent-functors,\
+-modernize-use-trailing-return-type,\
+-modernize-avoid-c-arrays,\
+-modernize-use-auto,\
+-modernize-concat-nested-namespaces,\
+-modernize-use-nodiscard,\
+performance-*,\
+readability-*,\
+-readability-function-size,\
+-readability-identifier-naming,\
+"
+CheckOptions:
+  - key:             readability-braces-around-statements.ShortStatementLines
+    value:           '2'
+  - key:             readability-implicit-bool-conversion.AllowPointerConditions
+    value:           '1'
+...
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..90e05c40
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "github-actions" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml
new file mode 100644
index 00000000..c19ecf92
--- /dev/null
+++ b/.github/workflows/continuous_integration.yml
@@ -0,0 +1,330 @@
+name: Continuous Integration
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ "amd-staging" ]
+    paths-ignore:
+      - '.github/workflows/pull_*.yml'
+      - '.github/workflows/linting.yml'
+      - '.github/workflows/markdown_lint.yml'
+      - '*.md'
+  pull_request:
+    branches: [ "amd-staging" ]
+    paths-ignore:
+      - '.github/workflows/pull_*.yml'
+      - '.github/workflows/linting.yml'
+      - '.github/workflows/markdown_lint.yml'
+      - '*.md'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  # TODO(jrmadsen): replace LD_RUNPATH_FLAG, GPU_LIST, etc. with internal handling in cmake
+  ROCM_PATH: "/opt/rocm"
+  CMAKE_PREFIX_PATH: "/opt/rocm"
+  LD_RUNPATH_FLAG: " -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib"
+  GPU_LIST: "gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102"
+
+jobs:
+  get_latest_mainline_build_number:
+    runs-on: mi200
+
+    outputs:
+      LATEST_BUILD_NUMBER: ${{ steps.get_build_number.outputs.LATEST_BUILD_NUMBER }}
+
+    steps:
+      - id: get_build_number
+        run: echo "LATEST_BUILD_NUMBER=$(wget -qO- 'http://rocm-ci.amd.com/job/compute-rocm-dkms-no-npi-hipclang/lastSuccessfulBuild/buildNumber')" >> $GITHUB_OUTPUT
+
+  Mi200-Ubuntu22-Doc-Packages:
+    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
+    strategy:
+      fail-fast: true
+      max-parallel: 4
+      matrix:
+        include:
+            - os: 'ubuntu-22.04'
+              runner: 'renderD129'
+              device: '/renderD129'
+              build-type: 'Release'
+              ci-flags: '--coverage'
+              name-tag: '-codecov'
+            - os: 'ubuntu-22.04'
+              runner: 'renderD130'
+              device: '/renderD130'
+              build-type: 'RelWithDebInfo'
+              ci-flags: ''
+              name-tag: ''
+
+    runs-on: ${{ matrix.runner }}
+
+    # define this for containers
+    env:
+      GIT_DISCOVERY_ACROSS_FILESYSTEM: 1
+
+    container:
+      image: compute-artifactory.amd.com:5000/rocm-plus-docker/compute-rocm-dkms-no-npi-hipclang:${{ needs.get_latest_mainline_build_number.outputs.LATEST_BUILD_NUMBER }}-${{ matrix.os }}-stg1
+      options: --memory=128g --cpus=32 --ipc=host --device=/dev/kfd --device=/dev/dri${{ matrix.device }} --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined
+
+    if: ${{ always() }}
+    needs: get_latest_mainline_build_number
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: List Files
+      shell: bash
+      run: |
+        which-realpath() { echo "$1 resolves to $(realpath $(which $1))"; }
+        for i in python python3 git cmake ctest; do which-realpath $i; done
+        ls -la
+
+    - name: Install Python requirements
+      shell: bash
+      run: pip3 install -r requirements.txt
+
+    - name: Configure, Build, and Test
+      timeout-minutes: 30
+      shell: bash
+      run:
+        python3 ./script/run-ci.py -B build
+          --name ${{ github.repository }}-${{ github.ref_name }}-mi200-${{ matrix.os }}${{ matrix.name-tag }}
+          --build-jobs 12
+          --site mi200
+          --gpu-targets ${{ env.GPU_LIST }}
+          ${{ matrix.ci-flags }}
+          --
+          -DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
+          -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}"
+          -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake"
+          -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}"
+          -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }}
+          -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF
+          -DCPACK_PACKAGING_INSTALL_PREFIX=${{ env.ROCM_PATH }}
+          -DCPACK_OBJCOPY_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objcopy"
+          -DCPACK_READELF_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-readelf"
+          -DCPACK_STRIP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-strip"
+          -DCPACK_OBJDUMP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objdump"
+          -DCPACK_GENERATOR='DEB;RPM;TGZ'
+          -DPython3_EXECUTABLE=$(which python3)
+
+    - name: Install
+      timeout-minutes: 10
+      run:
+        cmake --build build --target install --parallel 8
+
+    - name: Build Docs
+      timeout-minutes: 10
+      run:
+        cmake --build build --target doc --parallel 8
+
+    - name: Build Packaging
+      timeout-minutes: 10
+      run:
+        cmake --build build --target package --parallel 8
+
+    - name: Archive production artifacts
+      uses: actions/upload-artifact@v3
+      with:
+        name: artifacts
+        path: |
+          ${{github.workspace}}/build/*.deb
+          ${{github.workspace}}/build/*.rpm
+          ${{github.workspace}}/build/*.tgz
+
+  Building-Testing:
+    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
+
+    strategy:
+      fail-fast: true
+      matrix:
+        runner: ['vega20', 'mi100', 'navi21', 'navi32']
+        device: ['']
+        os: ['ubuntu-22.04']
+        build-type: ['RelWithDebInfo']
+        ci-flags: ['']
+        name-tag: ['']
+        extra-options: ['']
+        include:
+          - os: 'rhel-8.x'
+            runner: 'renderD131'
+            device: '/renderD131'
+            build-type: 'RelWithDebInfo'
+            ci-flags: ''
+            name-tag: ''
+            extra-options: '--memory=128g --cpus=32'
+          - os: 'rhel-9.x'
+            runner: 'renderD129'
+            device: '/renderD129'
+            build-type: 'RelWithDebInfo'
+            ci-flags: ''
+            name-tag: ''
+            extra-options: '--memory=128g --cpus=32'
+          - os: 'sles'
+            runner: 'renderD130'
+            device: '/renderD130'
+            build-type: 'RelWithDebInfo'
+            ci-flags: ''
+            name-tag: ''
+            extra-options: '--memory=128g --cpus=32'
+          - os: 'ubuntu-20.04'
+            runner: 'renderD131'
+            device: '/renderD131'
+            build-type: 'RelWithDebInfo'
+            ci-flags: ''
+            name-tag: ''
+            extra-options: '--memory=128g --cpus=32'
+
+    runs-on: ${{ matrix.runner }}
+
+    # define this for containers
+    env:
+      GIT_DISCOVERY_ACROSS_FILESYSTEM: 1
+
+    container:
+      image: compute-artifactory.amd.com:5000/rocm-plus-docker/compute-rocm-dkms-no-npi-hipclang:${{ needs.get_latest_mainline_build_number.outputs.LATEST_BUILD_NUMBER }}-${{ matrix.os }}-stg1
+      options: ${{ matrix.extra-options }} --ipc=host --device=/dev/kfd --device=/dev/dri${{ matrix.device }} --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined
+
+    if: ${{ always() }}
+    needs: get_latest_mainline_build_number
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: List Files
+      shell: bash
+      run: |
+        which-realpath() { echo "$1 resolves to $(realpath $(which $1))"; }
+        for i in python python3 git cmake ctest; do which-realpath $i; done
+        ls -la
+
+    - name: Install Python requirements
+      shell: bash
+      run: pip3 install -r requirements.txt
+
+    - name: Configure, Build, and Test
+      if: ${{ matrix.runner != 'navi32' }}
+      timeout-minutes: 30
+      shell: bash
+      run:
+        python3 ./script/run-ci.py -B build
+          --name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}-${{ matrix.os }}${{ matrix.name-tag }}
+          --build-jobs 12
+          --site ${{ matrix.runner }}
+          --gpu-targets ${{ env.GPU_LIST }}
+          ${{ matrix.ci-flags }}
+          --
+          -DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
+          -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}"
+          -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake"
+          -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}"
+          -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }}
+          -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF
+          -DCPACK_PACKAGING_INSTALL_PREFIX=${{ env.ROCM_PATH }}
+          -DCPACK_OBJCOPY_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objcopy"
+          -DCPACK_READELF_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-readelf"
+          -DCPACK_STRIP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-strip"
+          -DCPACK_OBJDUMP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objdump"
+          -DCPACK_GENERATOR='DEB;RPM;TGZ'
+          -DPython3_EXECUTABLE=$(which python3)
+
+    - name: Configure, Build, and Test
+      if: ${{ matrix.runner == 'navi32' }}
+      timeout-minutes: 30
+      shell: bash
+      run:
+        python3 ./script/run-ci.py -B build
+          --name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}${{ matrix.name-tag }}
+          --build-jobs 12
+          --site ${{ matrix.runner }}
+          --gpu-targets ${{ env.GPU_LIST }}
+          ${{ matrix.ci-flags }}
+          --
+          -DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
+          -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}"
+          -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake"
+          -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}"
+          -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }}
+          -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF
+          -DCPACK_PACKAGING_INSTALL_PREFIX=${{ env.ROCM_PATH }}
+          -DCPACK_OBJCOPY_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objcopy"
+          -DCPACK_READELF_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-readelf"
+          -DCPACK_STRIP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-strip"
+          -DCPACK_OBJDUMP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objdump"
+          -DCPACK_GENERATOR='DEB;RPM;TGZ'
+          -DPython3_EXECUTABLE=$(which python3)
+          --
+          -LE v1
+
+  sanitizers:
+    # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: 'ubuntu-22.04'
+            runner: 'vega20'
+            build-type: 'RelWithDebInfo'
+            ci-flags: ''
+            sanitizer: 'ThreadSanitizer'
+          - os: 'ubuntu-22.04'
+            runner: 'navi32'
+            build-type: 'RelWithDebInfo'
+            ci-flags: ''
+            sanitizer: 'LeakSanitizer'
+          - os: 'ubuntu-22.04'
+            runner: 'mi100'
+            build-type: 'RelWithDebInfo'
+            ci-flags: ''
+            sanitizer: 'AddressSanitizer'
+
+    runs-on: ${{ matrix.runner }}
+
+    # define this for containers
+    env:
+      GIT_DISCOVERY_ACROSS_FILESYSTEM: 1
+
+    container:
+      image: compute-artifactory.amd.com:5000/rocm-plus-docker/compute-rocm-dkms-no-npi-hipclang:${{ needs.get_latest_mainline_build_number.outputs.LATEST_BUILD_NUMBER }}-${{ matrix.os }}-stg1
+      options: --privileged --ipc=host --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined
+
+    if: ${{ always() }}
+    needs: get_latest_mainline_build_number
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: List Files
+      shell: bash
+      run: |
+        which-realpath() { echo "$1 resolves to $(realpath $(which $1))"; }
+        for i in python python3 git cmake ctest; do which-realpath $i; done
+        ls -la
+
+    - name: Install Python requirements
+      shell: bash
+      run: pip3 install -r requirements.txt
+
+    - name: Configure, Build, and Test
+      timeout-minutes: 30
+      shell: bash
+      run:
+        python3 ./script/run-ci.py -B build
+          --name ${{ github.repository }}-${{ github.ref_name }}-mi100-${{ matrix.sanitizer }}
+          --build-jobs 12
+          --site mi100
+          --gpu-targets ${{ env.GPU_LIST }}
+          --memcheck=${{ matrix.sanitizer }}
+          ${{ matrix.ci-flags }}
+          --
+          -DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
+          -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}"
+          -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake"
+          -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}"
+          -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }}
+          -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF
+          -DPython3_EXECUTABLE=$(which python3)
diff --git a/.github/workflows/docker_cleanup.yml b/.github/workflows/docker_cleanup.yml
new file mode 100644
index 00000000..9caa4adf
--- /dev/null
+++ b/.github/workflows/docker_cleanup.yml
@@ -0,0 +1,27 @@
+name: Dockers Cleanup
+
+on:
+  # allow triggering manually
+  workflow_dispatch:
+  # run on weekly schedule
+  schedule:
+  - cron: "0 0 * * 6"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  cleanup-dockers:
+
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: ['vega20', mi200, mi100, navi21, navi31]
+
+    runs-on: ${{ matrix.runner }}
+
+    steps:
+    - name: prune-dockers
+      run: |
+        docker system prune -f -a --volumes
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index fbed48e0..9af70090 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -1,12 +1,14 @@
 
 name: Formatting
-run-name: formatting
 
 on:
+  workflow_dispatch:
   pull_request:
     branches: [ amd-staging ]
     paths-ignore:
       - '.github/workflows/pull_*.yml'
+      - '.github/workflows/linting.yml'
+      - '.github/workflows/markdown_lint.yml'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -48,10 +50,10 @@ jobs:
       if: failure()
       uses: peter-evans/create-pull-request@v5
       with:
-        commit-message: "run cmake formatting (cmake-format)"
+        commit-message: "cmake formatting (cmake-format)"
         branch: ${{ steps.extract_branch.outputs.branch }}-cmake-format
         delete-branch: true
-        title: "Apply cmake-format to ${{ steps.extract_branch.outputs.branch }}"
+        title: "Format cmake code (via cmake-format) on ${{ steps.extract_branch.outputs.branch }}"
         base: ${{ steps.extract_branch.outputs.branch }}
 
   source:
@@ -90,8 +92,55 @@ jobs:
       if: failure()
       uses: peter-evans/create-pull-request@v5
       with:
-        commit-message: "run formatting (clang-format v11)"
+        commit-message: "source formatting (clang-format v11)"
         branch: ${{ steps.extract_branch.outputs.branch }}-clang-format
         delete-branch: true
-        title: "Apply clang-format (v11) to ${{ steps.extract_branch.outputs.branch }}"
+        title: "Format source code (via clang-format v11) on ${{ steps.extract_branch.outputs.branch }}"
+        base: ${{ steps.extract_branch.outputs.branch }}
+
+  python:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: ['3.10']
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Extract branch name
+      shell: bash
+      run: |
+        echo "branch=${GITHUB_HEAD_REF:-${GITHUB_HEAD_REF#refs/heads/}}" >> $GITHUB_OUTPUT
+      id: extract_branch
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install black
+
+    - name: black format
+      run: |
+        black .
+        if [ $(git diff | wc -l) -ne 0 ]; then
+          echo -e "\nError! Python code not formatted. Run black...\n"
+          echo -e "\nFiles:\n"
+          git diff --name-only
+          echo -e "\nFull diff:\n"
+          git diff
+          exit 1
+        fi
+
+    - name: Create pull request
+      if: failure()
+      uses: peter-evans/create-pull-request@v5
+      with:
+        commit-message: "python formatting (black)"
+        branch: ${{ steps.extract_branch.outputs.branch }}-python-format
+        delete-branch: true
+        title: "Format python code (via black) on ${{ steps.extract_branch.outputs.branch }}"
         base: ${{ steps.extract_branch.outputs.branch }}
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
new file mode 100644
index 00000000..361088e9
--- /dev/null
+++ b/.github/workflows/linting.yml
@@ -0,0 +1,103 @@
+name: Linting
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ "amd-staging" ]
+    paths-ignore:
+      - '.github/workflows/pull_*.yml'
+      - '*.md'
+  pull_request:
+    branches: [ "amd-staging" ]
+    paths-ignore:
+      - '.github/workflows/pull_*.yml'
+      - '*.md'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  # TODO(jrmadsen): replace LD_RUNPATH_FLAG, GPU_LIST, etc. with internal handling in cmake
+  GIT_DISCOVERY_ACROSS_FILESYSTEM: 1
+  ROCM_PATH: "/opt/rocm"
+  CMAKE_PREFIX_PATH: "/opt/rocm"
+  LD_RUNPATH_FLAG: " -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib"
+  GPU_LIST: "gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102"
+
+jobs:
+  get_latest_mainline_build_number:
+    runs-on: mi200
+
+    outputs:
+      LATEST_BUILD_NUMBER: ${{ steps.get_build_number.outputs.LATEST_BUILD_NUMBER }}
+
+    steps:
+      - id: get_build_number
+        run: echo "LATEST_BUILD_NUMBER=$(wget -qO- 'http://rocm-ci.amd.com/job/compute-rocm-dkms-no-npi-hipclang/lastSuccessfulBuild/buildNumber')" >> $GITHUB_OUTPUT
+
+  linting:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - build-type: 'Debug'
+            linter: 'clang-tidy'
+            runner: 'mi100'
+            os: 'ubuntu-22.04'
+          - build-type: 'Release'
+            linter: 'clang-tidy'
+            runner: 'vega20'
+            os: 'ubuntu-22.04'
+
+    runs-on: ${{ matrix.runner }}
+
+    needs: get_latest_mainline_build_number
+
+    container:
+      image: compute-artifactory.amd.com:5000/rocm-plus-docker/compute-rocm-dkms-no-npi-hipclang:${{ needs.get_latest_mainline_build_number.outputs.LATEST_BUILD_NUMBER }}-${{ matrix.os }}-stg1
+      options: --ipc=host --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: List Files
+      shell: bash
+      run: |
+        which-realpath() { echo "$1 resolves to $(realpath $(which $1))"; }
+        for i in python python3 git cmake ctest; do which-realpath $i; done
+        ls -la
+
+    - name: Update container
+      run: |
+        apt-get update
+        apt-get install -y clang-tidy-11 g++-12
+        update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 10
+        update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10 --slave /usr/bin/g++ g++ /usr/bin/g++-11
+        update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 20 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+    - name: Install Python requirements
+      shell: bash
+      run: |
+        python3 -m pip install -r requirements.txt
+
+    - name: Configure, Build, and Test
+      timeout-minutes: 30
+      shell: bash
+      run:
+        python3 ./script/run-ci.py -B build
+          --name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}-${{ matrix.linter }}-${{ matrix.build-type }}
+          --build-jobs 12
+          --site ${{ matrix.runner }}
+          --gpu-targets ${{ env.GPU_LIST }}
+          --linter ${{ matrix.linter }}
+          --
+          -DCMAKE_BUILD_TYPE=${{ matrix.build-type }}
+          -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}"
+          -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake"
+          -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}"
+          -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }}
+          -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF
+          -DPython3_EXECUTABLE=$(which python3)
+          --
+          -VV
diff --git a/.github/workflows/markdown_lint.yml b/.github/workflows/markdown_lint.yml
new file mode 100644
index 00000000..2523de0d
--- /dev/null
+++ b/.github/workflows/markdown_lint.yml
@@ -0,0 +1,21 @@
+name: Markdown Lint
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [ "amd-staging" ]
+    paths:
+        - '*.md'
+
+jobs:
+  check-readme:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v3
+
+    - name: Lint Markdown files
+      uses: avto-dev/markdown-lint@v1
+      with:
+        config: './.markdown-lint-config.yml'
+        args: './README.md'
diff --git a/.markdown-lint-config.yml b/.markdown-lint-config.yml
new file mode 100644
index 00000000..3161169c
--- /dev/null
+++ b/.markdown-lint-config.yml
@@ -0,0 +1,141 @@
+default: false # includes/excludes all rules by default
+
+# Heading levels should only increment by one level at a time <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md001>
+MD001: true
+
+# Heading style <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md003>
+MD003: true
+
+# Unordered list style <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md004>
+MD004: true
+
+# Inconsistent indentation for list items at the same level <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md005>
+MD005: true
+
+# Consider starting bulleted lists at the beginning of the line <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md006>
+MD006: true
+
+# Unordered list indentation <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md007>
+MD007: true
+
+# Trailing spaces <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md009>
+MD009: true
+
+# Hard tabs <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md010>
+MD010: true
+
+# Reversed link syntax <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md011>
+MD011: true
+
+# Multiple consecutive blank lines <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md012>
+MD012: true
+
+# Line length <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md013>
+MD013: false
+
+# Dollar signs used before commands without showing output <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md014>
+MD014: false
+
+# No space after hash on atx style heading <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md018>
+MD018: true
+
+# Multiple spaces after hash on atx style heading <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md019>
+MD019: true
+
+# No space inside hashes on closed atx style heading <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md020>
+MD020: true
+
+# Multiple spaces inside hashes on closed atx style heading <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md021>
+MD021: true
+
+# Headings should be surrounded by blank lines <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md022>
+MD022: true
+
+# Headings must start at the beginning of the line <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md023>
+MD023: true
+
+# Multiple headings with the same content <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md024>
+MD024:
+  allow_different_nesting: true
+
+# Multiple top level headings in the same document <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md025>
+MD025: true
+
+# Trailing punctuation in heading <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md026>
+MD026: true
+
+# Multiple spaces after blockquote symbol <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md027>
+MD027: true
+
+# Blank line inside blockquote <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md028>
+MD028: false
+
+# Ordered list item prefix <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md029>
+MD029:
+  style: 'one'
+
+# Spaces after list markers <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md030>
+MD030: true
+
+# Fenced code blocks should be surrounded by blank lines <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md031>
+MD031: true
+
+# Lists should be surrounded by blank lines <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md032>
+MD032: true
+
+# Inline HTML <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md033>
+MD033: true
+
+# Bare URL used <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md034>
+MD034: true
+
+# Horizontal rule style <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md035>
+MD035:
+  style: '***'
+
+# Emphasis used instead of a heading <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md036>
+MD036: true
+
+# Spaces inside emphasis markers <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md037>
+MD037: true
+
+# Spaces inside code span elements <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md038>
+MD038: true
+
+# Spaces inside link text <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md039>
+MD039: true
+
+# Fenced code blocks should have a language specified <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md040>
+MD040: true
+
+# First line in file should be a top level heading <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md041>
+MD041: true
+
+# No empty links <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md042>
+MD042: true
+
+# Required heading structure <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md043>
+MD043: false
+
+# Proper names should have the correct capitalization <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md044>
+MD044: false
+
+# Images should have alternate text (alt text) <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md045>
+MD045: false
+
+# Code block style <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md046>
+MD046:
+  style: 'fenced'
+
+# Files should end with a single newline character <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md047>
+MD047: true
+
+# Code fence style <https://github.com/DavidAnson/markdownlint/blob/master/doc/Rules.md#md048>
+MD048:
+  style: 'backtick'
+
+# Custom rules:
+CHANGELOG-RULE-001: true
+CHANGELOG-RULE-002: true
+CHANGELOG-RULE-003: true
+CHANGELOG-RULE-004: true
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ebf937c..f5284cca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,21 @@
 # SOFTWARE.
 # ##############################################################################
 
-cmake_minimum_required(VERSION 3.18.0)
+cmake_minimum_required(VERSION 3.18.0 FATAL_ERROR)
+
+set(CMAKE_C_FLAGS_COVERAGE_INIT
+    "-Og -g3 -fno-omit-frame-pointer -fprofile-abs-path -fprofile-arcs -ftest-coverage --coverage"
+    CACHE STRING "C flags for code coverage builds")
+set(CMAKE_CXX_FLAGS_COVERAGE_INIT
+    "-Og -g3 -fno-omit-frame-pointer -fprofile-abs-path -fprofile-arcs -ftest-coverage --coverage"
+    CACHE STRING "C++ flags for code coverage builds")
+
+set(CMAKE_C_FLAGS_COVERAGE
+    "${CMAKE_C_FLAGS_COVERAGE_INIT}"
+    CACHE STRING "C flags for code coverage builds")
+set(CMAKE_CXX_FLAGS_COVERAGE
+    "${CMAKE_CXX_FLAGS_COVERAGE_INIT}"
+    CACHE STRING "C++ flags for code coverage builds")
 
 # Build is not supported on Windows plaform
 if(WIN32)
@@ -49,18 +63,24 @@ endif()
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-add_compile_options(-Wall)
-
 set(THREADS_PREFER_PTHREAD_FLAG ON)
+set(CMAKE_BUILD_RPATH
+    "${PROJECT_BINARY_DIR}:${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}")
+set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
+set(CMAKE_SKIP_BUILD_RPATH OFF)
 
 # Adding default path cmake modules
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules")
+list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules")
+
 # Set build environment
-include(utils)
-include(env)
+include(rocprofiler_options)
+include(rocprofiler_utils)
+include(rocprofiler_env)
+include(rocprofiler_formatting)
+include(rocprofiler_linting)
 
 # Setup the package version.
-get_version("1.0.0")
+rocprofiler_get_version("1.0.0")
 message("-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
 
 set(BUILD_VERSION_MAJOR ${VERSION_MAJOR})
@@ -162,23 +182,33 @@ if(USE_PROF_API EQUAL 1)
     endif()
 endif()
 
+enable_testing()
+
+# Temporarily for CI to work
+set(ROCPROFILER_BUILD_TESTS ON)
+set(ROCPROFILER_BUILD_CI ON)
+
+if(ROCPROFILER_BUILD_CI)
+    include(CTest)
+endif()
+
 # Build libraries
 add_subdirectory(src)
 
-if(${LIBRARY_TYPE} STREQUAL SHARED)
+# Build Plugins
+add_subdirectory(plugin)
+
+if(ROCPROFILER_BUILD_SAMPLES)
     # Build samples
     add_subdirectory(samples)
+endif()
 
+if(ROCPROFILER_BUILD_TESTS)
     # Build tests
+    add_subdirectory(test)
     add_subdirectory(tests-v2)
 endif()
 
-# Build Plugins
-add_subdirectory(plugin)
-
-# Build tests
-add_subdirectory(${TEST_DIR} ${PROJECT_BINARY_DIR}/test)
-
 # Installation and packaging
 set(DEST_NAME ${ROCPROFILER_NAME})
 if(DEFINED CMAKE_INSTALL_PREFIX)
@@ -241,27 +271,12 @@ install(
     DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME}
     COMPONENT runtime)
 
-# librocprof-tool.so
-install(
-    FILES ${PROJECT_BINARY_DIR}/test/librocprof-tool.so
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME}
-    COMPONENT runtime)
-
-install(
-    FILES ${PROJECT_BINARY_DIR}/test/librocprof-tool.so
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME}
-    COMPONENT asan)
-
-install(
-    FILES ${PROJECT_BINARY_DIR}/test/rocprof-ctrl
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME}
-    PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ
-                WORLD_EXECUTE
-    COMPONENT runtime)
-
-# File reorg Backward compatibility
-option(FILE_REORG_BACKWARD_COMPATIBILITY
-       "Enable File Reorg with backward compatibility" OFF)
+# File reorg backward compatibility for non ASAN packaging
+if(NOT ENABLE_ASAN_PACKAGING)
+    # File reorg Backward compatibility
+    option(FILE_REORG_BACKWARD_COMPATIBILITY
+           "Enable File Reorg with backward compatibility" ON)
+endif()
 
 if(FILE_REORG_BACKWARD_COMPATIBILITY)
     # To enabe/disable #error in wrapper header files
diff --git a/README.md b/README.md
index af9c0962..75fd98b2 100644
--- a/README.md
+++ b/README.md
@@ -83,19 +83,24 @@ export ROCPROFILER_TRACE=1
 ## Supported AMD GPU Architectures (V1)
 
   The following AMD GPU architectures are supported with ROCprofiler V1:
-  
+
 - gfx8 (Fiji/Ellesmere)
 - gfx900 (AMD Vega 10)
 - gfx906 (AMD Vega 7nm also referred to as AMD Vega 20)
 - gfx908 (AMD Instinct™ MI100 accelerator)
 - gfx90a (AMD Instinct™ MI200)
 
+***
+Note: ROCProfiler V1 tool usage documentation is available at [Click Here](doc/rocprof_tool.md)
+***
+
 ## ROCProfiler V2
 
-ROCProfilerV2 is a newly developed design for AMD’s tooling infrastructure that provides a hardware specific low level performance analysis interface for profiling of GPU compute applications.
 The first API library version for ROCProfiler v2 is 9.0.0
 
-### Note: ROCProfilerV2 is currently considered a beta version and is subject to change in future releases
+***
+Note: ROCProfilerV2 is currently considered a beta version and is subject to change in future releases
+***
 
 ### ROCProfilerV2 Modules
 
@@ -288,14 +293,6 @@ Usage:
   rocprofv2 --plugin perfetto --hsa-trace -d output_dir <app_relative_path> # -d is optional, but can be used to define the directory output for output results
   ```
 
-  Both the output directory and filenames allow for simple environment variable substitution via a special syntax %q{var} -> $var, e.g.:
-  
-  ```bash
-    export var="FOO"
-    rocprofv2 --plugin perfetto -o file_%q{var}_name
-    # Generates file names: file_FOO_name[...].pftrace
-  ```
-
 - CTF plugin: Outputs the data in ctf format(a binary trace format). CTF binary output can be viewed using TraceCompass or babeltrace.
 Usage:
 
@@ -313,7 +310,7 @@ Tool used to collect fine-grained hardware metrics. Provides ISA-level instructi
     rocprofv2 -i input.txt --plugin att <app_assembly_file> --mode network <app_relative_path>
     ```
 
-  - app_assembly_file: 
+  - app_assembly_file:
     On ROCm 6.0, ATT enables automatic capture of the ISA during kernel execution, and does not require recompiling. It is recommeneded to leave at "auto".
   - app_relative_path
     Path for the running application
@@ -356,7 +353,7 @@ Tool used to collect fine-grained hardware metrics. Provides ISA-level instructi
     - att: TARGET_CU=1 //or some other CU [0,15] - WGP for Navi [0,8]
     - SE_MASK=0x1 // bitmask of shader engines. The fewer, the easier on the hardware. Default enables 1 out of 4 shader engines.
     - SIMD_MASK=0xF // GFX9: bitmask of SIMDs. Navi: SIMD Index [0-3].
-    - DISPATCH=ID,RN // collect trace only for the given dispatch_ID and MPI rank RN. RN is optional and ignored for single processes. Multiple line with varying combinations of RN and ID can be added.
+    - DISPATCH=ID,RN // collect trace only for the given dispatch_ID and MPI rank RN. RN ignored for single processes. Multiple lines with varying combinations of RN and ID can be added.
     - KERNEL=kernname // Profile only kernels containing the string kernname (c++ mangled name). Multiple lines can be added.
     - PERFCOUNTERS_COL_PERIOD=0x3 // Multiplier period for counter collection [0~31]. 0=fastest (usually once every 16 cycles). GFX9 only. Counters will be shown in a graph over time in the browser UI.
     - PERFCOUNTER=counter_name // Add a SQ counter to be collected with ATT; period defined by PERFCOUNTERS_COL_PERIOD. GFX9 only.
@@ -434,7 +431,105 @@ A device profiling session allows the user to profile the GPU device for counter
 
 ### Session Support
 
-A session is a unique identifier for a profiling/tracing/pc-sampling task. A ROCProfilerV2 Session has enough information about what needs to be collected or traced and it allows the user to start/stop profiling/tracing whenever required. More details on the API can be found in the API specification documentation that can be installed using rocprofiler-doc package. Samples also can be found for how to use the API in samples directory.
+  A session is a unique identifier for a profiling/tracing/pc-sampling task. A ROCProfilerV2 Session has enough information about what needs to be collected or traced and it allows the user to start/stop profiling/tracing whenever required. More details on the API can be found in the API specification documentation that can be installed using rocprofiler-doc package. Samples also can be found for how to use the API in samples directory.
+
+- #### (ATT) Advanced Thread Trace
+
+    Tool used to collect fine-grained hardware metrics. Provides ISA-level instruction hotspot analysis via hardware tracing.
+
+    ```bash
+    # ATT(Advanced Thread Trace) needs some preparation before running.
+
+    # 1. Make sure to generate the assembly file for application by executing the following before compiling your HIP Application
+    # This can be achieved globally by following environment variable
+    export HIPCC_COMPILE_FLAGS_APPEND="--save-temps -g"
+    # Similarly, the --save-temps -g flags can be added per file for better ISA generation control.
+
+    # 2. Install plugin package
+    # see Plugin Support section for installation
+
+    # 3. Run the following to view the trace
+    # Att-specific options must come right after the assembly file
+    rocprofv2 -i input.txt --plugin att <app_assembly_file> --mode network <app_relative_path>
+    ```
+
+    ```bash
+    # Example for vectoradd on navi31.
+    # Special attention to gfx1100.s==navi31 in the ISA file name.
+    # Use gfx1030 for navi21, gfx90a for MI200 and gfx940 for MI300
+    hipcc -g --save-temps vectoradd_hip.cpp -o vectoradd_hip.exe
+    rocprofv2 -i input.txt --plugin att vectoradd_hip-hip-amdgcn-amd-amdhsa-gfx1100.s --mode network ./vectoradd_hip.exe
+    # Then open the browser at http://localhost:8000
+    # The ISA can also be obtained from llvm/roc objdump, however, annotations will be different
+    ```
+
+    For MPI or very long applications, we recommend to run collection, and later run the parser with already collected data:
+
+    ```bash
+    # Run only collection: The assembly file is not used. Use mpirun [...] rocprofv2 [...] if needed.
+    rocprofv2 -i input.txt --plugin att none ./vectoradd_hip.exe
+    # Remove the binary/application: Only runs the parser.
+    rocprofv2 -i input.txt --plugin att vectoradd_hip-hip-amdgcn-amd-amdhsa-gfx1100.s --mode network
+    ```
+
+- ##### app_assembly_file_relative_path
+
+  AMDGCN ISA file with .s extension generated in 1st step
+
+- ##### app_relative_path
+
+  Path for the running application
+
+- ##### ATT plugin optional parameters
+
+  - --depth [n]: How many waves per slot to parse (maximum).
+  - --mpi [proc]: Parse with this many mpi processes, for greater analysis speed. Does not change results. Requires mpi4py.
+  - --att_kernel "filename": Kernel filename to use (instead of ATT asking which one to use).
+  - --trace_file "files": glob (wildcards allowed) of traces files to parse. Requires quotes for use with wildcards.
+  - --mode [network, file, off (default)]
+
+- ##### network
+
+  Opens the server with the browser UI.
+  att needs 2 ports available (e.g. 8000, 18000). There is an option (default: --ports "8000,18000") to change these.
+  In case rocprofv2 is running on a different machine, use port forwarding "ssh -L 8000:localhost:8000 <user@IP>" so the browser can be used locally. For docker, use --network=host --ipc=host -p8000:8000 -p18000:18000
+
+- ##### file
+
+  Dumps the analyzed json files to disk for vieweing at a later time. Run python3 httpserver.py from within the generated ui/ folder to view the trace, similarly to network mode. The folder can be copied to another machine, and will run without rocm.
+
+- ##### off
+
+  Runs trace collection but not analysis, so it can be analyzed at a later time. Run rocprofv2 ATT [network, file] with the same parameters, removing the application binary, to analyze previously generated traces. We recommend not setting the mode when collecting for MPI applications.
+
+- ##### input.txt
+
+  Required. Used to select specific compute units and other trace parameters.
+  For first time users, we recommend compiling and running vectorAdd with
+
+  ```bash
+  att: TARGET_CU=1
+  SE_MASK=0x1
+  SIMD_MASK=0x3
+  ```
+
+  and histogram with
+
+  ```bash
+  att: TARGET_CU=0
+  SE_MASK=0xFF
+  SIMD_MASK=0xF // 0xF for GFX9, SIMD_MASK=0 for Navi
+  ```
+
+  Possible contents:
+  - att: TARGET_CU=1 //or some other CU [0,15] - WGP for Navi [0,8]
+  - SE_MASK=0x1 // bitmask of shader engines. The fewer, the easier on the hardware. Default enables 1 out of 4 shader engines.
+  - SIMD_MASK=0xF // GFX9: bitmask of SIMDs. Navi: SIMD Index [0-3].
+  - DISPATCH=ID,RN // collect trace only for the given dispatch_ID and MPI rank RN. RN is optional and ignored for single processes. Multiple lines with varying combinations of RN and ID can be added.
+  - KERNEL=kernname // Profile only kernels containing the string kernname (c++ mangled name). Multiple lines can be added.
+  - PERFCOUNTERS_COL_PERIOD=0x3 // Multiplier period for counter collection [0~31]. 0=fastest (usually once every 16 cycles). GFX9 only. Counters will be shown in a graph over time in the browser UI.
+  - PERFCOUNTER=counter_name // Add a SQ counter to be collected with ATT; period defined by PERFCOUNTERS_COL_PERIOD. GFX9 only.
+  - BUFFER_SIZE=[size] // Sets size of the ATT buffer collection, per dispatch, in megabytes (shared among all shader engines).
 
 ## Tests
 
@@ -476,6 +571,12 @@ rocprofiler-tests-9.0.0-local.x86_64.rpm
 rocprofv2 -t
 ```
 
+OR
+
+```bash
+ctest
+```
+
 ### Guidelines for adding new tests
 
 - Prefer to enhance an existing test as opposed to writing a new one. Tests have overhead to start and many small tests spend precious test time on startup and initialization issues.
@@ -561,7 +662,7 @@ samples can be run as independent executables once installed
 
 ## Support
 
-Please report in the Github Issues
+Please report in the Github Issues.
 
 ## Limitations
 
diff --git a/bin/att_to_out.py b/bin/att_to_out.py
index 6be60a50..84b45759 100755
--- a/bin/att_to_out.py
+++ b/bin/att_to_out.py
@@ -22,19 +22,23 @@
 
 import numpy as np
 import sys
-BYTE_MAP = [str(k) for k in range(10)] + ['a', 'b', 'c', 'd', 'e', 'f']
+
+BYTE_MAP = [str(k) for k in range(10)] + ["a", "b", "c", "d", "e", "f"]
+
 
 def map8(c):
-    return BYTE_MAP[(c//16)%16]+BYTE_MAP[c%16]
+    return BYTE_MAP[(c // 16) % 16] + BYTE_MAP[c % 16]
+
 
 def map16(c):
-    return map8(c>>8)+map8(c)
+    return map8(c >> 8) + map8(c)
+
 
 in_filename = sys.argv[1]
-out_filename = in_filename.split('.att')[0]+'.out'
+out_filename = in_filename.split(".att")[0] + ".out"
 
 in_bytes = np.fromfile(in_filename, dtype=np.uint16)
-out_bytes = [map16(c)+'\n' for c in in_bytes]
+out_bytes = [map16(c) + "\n" for c in in_bytes]
 
-with open(out_filename, 'w') as f:
+with open(out_filename, "w") as f:
     [f.write(b) for b in out_bytes]
diff --git a/bin/dform.py b/bin/dform.py
index a417b7ad..97dbc621 100644
--- a/bin/dform.py
+++ b/bin/dform.py
@@ -23,51 +23,85 @@
 import os
 from sqlitedb import SQLiteDB
 
+
 def gen_message(outfile):
-  if outfile != '':
-    print("File '" + outfile + "' is generating")
-
-def post_process_data(db, table_name, outfile = ''):
-#  db.add_data_column('A', 'DispDurNs', 'INTEGER', 'BeginNs - DispatchNs')
-#  db.add_data_column('A', 'ComplDurNs', 'INTEGER', 'CompleteNs - EndNs')
-#  db.add_data_column('A', 'TotalDurNs', 'INTEGER', 'CompleteNs - DispatchNs')
-#  db.add_data_column(table_name, 'TimeNs', 'INTEGER', 'BeginNs - %d' % start_ns)
-  db.add_data_column(table_name, 'DurationNs', 'INTEGER', 'EndNs - BeginNs')
-  if outfile != '': db.dump_csv(table_name, outfile)
-  gen_message(outfile)
+    if outfile != "":
+        print("File '" + outfile + "' is generating")
+
+
+def post_process_data(db, table_name, outfile=""):
+    #  db.add_data_column('A', 'DispDurNs', 'INTEGER', 'BeginNs - DispatchNs')
+    #  db.add_data_column('A', 'ComplDurNs', 'INTEGER', 'CompleteNs - EndNs')
+    #  db.add_data_column('A', 'TotalDurNs', 'INTEGER', 'CompleteNs - DispatchNs')
+    #  db.add_data_column(table_name, 'TimeNs', 'INTEGER', 'BeginNs - %d' % start_ns)
+    db.add_data_column(table_name, "DurationNs", "INTEGER", "EndNs - BeginNs")
+    if outfile != "":
+        db.dump_csv(table_name, outfile)
+    gen_message(outfile)
+
 
 def gen_data_bins(db, outfile):
-  db.execute('create view C as select Name, Calls, TotalDurationNs, TotalDurationNs/Calls as AverageNs, TotalDurationNs*100.0/(select sum(TotalDurationNs) from %s) as Percentage from %s order by TotalDurationNs desc;' % ('B', 'B'));
-  db.dump_csv('C', outfile)
-  db.execute('DROP VIEW C')
+    db.execute(
+        "create view C as select Name, Calls, TotalDurationNs, TotalDurationNs/Calls as AverageNs, TotalDurationNs*100.0/(select sum(TotalDurationNs) from %s) as Percentage from %s order by TotalDurationNs desc;"
+        % ("B", "B")
+    )
+    db.dump_csv("C", outfile)
+    db.execute("DROP VIEW C")
+
 
 def gen_table_bins(db, table, outfile, name_var, dur_ns_var):
-  db.execute('create view B as select (%s) as Name, count(%s) as Calls, sum(%s) as TotalDurationNs from %s group by %s' % (name_var, name_var, dur_ns_var, table, name_var))
-  gen_data_bins(db, outfile)
-  db.execute('DROP VIEW B')
-  gen_message(outfile)
+    db.execute(
+        "create view B as select (%s) as Name, count(%s) as Calls, sum(%s) as TotalDurationNs from %s group by %s"
+        % (name_var, name_var, dur_ns_var, table, name_var)
+    )
+    gen_data_bins(db, outfile)
+    db.execute("DROP VIEW B")
+    gen_message(outfile)
+
 
 def gen_api_json_trace(db, table, start_ns, outfile):
-  db.execute('create view B as select "Index", Name as name, __section as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (start_ns, table));
-  db.dump_json('B', table, outfile)
-  db.execute('DROP VIEW B')
-  gen_message(outfile)
+    db.execute(
+        'create view B as select "Index", Name as name, __section as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;'
+        % (start_ns, table)
+    )
+    db.dump_json("B", table, outfile)
+    db.execute("DROP VIEW B")
+    gen_message(outfile)
+
 
 def gen_ext_json_trace(db, table, start_ns, outfile):
-  db.execute('create view B as select Name as name, __section as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, ((EndNs - BeginNs)/1000) as dur from %s;' % (start_ns, table));
-  db.dump_json('B', table, outfile)
-  db.execute('DROP VIEW B')
-  gen_message(outfile)
+    db.execute(
+        "create view B as select Name as name, __section as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, ((EndNs - BeginNs)/1000) as dur from %s;"
+        % (start_ns, table)
+    )
+    db.dump_json("B", table, outfile)
+    db.execute("DROP VIEW B")
+    gen_message(outfile)
+
 
 def gen_ops_json_trace(db, table, base_pid, start_ns, outfile):
-  db.execute('create view B as select "Index", "%s" as name, ("dev-id" + %d) as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % ('roctx-range' if 'ROCP_RENAME_KERNEL' in os.environ else 'Name',base_pid, start_ns, table));
-  db.dump_json('B', table, outfile)
-  db.execute('DROP VIEW B')
-  gen_message(outfile)
+    db.execute(
+        'create view B as select "Index", "%s" as name, ("dev-id" + %d) as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;'
+        % (
+            "roctx-range" if "ROCP_RENAME_KERNEL" in os.environ else "Name",
+            base_pid,
+            start_ns,
+            table,
+        )
+    )
+    db.dump_json("B", table, outfile)
+    db.execute("DROP VIEW B")
+    gen_message(outfile)
+
 
 def gen_kernel_json_trace(db, table, base_pid, start_ns, outfile):
-  db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_ns, table));
-  db.dump_json('B', table, outfile)
-  db.execute('DROP VIEW B')
-  gen_message(outfile)
+    db.execute(
+        'create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;'
+        % (base_pid, start_ns, table)
+    )
+    db.dump_json("B", table, outfile)
+    db.execute("DROP VIEW B")
+    gen_message(outfile)
+
+
 ##############################################################################################
diff --git a/bin/mem_manager.py b/bin/mem_manager.py
index 4d654ad3..4f51ba50 100755
--- a/bin/mem_manager.py
+++ b/bin/mem_manager.py
@@ -23,368 +23,464 @@
 import sys, os, re
 from sqlitedb import SQLiteDB
 
-pinned = ['hipMallocHost', 'hipHostMalloc', 'hipHostAlloc']
-ondevice = ['hipMalloc', 'hipMallocPitch', 'hipMallocArray', 'hipMalloc3DArray', 'hsa_amd_memory_pool_allocate']
+pinned = ["hipMallocHost", "hipHostMalloc", "hipHostAlloc"]
+ondevice = [
+    "hipMalloc",
+    "hipMallocPitch",
+    "hipMallocArray",
+    "hipMalloc3DArray",
+    "hsa_amd_memory_pool_allocate",
+]
 
 mm_table_descr = [
-  ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'Direction', 'SrcType', 'DstType', 'Size', 'Async'],
-  {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'Direction':'TEXT', 'SrcType':'TEXT', 'DstType':'TEXT', 'Size':'INTEGER', 'Async':'TEXT'}
+    [
+        "BeginNs",
+        "EndNs",
+        "pid",
+        "tid",
+        "Name",
+        "Direction",
+        "SrcType",
+        "DstType",
+        "Size",
+        "Async",
+    ],
+    {
+        "BeginNs": "INTEGER",
+        "EndNs": "INTEGER",
+        "pid": "INTEGER",
+        "tid": "INTEGER",
+        "Name": "TEXT",
+        "Direction": "TEXT",
+        "SrcType": "TEXT",
+        "DstType": "TEXT",
+        "Size": "INTEGER",
+        "Async": "TEXT",
+    },
 ]
 
+
 def fatal(msg):
-  sys.stderr.write(sys.argv[0] + ": " + msg + "\n");
-  sys.exit(1)
+    sys.stderr.write(sys.argv[0] + ": " + msg + "\n")
+    sys.exit(1)
+
+
+DELIM = ","
 
-DELIM = ','
 
 # Mem copy manager class
 class MemManager:
-
-  def __init__(self, db, indir):
-    self.db = db
-    self.allocations = {}
-    self.hsa_agent_types = {}
-    self.memcopies = {}
-    self.filename = ''
-    self.fd = 0
-    self.parse_hsa_handles(indir + '/' + 'hsa_handles.txt');
-
-  def __del__(self):
-    if self.fd != 0: self.fd.close()
-
-  # Parsing the mapping of HSA agent and memory pool handles
-  def parse_hsa_handles(self, infile):
-    if os.path.exists(infile):
-      inp = open(infile, 'r')
-      cpu_agent_ptrn = re.compile(r'(0x[0-9a-fA-F]+) agent cpu')
-      gpu_agent_ptrn = re.compile(r'(0x[0-9a-fA-F]+) agent gpu')
-      for line in inp.readlines():
-        m_cpu = cpu_agent_ptrn.match(line)
-        if m_cpu:
-          self.hsa_agent_types[str(int(m_cpu.group(1),16))] = 0 # "cpu"
-        m_gpu = gpu_agent_ptrn.match(line)
-        if m_gpu:
-          self.hsa_agent_types[str(int(m_gpu.group(1),16))] = 1 # "gpu"
-      inp.close()
-
-  # register alloc and memcpy API calls
-  # ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data'],
-  def register_api(self, rec_vals):
-    res = ''
-    record_name = rec_vals[4]  # 'Name'
-    record_args = rec_vals[5]  # 'args'
-    malloc_ptrn = re.compile(r'hip.*Malloc|hsa_amd_memory_pool_allocate')
-    mcopy_ptrn = re.compile(r'hipMemcpy|hsa_amd_memory_async_copy')
-
-    if malloc_ptrn.match(record_name):
-      self.add_allocation(record_name, record_args)
-    elif mcopy_ptrn.match(record_name):
-      res = self.add_memcpy(rec_vals)
-
-    return res
-
-
-  # register memcpy asynchronous copy
-  # ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index', ...
-  def register_copy(self, rec_vals):
-    data = ''
-    event = rec_vals[2]     # 'Name'
-    procid = rec_vals[3]    # 'pid'
-    recordid = rec_vals[5]  # 'Index'
-    size_ptrn = re.compile(DELIM + 'Size=(\d+)' + DELIM)
-    # query syncronous memcopy API record
-    key = (recordid, procid, 0)
-    if key in self.memcopies:
-      data = self.memcopies[key]
-
-    # query asyncronous memcopy API record
-    key = (recordid, procid, 1)
-    if key in self.memcopies:
-      if data != '': fatal('register_copy: corrupted record sync/async')
-      async_copy_start_time = rec_vals[0]
-      async_copy_end_time = rec_vals[1]
-
-      tid = rec_vals[4]
-      copy_line_header = str(async_copy_start_time) + DELIM + str(async_copy_end_time) + DELIM + str(procid) + DELIM + str(tid)
-      copy_line_footer = 'Async=' + str(1)
-      data = copy_line_header + self.memcopies[key] + copy_line_footer
-      self.memcopies[key] = data
-
-    return data
-
-  # register memcpy asynchronous activity
-  # rec_vals: ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'Data', ...
-  def register_activity(self, rec_vals):
-    data = ''
-    procid = rec_vals[5]    # 'pid'
-    recordid = rec_vals[7]  # 'Index'
-
-    # query syncronous memcopy API record
-    key = (recordid, procid, 0)
-    if key in self.memcopies:
-      data = self.memcopies[key]
-
-    # query asyncronous memcopy API record
-    key = (recordid, procid, 1)
-    if key in self.memcopies:
-      if data != '': fatal('register_activity: corrupted record sync/async')
-
-      async_copy_start_time = rec_vals[0]
-      async_copy_end_time = rec_vals[1]
-
-      tid = rec_vals[6]
-      copy_line_header = str(async_copy_start_time) + DELIM + str(async_copy_end_time) + DELIM + str(procid) + DELIM + str(tid)
-      copy_line_footer = 'Async=' + str(1)
-      data = copy_line_header + self.memcopies[key] + copy_line_footer
-      self.memcopies[key] = data
-
-    return data
-
-  # add allocation to map
-  def add_allocation(self, event, args):
-    choice = 0
-    if event == "hipMallocPitch":
-      malloc_args_ptrn = re.compile(r'\(ptr\((.*)\) width\((.*)\) height\((.*)\)\)')
-      choice = 1
-    elif event == "hipMallocArray":
-      malloc_args_ptrn = re.compile(r'\(array\((.*)\) width\((.*)\) height\((.*)\)\)')
-      choice = 1
-    elif event == "hipMalloc3DArray":
-      malloc_args_ptrn = re.compile(r'\(array\((.*)\) width\((.*)\) height\((.*)\) depth\((.*)\)\)')
-      choice = 2
-    elif event == "hsa_amd_memory_pool_allocate":
-      #({handle=25291264}, 40, 0, 0x7ffc4c7bf1b0)
-      malloc_args_ptrn = re.compile(r'\({handle=\d+}, (\d+), \d+, (0x[0-9a-fA-F]+)\)')
-      choice = 4
-    else:
-      #(ptr(0x7f3407000000) size(800000000) flags(0))
-      malloc_args_ptrn = re.compile(r'\(ptr\((.*)\) size\((.*)\) .*\)')
-      choice = 3
-    m = malloc_args_ptrn.match(args)
-    if m:
-      if choice == 4:
-        ptr = int(m.group(2), 16)
-        size = int(m.group(1))
-      elif choice == 3:
-        ptr = int(m.group(1), 16)
-        size = int(m.group(2))
-      elif choice == 1:
-        ptr = int(m.group(1), 16)
-        size = int(m.group(2)) * int(m.group(3))
-      else:
-        ptr = int(m.group(1), 16)
-        size = int(m.group(2)) * int(m.group(3)) * int(m.group(4))
-      self.allocations[ptr] = (size, event)
-
-  #get type of ptr
-  def get_ptr_type(self, ptr):
-    addr = int(ptr, 16)
-    addr_type = 'unknown'
-    found = 0
-    for base, (size, event) in self.allocations.items():
-      if addr >= base and addr < base + size:
-        found = 1
-        break
-    if not found:
-      addr_type = 'pageable'
-    elif event in pinned:
-      addr_type = 'pinned'
-    elif event in ondevice:
-      addr_type = 'device'
-    elif ptr in self.hsa_agent_types:
-      if self.hsa_agent_types[ptr] == 0:
-        addr_type = 'pinned'
-      elif self.hsa_agent_types[ptr] == 1:
-        addr_type = 'device'
-      else:
-        fatal('internal error: ptr(' + ptr + ') cannot be identified')
-    else:
-      fatal('internal error: ptr(' + ptr + ') cannot be identified')
-    return addr_type
-
-  # add memcpy to map
-  def add_memcpy(self, recvals):
-    recordid = recvals[6]  #same as corrid
-    event = recvals[4]
-    start_time = recvals[0] # sync time stamp
-    end_time = recvals[1] # sync time stamp
-    args = recvals[5]
-    procid = int(recvals[2]) # used to query async entries
-    pid = recvals[2]
-    tid = recvals[3]
-
-    # hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind)
-    hip_memcpy_ptrn = re.compile(r'\(\s*dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)')
-    # hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
-    #                   size_t height, hipMemcpyKind kind);
-    hip_memcpy_ptrn2 = re.compile(r'\(\s*dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)')
-    # hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
-    #                        size_t count, hipMemcpyKind kind);
-    hip_memcpy_ptrn3 = re.compile(r'\(\s*dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)')
-    # hipMemcpyToSymbol(const void* symbolName, const void* src, size_t sizeBytes,
-    #    size_t offset = 0, hipMemcpyKind kind)
-    hip_memcpy_ptrn4 = re.compile(r'\(\s*symbol\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)')
-    # memcopy with kind argument
-    hip_memcpy_ptrn_kind = re.compile(r'.* kind\((\d+)\)\s*.*')
-    #hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src,
-    #                          hsa_agent_t src_agent, size_t size,
-    #                          uint32_t num_dep_signals,
-    #                          const hsa_signal_t* dep_signals,
-    #                          hsa_signal_t completion_signal);
-    # "(0x7f8ab6600000, 27064880, 0x7f8b16000000, 27059968, 800000000, 0, 0, 140240759809536) = 0"
-    # hsa_memcpy_ptrn_prev used to support format transition and will be cleaned up later.
-    hsa_memcpy_ptrn_prev = re.compile(r'\((0x[0-9a-fA-F]+), (\d+), (0x[0-9a-fA-F]+), (\d+), (\d+), .*\) = \d')
-    # "(0x7fd83bc00000, {handle=16124864}, 0x7fd89b600000, {handle=16119808}, 800000000, 0, 0, {handle=140573877724672}) = 0"
-    hsa_memcpy_ptrn = re.compile(r'\((0x[0-9a-fA-F]+), {handle=(\d+)}, (0x[0-9a-fA-F]+), {handle=(\d+)}, (\d+), .*\) = \d')
-    #    "(0x7f9125cfe7b0, 0x7f9125cfe784, 0x7f9125cfe790, 0x7f9125cfe784, 0x7f9125cfe778, {handle=94324038652880}, 1, 0, 0, {handle=140261380710784}) = 0"
-    #    dst, dst_offset, src, src_offset, range, copy_agent, dir, num_dep_signals, dep_signals, completion_signal
-    hsa_memcpy_ptrn2 = re.compile(r'\((0x[0-9a-fA-F]+), 0x[0-9a-fA-F]+, (0x[0-9a-fA-F]+), 0x[0-9a-fA-F]+, 0x[0-9a-fA-F]+, {z=(\d+), y=(\d+), x=(\d+)}, {handle=(\d+)}, .*\) = \d')
-    # aysnc memcopy
-    async_event_ptrn = re.compile(r'Async|async')
-    m_basic_hip = hip_memcpy_ptrn.match(args)
-    m_basic_hsa3 = hip_memcpy_ptrn4.match(args)
-    m_basic_hsa_prev = hsa_memcpy_ptrn_prev.match(args)
-    m_basic_hsa = hsa_memcpy_ptrn.match(args)
-    m_basic_hsa2 = hsa_memcpy_ptrn2.match(args)
-    is_hip = True if not (m_basic_hsa_prev or m_basic_hsa or m_basic_hsa2) else False
-    m_2d = hip_memcpy_ptrn2.match(args)
-    m_array = hip_memcpy_ptrn3.match(args)
-    is_async = 1 if async_event_ptrn.search(event) else 0
-    copy_line = ''
-    size = 0
-    dstptr_type = 'unknown'
-    srcptr_type = 'unknown'
-    direction = 'unknown'
-    kind_switcher = {
-      '0': "HtoH",
-      '1': "HtoD",
-      '2': "DtoH",
-      '3': "DtoD",
-      '4': "auto",
-    }
-
-    condition_matched = False
-    if m_basic_hip:
-      dstptr = m_basic_hip.group(1)
-      dstptr_type = self.get_ptr_type(dstptr)
-      srcptr = m_basic_hip.group(2)
-      srcptr_type = self.get_ptr_type(srcptr)
-      size = int(m_basic_hip.group(3))
-      condition_matched = True
-
-    if m_basic_hsa_prev:
-      dstptr = m_basic_hsa_prev.group(1)
-      dst_agent_ptr = m_basic_hsa_prev.group(2)
-      dstptr_type = self.get_ptr_type(dst_agent_ptr)
-      srcptr = m_basic_hsa_prev.group(3)
-      src_agent_ptr = m_basic_hsa_prev.group(4)
-      srcptr_type = self.get_ptr_type(src_agent_ptr)
-      size = int(m_basic_hsa_prev.group(5))
-      condition_matched = True
-
-    if m_basic_hsa:
-      dstptr = m_basic_hsa.group(1)
-      dst_agent_ptr = m_basic_hsa.group(2)
-      dstptr_type = self.get_ptr_type(dst_agent_ptr)
-      srcptr = m_basic_hsa.group(3)
-      src_agent_ptr = m_basic_hsa.group(4)
-      srcptr_type = self.get_ptr_type(src_agent_ptr)
-      size = int(m_basic_hsa.group(5))
-      condition_matched = True
-
-    if m_basic_hsa2:
-      dstptr = m_basic_hsa2.group(1)
-      dst_agent_ptr = m_basic_hsa2.group(6)
-      dstptr_type = self.get_ptr_type(dst_agent_ptr)
-      srcptr = m_basic_hsa2.group(2)
-      src_agent_ptr = m_basic_hsa2.group(6)
-      srcptr_type = self.get_ptr_type(src_agent_ptr)
-      z = int(m_basic_hsa2.group(3))
-      y = int(m_basic_hsa2.group(4))
-      x = int(m_basic_hsa2.group(5))
-      size = x*y*z
-      condition_matched = True
-
-    if m_basic_hsa3:
-      dstptr = m_basic_hsa3.group(1)
-      dstptr_type = self.get_ptr_type(dstptr)
-      srcptr = m_basic_hsa3.group(2)
-      srcptr_type = self.get_ptr_type(srcptr)
-      size = int(m_basic_hsa3.group(3))
-      condition_matched = True
-
-    if m_array:
-      dstptr = m_array.group(1)
-      dstptr_type = self.get_ptr_type(dstptr)
-      srcptr = m_array.group(2)
-      srcptr_type = self.get_ptr_type(srcptr)
-      size = m_array.group(3)
-      condition_matched = True
-
-    if m_2d:
-      dstptr = m_2d.group(1)
-      dstptr_type = self.get_ptr_type(dstptr)
-      srcptr = m_2d.group(2)
-      srcptr_type = self.get_ptr_type(srcptr)
-      size = int(m_2d.group(3))*int(m_2d.group(4))
-      condition_matched = True
-
-    if not condition_matched: fatal('Memcpy args \"' + args + '\" cannot be identified')
-
-    if not is_async:
-      start_time = recvals[0] # sync time stamp
-      end_time = recvals[1] # sync time stamp
-      duration = (int(end_time) - int(start_time))
-
-
-    evt_switcher = {
-      'hipMemcpyDtoD': "DtoD",
-      'hipMemcpyDtoDAsync': "DtoD",
-      'hipMemcpyDtoH': "DtoH",
-      'hipMemcpyDtoHAsync': "DtoH",
-      'hipMemcpyHtoD': "HtoD",
-      'hipMemcpyHtoDAsync': "HtoD",
-    }
-
-    if is_hip:
-      m = hip_memcpy_ptrn_kind.match(args)
-      if m:
-        direction = kind_switcher.get(m.group(1), "unknown")
-      else:
-        direction = evt_switcher.get(event, "unknown")
-    else:
-      if dst_agent_ptr in self.hsa_agent_types and src_agent_ptr in self.hsa_agent_types:
-        if self.hsa_agent_types[src_agent_ptr] == 1: direction = 'D'
-        elif self.hsa_agent_types[src_agent_ptr] == 0: direction = 'H'
-        if direction != 'unknown': direction += 'to'
-        if self.hsa_agent_types[dst_agent_ptr] == 1: direction += 'D'
-        elif self.hsa_agent_types[dst_agent_ptr] == 0: direction += 'H'
-
-    copy_line_header = ''
-    copy_line_footer = ''
-    copy_line_header = str(start_time) + DELIM + str(end_time) + DELIM + str(pid) + DELIM + str(tid)
-    copy_line_footer = 'Async=' + str(is_async)
-
-    copy_line = copy_line_header + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + copy_line_footer
-
-    self.memcopies[(recordid, procid, is_async)] = copy_line
-    return copy_line;
-
-  def dump_data(self, table_name, file_name):
-    # To create memcopy info table in DB
-    print("File '" + file_name + "' is generating")
-    table_handle = self.db.add_table(table_name, mm_table_descr)
-
-    fld_ptrn = re.compile(r'(.*)=(.*)')
-    for (key, record) in self.memcopies.items():
-      rec_vals_array = []
-      for rec in record.split(DELIM):
-        fld_ptrnm = fld_ptrn.match(rec)
-        if fld_ptrnm:
-          rec_vals_array.append(fld_ptrnm.group(2))
+    def __init__(self, db, indir):
+        self.db = db
+        self.allocations = {}
+        self.hsa_agent_types = {}
+        self.memcopies = {}
+        self.filename = ""
+        self.fd = 0
+        self.parse_hsa_handles(indir + "/" + "hsa_handles.txt")
+
+    def __del__(self):
+        if self.fd != 0:
+            self.fd.close()
+
+    # Parsing the mapping of HSA agent and memory pool handles
+    def parse_hsa_handles(self, infile):
+        if os.path.exists(infile):
+            inp = open(infile, "r")
+            cpu_agent_ptrn = re.compile(r"(0x[0-9a-fA-F]+) agent cpu")
+            gpu_agent_ptrn = re.compile(r"(0x[0-9a-fA-F]+) agent gpu")
+            for line in inp.readlines():
+                m_cpu = cpu_agent_ptrn.match(line)
+                if m_cpu:
+                    self.hsa_agent_types[str(int(m_cpu.group(1), 16))] = 0  # "cpu"
+                m_gpu = gpu_agent_ptrn.match(line)
+                if m_gpu:
+                    self.hsa_agent_types[str(int(m_gpu.group(1), 16))] = 1  # "gpu"
+            inp.close()
+
+    # register alloc and memcpy API calls
+    # ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data'],
+    def register_api(self, rec_vals):
+        res = ""
+        record_name = rec_vals[4]  # 'Name'
+        record_args = rec_vals[5]  # 'args'
+        malloc_ptrn = re.compile(r"hip.*Malloc|hsa_amd_memory_pool_allocate")
+        mcopy_ptrn = re.compile(r"hipMemcpy|hsa_amd_memory_async_copy")
+
+        if malloc_ptrn.match(record_name):
+            self.add_allocation(record_name, record_args)
+        elif mcopy_ptrn.match(record_name):
+            res = self.add_memcpy(rec_vals)
+
+        return res
+
+    # register memcpy asynchronous copy
+    # ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index', ...
+    def register_copy(self, rec_vals):
+        data = ""
+        event = rec_vals[2]  # 'Name'
+        procid = rec_vals[3]  # 'pid'
+        recordid = rec_vals[5]  # 'Index'
+        size_ptrn = re.compile(DELIM + "Size=(\d+)" + DELIM)
+        # query syncronous memcopy API record
+        key = (recordid, procid, 0)
+        if key in self.memcopies:
+            data = self.memcopies[key]
+
+        # query asyncronous memcopy API record
+        key = (recordid, procid, 1)
+        if key in self.memcopies:
+            if data != "":
+                fatal("register_copy: corrupted record sync/async")
+            async_copy_start_time = rec_vals[0]
+            async_copy_end_time = rec_vals[1]
+
+            tid = rec_vals[4]
+            copy_line_header = (
+                str(async_copy_start_time)
+                + DELIM
+                + str(async_copy_end_time)
+                + DELIM
+                + str(procid)
+                + DELIM
+                + str(tid)
+            )
+            copy_line_footer = "Async=" + str(1)
+            data = copy_line_header + self.memcopies[key] + copy_line_footer
+            self.memcopies[key] = data
+
+        return data
+
+    # register memcpy asynchronous activity
+    # rec_vals: ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'Data', ...
+    def register_activity(self, rec_vals):
+        data = ""
+        procid = rec_vals[5]  # 'pid'
+        recordid = rec_vals[7]  # 'Index'
+
+        # query syncronous memcopy API record
+        key = (recordid, procid, 0)
+        if key in self.memcopies:
+            data = self.memcopies[key]
+
+        # query asyncronous memcopy API record
+        key = (recordid, procid, 1)
+        if key in self.memcopies:
+            if data != "":
+                fatal("register_activity: corrupted record sync/async")
+
+            async_copy_start_time = rec_vals[0]
+            async_copy_end_time = rec_vals[1]
+
+            tid = rec_vals[6]
+            copy_line_header = (
+                str(async_copy_start_time)
+                + DELIM
+                + str(async_copy_end_time)
+                + DELIM
+                + str(procid)
+                + DELIM
+                + str(tid)
+            )
+            copy_line_footer = "Async=" + str(1)
+            data = copy_line_header + self.memcopies[key] + copy_line_footer
+            self.memcopies[key] = data
+
+        return data
+
+    # add allocation to map
+    def add_allocation(self, event, args):
+        choice = 0
+        if event == "hipMallocPitch":
+            malloc_args_ptrn = re.compile(r"\(ptr\((.*)\) width\((.*)\) height\((.*)\)\)")
+            choice = 1
+        elif event == "hipMallocArray":
+            malloc_args_ptrn = re.compile(
+                r"\(array\((.*)\) width\((.*)\) height\((.*)\)\)"
+            )
+            choice = 1
+        elif event == "hipMalloc3DArray":
+            malloc_args_ptrn = re.compile(
+                r"\(array\((.*)\) width\((.*)\) height\((.*)\) depth\((.*)\)\)"
+            )
+            choice = 2
+        elif event == "hsa_amd_memory_pool_allocate":
+            # ({handle=25291264}, 40, 0, 0x7ffc4c7bf1b0)
+            malloc_args_ptrn = re.compile(
+                r"\({handle=\d+}, (\d+), \d+, (0x[0-9a-fA-F]+)\)"
+            )
+            choice = 4
         else:
-          rec_vals_array.append(rec)
-      self.db.insert_entry(table_handle, rec_vals_array)
-
-    # To dump the memcopy info table as CSV
-    self.db.dump_csv(table_name, file_name)
+            # (ptr(0x7f3407000000) size(800000000) flags(0))
+            malloc_args_ptrn = re.compile(r"\(ptr\((.*)\) size\((.*)\) .*\)")
+            choice = 3
+        m = malloc_args_ptrn.match(args)
+        if m:
+            if choice == 4:
+                ptr = int(m.group(2), 16)
+                size = int(m.group(1))
+            elif choice == 3:
+                ptr = int(m.group(1), 16)
+                size = int(m.group(2))
+            elif choice == 1:
+                ptr = int(m.group(1), 16)
+                size = int(m.group(2)) * int(m.group(3))
+            else:
+                ptr = int(m.group(1), 16)
+                size = int(m.group(2)) * int(m.group(3)) * int(m.group(4))
+            self.allocations[ptr] = (size, event)
+
+    # get type of ptr
+    def get_ptr_type(self, ptr):
+        addr = int(ptr, 16)
+        addr_type = "unknown"
+        found = 0
+        for base, (size, event) in self.allocations.items():
+            if addr >= base and addr < base + size:
+                found = 1
+                break
+        if not found:
+            addr_type = "pageable"
+        elif event in pinned:
+            addr_type = "pinned"
+        elif event in ondevice:
+            addr_type = "device"
+        elif ptr in self.hsa_agent_types:
+            if self.hsa_agent_types[ptr] == 0:
+                addr_type = "pinned"
+            elif self.hsa_agent_types[ptr] == 1:
+                addr_type = "device"
+            else:
+                fatal("internal error: ptr(" + ptr + ") cannot be identified")
+        else:
+            fatal("internal error: ptr(" + ptr + ") cannot be identified")
+        return addr_type
+
+    # add memcpy to map
+    def add_memcpy(self, recvals):
+        recordid = recvals[6]  # same as corrid
+        event = recvals[4]
+        start_time = recvals[0]  # sync time stamp
+        end_time = recvals[1]  # sync time stamp
+        args = recvals[5]
+        procid = int(recvals[2])  # used to query async entries
+        pid = recvals[2]
+        tid = recvals[3]
+
+        # hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind)
+        hip_memcpy_ptrn = re.compile(
+            r"\(\s*dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)"
+        )
+        # hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width,
+        #                   size_t height, hipMemcpyKind kind);
+        hip_memcpy_ptrn2 = re.compile(
+            r"\(\s*dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)"
+        )
+        # hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src,
+        #                        size_t count, hipMemcpyKind kind);
+        hip_memcpy_ptrn3 = re.compile(
+            r"\(\s*dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)"
+        )
+        # hipMemcpyToSymbol(const void* symbolName, const void* src, size_t sizeBytes,
+        #    size_t offset = 0, hipMemcpyKind kind)
+        hip_memcpy_ptrn4 = re.compile(
+            r"\(\s*symbol\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)"
+        )
+        # memcopy with kind argument
+        hip_memcpy_ptrn_kind = re.compile(r".* kind\((\d+)\)\s*.*")
+        # hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src,
+        #                          hsa_agent_t src_agent, size_t size,
+        #                          uint32_t num_dep_signals,
+        #                          const hsa_signal_t* dep_signals,
+        #                          hsa_signal_t completion_signal);
+        # "(0x7f8ab6600000, 27064880, 0x7f8b16000000, 27059968, 800000000, 0, 0, 140240759809536) = 0"
+        # hsa_memcpy_ptrn_prev used to support format transition and will be cleaned up later.
+        hsa_memcpy_ptrn_prev = re.compile(
+            r"\((0x[0-9a-fA-F]+), (\d+), (0x[0-9a-fA-F]+), (\d+), (\d+), .*\) = \d"
+        )
+        # "(0x7fd83bc00000, {handle=16124864}, 0x7fd89b600000, {handle=16119808}, 800000000, 0, 0, {handle=140573877724672}) = 0"
+        hsa_memcpy_ptrn = re.compile(
+            r"\((0x[0-9a-fA-F]+), {handle=(\d+)}, (0x[0-9a-fA-F]+), {handle=(\d+)}, (\d+), .*\) = \d"
+        )
+        #    "(0x7f9125cfe7b0, 0x7f9125cfe784, 0x7f9125cfe790, 0x7f9125cfe784, 0x7f9125cfe778, {handle=94324038652880}, 1, 0, 0, {handle=140261380710784}) = 0"
+        #    dst, dst_offset, src, src_offset, range, copy_agent, dir, num_dep_signals, dep_signals, completion_signal
+        hsa_memcpy_ptrn2 = re.compile(
+            r"\((0x[0-9a-fA-F]+), 0x[0-9a-fA-F]+, (0x[0-9a-fA-F]+), 0x[0-9a-fA-F]+, 0x[0-9a-fA-F]+, {z=(\d+), y=(\d+), x=(\d+)}, {handle=(\d+)}, .*\) = \d"
+        )
+        # aysnc memcopy
+        async_event_ptrn = re.compile(r"Async|async")
+        m_basic_hip = hip_memcpy_ptrn.match(args)
+        m_basic_hsa3 = hip_memcpy_ptrn4.match(args)
+        m_basic_hsa_prev = hsa_memcpy_ptrn_prev.match(args)
+        m_basic_hsa = hsa_memcpy_ptrn.match(args)
+        m_basic_hsa2 = hsa_memcpy_ptrn2.match(args)
+        is_hip = True if not (m_basic_hsa_prev or m_basic_hsa or m_basic_hsa2) else False
+        m_2d = hip_memcpy_ptrn2.match(args)
+        m_array = hip_memcpy_ptrn3.match(args)
+        is_async = 1 if async_event_ptrn.search(event) else 0
+        copy_line = ""
+        size = 0
+        dstptr_type = "unknown"
+        srcptr_type = "unknown"
+        direction = "unknown"
+        kind_switcher = {
+            "0": "HtoH",
+            "1": "HtoD",
+            "2": "DtoH",
+            "3": "DtoD",
+            "4": "auto",
+        }
+
+        condition_matched = False
+        if m_basic_hip:
+            dstptr = m_basic_hip.group(1)
+            dstptr_type = self.get_ptr_type(dstptr)
+            srcptr = m_basic_hip.group(2)
+            srcptr_type = self.get_ptr_type(srcptr)
+            size = int(m_basic_hip.group(3))
+            condition_matched = True
+
+        if m_basic_hsa_prev:
+            dstptr = m_basic_hsa_prev.group(1)
+            dst_agent_ptr = m_basic_hsa_prev.group(2)
+            dstptr_type = self.get_ptr_type(dst_agent_ptr)
+            srcptr = m_basic_hsa_prev.group(3)
+            src_agent_ptr = m_basic_hsa_prev.group(4)
+            srcptr_type = self.get_ptr_type(src_agent_ptr)
+            size = int(m_basic_hsa_prev.group(5))
+            condition_matched = True
+
+        if m_basic_hsa:
+            dstptr = m_basic_hsa.group(1)
+            dst_agent_ptr = m_basic_hsa.group(2)
+            dstptr_type = self.get_ptr_type(dst_agent_ptr)
+            srcptr = m_basic_hsa.group(3)
+            src_agent_ptr = m_basic_hsa.group(4)
+            srcptr_type = self.get_ptr_type(src_agent_ptr)
+            size = int(m_basic_hsa.group(5))
+            condition_matched = True
+
+        if m_basic_hsa2:
+            dstptr = m_basic_hsa2.group(1)
+            dst_agent_ptr = m_basic_hsa2.group(6)
+            dstptr_type = self.get_ptr_type(dst_agent_ptr)
+            srcptr = m_basic_hsa2.group(2)
+            src_agent_ptr = m_basic_hsa2.group(6)
+            srcptr_type = self.get_ptr_type(src_agent_ptr)
+            z = int(m_basic_hsa2.group(3))
+            y = int(m_basic_hsa2.group(4))
+            x = int(m_basic_hsa2.group(5))
+            size = x * y * z
+            condition_matched = True
+
+        if m_basic_hsa3:
+            dstptr = m_basic_hsa3.group(1)
+            dstptr_type = self.get_ptr_type(dstptr)
+            srcptr = m_basic_hsa3.group(2)
+            srcptr_type = self.get_ptr_type(srcptr)
+            size = int(m_basic_hsa3.group(3))
+            condition_matched = True
+
+        if m_array:
+            dstptr = m_array.group(1)
+            dstptr_type = self.get_ptr_type(dstptr)
+            srcptr = m_array.group(2)
+            srcptr_type = self.get_ptr_type(srcptr)
+            size = m_array.group(3)
+            condition_matched = True
+
+        if m_2d:
+            dstptr = m_2d.group(1)
+            dstptr_type = self.get_ptr_type(dstptr)
+            srcptr = m_2d.group(2)
+            srcptr_type = self.get_ptr_type(srcptr)
+            size = int(m_2d.group(3)) * int(m_2d.group(4))
+            condition_matched = True
+
+        if not condition_matched:
+            fatal('Memcpy args "' + args + '" cannot be identified')
+
+        if not is_async:
+            start_time = recvals[0]  # sync time stamp
+            end_time = recvals[1]  # sync time stamp
+            duration = int(end_time) - int(start_time)
+
+        evt_switcher = {
+            "hipMemcpyDtoD": "DtoD",
+            "hipMemcpyDtoDAsync": "DtoD",
+            "hipMemcpyDtoH": "DtoH",
+            "hipMemcpyDtoHAsync": "DtoH",
+            "hipMemcpyHtoD": "HtoD",
+            "hipMemcpyHtoDAsync": "HtoD",
+        }
+
+        if is_hip:
+            m = hip_memcpy_ptrn_kind.match(args)
+            if m:
+                direction = kind_switcher.get(m.group(1), "unknown")
+            else:
+                direction = evt_switcher.get(event, "unknown")
+        else:
+            if (
+                dst_agent_ptr in self.hsa_agent_types
+                and src_agent_ptr in self.hsa_agent_types
+            ):
+                if self.hsa_agent_types[src_agent_ptr] == 1:
+                    direction = "D"
+                elif self.hsa_agent_types[src_agent_ptr] == 0:
+                    direction = "H"
+                if direction != "unknown":
+                    direction += "to"
+                if self.hsa_agent_types[dst_agent_ptr] == 1:
+                    direction += "D"
+                elif self.hsa_agent_types[dst_agent_ptr] == 0:
+                    direction += "H"
+
+        copy_line_header = ""
+        copy_line_footer = ""
+        copy_line_header = (
+            str(start_time) + DELIM + str(end_time) + DELIM + str(pid) + DELIM + str(tid)
+        )
+        copy_line_footer = "Async=" + str(is_async)
+
+        copy_line = (
+            copy_line_header
+            + DELIM
+            + event
+            + DELIM
+            + "Direction="
+            + direction
+            + DELIM
+            + "SrcType="
+            + srcptr_type
+            + DELIM
+            + "DstType="
+            + dstptr_type
+            + DELIM
+            + "Size="
+            + str(size)
+            + DELIM
+            + copy_line_footer
+        )
+
+        self.memcopies[(recordid, procid, is_async)] = copy_line
+        return copy_line
+
+    def dump_data(self, table_name, file_name):
+        # To create memcopy info table in DB
+        print("File '" + file_name + "' is generating")
+        table_handle = self.db.add_table(table_name, mm_table_descr)
+
+        fld_ptrn = re.compile(r"(.*)=(.*)")
+        for key, record in self.memcopies.items():
+            rec_vals_array = []
+            for rec in record.split(DELIM):
+                fld_ptrnm = fld_ptrn.match(rec)
+                if fld_ptrnm:
+                    rec_vals_array.append(fld_ptrnm.group(2))
+                else:
+                    rec_vals_array.append(rec)
+            self.db.insert_entry(table_handle, rec_vals_array)
+
+        # To dump the memcopy info table as CSV
+        self.db.dump_csv(table_name, file_name)
diff --git a/bin/rocprofv2 b/bin/rocprofv2
index cf55435c..a2a86f66 100755
--- a/bin/rocprofv2
+++ b/bin/rocprofv2
@@ -1,8 +1,13 @@
-#!/bin/bash
+#!/bin/bash -e
+
 set -eo pipefail
-CURRENT_DIR="$(dirname -- "$0")"
-ROCPROFV2_DIR=$(dirname -- $(realpath ${BASH_SOURCE[0]}))
-ROCM_DIR=$(dirname -- "$ROCPROFV2_DIR")
+
+# LD_PRELOAD on script will not get propagated
+if [ -n "${ROCP_PRELOAD}" ]; then LD_PRELOAD="${ROCP_PRELOAD}"; fi
+
+CURRENT_DIR="$( dirname -- "$0"; )";
+ROCPROFV2_DIR=$(dirname -- $(realpath ${BASH_SOURCE[0]}));
+ROCM_DIR=$( dirname -- "$ROCPROFV2_DIR"; )
 PLUGIN_LIST=("ctf" "perfetto" "file" "att")
 RUN_FROM_BUILD=0
 if [[ $ROCPROFV2_DIR == *"/build"* ]]; then
diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh
index c1a3daa4..0f5b43c3 100755
--- a/bin/rpl_run.sh
+++ b/bin/rpl_run.sh
@@ -262,6 +262,7 @@ run() {
   MY_LD_PRELOAD=""
   if [ "$ROCTX_TRACE" = 1 ] ; then
     API_TRACE=${API_TRACE}":roctx"
+    MY_LD_PRELOAD="$TTLIB_PATH/libroctx64.so"
   fi
   if [ "$HIP_TRACE" = 1 ] ; then
     API_TRACE=${API_TRACE}":hip"
@@ -273,18 +274,18 @@ run() {
   if [ "$HSA_TRACE" = 1 ] ; then
     export ROCTRACER_DOMAIN=$API_TRACE":hsa"
     MY_HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB $ROCM_LIB_PATH/libroctracer64.so.4"
-    MY_LD_PRELOAD="$TTLIB_PATH/libroctracer_tool.so"
+    MY_LD_PRELOAD="$MY_LD_PRELOAD:$TTLIB_PATH/libroctracer_tool.so"
   elif [ -n "$API_TRACE" ] ; then
     export ROCTRACER_DOMAIN=$API_TRACE
     OUTPUT_LIST="$ROCP_OUTPUT_DIR/"
     MY_HSA_TOOLS_LIB="$ROCM_LIB_PATH/libroctracer64.so.4"
-    MY_LD_PRELOAD="$TTLIB_PATH/libroctracer_tool.so"
+    MY_LD_PRELOAD="$MY_LD_PRELOAD:$TTLIB_PATH/libroctracer_tool.so"
   fi
 
   if [ "$ROCP_STATS_OPT" = 1 ] ; then
     if [ "$ROCTRACER_DOMAIN" = ":hip" ] ; then
       MY_HSA_TOOLS_LIB="$ROCM_LIB_PATH/libroctracer64.so.4"
-      MY_LD_PRELOAD="$TTLIB_PATH/libhip_stats.so"
+      MY_LD_PRELOAD="$MY_LD_PRELOAD:$TTLIB_PATH/libhip_stats.so"
     else
       error_message="ROCP_STATS_OPT is only available with --hip-trace option"
       echo $error_message
diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py
index d1584e54..7dee934f 100644
--- a/bin/sqlitedb.py
+++ b/bin/sqlitedb.py
@@ -24,269 +24,326 @@
 from functools import reduce
 from txt2params import gen_params
 
+
 # SQLite Database class
 class SQLiteDB:
-  def __init__(self, file_name):
-    self.connection = sqlite3.connect(file_name)
-    self.tables = {}
-    self.section_index = 0
-
-  def __del__(self):
-    self.connection.close()
-
-  # add DB table
-  def add_table(self, name, descr, extra = ()):
-    (field_list, field_dict) = descr
-    if name in self.tables: raise Exception('table is already added: "' + name + '"')
-
-    # create DB table
-    table_descr = []
-    for field in field_list: table_descr.append('"%s" %s' % (field, field_dict[field]))
-    for item in extra: table_descr.append('"%s" %s' % (item[0], item[1]))
-    stm = 'CREATE TABLE ' + name + ' (%s)' % ', '.join(table_descr)
-    cursor = self.connection.cursor()
-    cursor.execute(stm)
-    self.connection.commit()
-
-    # register table
-    fields_str = ','.join(map(lambda x: '"' + x + '"', field_list))
-    templ_str = ','.join('?' * len(field_list))
-    stm = 'INSERT INTO ' + name + '(' + fields_str + ') VALUES(' + templ_str + ');'
-    self.tables[name] = stm
-
-    return (cursor, stm);
-
-  # add columns to table
-  def add_columns(self, name, columns):
-    cursor = self.connection.cursor()
-    for item in columns:
-      stm = 'ALTER TABLE ' + name + ' ADD COLUMN "%s" %s' % (item[0], item[1])
-      cursor.execute(stm)
-    self.connection.commit()
-
-  # add columns with expression
-  def add_data_column(self, table_name, data_label, data_type, data_expr):
-    cursor = self.connection.cursor()
-    cursor.execute('ALTER TABLE %s ADD COLUMN "%s" %s' % (table_name, data_label, data_type))
-    cursor.execute('UPDATE %s SET %s = (%s);' % (table_name, data_label, data_expr))
-
-  def change_rec_name(self, table_name, rec_id, rec_name):
-    self.connection.execute('UPDATE ' + table_name + ' SET Name = ? WHERE "Index" = ?', (rec_name, rec_id))
-  def change_rec_tid(self, table_name, rec_id, tid):
-    self.connection.execute('UPDATE ' + table_name + ' SET tid = ? WHERE "Index" = ?', (tid, rec_id))
-  def change_rec_fld(self, table_name, fld_expr, rec_pat):
-    self.connection.execute('UPDATE ' + table_name + ' SET ' + fld_expr + ' WHERE ' + rec_pat)
-  def table_get_record(self, table_name, rec_pat):
-    cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE ' + rec_pat)
-    raws = cursor.fetchall()
-    if len(raws) != 1: raise Exception('Record (' + rec_pat + ') is not unique, table "' + table_name + '"')
-    return list(raws[0])
-
-  # populate DB table entry
-  def insert_entry(self, table, val_list):
-    (cursor, stm) = table
-    cursor.execute(stm, val_list)
-
-  # populate DB table entry
-  def commit_entry(self, table, val_list):
-    self.insert_entry(table, val_list)
-    self.connection.commit()
-
-  # populate DB table data
-  def insert_table(self, table, reader):
-    for val_list in reader:
-      if not val_list[-1]: val_list.pop()
-      self.insert_entry(table, val_list)
-    self.connection.commit()
-
-  # return table fields list
-  def _get_fields(self, table_name):
-    cursor = self.connection.execute('SELECT * FROM ' + table_name)
-    return list(map(lambda x: '"%s"' % (x[0]), cursor.description))
-
-  # return table raws list
-  def _get_raws(self, table_name):
-    cursor = self.connection.execute('SELECT * FROM ' + table_name)
-    return cursor.fetchall()
-  def _get_raws_indexed(self, table_name):
-    cursor = self.connection.execute('SELECT * FROM ' + table_name + ' order by "Index" asc;')
-    return cursor.fetchall()
-  def _get_raw_by_id(self, table_name, rec_id):
-    cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (rec_id,))
-    raws = cursor.fetchall()
-    if len(raws) != 1: raise Exception('Index is not unique, table "' + table_name + '"')
-    return list(raws[0])
-
-  def table_get_raws(self, table_name):
-    return self._get_raws(table_name)
-
-  # dump CSV table
-  def dump_csv(self, table_name, file_name):
-    if not re.search(r'\.csv$', file_name):
-      raise Exception('wrong output file type: "' + file_name + '"' )
-
-    fields = self._get_fields(table_name)
-    with open(file_name, mode='w') as fd:
-      fd.write(','.join(fields) + '\n')
-      for raw in self._get_raws(table_name):
-        tmp = list(raw)
-        for idx in range(len(tmp)):
-          if type(tmp[idx]) == str:
-            if(not(tmp[idx][0] == tmp[idx][-1] == '"')): tmp[idx] = '"'+tmp[idx]+'"'
-        raw = tuple(tmp)
-        fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n')
-
-  # dump JSON trace
-  def open_json(self, file_name):
-    if not re.search(r'\.json$', file_name):
-      raise Exception('wrong output file type: "' + file_name + '"' )
-    with open(file_name, mode='w') as fd:
-      fd.write('{ "traceEvents":[{}\n');
-
-  def close_json(self, file_name):
-    if not re.search(r'\.json$', file_name):
-      raise Exception('wrong output file type: "' + file_name + '"' )
-    with open(file_name, mode='a') as fd:
-      fd.write('}')
-
-  def label_json(self, pid, label, file_name):
-    if not re.search(r'\.json$', file_name):
-      raise Exception('wrong output file type: "' + file_name + '"' )
-    with open(file_name, mode='a') as fd:
-      fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index))
-    self.section_index += 1
-
-  def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, file_name):
-    if not re.search(r'\.json$', file_name):
-      raise Exception('wrong output file type: "' + file_name + '"' )
-    with open(file_name, mode='a') as fd:
-      dep_id = base_id
-      for ind in range(len(from_us_list)):
-        corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind
-        if corr_id in to_us_dict:
-          (from_ts, stream_id, tid) = from_us_list[ind]
-          to_ts = to_us_dict[corr_id]
-          if from_ts > to_ts: from_ts = to_ts
-          fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, tid))
-          fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, stream_id))
-          dep_id += 1
-
-  def metadata_json(self, jsonfile, sysinfo_file):
-    params = gen_params(sysinfo_file);
-    with open(jsonfile, mode='a') as fd:
-      cnt = 0
-      fd.write('],\n')
-      fd.write('"otherData": {\n')
-      for nkey in sorted(params.keys()):
-        key = nkey[1]
-        cnt = cnt + 1
-        if cnt == len(params):
-          fd.write('    "' + key + '": "' + params[nkey] + '"\n')
-        else:
-          fd.write('    "' + key + '": "' + params[nkey] + '",\n')
-      fd.write('  }\n')
-
-  def dump_json(self, table_name, data_name, file_name):
-    if not re.search(r'\.json$', file_name):
-      raise Exception('wrong output file type: "' + file_name + '"' )
-
-    sub_ptrn = re.compile(r'(^"|"$)')
-    name_ptrn = re.compile(r'(name|Name)')
-
-    table_fields = self._get_fields(table_name)
-    table_raws = self._get_raws(table_name)
-    data_fields = self._get_fields(data_name)
-    data_raws = self._get_raws(data_name)
-
-    with open(file_name, mode='a') as fd:
-      table_raws_len = len(table_raws)
-      for raw_index in range(table_raws_len):
-        if (raw_index == table_raws_len - 1) or (raw_index % 1000 == 0):
-          sys.stdout.write( \
-            "\rdump json " + str(raw_index) + ":" + str(len(table_raws))  + " "*100 \
-          )
-
-        vals_list = []
-        values = list(table_raws[raw_index])
-        for value_index in range(len(values)):
-          label = table_fields[value_index]
-          value = values[value_index]
-          if name_ptrn.search(label): value = sub_ptrn.sub(r'', value)
-          if label != '"Index"':
-            if label == '"dur"' and value == 0:
-              vals_list.append('%s:"%s"' % (label, "1"))
-            else:
-              vals_list.append('%s:"%s"' % (label, value))
-
-        args_list = []
-        data = list(data_raws[raw_index])
-        for value_index in range(len(data)):
-          label = data_fields[value_index]
-          value = data[value_index]
-          if label[:3] == '"__': continue
-          if name_ptrn.search(label): value = sub_ptrn.sub(r'', value)
-          if label != '"Index"' and label != '"roctx-range"': args_list.append('%s:"%s"' % (label, value))
-
-        fd.write(',{"ph":"%s",%s,\n  "args":{\n    %s\n  }\n}\n' % ('X', ','.join(vals_list), ',\n    '.join(args_list)))
-
-    sys.stdout.write('\n')
-
-  # execute query on DB
-  def execute(self, cmd):
-    cursor = self.connection.cursor()
-    cursor.execute(cmd)
-
-  # commit DB
-  def commit(self):
-    self.connection.commit()
-
-  # close DB
-  def close(self):
-    self.connection.commit()
-    self.connection.close()
-
-  # access DB
-  def get_raws(self, table_name):
-    cur = self.connection.cursor()
-    cur.execute("SELECT * FROM %s" % table_name)
-    return cur.fetchall()
-
-  # return CSV descriptor
-  # list of fields and dictionaly for the fields types
-  def _get_csv_descr(self, table_name, fd):
-    reader = csv.DictReader(fd)
-    field_names = reader.fieldnames
-    if not field_names[-1]: field_names.pop()
-    field_types = {}
-
-    for entry in reader:
-      fields_left = [f for f in field_names if f not in field_types.keys()]
-      # all fields processed
-      if not fields_left: break
-
-      for field in fields_left:
-          data = entry[field]
-          # need data for the field to be processed
-          if len(data) == 0: continue
-
-          if data.isdigit():
-              field_types[field] = "INTEGER"
-          else:
-              field_types[field] = "TEXT"
-
-    if len(fields_left) > 0: raise Exception('types not found for fields: ', fields_left)
-    return (field_names, field_types)
-
-  # add CSV table
-  def add_csv_table(self, table_name, file_name, extra = ()):
-    with open(file_name, mode='r') as fd:
-      # get CSV table descriptor
-      descr = self._get_csv_descr(table_name, fd)
-      # reader to populate the table
-      fd.seek(0)
-      reader = csv.reader(fd)
-      reader.next()
-      table = self.add_table(table_name, descr, extra)
-      self.insert_table(table, reader)
+    def __init__(self, file_name):
+        self.connection = sqlite3.connect(file_name)
+        self.tables = {}
+        self.section_index = 0
+
+    def __del__(self):
+        self.connection.close()
+
+    # add DB table
+    def add_table(self, name, descr, extra=()):
+        (field_list, field_dict) = descr
+        if name in self.tables:
+            raise Exception('table is already added: "' + name + '"')
+
+        # create DB table
+        table_descr = []
+        for field in field_list:
+            table_descr.append('"%s" %s' % (field, field_dict[field]))
+        for item in extra:
+            table_descr.append('"%s" %s' % (item[0], item[1]))
+        stm = "CREATE TABLE " + name + " (%s)" % ", ".join(table_descr)
+        cursor = self.connection.cursor()
+        cursor.execute(stm)
+        self.connection.commit()
+
+        # register table
+        fields_str = ",".join(map(lambda x: '"' + x + '"', field_list))
+        templ_str = ",".join("?" * len(field_list))
+        stm = "INSERT INTO " + name + "(" + fields_str + ") VALUES(" + templ_str + ");"
+        self.tables[name] = stm
+
+        return (cursor, stm)
+
+    # add columns to table
+    def add_columns(self, name, columns):
+        cursor = self.connection.cursor()
+        for item in columns:
+            stm = "ALTER TABLE " + name + ' ADD COLUMN "%s" %s' % (item[0], item[1])
+            cursor.execute(stm)
+        self.connection.commit()
+
+    # add columns with expression
+    def add_data_column(self, table_name, data_label, data_type, data_expr):
+        cursor = self.connection.cursor()
+        cursor.execute(
+            'ALTER TABLE %s ADD COLUMN "%s" %s' % (table_name, data_label, data_type)
+        )
+        cursor.execute("UPDATE %s SET %s = (%s);" % (table_name, data_label, data_expr))
+
+    def change_rec_name(self, table_name, rec_id, rec_name):
+        self.connection.execute(
+            "UPDATE " + table_name + ' SET Name = ? WHERE "Index" = ?', (rec_name, rec_id)
+        )
+
+    def change_rec_tid(self, table_name, rec_id, tid):
+        self.connection.execute(
+            "UPDATE " + table_name + ' SET tid = ? WHERE "Index" = ?', (tid, rec_id)
+        )
+
+    def change_rec_fld(self, table_name, fld_expr, rec_pat):
+        self.connection.execute(
+            "UPDATE " + table_name + " SET " + fld_expr + " WHERE " + rec_pat
+        )
+
+    def table_get_record(self, table_name, rec_pat):
+        cursor = self.connection.execute(
+            "SELECT * FROM " + table_name + " WHERE " + rec_pat
+        )
+        raws = cursor.fetchall()
+        if len(raws) != 1:
+            raise Exception(
+                "Record (" + rec_pat + ') is not unique, table "' + table_name + '"'
+            )
+        return list(raws[0])
+
+    # populate DB table entry
+    def insert_entry(self, table, val_list):
+        (cursor, stm) = table
+        cursor.execute(stm, val_list)
+
+    # populate DB table entry
+    def commit_entry(self, table, val_list):
+        self.insert_entry(table, val_list)
+        self.connection.commit()
+
+    # populate DB table data
+    def insert_table(self, table, reader):
+        for val_list in reader:
+            if not val_list[-1]:
+                val_list.pop()
+            self.insert_entry(table, val_list)
+        self.connection.commit()
+
+    # return table fields list
+    def _get_fields(self, table_name):
+        cursor = self.connection.execute("SELECT * FROM " + table_name)
+        return list(map(lambda x: '"%s"' % (x[0]), cursor.description))
+
+    # return table raws list
+    def _get_raws(self, table_name):
+        cursor = self.connection.execute("SELECT * FROM " + table_name)
+        return cursor.fetchall()
+
+    def _get_raws_indexed(self, table_name):
+        cursor = self.connection.execute(
+            "SELECT * FROM " + table_name + ' order by "Index" asc;'
+        )
+        return cursor.fetchall()
+
+    def _get_raw_by_id(self, table_name, rec_id):
+        cursor = self.connection.execute(
+            "SELECT * FROM " + table_name + ' WHERE "Index"=?', (rec_id,)
+        )
+        raws = cursor.fetchall()
+        if len(raws) != 1:
+            raise Exception('Index is not unique, table "' + table_name + '"')
+        return list(raws[0])
+
+    def table_get_raws(self, table_name):
+        return self._get_raws(table_name)
+
+    # dump CSV table
+    def dump_csv(self, table_name, file_name):
+        if not re.search(r"\.csv$", file_name):
+            raise Exception('wrong output file type: "' + file_name + '"')
+
+        fields = self._get_fields(table_name)
+        with open(file_name, mode="w") as fd:
+            fd.write(",".join(fields) + "\n")
+            for raw in self._get_raws(table_name):
+                tmp = list(raw)
+                for idx in range(len(tmp)):
+                    if type(tmp[idx]) == str:
+                        if not (tmp[idx][0] == tmp[idx][-1] == '"'):
+                            tmp[idx] = '"' + tmp[idx] + '"'
+                raw = tuple(tmp)
+                fd.write(reduce(lambda a, b: str(a) + "," + str(b), raw) + "\n")
+
+    # dump JSON trace
+    def open_json(self, file_name):
+        if not re.search(r"\.json$", file_name):
+            raise Exception('wrong output file type: "' + file_name + '"')
+        with open(file_name, mode="w") as fd:
+            fd.write('{ "traceEvents":[{}\n')
+
+    def close_json(self, file_name):
+        if not re.search(r"\.json$", file_name):
+            raise Exception('wrong output file type: "' + file_name + '"')
+        with open(file_name, mode="a") as fd:
+            fd.write("}")
+
+    def label_json(self, pid, label, file_name):
+        if not re.search(r"\.json$", file_name):
+            raise Exception('wrong output file type: "' + file_name + '"')
+        with open(file_name, mode="a") as fd:
+            fd.write(
+                ',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n'
+                % (label, pid, self.section_index)
+            )
+        self.section_index += 1
+
+    def flow_json(
+        self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, file_name
+    ):
+        if not re.search(r"\.json$", file_name):
+            raise Exception('wrong output file type: "' + file_name + '"')
+        with open(file_name, mode="a") as fd:
+            dep_id = base_id
+            for ind in range(len(from_us_list)):
+                corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind
+                if corr_id in to_us_dict:
+                    (from_ts, stream_id, tid) = from_us_list[ind]
+                    to_ts = to_us_dict[corr_id]
+                    if from_ts > to_ts:
+                        from_ts = to_ts
+                    fd.write(
+                        ',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n'
+                        % (from_ts, dep_id, from_pid, tid)
+                    )
+                    fd.write(
+                        ',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n'
+                        % (to_ts, dep_id, to_pid, stream_id)
+                    )
+                    dep_id += 1
+
+    def metadata_json(self, jsonfile, sysinfo_file):
+        params = gen_params(sysinfo_file)
+        with open(jsonfile, mode="a") as fd:
+            cnt = 0
+            fd.write("],\n")
+            fd.write('"otherData": {\n')
+            for nkey in sorted(params.keys()):
+                key = nkey[1]
+                cnt = cnt + 1
+                if cnt == len(params):
+                    fd.write('    "' + key + '": "' + params[nkey] + '"\n')
+                else:
+                    fd.write('    "' + key + '": "' + params[nkey] + '",\n')
+            fd.write("  }\n")
+
+    def dump_json(self, table_name, data_name, file_name):
+        if not re.search(r"\.json$", file_name):
+            raise Exception('wrong output file type: "' + file_name + '"')
+
+        sub_ptrn = re.compile(r'(^"|"$)')
+        name_ptrn = re.compile(r"(name|Name)")
+
+        table_fields = self._get_fields(table_name)
+        table_raws = self._get_raws(table_name)
+        data_fields = self._get_fields(data_name)
+        data_raws = self._get_raws(data_name)
+
+        with open(file_name, mode="a") as fd:
+            table_raws_len = len(table_raws)
+            for raw_index in range(table_raws_len):
+                if (raw_index == table_raws_len - 1) or (raw_index % 1000 == 0):
+                    sys.stdout.write(
+                        "\rdump json "
+                        + str(raw_index)
+                        + ":"
+                        + str(len(table_raws))
+                        + " " * 100
+                    )
+
+                vals_list = []
+                values = list(table_raws[raw_index])
+                for value_index in range(len(values)):
+                    label = table_fields[value_index]
+                    value = values[value_index]
+                    if name_ptrn.search(label):
+                        value = sub_ptrn.sub(r"", value)
+                    if label != '"Index"':
+                        if label == '"dur"' and value == 0:
+                            vals_list.append('%s:"%s"' % (label, "1"))
+                        else:
+                            vals_list.append('%s:"%s"' % (label, value))
+
+                args_list = []
+                data = list(data_raws[raw_index])
+                for value_index in range(len(data)):
+                    label = data_fields[value_index]
+                    value = data[value_index]
+                    if label[:3] == '"__':
+                        continue
+                    if name_ptrn.search(label):
+                        value = sub_ptrn.sub(r"", value)
+                    if label != '"Index"' and label != '"roctx-range"':
+                        args_list.append('%s:"%s"' % (label, value))
+
+                fd.write(
+                    ',{"ph":"%s",%s,\n  "args":{\n    %s\n  }\n}\n'
+                    % ("X", ",".join(vals_list), ",\n    ".join(args_list))
+                )
+
+        sys.stdout.write("\n")
+
+    # execute query on DB
+    def execute(self, cmd):
+        cursor = self.connection.cursor()
+        cursor.execute(cmd)
+
+    # commit DB
+    def commit(self):
+        self.connection.commit()
+
+    # close DB
+    def close(self):
+        self.connection.commit()
+        self.connection.close()
+
+    # access DB
+    def get_raws(self, table_name):
+        cur = self.connection.cursor()
+        cur.execute("SELECT * FROM %s" % table_name)
+        return cur.fetchall()
+
+    # return CSV descriptor
+    # list of fields and dictionaly for the fields types
+    def _get_csv_descr(self, table_name, fd):
+        reader = csv.DictReader(fd)
+        field_names = reader.fieldnames
+        if not field_names[-1]:
+            field_names.pop()
+        field_types = {}
+
+        for entry in reader:
+            fields_left = [f for f in field_names if f not in field_types.keys()]
+            # all fields processed
+            if not fields_left:
+                break
+
+            for field in fields_left:
+                data = entry[field]
+                # need data for the field to be processed
+                if len(data) == 0:
+                    continue
+
+                if data.isdigit():
+                    field_types[field] = "INTEGER"
+                else:
+                    field_types[field] = "TEXT"
+
+        if len(fields_left) > 0:
+            raise Exception("types not found for fields: ", fields_left)
+        return (field_names, field_types)
+
+    # add CSV table
+    def add_csv_table(self, table_name, file_name, extra=()):
+        with open(file_name, mode="r") as fd:
+            # get CSV table descriptor
+            descr = self._get_csv_descr(table_name, fd)
+            # reader to populate the table
+            fd.seek(0)
+            reader = csv.reader(fd)
+            reader.next()
+            table = self.add_table(table_name, descr, extra)
+            self.insert_table(table, reader)
+
 
 ##############################################################################################
diff --git a/bin/tblextr.py b/bin/tblextr.py
index c549fc09..507e1aeb 100755
--- a/bin/tblextr.py
+++ b/bin/tblextr.py
@@ -50,317 +50,418 @@
 # stream ID map
 stream_counter = 0
 stream_id_map = {}
+
+
 def get_stream_index(stream_id):
-  global stream_counter
-  stream_ind = 0
-  if stream_id.lower() != 'nil':
-    if not stream_id in stream_id_map:
-      stream_counter += 1
-      stream_ind = stream_counter
-      stream_id_map[stream_id] = stream_ind
-    else:
-      stream_ind = stream_id_map[stream_id]
-  return stream_ind
+    global stream_counter
+    stream_ind = 0
+    if stream_id.lower() != "nil":
+        if not stream_id in stream_id_map:
+            stream_counter += 1
+            stream_ind = stream_counter
+            stream_id_map[stream_id] = stream_ind
+        else:
+            stream_ind = stream_id_map[stream_id]
+    return stream_ind
+
 
 # patching activity records
-def activity_record_patching(db, ops_table_name, kernel_found, kernel_name, stream_found, stream_ind, select_expr):
-  if kernel_found != 0:
-    db.change_rec_fld(ops_table_name, 'Name = "' + kernel_name + '"', select_expr)
-  if stream_found != 0:
-    db.change_rec_fld(ops_table_name, 'tid = ' + str(stream_ind), select_expr)
+def activity_record_patching(
+    db, ops_table_name, kernel_found, kernel_name, stream_found, stream_ind, select_expr
+):
+    if kernel_found != 0:
+        db.change_rec_fld(ops_table_name, 'Name = "' + kernel_name + '"', select_expr)
+    if stream_found != 0:
+        db.change_rec_fld(ops_table_name, "tid = " + str(stream_ind), select_expr)
+
 
 # global vars
-table_descr = [
-  ['Index', 'KernelName'],
-  {'Index': 'INTEGER', 'KernelName': 'TEXT'}
-]
+table_descr = [["Index", "KernelName"], {"Index": "INTEGER", "KernelName": "TEXT"}]
 var_list = table_descr[0]
 var_table = {}
 #############################################################
 
+
 def fatal(msg):
-  sys.stderr.write(sys.argv[0] + ": " + msg + "\n");
-  sys.exit(1)
+    sys.stderr.write(sys.argv[0] + ": " + msg + "\n")
+    sys.exit(1)
+
 
 dbglog_count = 0
+
+
 def dbglog(msg):
-  global dbglog_count
-  dbglog_count += 1
-  sys.stderr.write(sys.argv[0] + ": " + msg + "\n");
-  fatal("error")
+    global dbglog_count
+    dbglog_count += 1
+    sys.stderr.write(sys.argv[0] + ": " + msg + "\n")
+    fatal("error")
+
+
 #############################################################
 
 # Dumping sysinfo
 sysinfo_begin = 1
+
+
 def metadata_gen(sysinfo_file, sysinfo_cmd):
-  global sysinfo_begin
-  if not re.search(r'\.txt$', sysinfo_file):
-    raise Exception('wrong output file type: "' + sysinfo_file + '"' )
-  if sysinfo_begin == 1:
-    sysinfo_begin = 0
-    with open(sysinfo_file, mode='w') as fd: fd.write('')
-  with open(sysinfo_file, mode='a') as fd: fd.write('CMD: ' + sysinfo_cmd + '\n')
-  status = subprocess.call(sysinfo_cmd + ' >> ' + sysinfo_file,
-                           stderr=subprocess.STDOUT,
-                           shell=True)
-  if status != 0:
-    raise Exception('Could not run command: "' + sysinfo_cmd + '"')
+    global sysinfo_begin
+    if not re.search(r"\.txt$", sysinfo_file):
+        raise Exception('wrong output file type: "' + sysinfo_file + '"')
+    if sysinfo_begin == 1:
+        sysinfo_begin = 0
+        with open(sysinfo_file, mode="w") as fd:
+            fd.write("")
+    with open(sysinfo_file, mode="a") as fd:
+        fd.write("CMD: " + sysinfo_cmd + "\n")
+    status = subprocess.call(
+        sysinfo_cmd + " >> " + sysinfo_file, stderr=subprocess.STDOUT, shell=True
+    )
+    if status != 0:
+        raise Exception('Could not run command: "' + sysinfo_cmd + '"')
+
 
 # parse results method
 def parse_res(infile):
-  global max_gpu_id
-  if not os.path.isfile(infile): return
-  inp = open(infile, 'r')
-
-  beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)")
-  prop_pattern = re.compile("([\w-]+)\((\w+)\)");
-  ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)")
-  # var pattern below matches a variable name and a variable value from a one
-  # line text in the format of for example "WRITE_SIZE (0.2500000000)" or
-  # "GRBM_GUI_ACTIVE (27867)" or "TA_TA_BUSY[0]"
-  var_pattern = re.compile("^\s*([a-zA-Z0-9_]+(?:\[\d+\])?)\s+\((\d+(?:\.\d+)?)\)")
-  pid_pattern = re.compile("pid\((\d*)\)")
-
-  dispatch_number = 0
-  var_table_pid = 0
-  for line in inp.readlines():
-    record = line[:-1]
-
-    m = pid_pattern.search(record)
-    if m and not os.getenv('ROCP_MERGE_PIDS'): var_table_pid = int(m.group(1))
-
-    m = var_pattern.match(record)
-    if m:
-      if not (var_table_pid, dispatch_number) in var_table: fatal("Error: dispatch number not found '" + str(dispatch_number) + "'")
-      var = m.group(1)
-      val = m.group(2)
-      var_table[(var_table_pid, dispatch_number)][var] = val
-      if not var in var_list: var_list.append(var)
-
-    m = beg_pattern.match(record)
-    if m:
-      dispatch_number = m.group(1)
-      if not (var_table_pid, dispatch_number) in var_table:
-        var_table[(var_table_pid, dispatch_number)] = {
-          'Index': dispatch_number,
-          'KernelName': "\"" + m.group(3) + "\""
-        }
-
-        gpu_id = 0
-        queue_id = 0
-        disp_pid = 0
-        disp_tid = 0
-
-        kernel_properties = m.group(2)
-        for prop in kernel_properties.split(', '):
-          m = prop_pattern.match(prop)
-          if m:
+    global max_gpu_id
+    if not os.path.isfile(infile):
+        return
+    inp = open(infile, "r")
+
+    beg_pattern = re.compile('^dispatch\[(\d*)\], (.*) kernel-name\("([^"]*)"\)')
+    prop_pattern = re.compile("([\w-]+)\((\w+)\)")
+    ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)")
+    # var pattern below matches a variable name and a variable value from a one
+    # line text in the format of for example "WRITE_SIZE (0.2500000000)" or
+    # "GRBM_GUI_ACTIVE (27867)" or "TA_TA_BUSY[0]"
+    var_pattern = re.compile("^\s*([a-zA-Z0-9_]+(?:\[\d+\])?)\s+\((\d+(?:\.\d+)?)\)")
+    pid_pattern = re.compile("pid\((\d*)\)")
+
+    dispatch_number = 0
+    var_table_pid = 0
+    for line in inp.readlines():
+        record = line[:-1]
+
+        m = pid_pattern.search(record)
+        if m and not os.getenv("ROCP_MERGE_PIDS"):
+            var_table_pid = int(m.group(1))
+
+        m = var_pattern.match(record)
+        if m:
+            if not (var_table_pid, dispatch_number) in var_table:
+                fatal("Error: dispatch number not found '" + str(dispatch_number) + "'")
             var = m.group(1)
             val = m.group(2)
             var_table[(var_table_pid, dispatch_number)][var] = val
-            if not var in var_list: var_list.append(var);
-            if var == 'gpu-id':
-              gpu_id = int(val)
-              if (gpu_id > max_gpu_id): max_gpu_id = gpu_id
-            if var == 'queue-id': queue_id = int(val)
-            if var == 'pid': disp_pid = int(val)
-            if var == 'tid': disp_tid = int(val)
-          else: fatal('wrong kernel property "' + prop + '" in "'+ kernel_properties + '"')
-        m = ts_pattern.search(record)
+            if not var in var_list:
+                var_list.append(var)
+
+        m = beg_pattern.match(record)
         if m:
-          var_table[(var_table_pid, dispatch_number)]['DispatchNs'] = m.group(1)
-          var_table[(var_table_pid, dispatch_number)]['BeginNs'] = m.group(2)
-          var_table[(var_table_pid, dispatch_number)]['EndNs'] = m.group(3)
-          var_table[(var_table_pid, dispatch_number)]['CompleteNs'] = m.group(4)
-
-          ## filling dependenciws
-          from_ns = int(m.group(1))
-          to_ns = int(m.group(2))
-          from_us = int((from_ns - START_NS) / 1000)
-          to_us = int((to_ns - START_NS) / 1000)
-
-          kern_dep_list.append((from_ns, disp_pid, disp_tid))
-
-          gpu_pid = GPU_BASE_PID + int(gpu_id)
-          if not disp_pid in dep_dict: dep_dict[disp_pid] = {}
-          dep_proc = dep_dict[disp_pid]
-          if not gpu_pid in dep_proc: dep_proc[gpu_pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] }
-          dep_str = dep_proc[gpu_pid]
-          to_id = len(dep_str['from'])
-          dep_str['from'].append((from_us, disp_tid, disp_tid))
-          dep_str['to'][to_id] = to_us
-          ##
-
-  inp.close()
+            dispatch_number = m.group(1)
+            if not (var_table_pid, dispatch_number) in var_table:
+                var_table[(var_table_pid, dispatch_number)] = {
+                    "Index": dispatch_number,
+                    "KernelName": '"' + m.group(3) + '"',
+                }
+
+                gpu_id = 0
+                queue_id = 0
+                disp_pid = 0
+                disp_tid = 0
+
+                kernel_properties = m.group(2)
+                for prop in kernel_properties.split(", "):
+                    m = prop_pattern.match(prop)
+                    if m:
+                        var = m.group(1)
+                        val = m.group(2)
+                        var_table[(var_table_pid, dispatch_number)][var] = val
+                        if not var in var_list:
+                            var_list.append(var)
+                        if var == "gpu-id":
+                            gpu_id = int(val)
+                            if gpu_id > max_gpu_id:
+                                max_gpu_id = gpu_id
+                        if var == "queue-id":
+                            queue_id = int(val)
+                        if var == "pid":
+                            disp_pid = int(val)
+                        if var == "tid":
+                            disp_tid = int(val)
+                    else:
+                        fatal(
+                            'wrong kernel property "'
+                            + prop
+                            + '" in "'
+                            + kernel_properties
+                            + '"'
+                        )
+                m = ts_pattern.search(record)
+                if m:
+                    var_table[(var_table_pid, dispatch_number)]["DispatchNs"] = m.group(1)
+                    var_table[(var_table_pid, dispatch_number)]["BeginNs"] = m.group(2)
+                    var_table[(var_table_pid, dispatch_number)]["EndNs"] = m.group(3)
+                    var_table[(var_table_pid, dispatch_number)]["CompleteNs"] = m.group(4)
+
+                    ## filling dependenciws
+                    from_ns = int(m.group(1))
+                    to_ns = int(m.group(2))
+                    from_us = int((from_ns - START_NS) / 1000)
+                    to_us = int((to_ns - START_NS) / 1000)
+
+                    kern_dep_list.append((from_ns, disp_pid, disp_tid))
+
+                    gpu_pid = GPU_BASE_PID + int(gpu_id)
+                    if not disp_pid in dep_dict:
+                        dep_dict[disp_pid] = {}
+                    dep_proc = dep_dict[disp_pid]
+                    if not gpu_pid in dep_proc:
+                        dep_proc[gpu_pid] = {
+                            "pid": HSA_PID,
+                            "from": [],
+                            "to": {},
+                            "id": [],
+                        }
+                    dep_str = dep_proc[gpu_pid]
+                    to_id = len(dep_str["from"])
+                    dep_str["from"].append((from_us, disp_tid, disp_tid))
+                    dep_str["to"][to_id] = to_us
+                    ##
+
+    inp.close()
+
+
 #############################################################
 
+
 # Comparator to sort a dictionary of tuples. This comparator will convert
 # the second element of tuple to an int and return the new tuple. Then
 # the dictionary can use the default comparison i.e sort by first element,
 # then sort by second element.
-def tuple_comparator(tupleElem) :
+def tuple_comparator(tupleElem):
     return tupleElem[0], int(tupleElem[1])
 
+
 # merge results table
 def merge_table():
-  global var_list
-  keys = sorted(var_table.keys(), key=tuple_comparator)
-
-  fields = set(var_table[keys[0]])
-  if 'DispatchNs' in fields:
-    var_list.append('DispatchNs')
-    var_list.append('BeginNs')
-    var_list.append('EndNs')
-    var_list.append('CompleteNs')
-  var_list = [x for x in var_list if x in fields]
+    global var_list
+    keys = sorted(var_table.keys(), key=tuple_comparator)
+
+    fields = set(var_table[keys[0]])
+    if "DispatchNs" in fields:
+        var_list.append("DispatchNs")
+        var_list.append("BeginNs")
+        var_list.append("EndNs")
+        var_list.append("CompleteNs")
+    var_list = [x for x in var_list if x in fields]
+
+
 #############################################################
 
+
 # dump CSV results
 def dump_csv(file_name):
-  global var_list
-  keys = sorted(var_table.keys(), key=tuple_comparator)
+    global var_list
+    keys = sorted(var_table.keys(), key=tuple_comparator)
+
+    with open(file_name, mode="w") as fd:
+        fd.write(",".join(var_list) + "\n")
+        for pid, ind in keys:
+            entry = var_table[(pid, ind)]
+            dispatch_number = entry["Index"]
+            if ind != dispatch_number:
+                fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n")
+            val_list = [entry[var] for var in var_list]
+            fd.write(",".join(val_list) + "\n")
+
+    print("File '" + file_name + "' is generating")
 
-  with open(file_name, mode='w') as fd:
-    fd.write(','.join(var_list) + '\n');
-    for pid, ind in keys:
-      entry = var_table[(pid, ind)]
-      dispatch_number = entry['Index']
-      if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n")
-      val_list = [entry[var] for var in var_list]
-      fd.write(','.join(val_list) + '\n');
 
-  print("File '" + file_name + "' is generating")
 #############################################################
 
+
 # fill kernels DB
 def fill_kernel_db(table_name, db):
-  global var_list
-  keys = sorted(var_table.keys(), key=tuple_comparator)
+    global var_list
+    keys = sorted(var_table.keys(), key=tuple_comparator)
+
+    for var in set(var_list).difference(set(table_descr[1])):
+        table_descr[1][var] = "INTEGER"
+    table_descr[0] = var_list
+
+    table_handle = db.add_table(table_name, table_descr)
 
-  for var in set(var_list).difference(set(table_descr[1])):
-    table_descr[1][var] = 'INTEGER'
-  table_descr[0] = var_list;
+    for pid, ind in keys:
+        entry = var_table[(pid, ind)]
+        dispatch_number = entry["Index"]
+        if ind != dispatch_number:
+            fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n")
+        val_list = [entry[var] for var in var_list]
+        db.insert_entry(table_handle, val_list)
 
-  table_handle = db.add_table(table_name, table_descr)
 
-  for pid, ind in keys:
-    entry = var_table[(pid, ind)]
-    dispatch_number = entry['Index']
-    if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n")
-    val_list = [entry[var] for var in var_list]
-    db.insert_entry(table_handle, val_list)
 #############################################################
 
 # Fill Ext DB
 ext_table_descr = [
-  ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'Index', '__section', '__lane'],
-  {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'Index':'INTEGER', '__section':'INTEGER', '__lane':'INTEGER'}
+    ["BeginNs", "EndNs", "pid", "tid", "Name", "Index", "__section", "__lane"],
+    {
+        "BeginNs": "INTEGER",
+        "EndNs": "INTEGER",
+        "pid": "INTEGER",
+        "tid": "INTEGER",
+        "Name": "TEXT",
+        "Index": "INTEGER",
+        "__section": "INTEGER",
+        "__lane": "INTEGER",
+    },
 ]
+
+
 def fill_ext_db(table_name, db, indir, trace_name, api_pid):
-  global range_data
+    global range_data
 
-  file_name = indir + '/' + trace_name + '_trace.txt'
-  # tms pid:tid cid:rid:'.....'
-  ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(\d+):"(.*)"$')
+    file_name = indir + "/" + trace_name + "_trace.txt"
+    # tms pid:tid cid:rid:'.....'
+    ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(\d+):"(.*)"$')
 
-  range_data = {}
-  range_stack = {}
-  range_map = {}
+    range_data = {}
+    range_stack = {}
+    range_map = {}
 
-  if not os.path.isfile(file_name): return 0
+    if not os.path.isfile(file_name):
+        return 0
+
+    record_id = 0
+    table_handle = db.add_table(table_name, ext_table_descr)
+    with open(file_name, mode="r") as fd:
+        for line in fd.readlines():
+            record = line[:-1]
+            m = ptrn_val.match(record)
+            if m:
+                tms = int(m.group(1))
+                pid = m.group(2)
+                tid = int(m.group(3))
+                cid = int(m.group(4))
+                rid = int(m.group(5))
+                msg = m.group(6)
+
+                rec_vals = []
+                if not tid in range_data:
+                    range_data[tid] = {}
+
+                if cid != 2:
+                    rec_vals.append(tms)
+                    rec_vals.append(tms + 1)
+                    rec_vals.append(pid)
+                    rec_vals.append(tid)
+                    rec_vals.append(msg)
+                    rec_vals.append(record_id)
+                    rec_vals.append(api_pid)  # __section
+                    rec_vals.append(tid)  # __lane
+
+                if cid == 1:
+                    if not pid in range_stack:
+                        range_stack[pid] = {}
+                    pid_stack = range_stack[pid]
+                    if not tid in pid_stack:
+                        pid_stack[tid] = []
+                    rec_stack = pid_stack[tid]
+                    rec_stack.append(rec_vals)
+                    continue
+
+                if cid == 2:
+                    if not pid in range_stack:
+                        fatal("ROCTX range begin not found, pid(" + pid + ")")
+                    pid_stack = range_stack[pid]
+                    if not tid in pid_stack:
+                        fatal("ROCTX range begin not found, tid(" + tid + ")")
+                    rec_stack = pid_stack[tid]
+                    rec_vals = rec_stack.pop()
+                    rec_vals[1] = tms
+                    # record the range's start/stop timestamps, its parent (ranges can be nested), and its message.
+                    range_start = rec_vals[0]
+                    range_stop = tms
+                    range_parent = rec_stack[-1][0] if len(rec_stack) != 0 else 0
+                    range_msg = rec_vals[4]
+                    range_data[tid][range_start] = (range_stop, range_parent, range_msg)
+
+                # range start
+                if cid == 3:
+                    range_map[rid] = (tms, msg)
+                    continue
+
+                # range stop
+                if cid == 4:
+                    if rid in range_map:
+                        (tms, msg) = range_map[
+                            rid
+                        ]  # querying start timestamp if rid exists
+                        del range_map[rid]
+                    else:
+                        fatal("range id(" + str(rid) + ") is not found")
+                    rec_vals[0] = tms  # begin timestamp
+                    rec_vals[4] = msg  # range message
+                    rec_vals[7] = 0  # 0 lane for ranges
+
+                db.insert_entry(table_handle, rec_vals)
+                record_id += 1
+
+    return 1
 
-  record_id = 0
-  table_handle = db.add_table(table_name, ext_table_descr)
-  with open(file_name, mode='r') as fd:
-    for line in fd.readlines():
-      record = line[:-1]
-      m = ptrn_val.match(record)
-      if m:
-        tms = int(m.group(1))
-        pid = m.group(2)
-        tid = int(m.group(3))
-        cid = int(m.group(4))
-        rid = int(m.group(5))
-        msg = m.group(6)
 
-        rec_vals = []
-        if not tid in range_data: range_data[tid] = {}
-
-        if cid != 2:
-          rec_vals.append(tms)
-          rec_vals.append(tms + 1)
-          rec_vals.append(pid)
-          rec_vals.append(tid)
-          rec_vals.append(msg)
-          rec_vals.append(record_id)
-          rec_vals.append(api_pid)     # __section
-          rec_vals.append(tid)         # __lane
-
-        if cid == 1:
-          if not pid in range_stack: range_stack[pid] = {}
-          pid_stack = range_stack[pid]
-          if not tid in pid_stack: pid_stack[tid] = []
-          rec_stack = pid_stack[tid]
-          rec_stack.append(rec_vals)
-          continue
-
-        if cid == 2:
-          if not pid in range_stack: fatal("ROCTX range begin not found, pid(" + pid + ")");
-          pid_stack = range_stack[pid]
-          if not tid in pid_stack: fatal("ROCTX range begin not found, tid(" + tid + ")");
-          rec_stack = pid_stack[tid]
-          rec_vals = rec_stack.pop()
-          rec_vals[1] = tms
-          # record the range's start/stop timestamps, its parent (ranges can be nested), and its message.
-          range_start = rec_vals[0]
-          range_stop = tms
-          range_parent = rec_stack[-1][0] if len(rec_stack) != 0 else 0
-          range_msg = rec_vals[4]
-          range_data[tid][range_start] = (range_stop, range_parent, range_msg)
-
-        # range start
-        if cid == 3:
-          range_map[rid] = (tms, msg)
-          continue
-
-        # range stop
-        if cid == 4:
-          if rid in range_map:
-            (tms, msg) = range_map[rid]    # querying start timestamp if rid exists
-            del range_map[rid]
-          else: fatal("range id(" + str(rid) + ") is not found")
-          rec_vals[0] = tms       # begin timestamp
-          rec_vals[4] = msg       # range message
-          rec_vals[7] = 0         # 0 lane for ranges
-
-        db.insert_entry(table_handle, rec_vals)
-        record_id += 1
-
-  return 1
 #############################################################
 # arguments manipulation routines
 def get_field(args, field):
-  ptrn1_field = re.compile(r'^.* ' + field + '\(');
-  ptrn2_field = re.compile(r'\) .*$');
-  ptrn3_field = re.compile(r'\)\)$');
-  (field_name, n) = ptrn1_field.subn('', args, count=1);
-  if n != 0:
-    (field_name, n) = ptrn2_field.subn('', field_name, count=1)
-    if n == 0:
-      (field_name, n) = ptrn3_field.subn('', field_name, count=1)
-  return (field_name, n)
+    ptrn1_field = re.compile(r"^.* " + field + "\(")
+    ptrn2_field = re.compile(r"\) .*$")
+    ptrn3_field = re.compile(r"\)\)$")
+    (field_name, n) = ptrn1_field.subn("", args, count=1)
+    if n != 0:
+        (field_name, n) = ptrn2_field.subn("", field_name, count=1)
+        if n == 0:
+            (field_name, n) = ptrn3_field.subn("", field_name, count=1)
+    return (field_name, n)
+
 
 def set_field(args, field, val):
-  return re.subn(field + '\(\w+\)([ \)])', field + '(' + str(val) + ')\\1', args, count=1)
+    return re.subn(
+        field + "\(\w+\)([ \)])", field + "(" + str(val) + ")\\1", args, count=1
+    )
+
 
 hsa_patch_data = {}
 ops_patch_data = {}
 
 # Fill API DB
 api_table_descr = [
-  ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data', '__section', '__lane'],
-  {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER', 'Data':'TEXT', '__section':'INTEGER', '__lane':'INTEGER'}
+    [
+        "BeginNs",
+        "EndNs",
+        "pid",
+        "tid",
+        "Name",
+        "args",
+        "Index",
+        "Data",
+        "__section",
+        "__lane",
+    ],
+    {
+        "BeginNs": "INTEGER",
+        "EndNs": "INTEGER",
+        "pid": "INTEGER",
+        "tid": "INTEGER",
+        "Name": "TEXT",
+        "args": "TEXT",
+        "Index": "INTEGER",
+        "Data": "TEXT",
+        "__section": "INTEGER",
+        "__lane": "INTEGER",
+    },
 ]
+
+
 # Filling API records DB table
 # table_name - created DB table name
 # db - DB handle
@@ -370,53 +471,64 @@ def set_field(args, field, val):
 # dep_pid - PID of dependet domain
 # dep_list - list of dependet dospatch events
 # dep_filtr - registered dependencies by record ID
-def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id):
-  global hsa_activity_found
-  global memory_manager
-
-  range_start_times = {}
-  copy_csv = ''
-
-  ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$')
-  hip_mcopy_ptrn = re.compile(r'hipMemcpy|hipMemset')
-  hip_wait_event_ptrn =  re.compile(r'WaitEvent')
-  hip_sync_event_ptrn = re.compile(r'hipStreamSynchronize')
-  hip_sync_dev_event_ptrn = re.compile(r'hipDeviceSynchronize')
-  hip_graph_ptrn = re.compile(r'hipGraphLaunch')
-  wait_event_ptrn = re.compile(r'WaitEvent|hipStreamSynchronize|hipDeviceSynchronize')
-  hip_stream_wait_write_ptrn = re.compile(r'hipStreamWaitValue64|hipStreamWriteValue64|hipStreamWaitValue32|hipStreamWriteValue32')
-  prop_pattern = re.compile("([\w-]+)\((\w+)\)");
-  beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)")
-  hip_strm_cr_event_ptrn = re.compile(r'hipStreamCreate')
-  hsa_mcopy_ptrn = re.compile(r'hsa_amd_memory_async_copy')
-  ptrn_fixformat = re.compile(r'(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$')
-  ptrn_fixkernel = re.compile(r'\s+kernel=(.*)$')
-  ptrn_multi_kernel = re.compile(r'(.*):(\d+)$')
-  ptrn_corr_id = re.compile(r'\ :(\d*)$')
-
-  file_name = indir + '/' + api_name + '_api_trace.txt'
-  if not os.path.isfile(file_name): return 0
-
-  hsa_copy_file_name = indir + '/' + 'async_copy_trace.txt'
-  hsa_copy_file_name_present = 1 if os.path.isfile(file_name) else 0
-  hsa_copy_deps = 1 if (api_pid == HSA_PID and hsa_copy_file_name_present == 1) else 0
-  print("hsa_copy_deps: " + str(hsa_copy_deps))
-
-  # parsing an input trace file and creating a DB table
-  record_id_dict = {}
-  table_handle = db.add_table(table_name, api_table_descr)
-  with open(file_name, mode='r') as fd:
-    file_lines = fd.readlines()
-    total_lines = len(file_lines)
-    line_index = 0
-    for line in file_lines:
-      if (line_index == total_lines - 1) or (line_index % 100 == 0):
-        sys.stdout.write( \
-          "\rscan " + api_name + " API data " + str(line_index) + ":" + str(total_lines)  + " "*100 \
-        )
-      line_index += 1
-
-      record = line[:-1]
+def fill_api_db(
+    table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id
+):
+    global hsa_activity_found
+    global memory_manager
+
+    range_start_times = {}
+    copy_csv = ""
+
+    ptrn_val = re.compile(r"(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$")
+    hip_mcopy_ptrn = re.compile(r"hipMemcpy|hipMemset")
+    hip_wait_event_ptrn = re.compile(r"WaitEvent")
+    hip_sync_event_ptrn = re.compile(r"hipStreamSynchronize")
+    hip_sync_dev_event_ptrn = re.compile(r"hipDeviceSynchronize")
+    hip_graph_ptrn = re.compile(r"hipGraphLaunch")
+    wait_event_ptrn = re.compile(r"WaitEvent|hipStreamSynchronize|hipDeviceSynchronize")
+    hip_stream_wait_write_ptrn = re.compile(
+        r"hipStreamWaitValue64|hipStreamWriteValue64|hipStreamWaitValue32|hipStreamWriteValue32"
+    )
+    prop_pattern = re.compile("([\w-]+)\((\w+)\)")
+    beg_pattern = re.compile('^dispatch\[(\d*)\], (.*) kernel-name\("([^"]*)"\)')
+    hip_strm_cr_event_ptrn = re.compile(r"hipStreamCreate")
+    hsa_mcopy_ptrn = re.compile(r"hsa_amd_memory_async_copy")
+    ptrn_fixformat = re.compile(r"(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$")
+    ptrn_fixkernel = re.compile(r"\s+kernel=(.*)$")
+    ptrn_multi_kernel = re.compile(r"(.*):(\d+)$")
+    ptrn_corr_id = re.compile(r"\ :(\d*)$")
+
+    file_name = indir + "/" + api_name + "_api_trace.txt"
+    if not os.path.isfile(file_name):
+        return 0
+
+    hsa_copy_file_name = indir + "/" + "async_copy_trace.txt"
+    hsa_copy_file_name_present = 1 if os.path.isfile(file_name) else 0
+    hsa_copy_deps = 1 if (api_pid == HSA_PID and hsa_copy_file_name_present == 1) else 0
+    print("hsa_copy_deps: " + str(hsa_copy_deps))
+
+    # parsing an input trace file and creating a DB table
+    record_id_dict = {}
+    table_handle = db.add_table(table_name, api_table_descr)
+    with open(file_name, mode="r") as fd:
+        file_lines = fd.readlines()
+        total_lines = len(file_lines)
+        line_index = 0
+        for line in file_lines:
+            if (line_index == total_lines - 1) or (line_index % 100 == 0):
+                sys.stdout.write(
+                    "\rscan "
+                    + api_name
+                    + " API data "
+                    + str(line_index)
+                    + ":"
+                    + str(total_lines)
+                    + " " * 100
+                )
+            line_index += 1
+
+            record = line[:-1]
 
       corr_id = 0
       m = ptrn_corr_id.search(record)
@@ -494,447 +606,690 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep
           for prop in kernel_properties.split(', '):
             m = prop_pattern.match(prop)
             if m:
-              val = m.group(2)
-              var = m.group(1)
-              if var == 'gpu-id':
-                gpu_id = int(val)
+                corr_id = int(m.group(1))
+                record = ptrn_corr_id.sub("", record)
+
+            kernel_arg = ""
+            m = ptrn_fixkernel.search(record)
+            if m:
+                kernel_arg = "kernel(" + m.group(1) + ") "
+                record = ptrn_fixkernel.sub("", record)
+
+            mfixformat = ptrn_fixformat.match(record)
+            if mfixformat:  # replace '=' in args with parentheses
+                reformated_args = (
+                    kernel_arg
+                    + mfixformat.group(2)
+                    .replace("=", "(")
+                    .replace(",", ")")
+                    .replace("\\", "\\\\")
+                    .replace('"', '\\"')
+                    + ")"
+                )
+                record = mfixformat.group(1) + "( " + reformated_args + ")"
+
+            m = ptrn_val.match(record)
+            if not m:
+                fatal(api_name + " bad record: '" + record + "'")
+            else:
+                rec_vals = []
+                rec_len = len(api_table_descr[0]) - 3
+                for ind in range(1, rec_len):
+                    rec_vals.append(m.group(ind))
+                proc_id = int(rec_vals[2])
+                thread_id = int(rec_vals[3])
+                record_name = rec_vals[4]
+                record_args = rec_vals[5]
+
+                # incrementing per-process record id/correlation id
+                if not proc_id in record_id_dict:
+                    record_id_dict[proc_id] = 0
+                record_id_dict[proc_id] += 1
+                record_id = record_id_dict[proc_id]
+
+                # setting correlationid to record id if correlation id is not defined
+                if corr_id == 0:
+                    corr_id = record_id
+
+                rec_vals.append(corr_id)
+                # extracting/converting stream id
+                (stream_id, stream_found) = get_field(record_args, "stream")
+                if stream_found:
+                    stream_id = get_stream_index(stream_id)
+                    (rec_vals[5], found) = set_field(record_args, "stream", stream_id)
+                    if found == 0:
+                        fatal(
+                            'set_field() failed for "stream", args: "' + record_args + '"'
+                        )
+                else:
+                    (stream_id, stream_found) = get_field(record_args, "hStream")
+                    if stream_found:
+                        stream_id = get_stream_index(stream_id)
+                        (rec_vals[5], found) = set_field(
+                            record_args, "hStream", stream_id
+                        )
+                        if found == 0:
+                            fatal(
+                                'set_field() failed for "stream", args: "'
+                                + record_args
+                                + '"'
+                            )
+                    else:
+                        stream_id = 0
+
+                if hip_strm_cr_event_ptrn.match(record_name):
+                    hip_streams.append(stream_id)
+
+                if hip_sync_event_ptrn.match(record_name):
+                    if (proc_id, stream_id) in last_hip_api_map:
+                        (last_hip_api_corr_id, last_hip_api_from_pid) = last_hip_api_map[
+                            (proc_id, stream_id)
+                        ][-1]
+                        sync_api_beg_us = int((int(rec_vals[0]) - START_NS) / 1000)
+                        if HIP_PID not in dep_dict[proc_id]:
+                            dep_dict[proc_id][HIP_PID] = {
+                                "pid": last_hip_api_from_pid,
+                                "from": [],
+                                "to": {},
+                                "id": [],
+                            }
+                        dep_dict[proc_id][HIP_PID]["from"].append(
+                            (-1, stream_id, thread_id)
+                        )
+                        dep_dict[proc_id][HIP_PID]["id"].append(last_hip_api_corr_id)
+                        dep_dict[proc_id][HIP_PID]["to"][
+                            last_hip_api_corr_id
+                        ] = sync_api_beg_us
+                        from_ids[(last_hip_api_corr_id, proc_id)] = (
+                            len(dep_dict[proc_id][HIP_PID]["from"]) - 1
+                        )
+
+                m = beg_pattern.match(record)
+                gpu_id = 0
+                if m:
+                    kernel_properties = m.group(2)
+                    for prop in kernel_properties.split(", "):
+                        m = prop_pattern.match(prop)
+                        if m:
+                            val = m.group(2)
+                            var = m.group(1)
+                            if var == "gpu-id":
+                                gpu_id = int(val)
+
+                if hsa_mcopy_ptrn.match(record_name) or hip_mcopy_ptrn.match(record_name):
+                    ops_section_id = COPY_PID
+                else:
+                    ops_section_id = GPU_BASE_PID + int(gpu_id)
+
+                if (proc_id, stream_id) not in last_hip_api_map:
+                    last_hip_api_map[(proc_id, stream_id)] = []
+                last_hip_api_map[(proc_id, stream_id)].append((corr_id, ops_section_id))
+
+                # asyncronous opeartion API found
+                op_found = 0
+                mcopy_found = 0
+
+                # extract kernel name string
+                (kernel_str, kernel_found) = get_field(record_args, "kernel")
+                if kernel_found == 0:
+                    kernel_str = ""
+                else:
+                    op_found = 1
+
+                if hip_mcopy_ptrn.match(record_name):
+                    mcopy_found = 1
+                    op_found = 1
+
+                # HIP Graph API
+                if hip_graph_ptrn.search(record_name):
+                    op_found = 1
+
+                # HIP WaitEvent API
+                if wait_event_ptrn.search(record_name):
+                    op_found = 1
+
+                if hip_stream_wait_write_ptrn.search(record_name):
+                    op_found = 1
+
+                # HSA memcopy API
+                if hsa_mcopy_ptrn.match(record_name):
+                    mcopy_found = 1
+                    op_found = 1
+
+                    stream_id = thread_id
+                    hsa_patch_data[(corr_id, proc_id)] = thread_id
+
+                if op_found:
+                    roctx_msg = ""
+
+                    if not thread_id in range_start_times:
+                        range_start_times[thread_id] = (
+                            sorted(range_data[thread_id].keys())
+                            if thread_id in range_data
+                            else []
+                        )
+                    start_times = range_start_times[thread_id]
+
+                    index = bisect.bisect_right(start_times, int(rec_vals[0]))
+                    if index > 0:
+                        # We found the range that is closest to this operation. Iterate the
+                        # range stack this range is part of until we find a range that entirely
+                        # contains the operation.
+                        range_start = start_times[index - 1]
+                        while range_start != 0:
+                            (range_end, range_start, msg) = range_data[thread_id][
+                                range_start
+                            ]
+                            if int(rec_vals[1]) < range_end:
+                                # This range contains the operation.
+                                roctx_msg = msg
+                                break
+
+                    ops_patch_data[(corr_id, proc_id)] = (
+                        thread_id,
+                        stream_id,
+                        kernel_str,
+                        roctx_msg,
+                    )
+
+                if op_found:
+                    op_found = 0
+                    beg_ns = int(rec_vals[0])
+                    end_ns = int(rec_vals[1])
+                    dur_us = int((end_ns - beg_ns) / 1000)
+                    from_us = int((beg_ns - START_NS) / 1000) + dur_us / 2
+                    if api_pid == HIP_PID or hsa_copy_deps == 1:
+                        if not proc_id in dep_dict:
+                            dep_dict[proc_id] = {}
+                        dep_proc = dep_dict[proc_id]
+                        if not dep_pid in dep_proc:
+                            if api_pid == "HIP_PID":
+                                dep_proc[dep_pid] = {"pid": api_pid, "from": [], "id": []}
+                            else:
+                                dep_proc[dep_pid] = {
+                                    "pid": api_pid,
+                                    "from": [],
+                                    "id": [],
+                                    "to": {},
+                                }
+                        dep_str = dep_proc[dep_pid]
+                        dep_str["from"].append((from_us, stream_id, thread_id))
+                        if expl_id:
+                            dep_str["id"].append(corr_id)
+
+                # memcopy registering
+                api_data = (
+                    memory_manager.register_api(rec_vals) if mcopy_data_enabled else ""
+                )
+                rec_vals.append(api_data)
+
+                # setting section and lane
+                rec_vals.append(api_pid)  # __section
+                rec_vals.append(thread_id)  # __lane
+
+                # inserting an API record to DB
+                db.insert_entry(table_handle, rec_vals)
+
+    # inserting of dispatch events correlated to the dependent dispatches
+    for from_ns, proc_id, thread_id in dep_list:
+        if not proc_id in record_id_dict:
+            record_id_dict[proc_id] = 0
+        record_id_dict[proc_id] += 1
+        corr_id = record_id_dict[proc_id]
+        db.insert_entry(
+            table_handle,
+            [
+                from_ns,
+                from_ns,
+                proc_id,
+                thread_id,
+                "hsa_dispatch",
+                "",
+                corr_id,
+                "",
+                api_pid,
+                thread_id,
+            ],
+        )
+
+    # generating memcopy CSV
+    if copy_csv != "":
+        file_name = os.environ["PWD"] + "/results_mcopy.csv"
+        with open(file_name, mode="w") as fd:
+            print("File '" + file_name + "' is generating")
+            fd.write(copy_csv)
+
+    return 1
 
 
-        if hsa_mcopy_ptrn.match(record_name) or hip_mcopy_ptrn.match(record_name):
-          ops_section_id = COPY_PID
-        else:
-          ops_section_id = GPU_BASE_PID + int(gpu_id)
-
-        if (proc_id,stream_id) not in last_hip_api_map:
-          last_hip_api_map[(proc_id,stream_id)] = []
-        last_hip_api_map[(proc_id, stream_id)].append((corr_id, ops_section_id))
-
-        # asyncronous opeartion API found
-        op_found = 0
-        mcopy_found = 0
-
-        # extract kernel name string
-        (kernel_str, kernel_found) = get_field(record_args, 'kernel')
-        if kernel_found == 0: kernel_str = ''
-        else: op_found = 1
-
-        if hip_mcopy_ptrn.match(record_name):
-          mcopy_found = 1
-          op_found = 1
-
-        # HIP Graph API
-        if hip_graph_ptrn.search(record_name):
-          op_found = 1
-
-        # HIP WaitEvent API
-        if wait_event_ptrn.search(record_name):
-          op_found = 1
-
-        if hip_stream_wait_write_ptrn.search(record_name):
-          op_found = 1
-
-        # HSA memcopy API
-        if hsa_mcopy_ptrn.match(record_name):
-          mcopy_found = 1
-          op_found = 1
-
-          stream_id = thread_id
-          hsa_patch_data[(corr_id, proc_id)] = thread_id
-
-        if op_found:
-          roctx_msg = ''
-
-          if not thread_id in range_start_times:
-            range_start_times[thread_id] = sorted(range_data[thread_id].keys()) if thread_id in range_data else []
-          start_times = range_start_times[thread_id]
-
-          index = bisect.bisect_right(start_times,int(rec_vals[0]))
-          if index > 0:
-            # We found the range that is closest to this operation. Iterate the
-            # range stack this range is part of until we find a range that entirely
-            # contains the operation.
-            range_start = start_times[index - 1]
-            while range_start != 0:
-              (range_end, range_start, msg) = range_data[thread_id][range_start]
-              if int(rec_vals[1]) < range_end:
-                # This range contains the operation.
-                roctx_msg = msg
-                break
-
-          ops_patch_data[(corr_id, proc_id)] = (thread_id, stream_id, kernel_str, roctx_msg)
-
-        if op_found:
-          op_found = 0
-          beg_ns = int(rec_vals[0])
-          end_ns = int(rec_vals[1])
-          dur_us = int((end_ns - beg_ns) / 1000)
-          from_us = int((beg_ns - START_NS) / 1000) + dur_us/2
-          if api_pid == HIP_PID or hsa_copy_deps == 1:
-            if not proc_id in dep_dict: dep_dict[proc_id] = {}
-            dep_proc = dep_dict[proc_id]
-            if not dep_pid in dep_proc:
-              if api_pid == 'HIP_PID': dep_proc[dep_pid] = { 'pid': api_pid, 'from': [], 'id': [] }
-              else: dep_proc[dep_pid] = { 'pid': api_pid, 'from': [], 'id': [], 'to': {} }
-            dep_str = dep_proc[dep_pid]
-            dep_str['from'].append((from_us, stream_id, thread_id))
-            if expl_id: dep_str['id'].append(corr_id)
-
-        # memcopy registering
-        api_data = memory_manager.register_api(rec_vals) if mcopy_data_enabled else ''
-        rec_vals.append(api_data)
-
-        # setting section and lane
-        rec_vals.append(api_pid)             # __section
-        rec_vals.append(thread_id)           # __lane
-
-        # inserting an API record to DB
-        db.insert_entry(table_handle, rec_vals)
-
-  # inserting of dispatch events correlated to the dependent dispatches
-  for (from_ns, proc_id, thread_id) in dep_list:
-    if not proc_id in record_id_dict: record_id_dict[proc_id] = 0
-    record_id_dict[proc_id] += 1
-    corr_id = record_id_dict[proc_id]
-    db.insert_entry(table_handle, [from_ns, from_ns, proc_id, thread_id, 'hsa_dispatch', '', corr_id, '', api_pid, thread_id])
-
-  # generating memcopy CSV
-  if copy_csv != '':
-    file_name = os.environ['PWD'] + '/results_mcopy.csv'
-    with open(file_name, mode='w') as fd:
-      print("File '" + file_name + "' is generating")
-      fd.write(copy_csv)
-
-  return 1
 #############################################################
 
 # fill COPY DB
 copy_table_descr = [
-  ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index', 'Data', '__section', '__lane'],
-  {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Data':'TEXT', '__section':'INTEGER', '__lane':'INTEGER'}
+    ["BeginNs", "EndNs", "Name", "pid", "tid", "Index", "Data", "__section", "__lane"],
+    {
+        "Index": "INTEGER",
+        "Name": "TEXT",
+        "args": "TEXT",
+        "BeginNs": "INTEGER",
+        "EndNs": "INTEGER",
+        "pid": "INTEGER",
+        "tid": "INTEGER",
+        "Data": "TEXT",
+        "__section": "INTEGER",
+        "__lane": "INTEGER",
+    },
 ]
-def fill_copy_db(table_name, db, indir):
-  sect_id = COPY_PID
-  file_name = indir + '/' + 'async_copy_trace.txt'
-  ptrn_val = re.compile(r'^(\d+):(\d+) (async-copy):(\d+):(\d+)$')
-
-  if not os.path.isfile(file_name): return 0
-
-  table_handle = db.add_table(table_name, copy_table_descr)
-  with open(file_name, mode='r') as fd:
-    for line in fd.readlines():
-      record = line[:-1]
-      m = ptrn_val.match(record)
-      if not m: fatal("bad async-copy entry '" + record + "'")
-      else:
-        rec_vals = []
-        for ind in range(1,4): rec_vals.append(m.group(ind))
-        corr_id = int(m.group(4))
-        proc_id = int(m.group(5))
-
-        # querying tid value
-        if (corr_id, proc_id) in hsa_patch_data:
-          thread_id = hsa_patch_data[(corr_id, proc_id)]
-        else:
-          thread_id = -1
-
-        # completing record
-        rec_vals.append(proc_id)          # tid
-        rec_vals.append(thread_id)        # tid
-        rec_vals.append(corr_id)          # Index
 
-        # registering memcopy information
-        activity_data = memory_manager.register_copy(rec_vals) if mcopy_data_enabled else ''
-        rec_vals.append(activity_data)
 
-        # appending straem ID and section ID
-        rec_vals.append(COPY_PID)     # __section
-        rec_vals.append(thread_id)    # __lane
-
-        # inserting DB activity entry
-        db.insert_entry(table_handle, rec_vals)
-
-        # filling dependencies
-        to_ns = int(rec_vals[0])
-        to_us = int((to_ns - START_NS) / 1000)
+def fill_copy_db(table_name, db, indir):
+    sect_id = COPY_PID
+    file_name = indir + "/" + "async_copy_trace.txt"
+    ptrn_val = re.compile(r"^(\d+):(\d+) (async-copy):(\d+):(\d+)$")
+
+    if not os.path.isfile(file_name):
+        return 0
+
+    table_handle = db.add_table(table_name, copy_table_descr)
+    with open(file_name, mode="r") as fd:
+        for line in fd.readlines():
+            record = line[:-1]
+            m = ptrn_val.match(record)
+            if not m:
+                fatal("bad async-copy entry '" + record + "'")
+            else:
+                rec_vals = []
+                for ind in range(1, 4):
+                    rec_vals.append(m.group(ind))
+                corr_id = int(m.group(4))
+                proc_id = int(m.group(5))
+
+                # querying tid value
+                if (corr_id, proc_id) in hsa_patch_data:
+                    thread_id = hsa_patch_data[(corr_id, proc_id)]
+                else:
+                    thread_id = -1
+
+                # completing record
+                rec_vals.append(proc_id)  # tid
+                rec_vals.append(thread_id)  # tid
+                rec_vals.append(corr_id)  # Index
+
+                # registering memcopy information
+                activity_data = (
+                    memory_manager.register_copy(rec_vals) if mcopy_data_enabled else ""
+                )
+                rec_vals.append(activity_data)
+
+                # appending straem ID and section ID
+                rec_vals.append(COPY_PID)  # __section
+                rec_vals.append(thread_id)  # __lane
+
+                # inserting DB activity entry
+                db.insert_entry(table_handle, rec_vals)
+
+                # filling dependencies
+                to_ns = int(rec_vals[0])
+                to_us = int((to_ns - START_NS) / 1000)
+
+                if thread_id != -1:
+                    # if not proc_id in dep_dict: dep_dict[proc_id] = {}
+                    dep_proc = dep_dict[proc_id]
+                    # if not pid in dep_proc: dep_proc[pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] }
+                    dep_str = dep_proc[sect_id]
+                    dep_str["to"][corr_id] = to_us
+                    dep_str["id"].append(corr_id)
+
+    return 1
 
-        if thread_id != -1:
-          #if not proc_id in dep_dict: dep_dict[proc_id] = {}
-          dep_proc = dep_dict[proc_id]
-          #if not pid in dep_proc: dep_proc[pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] }
-          dep_str = dep_proc[sect_id]
-          dep_str['to'][corr_id] = to_us
-          dep_str['id'].append(corr_id)
 
-  return 1
 #############################################################
 
 # fill HCC ops DB
 ops_table_descr = [
-  ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'roctx-range', 'stream-id', 'Index', 'Data', '__section', '__lane'],
-  {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'roctx-range':'TEXT', 'Data':'TEXT', 'stream-id':'INTEGER', '__section':'INTEGER', '__lane':'INTEGER'}
+    [
+        "BeginNs",
+        "EndNs",
+        "dev-id",
+        "queue-id",
+        "Name",
+        "pid",
+        "tid",
+        "roctx-range",
+        "stream-id",
+        "Index",
+        "Data",
+        "__section",
+        "__lane",
+    ],
+    {
+        "Index": "INTEGER",
+        "Name": "TEXT",
+        "args": "TEXT",
+        "BeginNs": "INTEGER",
+        "EndNs": "INTEGER",
+        "dev-id": "INTEGER",
+        "queue-id": "INTEGER",
+        "pid": "INTEGER",
+        "tid": "INTEGER",
+        "roctx-range": "TEXT",
+        "Data": "TEXT",
+        "stream-id": "INTEGER",
+        "__section": "INTEGER",
+        "__lane": "INTEGER",
+    },
 ]
-def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir):
-  global max_gpu_id
-  file_name = indir + '/' + 'hcc_ops_trace.txt'
-  ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) (.*)$')
-  ptrn_id = re.compile(r'^([^:]+):(\d+):(\d+)$')
-  ptrn_mcopy = re.compile(r'(Memcpy|Copy|Fill)')
-  ptrn_barrier = re.compile(r'Marker')
-
-  if not os.path.isfile(file_name): return {}
-
-  filtr = {}
-
-  kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr)
-  mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr)
-  with open(file_name, mode='r') as fd:
-    file_lines = fd.readlines()
-    total_lines = len(file_lines)
-    line_index = 0
-    for line in file_lines:
-      if (line_index == total_lines - 1) or (line_index % 100 == 0):
-        sys.stdout.write( \
-          "\rscan ops data " + str(line_index) + ":" + str(total_lines)  + " "*100 \
-        )
-      line_index += 1
 
-      record = line[:-1]
-      m = ptrn_val.match(record)
-      if m:
-        # parsing trace record
-        rec_vals = []
-        for ind in range(1,6): rec_vals.append(m.group(ind))
-        label = rec_vals[4] # record name
-        m = ptrn_id.match(label)
-        if not m: fatal("bad hcc ops entry '" + record + "'")
-        name = m.group(1)
-        corr_id = int(m.group(2))
-        proc_id = int(m.group(3))
-
-        # checking name for memcopy pattern
-        is_barrier = 0
-        if ptrn_mcopy.search(name):
-          rec_table_name = mcopy_table_name
-          table_handle = mcopy_table_handle
-          sect_id = COPY_PID;
-        else:
-          rec_table_name = kernel_table_name
-          table_handle = kernel_table_handle
-
-          gpu_id = int(rec_vals[2]);
-          if (gpu_id > max_gpu_id): max_gpu_id = gpu_id
-          sect_id = GPU_BASE_PID + int(gpu_id)
-
-          if ptrn_barrier.search(name):
-            name = '"<barrier packet>"'
-            is_barrier = 1
-
-        thread_id = 0
-        stream_id = 0
-        roctx_range = ''
-        if (corr_id, proc_id) in ops_patch_data:
-          (thread_id, stream_id, name_patch, roctx_range) = ops_patch_data[(corr_id, proc_id)]
-          if name_patch != '': name = name_patch
-          if roctx_range == '': roctx_range = name
-        else:
-          if is_barrier: continue
-          else:
-            if "ROCP_CTRL_RATE" in os.environ: continue
-            else: fatal("hcc ops data not found: '" + record + "', " + str(corr_id) + ", " + str(proc_id))
-
-        # activity record
-        rec_vals[4] = name                       # Name
-        rec_vals.append(proc_id)                 # pid
-        rec_vals.append(thread_id)               # tid
-        rec_vals.append(roctx_range)             # roctx-range
-        rec_vals.append(stream_id)               # StreamId
-        rec_vals.append(corr_id)                 # Index
-
-        # registering memcopy information
-        activity_data = memory_manager.register_activity(rec_vals) if mcopy_data_enabled else ''
-        rec_vals.append(activity_data)
-
-        # activity record data for stream ID and sction ID
-        rec_vals.append(sect_id)                 # __section
-        rec_vals.append(stream_id)               # __lane
-
-        # inserting DB activity entry
-        db.insert_entry(table_handle, rec_vals)
-
-        # registering a dependency filtr
-        filtr[(corr_id, proc_id)] = rec_table_name
-
-        # filling a dependencies
-        to_ns = int(rec_vals[0])
-        to_us = int((to_ns - START_NS) / 1000)
-
-        end_ns = int(rec_vals[1])
-        dur_us = int((end_ns - to_ns) / 1000)
-
-        if (corr_id, proc_id) in from_ids:
-          depid = from_ids[(corr_id, proc_id)]
-          from_val = dep_dict[proc_id][HIP_PID]['from'][depid]
-          print("from_val" + str(from_val))
-          from_val_new = (to_us + dur_us, from_val[1], from_val[2])
-          dep_dict[proc_id][HIP_PID]['from'][depid] = from_val_new
-
-        if not proc_id in dep_dict: dep_dict[proc_id] = {}
-        dep_proc = dep_dict[proc_id]
-        if not sect_id in dep_proc: dep_proc[sect_id] = { 'bsp': OPS_PID, 'to': {} }
-        dep_str = dep_proc[sect_id]
-        dep_str['to'][corr_id] = to_us
 
-      else:
-        fatal("hcc ops bad record: '" + record + "'")
+def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir):
+    global max_gpu_id
+    file_name = indir + "/" + "hcc_ops_trace.txt"
+    ptrn_val = re.compile(r"(\d+):(\d+) (\d+):(\d+) (.*)$")
+    ptrn_id = re.compile(r"^([^:]+):(\d+):(\d+)$")
+    ptrn_mcopy = re.compile(r"(Memcpy|Copy|Fill)")
+    ptrn_barrier = re.compile(r"Marker")
+
+    if not os.path.isfile(file_name):
+        return {}
+
+    filtr = {}
+
+    kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr)
+    mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr)
+    with open(file_name, mode="r") as fd:
+        file_lines = fd.readlines()
+        total_lines = len(file_lines)
+        line_index = 0
+        for line in file_lines:
+            if (line_index == total_lines - 1) or (line_index % 100 == 0):
+                sys.stdout.write(
+                    "\rscan ops data "
+                    + str(line_index)
+                    + ":"
+                    + str(total_lines)
+                    + " " * 100
+                )
+            line_index += 1
+
+            record = line[:-1]
+            m = ptrn_val.match(record)
+            if m:
+                # parsing trace record
+                rec_vals = []
+                for ind in range(1, 6):
+                    rec_vals.append(m.group(ind))
+                label = rec_vals[4]  # record name
+                m = ptrn_id.match(label)
+                if not m:
+                    fatal("bad hcc ops entry '" + record + "'")
+                name = m.group(1)
+                corr_id = int(m.group(2))
+                proc_id = int(m.group(3))
+
+                # checking name for memcopy pattern
+                is_barrier = 0
+                if ptrn_mcopy.search(name):
+                    rec_table_name = mcopy_table_name
+                    table_handle = mcopy_table_handle
+                    sect_id = COPY_PID
+                else:
+                    rec_table_name = kernel_table_name
+                    table_handle = kernel_table_handle
+
+                    gpu_id = int(rec_vals[2])
+                    if gpu_id > max_gpu_id:
+                        max_gpu_id = gpu_id
+                    sect_id = GPU_BASE_PID + int(gpu_id)
+
+                    if ptrn_barrier.search(name):
+                        name = '"<barrier packet>"'
+                        is_barrier = 1
+
+                thread_id = 0
+                stream_id = 0
+                roctx_range = ""
+                if (corr_id, proc_id) in ops_patch_data:
+                    (thread_id, stream_id, name_patch, roctx_range) = ops_patch_data[
+                        (corr_id, proc_id)
+                    ]
+                    if name_patch != "":
+                        name = name_patch
+                    if roctx_range == "":
+                        roctx_range = name
+                else:
+                    if is_barrier:
+                        continue
+                    else:
+                        if "ROCP_CTRL_RATE" in os.environ:
+                            continue
+                        else:
+                            fatal(
+                                "hcc ops data not found: '"
+                                + record
+                                + "', "
+                                + str(corr_id)
+                                + ", "
+                                + str(proc_id)
+                            )
+
+                # activity record
+                rec_vals[4] = name  # Name
+                rec_vals.append(proc_id)  # pid
+                rec_vals.append(thread_id)  # tid
+                rec_vals.append(roctx_range)  # roctx-range
+                rec_vals.append(stream_id)  # StreamId
+                rec_vals.append(corr_id)  # Index
+
+                # registering memcopy information
+                activity_data = (
+                    memory_manager.register_activity(rec_vals)
+                    if mcopy_data_enabled
+                    else ""
+                )
+                rec_vals.append(activity_data)
+
+                # activity record data for stream ID and sction ID
+                rec_vals.append(sect_id)  # __section
+                rec_vals.append(stream_id)  # __lane
+
+                # inserting DB activity entry
+                db.insert_entry(table_handle, rec_vals)
+
+                # registering a dependency filtr
+                filtr[(corr_id, proc_id)] = rec_table_name
+
+                # filling a dependencies
+                to_ns = int(rec_vals[0])
+                to_us = int((to_ns - START_NS) / 1000)
+
+                end_ns = int(rec_vals[1])
+                dur_us = int((end_ns - to_ns) / 1000)
+
+                if (corr_id, proc_id) in from_ids:
+                    depid = from_ids[(corr_id, proc_id)]
+                    from_val = dep_dict[proc_id][HIP_PID]["from"][depid]
+                    print("from_val" + str(from_val))
+                    from_val_new = (to_us + dur_us, from_val[1], from_val[2])
+                    dep_dict[proc_id][HIP_PID]["from"][depid] = from_val_new
+
+                if not proc_id in dep_dict:
+                    dep_dict[proc_id] = {}
+                dep_proc = dep_dict[proc_id]
+                if not sect_id in dep_proc:
+                    dep_proc[sect_id] = {"bsp": OPS_PID, "to": {}}
+                dep_str = dep_proc[sect_id]
+                dep_str["to"][corr_id] = to_us
+
+            else:
+                fatal("hcc ops bad record: '" + record + "'")
+
+    return filtr
+
 
-  return filtr
 #############################################################
 # main
-if (len(sys.argv) < 2): fatal("Usage: " + sys.argv[0] + " <output CSV file> <input result files list>")
+if len(sys.argv) < 2:
+    fatal("Usage: " + sys.argv[0] + " <output CSV file> <input result files list>")
 
 outfile = sys.argv[1]
 infiles = sys.argv[2:]
-indir = re.sub(r'\/[^\/]*$', r'', infiles[0])
-inext = re.sub(r'\s+$', r'', infiles[0])
-inext = re.sub(r'^.*(\.[^\.]+)$', r'\1', inext)
-
-dbfile = ''
-csvfile = ''
-
-if 'ROCP_JSON_REBASE' in os.environ and os.environ['ROCP_JSON_REBASE'] == 0:
-  begin_ts_file = indir + '/begin_ts_file.txt'
-  if os.path.isfile(begin_ts_file):
-    with open(begin_ts_file, mode='r') as fd:
-      ind = 0
-      for line in fd.readlines():
-        val = int(line)
-        if ind == 0 or val < START_NS: START_NS = val
-        ind += 1
-    print('START timestamp found (' + str(START_NS) + 'ns)')
-
-if re.search(r'\.csv$', outfile):
-  csvfile = outfile
-elif re.search(r'\.db$', outfile):
-  dbfile = outfile
-  csvfile = re.sub(r'\.db$', '.csv', outfile)
+indir = re.sub(r"\/[^\/]*$", r"", infiles[0])
+inext = re.sub(r"\s+$", r"", infiles[0])
+inext = re.sub(r"^.*(\.[^\.]+)$", r"\1", inext)
+
+dbfile = ""
+csvfile = ""
+
+if "ROCP_JSON_REBASE" in os.environ and os.environ["ROCP_JSON_REBASE"] == 0:
+    begin_ts_file = indir + "/begin_ts_file.txt"
+    if os.path.isfile(begin_ts_file):
+        with open(begin_ts_file, mode="r") as fd:
+            ind = 0
+            for line in fd.readlines():
+                val = int(line)
+                if ind == 0 or val < START_NS:
+                    START_NS = val
+                ind += 1
+        print("START timestamp found (" + str(START_NS) + "ns)")
+
+if re.search(r"\.csv$", outfile):
+    csvfile = outfile
+elif re.search(r"\.db$", outfile):
+    dbfile = outfile
+    csvfile = re.sub(r"\.db$", ".csv", outfile)
 else:
-  fatal("Bad output file '" + outfile + "'")
+    fatal("Bad output file '" + outfile + "'")
 
-if inext == '.txt':
-  for f in infiles: parse_res(f)
-  if len(var_table) != 0: merge_table()
+if inext == ".txt":
+    for f in infiles:
+        parse_res(f)
+    if len(var_table) != 0:
+        merge_table()
 
-if dbfile == '':
-  dump_csv(csvfile)
+if dbfile == "":
+    dump_csv(csvfile)
 else:
-  statfile = re.sub(r'\.csv$', '.stats.csv', csvfile)
-  jsonfile = re.sub(r'\.csv$', '.json', csvfile)
-
-  hsa_statfile = re.sub(r'\.stats\.csv$', r'.hsa_stats.csv', statfile)
-  hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile)
-  ops_statfile = statfile
-  copy_statfile = re.sub(r'\.stats\.csv$', r'.copy_stats.csv', statfile)
-  memcopy_info_file = re.sub(r'\.stats\.csv$', r'.memcopy_info.csv', statfile)
-  sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo.txt', statfile)
-  metadata_gen(sysinfo_file, "@ROCMINFO_EXEC@")
-
-  with open(dbfile, mode='w') as fd: fd.truncate()
-  db = SQLiteDB(dbfile)
-  memory_manager = MemManager(db, indir)
-
-  ext_trace_found = fill_ext_db('rocTX', db, indir, 'roctx', EXT_PID)
-
-  hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0)
-  hsa_activity_found = fill_copy_db('COPY', db, indir)
-
-  hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], {}, 1)
-  ops_filtr = fill_ops_db('OPS', 'COPY', db, indir)
-
-  fill_kernel_db('KERN', db)
-
-  any_trace_found = ext_trace_found | hsa_trace_found | hip_trace_found
-  copy_trace_found = 0
-  if hsa_activity_found or len(ops_filtr): copy_trace_found = 1
-
-  if any_trace_found:
-    db.open_json(jsonfile)
-
-  if ext_trace_found:
-    db.label_json(EXT_PID, "Markers and Ranges", jsonfile)
-
-  if hip_trace_found:
-    db.label_json(HIP_PID, "CPU HIP API", jsonfile)
-
-  if hsa_trace_found:
-    db.label_json(HSA_PID, "CPU HSA API", jsonfile)
-
-  db.label_json(COPY_PID, "COPY", jsonfile)
-
-  if any_trace_found and max_gpu_id >= 0:
-    for ind in range(0, int(max_gpu_id) + 1):
-      db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile)
-
-  if ext_trace_found:
-    dform.gen_ext_json_trace(db, 'rocTX', START_NS, jsonfile)
-
-  if len(var_table) != 0:
-    dform.post_process_data(db, 'KERN', csvfile)
-    dform.gen_table_bins(db, 'KERN', statfile, 'KernelName', 'DurationNs')
-    if hsa_trace_found and 'BeginNs' in var_list:
-      dform.gen_kernel_json_trace(db, 'KERN', GPU_BASE_PID, START_NS, jsonfile)
-
-  if hsa_trace_found:
-    dform.post_process_data(db, 'HSA')
-    dform.gen_table_bins(db, 'HSA', hsa_statfile, 'Name', 'DurationNs')
-    dform.gen_api_json_trace(db, 'HSA', START_NS, jsonfile)
-
-  if copy_trace_found:
-    dform.post_process_data(db, 'COPY')
-    dform.gen_table_bins(db, 'COPY', copy_statfile, 'Name', 'DurationNs')
-    dform.gen_api_json_trace(db, 'COPY', START_NS, jsonfile)
-
-  if hip_trace_found:
-    dform.post_process_data(db, 'HIP')
-    dform.gen_table_bins(db, 'HIP', hip_statfile, 'Name', 'DurationNs')
-    dform.gen_api_json_trace(db, 'HIP', START_NS, jsonfile)
-
-  if ops_filtr:
-    dform.post_process_data(db, 'OPS')
-    dform.gen_table_bins(db, 'OPS', ops_statfile, 'Name', 'DurationNs')
-    dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_NS, jsonfile)
-
-  if any_trace_found:
-    dep_id = 0
-    for (proc_id, dep_proc) in dep_dict.items():
-      for (to_pid, dep_str) in dep_proc.items():
-        if 'bsp' in dep_str:
-          bspid = dep_str['bsp']
-          base_str = dep_proc[bspid]
-          for v in ('pid', 'from', 'id'):
-            dep_str[v] = base_str[v]
-          base_str['inv'] = 1
-
-      for (to_pid, dep_str) in dep_proc.items():
-        if 'inv' in dep_str: continue
-        if not 'to' in dep_str: continue
-
-        from_pid = dep_str['pid']
-        from_us_list = dep_str['from']
-        to_us_dict = dep_str['to']
-        corr_id_list = dep_str['id']
-
-        db.flow_json(dep_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, jsonfile)
-        dep_id += len(from_us_list)
-
-  if any_trace_found:
-    db.metadata_json(jsonfile, sysinfo_file)
-    db.close_json(jsonfile)
-
-  if mcopy_data_enabled:
-    memory_manager.dump_data('MM', memcopy_info_file)
-
-  db.close()
+    statfile = re.sub(r"\.csv$", ".stats.csv", csvfile)
+    jsonfile = re.sub(r"\.csv$", ".json", csvfile)
+
+    hsa_statfile = re.sub(r"\.stats\.csv$", r".hsa_stats.csv", statfile)
+    hip_statfile = re.sub(r"\.stats\.csv$", r".hip_stats.csv", statfile)
+    ops_statfile = statfile
+    copy_statfile = re.sub(r"\.stats\.csv$", r".copy_stats.csv", statfile)
+    memcopy_info_file = re.sub(r"\.stats\.csv$", r".memcopy_info.csv", statfile)
+    sysinfo_file = re.sub(r"\.stats\.csv$", r".sysinfo.txt", statfile)
+    metadata_gen(sysinfo_file, "@ROCMINFO_EXEC@")
+
+    with open(dbfile, mode="w") as fd:
+        fd.truncate()
+    db = SQLiteDB(dbfile)
+    memory_manager = MemManager(db, indir)
+
+    ext_trace_found = fill_ext_db("rocTX", db, indir, "roctx", EXT_PID)
+
+    hsa_trace_found = fill_api_db(
+        "HSA", db, indir, "hsa", HSA_PID, COPY_PID, kern_dep_list, {}, 0
+    )
+    hsa_activity_found = fill_copy_db("COPY", db, indir)
+
+    hip_trace_found = fill_api_db("HIP", db, indir, "hip", HIP_PID, OPS_PID, [], {}, 1)
+    ops_filtr = fill_ops_db("OPS", "COPY", db, indir)
+
+    fill_kernel_db("KERN", db)
+
+    any_trace_found = ext_trace_found | hsa_trace_found | hip_trace_found
+    copy_trace_found = 0
+    if hsa_activity_found or len(ops_filtr):
+        copy_trace_found = 1
+
+    if any_trace_found:
+        db.open_json(jsonfile)
+
+    if ext_trace_found:
+        db.label_json(EXT_PID, "Markers and Ranges", jsonfile)
+
+    if hip_trace_found:
+        db.label_json(HIP_PID, "CPU HIP API", jsonfile)
+
+    if hsa_trace_found:
+        db.label_json(HSA_PID, "CPU HSA API", jsonfile)
+
+    db.label_json(COPY_PID, "COPY", jsonfile)
+
+    if any_trace_found and max_gpu_id >= 0:
+        for ind in range(0, int(max_gpu_id) + 1):
+            db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile)
+
+    if ext_trace_found:
+        dform.gen_ext_json_trace(db, "rocTX", START_NS, jsonfile)
+
+    if len(var_table) != 0:
+        dform.post_process_data(db, "KERN", csvfile)
+        dform.gen_table_bins(db, "KERN", statfile, "KernelName", "DurationNs")
+        if hsa_trace_found and "BeginNs" in var_list:
+            dform.gen_kernel_json_trace(db, "KERN", GPU_BASE_PID, START_NS, jsonfile)
+
+    if hsa_trace_found:
+        dform.post_process_data(db, "HSA")
+        dform.gen_table_bins(db, "HSA", hsa_statfile, "Name", "DurationNs")
+        dform.gen_api_json_trace(db, "HSA", START_NS, jsonfile)
+
+    if copy_trace_found:
+        dform.post_process_data(db, "COPY")
+        dform.gen_table_bins(db, "COPY", copy_statfile, "Name", "DurationNs")
+        dform.gen_api_json_trace(db, "COPY", START_NS, jsonfile)
+
+    if hip_trace_found:
+        dform.post_process_data(db, "HIP")
+        dform.gen_table_bins(db, "HIP", hip_statfile, "Name", "DurationNs")
+        dform.gen_api_json_trace(db, "HIP", START_NS, jsonfile)
+
+    if ops_filtr:
+        dform.post_process_data(db, "OPS")
+        dform.gen_table_bins(db, "OPS", ops_statfile, "Name", "DurationNs")
+        dform.gen_ops_json_trace(db, "OPS", GPU_BASE_PID, START_NS, jsonfile)
+
+    if any_trace_found:
+        dep_id = 0
+        for proc_id, dep_proc in dep_dict.items():
+            for to_pid, dep_str in dep_proc.items():
+                if "bsp" in dep_str:
+                    bspid = dep_str["bsp"]
+                    base_str = dep_proc[bspid]
+                    for v in ("pid", "from", "id"):
+                        dep_str[v] = base_str[v]
+                    base_str["inv"] = 1
+
+            for to_pid, dep_str in dep_proc.items():
+                if "inv" in dep_str:
+                    continue
+                if not "to" in dep_str:
+                    continue
+
+                from_pid = dep_str["pid"]
+                from_us_list = dep_str["from"]
+                to_us_dict = dep_str["to"]
+                corr_id_list = dep_str["id"]
+
+                db.flow_json(
+                    dep_id,
+                    from_pid,
+                    from_us_list,
+                    to_pid,
+                    to_us_dict,
+                    corr_id_list,
+                    jsonfile,
+                )
+                dep_id += len(from_us_list)
+
+    if any_trace_found:
+        db.metadata_json(jsonfile, sysinfo_file)
+        db.close_json(jsonfile)
+
+    if mcopy_data_enabled:
+        memory_manager.dump_data("MM", memcopy_info_file)
+
+    db.close()
 
 sys.exit(0)
 #############################################################
-
diff --git a/bin/txt2params.py b/bin/txt2params.py
index 7944029f..4be34266 100644
--- a/bin/txt2params.py
+++ b/bin/txt2params.py
@@ -22,89 +22,92 @@
 
 import os, sys, re
 
+
 # gen_params() takes a text file like the output of rocminfo cmd and parses it into a map {key,value}
 # where key is the param and value is the value of this param
 # for example: Threadmodel : "posix"
 # it also processes encompasing sections to generate a full param name such as (section names separated by '_'):
 #     "Agent2_PoolInfo_ISAInfo_ISA1_WorkgroupMaxSizeperDimension_x": "1024(0x400)",
 def gen_params(txtfile):
-  fields = {}
-  counter = 0
-  parent_field = ''
-  nbr_indent = 0
-  nbr_indent_prev = 0
-  check_for_dims = False
-  with open(txtfile) as fp:
-    for line in fp:
-      me = re.match(r'\*\*\* Done \*\*\*',line) #Marks the end of cmd
-      if me:
-        parent_field = ''
-        nbr_indent = 0
-        nbr_indent_prev = 0
-        check_for_dims = False
-        continue
-      mv = re.match(r'HCC clang version\s+(.*)',line) # outlier: only line with a version number and no ':', special case
-      if mv:
-        key = 'HCCclangversion'
-        val = mv.group(1)
-        counter = counter + 1
-        fields[(counter,key)] = val
-        continue
-      # Variable 'check_for_dims' is True for text like this:
-      # Workgroup Max Size per Dimension:
-      #     x                        1024(0x400)
-      #     y                        1024(0x400)
-      #     z                        1024(0x400)
-      if check_for_dims == True:
-        mc = re.match(r'\s*([x|y|z])\s+(.*)',line)
-        if mc:
-          key_sav = mc.group(1)
-          if parent_field != '':
-            key = parent_field + '.' + mc.group(1)
-          else:
-            key = mc.group(1)
-          val = re.sub(r"\s+", "", mc.group(2))
-          counter = counter + 1
-          fields[(counter,key)] = val
-          if key_sav == 'z':
-            check_for_dims = False
-      nbr_indent_prev = nbr_indent
-      mi = re.search(r'^(\s+)\w+.*', line)
-      md = re.search(':', line)
-      if mi:
-        nbr_indent = int(len(mi.group(1)) / 2) #indentation cnt
-      else:
-        if not md:
-          tmp = re.sub(r"\s+", "", line)
-          if tmp.isalnum():
-            parent_field = tmp
+    fields = {}
+    counter = 0
+    parent_field = ""
+    nbr_indent = 0
+    nbr_indent_prev = 0
+    check_for_dims = False
+    with open(txtfile) as fp:
+        for line in fp:
+            me = re.match(r"\*\*\* Done \*\*\*", line)  # Marks the end of cmd
+            if me:
+                parent_field = ""
+                nbr_indent = 0
+                nbr_indent_prev = 0
+                check_for_dims = False
+                continue
+            mv = re.match(
+                r"HCC clang version\s+(.*)", line
+            )  # outlier: only line with a version number and no ':', special case
+            if mv:
+                key = "HCCclangversion"
+                val = mv.group(1)
+                counter = counter + 1
+                fields[(counter, key)] = val
+                continue
+            # Variable 'check_for_dims' is True for text like this:
+            # Workgroup Max Size per Dimension:
+            #     x                        1024(0x400)
+            #     y                        1024(0x400)
+            #     z                        1024(0x400)
+            if check_for_dims == True:
+                mc = re.match(r"\s*([x|y|z])\s+(.*)", line)
+                if mc:
+                    key_sav = mc.group(1)
+                    if parent_field != "":
+                        key = parent_field + "." + mc.group(1)
+                    else:
+                        key = mc.group(1)
+                    val = re.sub(r"\s+", "", mc.group(2))
+                    counter = counter + 1
+                    fields[(counter, key)] = val
+                    if key_sav == "z":
+                        check_for_dims = False
+            nbr_indent_prev = nbr_indent
+            mi = re.search(r"^(\s+)\w+.*", line)
+            md = re.search(":", line)
+            if mi:
+                nbr_indent = int(len(mi.group(1)) / 2)  # indentation cnt
+            else:
+                if not md:
+                    tmp = re.sub(r"\s+", "", line)
+                    if tmp.isalnum():
+                        parent_field = tmp
 
-      if nbr_indent < nbr_indent_prev:
-        go_back_parent = (nbr_indent_prev - nbr_indent)
-        for i in range(go_back_parent): #decrease as many levels up as needed
-          pos = parent_field.rfind('.')
-          if pos != -1:
-            parent_field = parent_field[:pos]
-      # Process lines such as :
-      # Segment:                 GLOBAL; FLAGS: KERNARG, FINE GRAINED
-      # Size:                    131897644(0x7dc992c) KB
-      for lin in line.split(';'):
-        lin = re.sub(r"\s+", "", lin)
-        m = re.match(r'(.*):(.*)', lin)
-        if m:
-          key, val = m.group(1), m.group(2)
-          if parent_field != '':
-            key = parent_field + '.' + key
-          if val == '':
-            mk = re.match(r'.*Dimension',key)
-            if mk: # expect x,y,z on next 3 lines
-               check_for_dims = True
-            parent_field = key
-          else:
-            counter = counter + 1
-            fields[(counter,key)] = val
-        else:
-          if nbr_indent != nbr_indent_prev and not check_for_dims :
-            parent_field = parent_field + '.' + lin.replace(':','')
+            if nbr_indent < nbr_indent_prev:
+                go_back_parent = nbr_indent_prev - nbr_indent
+                for i in range(go_back_parent):  # decrease as many levels up as needed
+                    pos = parent_field.rfind(".")
+                    if pos != -1:
+                        parent_field = parent_field[:pos]
+            # Process lines such as :
+            # Segment:                 GLOBAL; FLAGS: KERNARG, FINE GRAINED
+            # Size:                    131897644(0x7dc992c) KB
+            for lin in line.split(";"):
+                lin = re.sub(r"\s+", "", lin)
+                m = re.match(r"(.*):(.*)", lin)
+                if m:
+                    key, val = m.group(1), m.group(2)
+                    if parent_field != "":
+                        key = parent_field + "." + key
+                    if val == "":
+                        mk = re.match(r".*Dimension", key)
+                        if mk:  # expect x,y,z on next 3 lines
+                            check_for_dims = True
+                        parent_field = key
+                    else:
+                        counter = counter + 1
+                        fields[(counter, key)] = val
+                else:
+                    if nbr_indent != nbr_indent_prev and not check_for_dims:
+                        parent_field = parent_field + "." + lin.replace(":", "")
 
-  return fields
+    return fields
diff --git a/build.sh b/build.sh
index 33ce9e5f..d1a0078a 100755
--- a/build.sh
+++ b/build.sh
@@ -50,7 +50,11 @@ while [ 1 ] ; do
   elif [[ "$1" = "-cb" || "$1" = "--clean-build" ]] ; then
     TO_CLEAN=yes
     shift
-  elif [[ "$1" = "-"* || "$1" = "--"* ]] ; then
+  elif [ "$1" = "--" ] ; then
+    shift
+    EXTRA_BUILD_ARGS=$@
+    break
+  elif [[ "$1" = "-"* ]] ; then
     echo -e "Wrong option \"$1\", Please use the following options:\n"
     usage
     exit 1
@@ -73,14 +77,14 @@ if [ -z "$RUN_TEST" ] ; then RUN_TEST=no; fi
 if [ -z "$ASAN" ] ; then ASAN=False; fi
 if [ -z "$GPU_LIST" ] ; then GPU_LIST="gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102"; fi
 
-
 ROCPROFILER_ROOT=$(cd $ROCPROFILER_ROOT && echo $PWD)
 
 if [ "$TO_CLEAN" = "yes" ] ; then rm -rf $BUILD_DIR; fi
-mkdir -p $BUILD_DIR
-pushd $BUILD_DIR
 
-cmake  \
+cmake -B ${BUILD_DIR} ${ROCPROFILER_ROOT} \
+    -DROCPROFILER_BUILD_CI=1 \
+    -DROCPROFILER_BUILD_TESTS=1 \
+    -DROCPROFILER_BUILD_SAMPLES=1 \
     -DCMAKE_EXPORT_COMPILE_COMMANDS=TRUE \
     -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-'RelWithDebInfo'} \
     -DCMAKE_MODULE_PATH="${ROCM_PATH}/hip/cmake;${ROCM_PATH}/lib/cmake" \
@@ -96,19 +100,9 @@ cmake  \
     -DCPACK_READELF_EXECUTABLE="${PACKAGE_ROOT}/llvm/bin/llvm-readelf" \
     -DCPACK_STRIP_EXECUTABLE="${PACKAGE_ROOT}/llvm/bin/llvm-strip" \
     -DCPACK_OBJDUMP_EXECUTABLE="${PACKAGE_ROOT}/llvm/bin/llvm-objdump" \
-    -DHIP_ROOT_DIR=${ROCM_PATH} \
-    $ROCPROFILER_ROOT
-
-popd
-
-MAKE_OPTS="-j -C $ROCPROFILER_ROOT/$BUILD_DIR"
+    ${EXTRA_BUILD_ARGS}
 
-cmake --build "$BUILD_DIR" -- $MAKE_OPTS
-cmake --build "$BUILD_DIR" -- $MAKE_OPTS mytest
-if [ "$RUN_TEST" = "no" ] ; then
-  cmake --build "$BUILD_DIR" -- $MAKE_OPTS tests samples doc package
-else
-  cmake --build "$BUILD_DIR" -- $MAKE_OPTS tests
-fi
+cmake --build "$BUILD_DIR" --target all --parallel $(nproc)
+cmake --build "$BUILD_DIR" --target package --parallel $(nproc)
 
 exit 0
diff --git a/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in b/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in
new file mode 100644
index 00000000..14a751d9
--- /dev/null
+++ b/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.18.0 FATAL_ERROR)
+
+if(NOT @GTEST_DISCOVER_TESTS_TARGET@_TESTS)
+    message(FATAL_ERROR "@GTEST_DISCOVER_TESTS_TARGET@_TESTS is not defined")
+endif()
+
+foreach(_TEST ${@GTEST_DISCOVER_TESTS_TARGET@_TESTS})
+    set_tests_properties(
+        ${_TEST} PROPERTIES LABELS "@GTEST_DISCOVER_TESTS_LABELS@" ENVIRONMENT
+                            "@GTEST_DISCOVER_TESTS_ENVIRONMENT@")
+endforeach()
diff --git a/cmake_modules/env.cmake b/cmake_modules/rocprofiler_env.cmake
similarity index 78%
rename from cmake_modules/env.cmake
rename to cmake_modules/rocprofiler_env.cmake
index 58412775..7b7c4727 100644
--- a/cmake_modules/env.cmake
+++ b/cmake_modules/rocprofiler_env.cmake
@@ -20,29 +20,25 @@
 # THE SOFTWARE.
 ################################################################################
 
-# Linux Compiler options
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions")
+add_library(rocprofiler-build-flags INTERFACE)
+add_library(rocprofiler::build-flags ALIAS rocprofiler-build-flags)
 
-add_definitions(-DNEW_TRACE_API=1)
-
-# CLANG options
-if("$ENV{CXX}" STREQUAL "/usr/bin/clang++")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000")
-endif()
+target_compile_options(
+    rocprofiler-build-flags
+    INTERFACE $<$<COMPILE_LANGUAGE:C,CXX>:-W -Wall -Wextra -Wno-unused-parameter>
+              $<$<COMPILE_LANGUAGE:CXX>:-fms-extensions>
+              $<$<COMPILE_LANGUAGE:CXX>:$<$<CXX_COMPILER_ID:Clang>:-ferror-limit=1000000>>
+    )
+target_compile_definitions(rocprofiler-build-flags INTERFACE NEW_TRACE_API=1)
 
 # Enable debug trace
-if(DEFINED ENV{CMAKE_DEBUG_TRACE})
-    add_definitions(-DDEBUG_TRACE=1)
-endif()
-
-# Enable AQL-profile new API
-if(NOT DEFINED ENV{CMAKE_CURR_API})
-    add_definitions(-DAQLPROF_NEW_API=1)
+if(ROCPROFILER_DEBUG_TRACE)
+    target_compile_definitions(rocprofiler-build-flags INTERFACE DEBUG_TRACE=1)
 endif()
 
 # Enable direct loading of AQL-profile HSA extension
-if(DEFINED ENV{CMAKE_LD_AQLPROFILE})
-    add_definitions(-DROCP_LD_AQLPROFILE=1)
+if(ROCPROFILER_LD_AQLPROFILE)
+    target_compile_definitions(rocprofiler-build-flags INTERFACE ROCP_LD_AQLPROFILE=1)
 endif()
 
 # Find hsa-runtime
@@ -85,10 +81,8 @@ if("${ROCM_ROOT_DIR}" STREQUAL "")
 endif()
 
 find_library(
-    FIND_AQL_PROFILE_LIB "libhsa-amd-aqlprofile64.so"
+    HSA_AMD_AQLPROFILE_LIBRARY
+    NAMES hsa-amd-aqlprofile64
     HINTS ${CMAKE_PREFIX_PATH}
     PATHS ${ROCM_ROOT_DIR}
     PATH_SUFFIXES lib REQUIRED)
-if(NOT FIND_AQL_PROFILE_LIB)
-    message("AQL_PROFILE not installed. Please install AQL_PROFILE")
-endif()
diff --git a/cmake_modules/rocprofiler_formatting.cmake b/cmake_modules/rocprofiler_formatting.cmake
new file mode 100644
index 00000000..35192a39
--- /dev/null
+++ b/cmake_modules/rocprofiler_formatting.cmake
@@ -0,0 +1,103 @@
+# ------------------------------------------------------------------------------#
+#
+# creates following targets to format code:
+# - format
+# - format-source
+# - format-cmake
+# - format-python
+# - format-rocprofiler-source
+# - format-rocprofiler-cmake
+# - format-rocprofiler-python
+#
+# ------------------------------------------------------------------------------#
+
+include_guard(GLOBAL)
+
+find_program(ROCPROFILER_CLANG_FORMAT_EXE NAMES clang-format-11 clang-format-mp-11)
+find_program(ROCPROFILER_CMAKE_FORMAT_EXE NAMES cmake-format)
+find_program(ROCPROFILER_BLACK_FORMAT_EXE NAMES black)
+
+if(ROCPROFILER_CLANG_FORMAT_EXE
+   OR ROCPROFILER_BLACK_FORMAT_EXE
+   OR ROCPROFILER_CMAKE_FORMAT_EXE)
+    add_custom_target(format-rocprofiler)
+
+    if(NOT TARGET format)
+        add_custom_target(format)
+    endif()
+
+    foreach(_TYPE source python cmake)
+        if(NOT TARGET format-${_TYPE})
+            add_custom_target(format-${_TYPE})
+        endif()
+    endforeach()
+
+    set(rocp_sources)
+    set(rocp_headers)
+    set(rocp_cmake_files)
+    set(rocp_python_files)
+    foreach(_DIR include src plugin samples test tests-v2 script cmake_modules)
+        foreach(_TYPE headers sources cmake_files python_files)
+            set(${_TYPE})
+        endforeach()
+        file(GLOB_RECURSE headers ${PROJECT_SOURCE_DIR}/${_DIR}/*.h)
+        file(GLOB_RECURSE sources ${PROJECT_SOURCE_DIR}/${_DIR}/*.cpp)
+        file(GLOB_RECURSE cmake_files ${PROJECT_SOURCE_DIR}/${_DIR}/*CMakeLists.txt
+             ${PROJECT_SOURCE_DIR}/${_DIR}/*.cmake)
+        file(GLOB_RECURSE python_files ${PROJECT_SOURCE_DIR}/${_DIR}/*.py)
+        foreach(_TYPE headers sources cmake_files python_files)
+            list(APPEND rocp_${_TYPE} ${${_TYPE}})
+        endforeach()
+    endforeach()
+
+    if(ROCPROFILER_CLANG_FORMAT_EXE)
+        add_custom_target(
+            format-rocprofiler-source
+            ${ROCPROFILER_CLANG_FORMAT_EXE} -i ${rocp_sources} ${rocp_headers}
+            COMMENT
+                "[rocprofiler] Running source formatter ${ROCPROFILER_CLANG_FORMAT_EXE}..."
+            )
+    endif()
+
+    if(ROCPROFILER_BLACK_FORMAT_EXE)
+        add_custom_target(
+            format-rocprofiler-python
+            ${ROCPROFILER_BLACK_FORMAT_EXE} -q ${rocp_python_files}
+            COMMENT
+                "[rocprofiler] Running Python formatter ${ROCPROFILER_BLACK_FORMAT_EXE}..."
+            )
+        if(NOT TARGET format-python)
+            add_custom_target(format-python)
+        endif()
+    endif()
+
+    if(ROCPROFILER_CMAKE_FORMAT_EXE)
+        add_custom_target(
+            format-rocprofiler-cmake
+            ${ROCPROFILER_CMAKE_FORMAT_EXE} -i ${rocp_cmake_files}
+            COMMENT
+                "[rocprofiler] Running CMake formatter ${ROCPROFILER_CMAKE_FORMAT_EXE}..."
+            )
+        if(NOT TARGET format-cmake)
+            add_custom_target(format-cmake)
+        endif()
+    endif()
+
+    foreach(_TYPE source python cmake)
+        if(TARGET format-rocprofiler-${_TYPE})
+            add_dependencies(format-rocprofiler format-rocprofiler-${_TYPE})
+            add_dependencies(format-${_TYPE} format-rocprofiler-${_TYPE})
+        endif()
+    endforeach()
+
+    foreach(_TYPE source python)
+        if(TARGET format-rocprofiler-${_TYPE})
+            add_dependencies(format format-rocprofiler-${_TYPE})
+        endif()
+    endforeach()
+else()
+    message(
+        STATUS
+            "no formatting tools (clang-format-11/black/cmake-format) could not be found. formatting build targets not available."
+        )
+endif()
diff --git a/cmake_modules/rocprofiler_linting.cmake b/cmake_modules/rocprofiler_linting.cmake
new file mode 100644
index 00000000..3e00aff9
--- /dev/null
+++ b/cmake_modules/rocprofiler_linting.cmake
@@ -0,0 +1,30 @@
+include_guard(DIRECTORY)
+
+# ----------------------------------------------------------------------------------------#
+#
+# Clang Tidy
+#
+# ----------------------------------------------------------------------------------------#
+
+if(ROCPROFILER_ENABLE_CLANG_TIDY)
+    find_program(ROCPROFILER_CLANG_TIDY_COMMAND NAMES clang-tidy)
+
+    if(NOT ROCPROFILER_CLANG_TIDY_COMMAND)
+        message(
+            WARNING "ROCPROFILER_ENABLE_CLANG_TIDY is ON but clang-tidy is not found!")
+        set(ROCPROFILER_ENABLE_CLANG_TIDY OFF)
+    else()
+        set(CMAKE_CXX_CLANG_TIDY ${ROCPROFILER_CLANG_TIDY_COMMAND}
+                                 -header-filter=${PROJECT_SOURCE_DIR}/.*)
+
+        # Create a preprocessor definition that depends on .clang-tidy content so the
+        # compile command will change when .clang-tidy changes.  This ensures that a
+        # subsequent build re-runs clang-tidy on all sources even if they do not otherwise
+        # need to be recompiled.  Nothing actually uses this definition.  We add it to
+        # targets on which we run clang-tidy just to get the build dependency on the
+        # .clang-tidy file.
+        file(SHA1 ${PROJECT_SOURCE_DIR}/.clang-tidy clang_tidy_sha1)
+        set(CLANG_TIDY_DEFINITIONS "CLANG_TIDY_SHA1=${clang_tidy_sha1}")
+        unset(clang_tidy_sha1)
+    endif()
+endif()
diff --git a/cmake_modules/rocprofiler_options.cmake b/cmake_modules/rocprofiler_options.cmake
new file mode 100644
index 00000000..41135b5a
--- /dev/null
+++ b/cmake_modules/rocprofiler_options.cmake
@@ -0,0 +1,139 @@
+if("${PROJECT_SOURCE_DIR}" STREQUAL "${PROJECT_BINARY_DIR}")
+    message(STATUS "")
+    message(STATUS "rocprofiler does not support in-source builds.")
+    message(STATUS "Delete CMakeCache.txt and CMakeFiles in ${PROJECT_SOURCE_DIR}")
+    message(STATUS "and run cmake with `-B <build-directory>`")
+    message(STATUS "")
+    message(FATAL_ERROR "In-source build detected.")
+endif()
+
+option(ROCPROFILER_BUILD_TESTS "Enable building the tests" OFF)
+option(ROCPROFILER_BUILD_SAMPLES "Enable building the code samples" OFF)
+
+# CLI and FILE plugins are always built
+foreach(_PLUGIN "ATT" "CTF" "PERFETTO")
+    option(ROCPROFILER_BUILD_PLUGIN_${_PLUGIN} "Enable building the ${_PLUGIN} plugin" ON)
+endforeach()
+
+option(ROCPROFILER_DEBUG_TRACE "Enable debug tracing" OFF)
+mark_as_advanced(ROCPROFILER_DEBUG_TRACE)
+
+option(ROCPROFILER_LD_AQLPROFILE "Enable direct loading of AQL-profile HSA extension" OFF)
+mark_as_advanced(ROCPROFILER_LD_AQLPROFILE)
+
+option(ROCPROFILER_BUILD_CI "Enable continuous integration additions" OFF)
+mark_as_advanced(ROCPROFILER_BUILD_CI)
+
+option(ROCPROFILER_ENABLE_CLANG_TIDY "Enable clang-tidy checks" OFF)
+mark_as_advanced(ROCPROFILER_ENABLE_CLANG_TIDY)
+
+set(ROCPROFILER_BUILD_TYPES "Release" "RelWithDebInfo" "Debug" "MinSizeRel" "Coverage")
+
+# export compile commands in the project
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE
+        "Release"
+        CACHE STRING "Build type" FORCE)
+endif()
+
+if(NOT CMAKE_BUILD_TYPE IN_LIST ROCPROFILER_BUILD_TYPES)
+    message(
+        FATAL_ERROR
+            "Unsupported build type '${CMAKE_BUILD_TYPE}'. Options: ${ROCPROFILER_BUILD_TYPES}"
+        )
+endif()
+
+if(ROCPROFILER_BUILD_CI)
+    foreach(_BUILD_TYPE ${ROCPROFILER_BUILD_TYPES})
+        string(TOUPPER "${_BUILD_TYPE}" _BUILD_TYPE)
+
+        # remove NDEBUG preprocessor def so that asserts are triggered
+        string(REGEX REPLACE ".DNDEBUG" "" CMAKE_C_FLAGS_${_BUILD_TYPE}
+                             "${CMAKE_C_FLAGS_${_BUILD_TYPE}}")
+        string(REGEX REPLACE ".DNDEBUG" "" CMAKE_CXX_FLAGS_${_BUILD_TYPE}
+                             "${CMAKE_CXX_FLAGS_${_BUILD_TYPE}}")
+    endforeach()
+endif()
+
+if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "${ROCPROFILER_BUILD_TYPES}")
+endif()
+
+set(ROCPROFILER_MEMCHECK
+    ""
+    CACHE STRING "Memory checker type")
+mark_as_advanced(ROCPROFILER_MEMCHECK)
+
+# ASAN is defined by testing team on Jenkins
+if(ASAN)
+    set(ROCPROFILER_MEMCHECK
+        "AddressSanitizer"
+        CACHE STRING "Memory checker type (forced by ASAN defined)" FORCE)
+endif()
+
+set(ROCPROFILER_MEMCHECK_TYPES "ThreadSanitizer" "AddressSanitizer" "LeakSanitizer"
+                               "MemorySanitizer" "UndefinedBehaviorSanitizer")
+
+if(ROCPROFILER_MEMCHECK AND NOT ROCPROFILER_MEMCHECK IN_LIST ROCPROFILER_MEMCHECK_TYPES)
+    message(
+        FATAL_ERROR
+            "Unsupported memcheck type '${ROCPROFILER_MEMCHECK}'. Options: ${ROCPROFILER_MEMCHECK_TYPES}"
+        )
+endif()
+
+set_property(CACHE ROCPROFILER_MEMCHECK PROPERTY STRINGS "${ROCPROFILER_MEMCHECK_TYPES}")
+
+add_library(rocprofiler-memcheck INTERFACE)
+add_library(rocprofiler::memcheck ALIAS rocprofiler-memcheck)
+
+function(rocprofiler_add_memcheck_flags _TYPE)
+    target_compile_options(
+        rocprofiler-memcheck INTERFACE $<BUILD_INTERFACE:-g3 -Og -fno-omit-frame-pointer
+                                       -fsanitize=${_TYPE}>)
+    target_link_options(rocprofiler-memcheck INTERFACE
+                        $<BUILD_INTERFACE:-fsanitize=${_TYPE} -Wl,--no-undefined>)
+endfunction()
+
+function(rocprofiler_set_memcheck_env _TYPE _LIB_BASE)
+    set(_LIBS ${_LIB_BASE})
+    foreach(_N 6 5 4 3 2 1 0)
+        list(
+            APPEND _LIBS
+            ${CMAKE_SHARED_LIBRARY_PREFIX}${_LIB_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}.${_N}
+            )
+    endforeach()
+    foreach(_LIB ${_LIBS})
+        if(NOT ${_TYPE}_LIBRARY)
+            find_library(${_TYPE}_LIBRARY NAMES ${_LIB} ${ARGN})
+        endif()
+    endforeach()
+
+    target_link_libraries(rocprofiler-memcheck INTERFACE ${_LIB_BASE})
+
+    if(${_TYPE}_LIBRARY)
+        set(ROCPROFILER_MEMCHECK_PRELOAD_ENV
+            "LD_PRELOAD=${${_TYPE}_LIBRARY};LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}"
+            CACHE INTERNAL "LD_PRELOAD env variable for tests" FORCE)
+    endif()
+endfunction()
+
+# always unset so that it doesn't preload if memcheck disabled
+unset(ROCPROFILER_MEMCHECK_PRELOAD_ENV CACHE)
+
+if(ROCPROFILER_MEMCHECK STREQUAL "AddressSanitizer")
+    rocprofiler_add_memcheck_flags("address")
+    rocprofiler_set_memcheck_env("${ROCPROFILER_MEMCHECK}" "asan")
+elseif(ROCPROFILER_MEMCHECK STREQUAL "LeakSanitizer")
+    rocprofiler_add_memcheck_flags("leak")
+    rocprofiler_set_memcheck_env("${ROCPROFILER_MEMCHECK}" "lsan")
+elseif(ROCPROFILER_MEMCHECK STREQUAL "MemorySanitizer")
+    rocprofiler_add_memcheck_flags("memory")
+elseif(ROCPROFILER_MEMCHECK STREQUAL "ThreadSanitizer")
+    rocprofiler_add_memcheck_flags("thread")
+    rocprofiler_set_memcheck_env("${ROCPROFILER_MEMCHECK}" "tsan")
+elseif(ROCPROFILER_MEMCHECK STREQUAL "UndefinedBehaviorSanitizer")
+    rocprofiler_add_memcheck_flags("undefined")
+    rocprofiler_set_memcheck_env("${ROCPROFILER_MEMCHECK}" "ubsan")
+endif()
diff --git a/cmake_modules/utils.cmake b/cmake_modules/rocprofiler_utils.cmake
similarity index 94%
rename from cmake_modules/utils.cmake
rename to cmake_modules/rocprofiler_utils.cmake
index f1f85656..9b0607f4 100644
--- a/cmake_modules/utils.cmake
+++ b/cmake_modules/rocprofiler_utils.cmake
@@ -22,7 +22,7 @@
 
 # Parses the VERSION_STRING variable and places the first, second and third number values
 # in the major, minor and patch variables.
-function(parse_version VERSION_STRING)
+function(rocprofiler_parse_version VERSION_STRING)
 
     string(FIND ${VERSION_STRING} "-" STRING_INDEX)
 
@@ -72,9 +72,9 @@ endfunction()
 
 # Gets the current version of the repository using versioning tags and git describe.
 # Passes back a packaging version string and a library version string.
-function(get_version DEFAULT_VERSION_STRING)
+function(rocprofiler_get_version DEFAULT_VERSION_STRING)
 
-    parse_version(${DEFAULT_VERSION_STRING})
+    rocprofiler_parse_version(${DEFAULT_VERSION_STRING})
 
     find_program(GIT NAMES git)
 
@@ -89,7 +89,7 @@ function(get_version DEFAULT_VERSION_STRING)
 
         if(${RESULT} EQUAL 0)
 
-            parse_version(${GIT_TAG_STRING})
+            rocprofiler_parse_version(${GIT_TAG_STRING})
 
         endif()
 
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 03201d40..8dad8387 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -20,8 +20,22 @@
 # IN THE SOFTWARE.
 ################################################################################
 
-add_subdirectory(file)
-add_subdirectory(perfetto)
-add_subdirectory(ctf)
-add_subdirectory(att)
+if(ROCPROFILER_BUILD_CODECOV)
+    set(CMAKE_BUILD_TYPE "Coverage")
+endif()
+
+# these two "native" plugins are always built
 add_subdirectory(cli)
+add_subdirectory(file)
+
+if(ROCPROFILER_BUILD_PLUGIN_PERFETTO)
+    add_subdirectory(perfetto)
+endif()
+
+if(ROCPROFILER_BUILD_PLUGIN_CTF)
+    add_subdirectory(ctf)
+endif()
+
+if(ROCPROFILER_BUILD_PLUGIN_ATT)
+    add_subdirectory(att)
+endif()
diff --git a/plugin/att/att.py b/plugin/att/att.py
index cf1778f9..7c9251e7 100755
--- a/plugin/att/att.py
+++ b/plugin/att/att.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import sys
+
 if sys.version_info[0] < 3:
     raise Exception("Must be using Python 3")
 
@@ -18,23 +19,33 @@
 
 try:
     from mpi4py import MPI
+
     MPI_IMPORTED = True
 except:
     MPI_IMPORTED = False
 
+
 class PerfEvent(ctypes.Structure):
     _fields_ = [
-        ('time', c_uint64),
-        ('event0', c_uint16),
-        ('event1', c_uint16),
-        ('event2', c_uint16),
-        ('event3', c_uint16),
-        ('cu', c_uint8),
-        ('bank', c_uint8),
+        ("time", c_uint64),
+        ("event0", c_uint16),
+        ("event1", c_uint16),
+        ("event2", c_uint16),
+        ("event3", c_uint16),
+        ("cu", c_uint8),
+        ("bank", c_uint8),
     ]
+
     def toTuple(self):
-        return (int(self.time), int(self.event0), int(self.event1),
-            int(self.event2), int(self.event3), int(self.cu), int(self.bank))
+        return (
+            int(self.time),
+            int(self.event0),
+            int(self.event1),
+            int(self.event2),
+            int(self.event3),
+            int(self.cu),
+            int(self.bank),
+        )
 
 
 class CodeWrapped(ctypes.Structure):
@@ -64,44 +75,41 @@ class ReturnAssemblyInfo(ctypes.Structure):
 
 class Wave(ctypes.Structure):
     _fields_ = [
-        ('simd', ctypes.c_uint64),
-        ('wave_id', ctypes.c_uint64),
-        ('begin_time', ctypes.c_uint64),  # Begin and end cycle
-        ('end_time', ctypes.c_uint64),
-
+        ("simd", ctypes.c_uint64),
+        ("wave_id", ctypes.c_uint64),
+        ("begin_time", ctypes.c_uint64),  # Begin and end cycle
+        ("end_time", ctypes.c_uint64),
         # total VMEM/FLAT/LDS/SMEM instructions issued
         # total issued memory instructions
-        ('num_mem_instrs', ctypes.c_uint64),
+        ("num_mem_instrs", ctypes.c_uint64),
         # total issued instructions (compute + memory)
-        ('num_issued_instrs', ctypes.c_uint64),
-        ('num_valu_instrs', ctypes.c_uint64),
-        ('num_valu_stalls', ctypes.c_uint64),
+        ("num_issued_instrs", ctypes.c_uint64),
+        ("num_valu_instrs", ctypes.c_uint64),
+        ("num_valu_stalls", ctypes.c_uint64),
         # VMEM Pipeline: instrs and stalls
-        ('num_vmem_instrs', ctypes.c_uint64),
-        ('num_vmem_stalls', ctypes.c_uint64),
+        ("num_vmem_instrs", ctypes.c_uint64),
+        ("num_vmem_stalls", ctypes.c_uint64),
         # FLAT instrs and stalls
-        ('num_flat_instrs', ctypes.c_uint64),
-        ('num_flat_stalls', ctypes.c_uint64),
-
+        ("num_flat_instrs", ctypes.c_uint64),
+        ("num_flat_stalls", ctypes.c_uint64),
         # LDS instr and stalls
-        ('num_lds_instrs', ctypes.c_uint64),
-        ('num_lds_stalls', ctypes.c_uint64),
-
+        ("num_lds_instrs", ctypes.c_uint64),
+        ("num_lds_stalls", ctypes.c_uint64),
         # SCA instrs stalls
-        ('num_salu_instrs', ctypes.c_uint64),
-        ('num_smem_instrs', ctypes.c_uint64),
-        ('num_salu_stalls', ctypes.c_uint64),
-        ('num_smem_stalls', ctypes.c_uint64),
-
+        ("num_salu_instrs", ctypes.c_uint64),
+        ("num_smem_instrs", ctypes.c_uint64),
+        ("num_salu_stalls", ctypes.c_uint64),
+        ("num_smem_stalls", ctypes.c_uint64),
         # Branch
-        ('num_branch_instrs', ctypes.c_uint64),
-        ('num_branch_taken_instrs', ctypes.c_uint64),
-        ('num_branch_stalls', ctypes.c_uint64),
+        ("num_branch_instrs", ctypes.c_uint64),
+        ("num_branch_taken_instrs", ctypes.c_uint64),
+        ("num_branch_stalls", ctypes.c_uint64),
+        ("timeline_array", POINTER(ctypes.c_int64)),
+        ("instructions_array", POINTER(ctypes.c_int64)),
+        ("timeline_size", ctypes.c_uint64),
+        ("instructions_size", ctypes.c_uint64),
+    ]
 
-        ('timeline_array', POINTER(ctypes.c_int64)),
-        ('instructions_array', POINTER(ctypes.c_int64)),
-        ('timeline_size', ctypes.c_uint64),
-        ('instructions_size', ctypes.c_uint64)]
 
 class PythonWave:
     def __init__(self, source_wave):
@@ -110,20 +118,26 @@ def __init__(self, source_wave):
         self.timeline_array = None
         self.instructions_array = None
 
+
 # Flags :
 #   IS_NAVI = 0x1
 class ReturnInfo(ctypes.Structure):
-    _fields_ = [('num_waves', ctypes.c_uint64),
-                ('wavedata', POINTER(Wave)),
-                ('num_events', ctypes.c_uint64),
-                ('perfevents', POINTER(PerfEvent)),
-                ('occupancy', POINTER(ctypes.c_uint64)),
-                ('num_occupancy', ctypes.c_uint64),
-                ('flags', ctypes.c_uint64)]
-
-rocprofv2_att_lib = os.getenv('ROCPROFV2_ATT_LIB_PATH')
+    _fields_ = [
+        ("num_waves", ctypes.c_uint64),
+        ("wavedata", POINTER(Wave)),
+        ("num_events", ctypes.c_uint64),
+        ("perfevents", POINTER(PerfEvent)),
+        ("occupancy", POINTER(ctypes.c_uint64)),
+        ("num_occupancy", ctypes.c_uint64),
+        ("flags", ctypes.c_uint64),
+    ]
+
+
+rocprofv2_att_lib = os.getenv("ROCPROFV2_ATT_LIB_PATH")
 if rocprofv2_att_lib is None:
-    print("ATT Lib path not set. Use export ROCPROFV2_ATT_LIB_PATH=/path/to/librocprofv2_att.so")
+    print(
+        "ATT Lib path not set. Use export ROCPROFV2_ATT_LIB_PATH=/path/to/librocprofv2_att.so"
+    )
     quit()
 path_to_parser = os.path.abspath(rocprofv2_att_lib)
 SO = CDLL(path_to_parser)
@@ -133,18 +147,19 @@ class ReturnInfo(ctypes.Structure):
 SO.wrapped_parse_binary.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
 SO.wrapped_parse_binary.restype = ReturnAssemblyInfo
 
+
 def parse_binary(filename, kernel=None):
-    if kernel is None or kernel == '':
+    if kernel is None or kernel == "":
         kernel = ctypes.c_char_p(0)
-        print('Parsing all kernels')
+        print("Parsing all kernels")
     else:
-        with open(glob.glob(kernel)[0], 'r') as file:
+        with open(glob.glob(kernel)[0], "r") as file:
             kernel = file.readlines()
-        print('Parsing kernel:', kernel[0].split(': ')[0])
-        kernel = kernel[0].split(': ')[1].split('.kd')[0]
-        kernel = str(kernel).encode('utf-8')
+        print("Parsing kernel:", kernel[0].split(": ")[0])
+        kernel = kernel[0].split(": ")[1].split(".kd")[0]
+        kernel = str(kernel).encode("utf-8")
     filename = os.path.abspath(str(filename))
-    info = SO.wrapped_parse_binary(str(filename).encode('utf-8'), kernel)
+    info = SO.wrapped_parse_binary(str(filename).encode("utf-8"), kernel)
 
     code = []
     for k in range(info.code_len):
@@ -169,23 +184,32 @@ def parse_binary(filename, kernel=None):
 
 def getWaves_binary(name, shader_engine_data_dict, target_cu, depth):
     filename = os.path.abspath(str(name))
-    info = SO.AnalyseBinary(filename.encode('utf-8'), target_cu, False)
+    info = SO.AnalyseBinary(filename.encode("utf-8"), target_cu, False)
 
     waves = [info.wavedata[k] for k in range(info.num_waves)]
     events = [deepcopy(info.perfevents[k]) for k in range(info.num_events)]
     occupancy = [int(info.occupancy[k]) for k in range(int(info.num_occupancy))]
-    flags = 'navi' if (info.flags & 0x1) else 'vega'
+    flags = "navi" if (info.flags & 0x1) else "vega"
 
     wave_slot_count = [[0 for k in range(20)] for j in range(4)]
     waves_python = []
     for wave in waves:
-        if wave_slot_count[wave.simd][wave.wave_id] >= depth or wave.instructions_size == 0:
+        if (
+            wave_slot_count[wave.simd][wave.wave_id] >= depth
+            or wave.instructions_size == 0
+        ):
             continue
         wave_slot_count[wave.simd][wave.wave_id] += 1
         pwave = PythonWave(wave)
-        pwave.timeline = [(wave.timeline_array[2*k], wave.timeline_array[2*k+1]) for k in range(wave.timeline_size)]
-        pwave.instructions = [tuple([wave.instructions_array[4*k+m] for m in range(4)]) for k in range(wave.instructions_size)]
-        waves_python.append( pwave )
+        pwave.timeline = [
+            (wave.timeline_array[2 * k], wave.timeline_array[2 * k + 1])
+            for k in range(wave.timeline_size)
+        ]
+        pwave.instructions = [
+            tuple([wave.instructions_array[4 * k + m] for m in range(4)])
+            for k in range(wave.instructions_size)
+        ]
+        waves_python.append(pwave)
     shader_engine_data_dict[name] = (waves_python, events, occupancy, flags)
 
 
@@ -233,31 +257,31 @@ def persist(trace_file, SIMD):
         instructions.append(wave.instructions)
 
     df = {
-        'name': [trace for _ in range(len(begin_time))],
-        'id': [i for i in range(len(begin_time))],
-        'simd': simds,
-        'wave_slot': waves,
-        'begin_time': begin_time,
-        'end_time': end_time,
-        'mem_ins': mem_ins,
-        'issued_ins': issued_ins,
-        'valu_ins': valu_ins,
-        'valu_stalls': valu_stalls,
-        'vmem_ins': vmem_ins,
-        'vmem_stalls': vmem_stalls,
-        'flat_ins': flat_ins,
-        'flat_stalls': flat_stalls,
-        'lds_ins': lds_ins,
-        'lds_stalls': lds_stalls,
-        'salu_ins': salu_ins,
-        'salu_stalls': salu_stalls,
-        'smem_ins': smem_ins,
-        'smem_stalls': smem_stalls,
-        'br_ins': br_ins,
-        'br_taken_ins': br_taken_ins,
-        'br_stalls': br_stalls,
-        'timeline': timeline,
-        'instructions': instructions,
+        "name": [trace for _ in range(len(begin_time))],
+        "id": [i for i in range(len(begin_time))],
+        "simd": simds,
+        "wave_slot": waves,
+        "begin_time": begin_time,
+        "end_time": end_time,
+        "mem_ins": mem_ins,
+        "issued_ins": issued_ins,
+        "valu_ins": valu_ins,
+        "valu_stalls": valu_stalls,
+        "vmem_ins": vmem_ins,
+        "vmem_stalls": vmem_stalls,
+        "flat_ins": flat_ins,
+        "flat_stalls": flat_stalls,
+        "lds_ins": lds_ins,
+        "lds_stalls": lds_stalls,
+        "salu_ins": salu_ins,
+        "salu_stalls": salu_stalls,
+        "smem_ins": smem_ins,
+        "smem_stalls": smem_stalls,
+        "br_ins": br_ins,
+        "br_taken_ins": br_taken_ins,
+        "br_stalls": br_stalls,
+        "timeline": timeline,
+        "instructions": instructions,
     }
     return df
 
@@ -271,68 +295,85 @@ def mem_max(array):
                     mem_dict[inst[0]][0] = max(mem_dict[inst[0]][0], inst[1])
                 except:
                     mem_dict[inst[0]] = inst[1:]
-                assert(mem_dict[inst[0]][1] == inst[2])
+                assert mem_dict[inst[0]][1] == inst[2]
 
     return mem_dict
 
+
 def lgk(count):
-    return 'lgkmcnt({0})'.format(count)
+    return "lgkmcnt({0})".format(count)
+
+
 def vmc(count):
-    return 'vmcnt({0})'.format(count)
+    return "vmcnt({0})".format(count)
+
+
 def both_cnt(count):
-    return lgk(count)+' '+vmc(count)
+    return lgk(count) + " " + vmc(count)
+
 
 def insert_waitcnt(flight_count, assembly_code):
     flight_count = mem_max(flight_count)
     for key in sorted(flight_count):
         line_n = key
-        issue_amount, waitcnt_amount, = flight_count[key]
-        if 'vmcnt' in assembly_code[line_n] and 'lgkmcnt' in assembly_code[line_n]:
+        (
+            issue_amount,
+            waitcnt_amount,
+        ) = flight_count[key]
+        if "vmcnt" in assembly_code[line_n] and "lgkmcnt" in assembly_code[line_n]:
             counter_type = both_cnt
-        elif 'vmcnt' in assembly_code[line_n]:
+        elif "vmcnt" in assembly_code[line_n]:
             counter_type = vmc
-        elif 'lgkmcnt' in assembly_code[line_n]:
+        elif "lgkmcnt" in assembly_code[line_n]:
             counter_type = lgk
         else:
-            print('Error: Line mismatch')
+            print("Error: Line mismatch")
             exit(-1)
 
-        for count in range(waitcnt_amount+1, issue_amount):
-            print('Inserted line: '+str(line_n))
-            as_index = line_n - count/(issue_amount+1)
-            assembly_code[as_index] = \
-                '\ts_waitcnt {0}\t\t; Timing analysis.'.format(counter_type(count))
-            as_index += 0.5/(issue_amount+1)
-            assembly_code[as_index] = '\ts_nop 0\t\t\t\t\t\t; Counters: '+str(issue_amount)
+        for count in range(waitcnt_amount + 1, issue_amount):
+            print("Inserted line: " + str(line_n))
+            as_index = line_n - count / (issue_amount + 1)
+            assembly_code[as_index] = "\ts_waitcnt {0}\t\t; Timing analysis.".format(
+                counter_type(count)
+            )
+            as_index += 0.5 / (issue_amount + 1)
+            assembly_code[as_index] = "\ts_nop 0\t\t\t\t\t\t; Counters: " + str(
+                issue_amount
+            )
 
     return assembly_code
 
 
 def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
     for n, occ in enumerate(OCCUPANCY):
-        OCCUPANCY[n] = [max(min(int((u>>16)-min_event_time)<<16,2**42),0) | (u&0xFFFFF) for u in occ]
+        OCCUPANCY[n] = [
+            max(min(int((u >> 16) - min_event_time) << 16, 2**42), 0) | (u & 0xFFFFF)
+            for u in occ
+        ]
     for perf in EVENTS:
         for p in perf:
             p.time -= min_event_time
 
     for df in DBFILES:
-        for T in range(len(df['timeline'])):
-            timeline = df['timeline'][T]
+        for T in range(len(df["timeline"])):
+            timeline = df["timeline"][T]
             time_acc = 0
-            tuples3 = [(0,df['begin_time'][T]-min_event_time)]+[(int(t[0]),int(t[1])) for t in timeline]
+            tuples3 = [(0, df["begin_time"][T] - min_event_time)] + [
+                (int(t[0]), int(t[1])) for t in timeline
+            ]
 
             for state in tuples3:
-                if state[1] > 1E8:
-                    print('Warning: Time limit reached for ',state[0], state[1])
+                if state[1] > 1e8:
+                    print("Warning: Time limit reached for ", state[0], state[1])
                     break
-                if time_acc+state[1] > TIMELINES[state[0]].size:
-                    TIMELINES[state[0]] = np.hstack([
-                        TIMELINES[state[0]],
-                        np.zeros_like(TIMELINES[state[0]])
-                    ])
-                TIMELINES[state[0]][time_acc:time_acc+state[1]] += 1
+                if time_acc + state[1] > TIMELINES[state[0]].size:
+                    TIMELINES[state[0]] = np.hstack(
+                        [TIMELINES[state[0]], np.zeros_like(TIMELINES[state[0]])]
+                    )
+                TIMELINES[state[0]][time_acc : time_acc + state[1]] += 1
                 time_acc += state[1]
 
+
 if __name__ == "__main__":
     comm = None
     mpi_root = True
@@ -344,25 +385,41 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
             else:
                 mpi_root = comm.Get_rank() == 0
         except:
-            print('Could not load MPI')
+            print("Could not load MPI")
             comm = None
 
-    pathenv = os.getenv('OUTPUT_PATH')
+    pathenv = os.getenv("OUTPUT_PATH")
     if pathenv is None:
         pathenv = "."
     parser = argparse.ArgumentParser()
-    parser.add_argument("assembly_code", help="Path to the assembly code. Must be the first parameter.")
-    parser.add_argument("--depth", help="Maximum number of parsed waves per slot", default=100, type=int)
-    parser.add_argument("--trace_file", help="Filter for trace files", default=None, type=str)
-    parser.add_argument("--att_kernel", help="Kernel file",
-                        type=str, default=pathenv+'/*_kernel.txt')
+    parser.add_argument(
+        "assembly_code", help="Path to the assembly code. Must be the first parameter."
+    )
+    parser.add_argument(
+        "--depth", help="Maximum number of parsed waves per slot", default=100, type=int
+    )
+    parser.add_argument(
+        "--trace_file", help="Filter for trace files", default=None, type=str
+    )
+    parser.add_argument(
+        "--att_kernel", help="Kernel file", type=str, default=pathenv + "/*_kernel.txt"
+    )
     parser.add_argument("--ports", help="Server and websocket ports, default: 8000,18000")
-    parser.add_argument("--genasm",
-                        help="Generate post-processed asm file at this path", type=str, default="")
-    parser.add_argument("--mode", help='''ATT analysis modes:\n
+    parser.add_argument(
+        "--genasm",
+        help="Generate post-processed asm file at this path",
+        type=str,
+        default="",
+    )
+    parser.add_argument(
+        "--mode",
+        help="""ATT analysis modes:\n
                         off: Only run ATT collection, disable analysis.\n
                         file: dump json files to disk.\n
-                        network: Open att server over the network.''', type=str, default="off")
+                        network: Open att server over the network.""",
+        type=str,
+        default="off",
+    )
     args = parser.parse_args()
 
     CSV_MODE = False
@@ -370,38 +427,38 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
         CSV_MODE = True
     elif args.mode.lower() == 'file':
         args.dumpfiles = True
-    elif args.mode.lower() == 'network':
+    elif args.mode.lower() == "network":
         args.dumpfiles = False
     else:
-        print('Skipping analysis.')
+        print("Skipping analysis.")
         quit()
 
-    with open(os.getenv("COUNTERS_PATH"), 'r') as f:
-        lines = [l.split('//')[0] for l in f.readlines()]
+    with open(os.getenv("COUNTERS_PATH"), "r") as f:
+        lines = [l.split("//")[0] for l in f.readlines()]
 
         EVENT_NAMES = []
-        clean = lambda x: x.split('=')[1].split(' ')[0].split('\n')[0]
+        clean = lambda x: x.split("=")[1].split(" ")[0].split("\n")[0]
         for line in lines:
-            if 'PERFCOUNTER_ID=' in line:
-                EVENT_NAMES += ['id: '+clean(line)]
-            elif 'att: TARGET_CU' in line:
+            if "PERFCOUNTER_ID=" in line:
+                EVENT_NAMES += ["id: " + clean(line)]
+            elif "att: TARGET_CU" in line:
                 args.target_cu = int(clean(line))
         for line in lines:
-            if 'PERFCOUNTER=' in line:
-                EVENT_NAMES += [clean(line).split('SQ_')[1].lower()]
+            if "PERFCOUNTER=" in line:
+                EVENT_NAMES += [clean(line).split("SQ_")[1].lower()]
     if args.target_cu is None:
         args.target_cu = 1
 
     att_kernel = glob.glob(args.att_kernel)
 
     if len(att_kernel) == 0:
-        print('Could not find att output kernel:', args.att_kernel)
+        print("Could not find att output kernel:", args.att_kernel)
         exit(1)
     elif len(att_kernel) > 1:
         if mpi_root:
-            print('Found multiple kernel matching given filters:')
+            print("Found multiple kernel matching given filters:")
             for n, k in enumerate(att_kernel):
-                print('\t', n, '->', k)
+                print("\t", n, "->", k)
 
             bValid = False
             while bValid == False:
@@ -411,7 +468,7 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
                 except KeyboardInterrupt:
                     exit(0)
                 except:
-                    print('Invalid option.')
+                    print("Invalid option.")
         if comm is not None:
             args.att_kernel = comm.bcast(args.att_kernel, root=0)
     else:
@@ -429,13 +486,13 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
 
     # Trace Parsing
     if args.trace_file is None:
-        filenames = glob.glob(args.att_kernel.split('_kernel.txt')[0]+'_*.att')
+        filenames = glob.glob(args.att_kernel.split("_kernel.txt")[0] + "_*.att")
     else:
         filenames = glob.glob(args.trace_file)
-    assert(len(filenames) > 0)
+    assert len(filenames) > 0
 
     if comm is not None:
-        filenames = filenames[comm.Get_rank()::comm.Get_size()]
+        filenames = filenames[comm.Get_rank() :: comm.Get_size()]
 
     code = jumps = None
     if mpi_root:
@@ -443,7 +500,7 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
         code, jumps = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel)
 
     DBFILES = []
-    TIMELINES = [np.zeros(int(1E4),dtype=np.int16) for k in range(5)]
+    TIMELINES = [np.zeros(int(1e4), dtype=np.int16) for k in range(5)]
     EVENTS = []
     OCCUPANCY = []
     GFXV = []
@@ -479,16 +536,16 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
     gc.collect()
     min_event_time = 2**62
     for df in DBFILES:
-        if len(df['begin_time']) > 0:
-            min_event_time = min(min_event_time, np.min(df['begin_time']))
+        if len(df["begin_time"]) > 0:
+            min_event_time = min(min_event_time, np.min(df["begin_time"]))
     for perf in EVENTS:
         for p in perf:
             min_event_time = min(min_event_time, p.time)
     for occ in OCCUPANCY:
-        min_event_time = min(min_event_time, np.min(np.array(occ)>>16))
+        min_event_time = min(min_event_time, np.min(np.array(occ) >> 16))
 
     gc.collect()
-    min_event_time = max(0, min_event_time-32)
+    min_event_time = max(0, min_event_time - 32)
     if comm is not None:
         min_event_time = comm.reduce(min_event_time, op=MPI.MIN)
         min_event_time = comm.bcast(min_event_time, root=0)
@@ -513,14 +570,17 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
             OCCUPANCY = [e for elem in OCCUPANCY for e in elem]
             gathered_filenames = [e for elem in gathered_filenames for e in elem]
             gfxv = [e for elem in GFXV for e in elem][0]
-    
+
             TIMELINES_GATHER = TIMELINES
-            TIMELINES = [np.zeros((np.max([len(tm[k]) for tm in TIMELINES])), np.int16) for k in range(5)]
+            TIMELINES = [
+                np.zeros((np.max([len(tm[k]) for tm in TIMELINES])), np.int16)
+                for k in range(5)
+            ]
             for gather in TIMELINES_GATHER:
                 for t, m in zip(TIMELINES, gather):
-                    t[:len(m)] += m
-            del(TIMELINES_GATHER)
-        else: # free up memory
+                    t[: len(m)] += m
+            del TIMELINES_GATHER
+        else:  # free up memory
             TIMELINES = []
             OCCUPANCY = []
             EVENTS = []
@@ -542,17 +602,49 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES):
     gc.collect()
     print("Min time:", min_event_time)
 
-    drawinfo = {'TIMELINES':TIMELINES, 'EVENTS':EVENTS, 'EVENT_NAMES':EVENT_NAMES, 'OCCUPANCY': OCCUPANCY, 'ShaderNames': gathered_filenames}
+    drawinfo = {
+        "TIMELINES": TIMELINES,
+        "EVENTS": EVENTS,
+        "EVENT_NAMES": EVENT_NAMES,
+        "OCCUPANCY": OCCUPANCY,
+        "ShaderNames": gathered_filenames,
+    }
     if args.genasm and len(args.genasm) > 0:
-        flight_count = view_trace(args, code, DBFILES, analysed_filenames, True, OCCUPANCY, args.dumpfiles, min_event_time, gfxv, drawinfo, comm, mpi_root)
-        with open(args.assembly_code, 'r') as file:
+        flight_count = view_trace(
+            args,
+            code,
+            DBFILES,
+            analysed_filenames,
+            True,
+            OCCUPANCY,
+            args.dumpfiles,
+            min_event_time,
+            gfxv,
+            drawinfo,
+            comm,
+            mpi_root,
+        )
+        with open(args.assembly_code, "r") as file:
             lines = file.readlines()
-        assembly_code = {l+1.0: lines[l][:-1] for l in range(len(lines))}
+        assembly_code = {l + 1.0: lines[l][:-1] for l in range(len(lines))}
         assembly_code = insert_waitcnt(flight_count, assembly_code)
 
-        with open(args.genasm, 'w') as file:
+        with open(args.genasm, "w") as file:
             keys = sorted(assembly_code.keys())
             for k in keys:
-                file.write(assembly_code[k]+'\n')
+                file.write(assembly_code[k] + "\n")
     else:
-        view_trace(args, code, DBFILES, analysed_filenames, False, OCCUPANCY, args.dumpfiles, min_event_time, gfxv, drawinfo, comm, mpi_root)
+        view_trace(
+            args,
+            code,
+            DBFILES,
+            analysed_filenames,
+            False,
+            OCCUPANCY,
+            args.dumpfiles,
+            min_event_time,
+            gfxv,
+            drawinfo,
+            comm,
+            mpi_root,
+        )
diff --git a/plugin/att/drawing.py b/plugin/att/drawing.py
index 63176260..b6a5e62a 100644
--- a/plugin/att/drawing.py
+++ b/plugin/att/drawing.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import sys
+
 if sys.version_info[0] < 3:
     raise Exception("Must be using Python 3")
 
@@ -9,7 +10,8 @@
 from copy import deepcopy
 import json
 
-COUNTERS_MAX_CAPTURES = 1<<12
+COUNTERS_MAX_CAPTURES = 1 << 12
+
 
 class Readable:
     def __init__(self, jsonstring):
@@ -17,19 +19,20 @@ def __init__(self, jsonstring):
         self.seek = 0
 
     def read(self, length=0):
-        if length<=0:
+        if length <= 0:
             return self.jsonstr
         else:
             if self.seek >= len(self):
                 self.seek = 0
                 return None
-            response =  self.jsonstr[self.seek:self.seek+length]
+            response = self.jsonstr[self.seek : self.seek + length]
             self.seek += length
-            return bytes(response, 'utf-8')
+            return bytes(response, "utf-8")
 
     def __len__(self):
         return len(self.jsonstr)
 
+
 class FileBytesIO:
     def __init__(self, iobytes):
         self.iobytes = deepcopy(iobytes)
@@ -39,72 +42,103 @@ def __len__(self):
         return self.iobytes.getbuffer().nbytes
 
     def read(self, length=0):
-        if length<=0:
+        if length <= 0:
             return bytes(self.iobytes.getbuffer())
         else:
             if self.seek >= self.iobytes.getbuffer().nbytes:
                 self.seek = 0
                 return None
-            response =  self.iobytes.getbuffer()[self.seek:self.seek+length]
+            response = self.iobytes.getbuffer()[self.seek : self.seek + length]
             self.seek += length
             return bytes(response)
 
+
 def get_delta_time(events):
     try:
-        CUS = [[e.time for e in events if e.cu==k and e.bank==0] for k in range(16)]
+        CUS = [[e.time for e in events if e.cu == k and e.bank == 0] for k in range(16)]
         CUS = [np.asarray(c).astype(np.int64) for c in CUS if len(c) > 2]
-        return np.min([np.min(abs(c[1:]-c[:-1])) for c in CUS])
+        return np.min([np.min(abs(c[1:] - c[:-1])) for c in CUS])
     except:
         return 1
 
+
 def draw_wave_metrics(selections, normalize, TIMELINES, EVENTS, EVENT_NAMES):
-    plt.figure(figsize=(15,4))
+    plt.figure(figsize=(15, 4))
 
     delta_step = 8
-    quad_delta_time = max(delta_step,int(0.5+np.min([get_delta_time(events) for events in EVENTS])))
-    maxtime = np.max([np.max([e.time for e in events]) for events in EVENTS])/quad_delta_time+1
-
-    if maxtime*delta_step >= COUNTERS_MAX_CAPTURES:
+    quad_delta_time = max(
+        delta_step, int(0.5 + np.min([get_delta_time(events) for events in EVENTS]))
+    )
+    maxtime = (
+        np.max([np.max([e.time for e in events]) for events in EVENTS]) / quad_delta_time
+        + 1
+    )
+
+    if maxtime * delta_step >= COUNTERS_MAX_CAPTURES:
         delta_step = 1
     while maxtime >= COUNTERS_MAX_CAPTURES:
         quad_delta_time *= 2
         maxtime /= 2
 
-    maxtime = int(min(maxtime*delta_step, COUNTERS_MAX_CAPTURES))
+    maxtime = int(min(maxtime * delta_step, COUNTERS_MAX_CAPTURES))
     event_timeline = np.zeros((16, maxtime), dtype=np.int32)
-    print('Delta:', quad_delta_time)
-    print('Max_cycles:', maxtime*quad_delta_time*4//delta_step)
+    print("Delta:", quad_delta_time)
+    print("Max_cycles:", maxtime * quad_delta_time * 4 // delta_step)
 
-    cycles = 4*quad_delta_time//delta_step*np.arange(maxtime)
-    kernel = len(EVENTS)*quad_delta_time
+    cycles = 4 * quad_delta_time // delta_step * np.arange(maxtime)
+    kernel = len(EVENTS) * quad_delta_time
 
     for events in EVENTS:
-        for e in range(len(events)-1):
-            bk = events[e].bank*4
-            start = events[e].time // (quad_delta_time//delta_step)
-            end = start+delta_step
-            event_timeline[bk:bk+4, start:end] += np.asarray(events[e].toTuple()[1:5])[:, None]
+        for e in range(len(events) - 1):
+            bk = events[e].bank * 4
+            start = events[e].time // (quad_delta_time // delta_step)
+            end = start + delta_step
+            event_timeline[bk : bk + 4, start:end] += np.asarray(
+                events[e].toTuple()[1:5]
+            )[:, None]
         start = events[-1].time
-        event_timeline[bk:bk+4, start:start+delta_step] += \
-            np.asarray(events[-1].toTuple()[1:5])[:, None]
+        event_timeline[bk : bk + 4, start : start + delta_step] += np.asarray(
+            events[-1].toTuple()[1:5]
+        )[:, None]
 
-    event_timeline = [np.convolve(e, [kernel for k in range(3)])[1:-1] for e in event_timeline]
-    #event_timeline = [e/kernel for e in event_timeline]
+    event_timeline = [
+        np.convolve(e, [kernel for k in range(3)])[1:-1] for e in event_timeline
+    ]
+    # event_timeline = [e/kernel for e in event_timeline]
 
     if normalize:
-        event_timeline = [100*e/max(e.max(), 1E-5) for e in event_timeline]
-
-    colors = ['blue', 'green', 'gray', 'red', 'orange', 'cyan', 'black', 'darkviolet',
-                'yellow', 'darkred', 'pink', 'lime', 'gold', 'tan', 'aqua', 'olive']
-    [plt.plot(cycles, e, '-', label=n, color=c)
-        for e, n, c, sel in zip(event_timeline, EVENT_NAMES, colors, selections) if sel]
+        event_timeline = [100 * e / max(e.max(), 1e-5) for e in event_timeline]
+
+    colors = [
+        "blue",
+        "green",
+        "gray",
+        "red",
+        "orange",
+        "cyan",
+        "black",
+        "darkviolet",
+        "yellow",
+        "darkred",
+        "pink",
+        "lime",
+        "gold",
+        "tan",
+        "aqua",
+        "olive",
+    ]
+    [
+        plt.plot(cycles, e, "-", label=n, color=c)
+        for e, n, c, sel in zip(event_timeline, EVENT_NAMES, colors, selections)
+        if sel
+    ]
 
     plt.legend()
     if normalize:
-        plt.ylabel('As % of maximum')
+        plt.ylabel("As % of maximum")
     else:
-        plt.ylabel('Value')
-    plt.xlabel('Cycle')
+        plt.ylabel("Value")
+    plt.xlabel("Cycle")
     plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1)
 
     figure_bytes = BytesIO()
@@ -114,39 +148,56 @@ def draw_wave_metrics(selections, normalize, TIMELINES, EVENTS, EVENT_NAMES):
 
 def draw_wave_states(selections, normalize, TIMELINES):
     plot_indices = [1, 2, 3, 4]
-    STATES = [['Empty', 'Idle', 'Exec', 'Wait', 'Stall'][k] for k in plot_indices]
-    colors = [['gray', 'orange', 'green', 'red', 'blue'][k] for k in plot_indices]
+    STATES = [["Empty", "Idle", "Exec", "Wait", "Stall"][k] for k in plot_indices]
+    colors = [["gray", "orange", "green", "red", "blue"][k] for k in plot_indices]
 
-    plt.figure(figsize=(15,4))
+    plt.figure(figsize=(15, 4))
 
     maxtime = max([np.max((TIMELINES[k]!=0)*np.arange(0,TIMELINES[k].size)) for k in plot_indices])
     maxtime = max(maxtime, 1)
     timelines = [deepcopy(TIMELINES[k][:maxtime]) for k in plot_indices]
-    timelines = [np.pad(t, [0, maxtime-t.size]) for t in timelines]
+    timelines = [np.pad(t, [0, maxtime - t.size]) for t in timelines]
 
     if normalize:
-        timelines = np.array(timelines) / np.maximum(np.sum(timelines,0)*1E-2,1E-7)
-
-    trim = max(maxtime//5000,1)
-    cycles = np.arange(0, timelines[0].size//trim, 1)*trim
-    timelines = [time[:trim*(time.size//trim)].reshape((-1, trim)).mean(-1) if len(time) > 0 else cycles*0 for time in timelines]
+        timelines = np.array(timelines) / np.maximum(np.sum(timelines, 0) * 1e-2, 1e-7)
+
+    trim = max(maxtime // 5000, 1)
+    cycles = np.arange(0, timelines[0].size // trim, 1) * trim
+    timelines = [
+        time[: trim * (time.size // trim)].reshape((-1, trim)).mean(-1)
+        if len(time) > 0
+        else cycles * 0
+        for time in timelines
+    ]
     kernsize = 21
-    kernel = np.asarray([np.exp(-abs(10*k/kernsize)) for k in range(-kernsize//2,kernsize//2+1)])
+    kernel = np.asarray(
+        [
+            np.exp(-abs(10 * k / kernsize))
+            for k in range(-kernsize // 2, kernsize // 2 + 1)
+        ]
+    )
     kernel /= np.sum(kernel)
 
-    timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2] for time in timelines if len(time) > 0]
+    timelines = [
+        np.convolve(time, kernel)[kernsize // 2 : -kernsize // 2]
+        for time in timelines
+        if len(time) > 0
+    ]
 
-    [plt.plot(cycles, t, label='State '+s, linewidth=1.1, color=c)
-        for t, s, c, sel in zip(timelines, STATES, colors, selections) if sel]
+    [
+        plt.plot(cycles, t, label="State " + s, linewidth=1.1, color=c)
+        for t, s, c, sel in zip(timelines, STATES, colors, selections)
+        if sel
+    ]
 
     plt.legend()
     if normalize:
-        plt.ylabel('Waves state %')
+        plt.ylabel("Waves state %")
     else:
-        plt.ylabel('Waves state total')
-    plt.xlabel('Cycle')
+        plt.ylabel("Waves state total")
+    plt.xlabel("Cycle")
     plt.ylim(-1)
-    plt.xlim(-maxtime//200, maxtime+maxtime//200+1)
+    plt.xlim(-maxtime // 200, maxtime + maxtime // 200 + 1)
     plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1)
     figure_bytes = BytesIO()
     plt.savefig(figure_bytes, dpi=150)
@@ -154,7 +205,7 @@ def draw_wave_states(selections, normalize, TIMELINES):
 
 
 def draw_occupancy(selections, normalize, OCCUPANCY, shadernames):
-    plt.figure(figsize=(15,4))
+    plt.figure(figsize=(15, 4))
     names = []
     if len(OCCUPANCY) == 1: # If single SE, do occupancy per CU/WGP
         OCCUPANCY = [[u for u in OCCUPANCY[0] if u&0xFF==k] for k in range(16)]
@@ -166,7 +217,7 @@ def draw_occupancy(selections, normalize, OCCUPANCY, shadernames):
     for name, occ in zip(shadernames, OCCUPANCY):
         occ_values = [0]
         occ_times = [0]
-        occ = [(int(u>>16), (u>>8)&0xFF, u&0xFF) for u in occ]
+        occ = [(int(u >> 16), (u >> 8) & 0xFF, u & 0xFF) for u in occ]
         current_occ = [0 for k in range(16)]
 
         for time, value, cu in occ:
@@ -180,30 +231,30 @@ def draw_occupancy(selections, normalize, OCCUPANCY, shadernames):
 
         NUM_DOTS = 1500
         maxtime = np.max(occ_times)
-        delta = max(1, maxtime//NUM_DOTS)
-        chart = np.zeros((maxtime//delta+1), dtype=np.float32)
+        delta = max(1, maxtime // NUM_DOTS)
+        chart = np.zeros((maxtime // delta + 1), dtype=np.float32)
         norm_fact = np.zeros_like(chart)
 
         for i, t in enumerate(occ_times[:-1]):
-            b = t//delta
-            e = max(b+1,occ_times[i+1]//delta)
+            b = t // delta
+            e = max(b + 1, occ_times[i + 1] // delta)
             chart[b:e] += occ_values[i]
             norm_fact[b:e] += 1
 
-        chart /= np.maximum(norm_fact,1)
+        chart /= np.maximum(norm_fact, 1)
         if normalize:
-            chart /= max(chart.max(),1E-6)
+            chart /= max(chart.max(), 1e-6)
 
-        plt.plot(np.arange(chart.size)*delta, chart, label=name, linewidth=1.1)
+        plt.plot(np.arange(chart.size) * delta, chart, label=name, linewidth=1.1)
 
     plt.legend()
     if normalize:
-        plt.ylabel('Occupancy %')
+        plt.ylabel("Occupancy %")
     else:
-        plt.ylabel('Occupancy total')
-    plt.xlabel('Cycle')
+        plt.ylabel("Occupancy total")
+    plt.xlabel("Cycle")
     plt.ylim(-1)
-    plt.xlim(-maxtime//200, maxtime+maxtime//200+delta+1)
+    plt.xlim(-maxtime // 200, maxtime + maxtime // 200 + delta + 1)
     plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1)
     figure_bytes = BytesIO()
     plt.savefig(figure_bytes, dpi=150)
@@ -211,22 +262,26 @@ def draw_occupancy(selections, normalize, OCCUPANCY, shadernames):
 
 
 def GeneratePIC(drawinfo, selections=[True for k in range(16)], normalize=False):
-    EVENTS = drawinfo['EVENTS']
+    EVENTS = drawinfo["EVENTS"]
 
     response = {}
     figures = {}
 
-    states, figure = draw_occupancy(selections, normalize, drawinfo['OCCUPANCY'], drawinfo['ShaderNames'])
-    response['occupancy.png'] = states
-    figures['occupancy.png'] = figure
+    states, figure = draw_occupancy(
+        selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"]
+    )
+    response["occupancy.png"] = states
+    figures["occupancy.png"] = figure
 
-    states, figure = draw_wave_states(selections, normalize, drawinfo['TIMELINES'])
-    response['timeline.png'] = states
-    figures['timeline.png'] = figure
+    states, figure = draw_wave_states(selections, normalize, drawinfo["TIMELINES"])
+    response["timeline.png"] = states
+    figures["timeline.png"] = figure
 
     if len(EVENTS) > 0 and np.sum([len(e) for e in EVENTS]) > 32:
-        EVENT_NAMES, figure = draw_wave_metrics(selections, normalize, drawinfo['TIMELINES'], EVENTS, drawinfo['EVENT_NAMES'])
-        response['counters.png'] = EVENT_NAMES
-        figures['counters.png'] = figure
+        EVENT_NAMES, figure = draw_wave_metrics(
+            selections, normalize, drawinfo["TIMELINES"], EVENTS, drawinfo["EVENT_NAMES"]
+        )
+        response["counters.png"] = EVENT_NAMES
+        figures["counters.png"] = figure
 
     return Readable(response), figures
diff --git a/plugin/att/stitch.py b/plugin/att/stitch.py
index e1ada4e6..0cd03bd5 100644
--- a/plugin/att/stitch.py
+++ b/plugin/att/stitch.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import sys
+
 if sys.version_info[0] < 3:
     raise Exception("Must be using Python 3")
 
@@ -54,33 +55,35 @@
 # Keeps track of register states for hipcc-generated assembly
 class RegisterWatchList:
     def __init__(self, labels):
-        self.registers = {'v'+str(k): [[] for m in range(64)] for k in range(64)}
+        self.registers = {"v" + str(k): [[] for m in range(64)] for k in range(64)}
         for k in range(64):
-            self.registers['s'+str(k)] = []
+            self.registers["s" + str(k)] = []
         self.labels = labels
 
     def try_translate(self, tok):
-        if tok[0] in ['s']:
+        if tok[0] in ["s"]:
             return self.registers[self.range(tok)[0]]
-        elif '@' in tok:
-            return self.labels[tok.split('@')[0]]+1
+        elif "@" in tok:
+            return self.labels[tok.split("@")[0]] + 1
 
     def range(self, r):
-        reg = r.split(':')
+        reg = r.split(":")
         if len(reg) == 1:
             return reg
         else:
-            r0 = reg[0].split('[')
-            return [r0[0]+str(k) for k in range(int(r0[1]), int(reg[1][:-1])+1)]
+            r0 = reg[0].split("[")
+            return [r0[0] + str(k) for k in range(int(r0[1]), int(reg[1][:-1]) + 1)]
 
     def tokenize(self, line):
-        return [u for u in [t.split(',')[0].strip() for t in line.split(' ')] if len(u) > 0]
+        return [
+            u for u in [t.split(",")[0].strip() for t in line.split(" ")] if len(u) > 0
+        ]
 
     def getpc(self, line, next_line):
-        #print('Get pc:', line)
+        # print('Get pc:', line)
         try:
-            dst = line.split(' ')[1].strip()
-            label_dest = next_line.split(', ')[-1].split('@')[0]
+            dst = line.split(" ")[1].strip()
+            label_dest = next_line.split(", ")[-1].split("@")[0]
             for reg in self.range(dst):
                 self.registers[reg].append(deepcopy(self.labels[label_dest]))
         except:
@@ -94,7 +97,7 @@ def swappc(self, line, line_num, inst_num):
 
             popped = self.registers[self.range(src)[0]][-1]
             self.registers[self.range(src)[0]] = self.registers[self.range(src)[0]][:-1]
-            self.registers[self.range(dst)[0]].append(line_num+1)
+            self.registers[self.range(dst)[0]].append(line_num + 1)
             return popped
         except:
             return 0
@@ -111,12 +114,12 @@ def setpc(self, line, inst_num):
     def scratch(self, line):
         try:
             tokens = self.tokenize(line)
-            if '_load' in tokens[0]:
+            if "_load" in tokens[0]:
                 dst = tokens[1]
-                src = tokens[3]+tokens[4]
+                src = tokens[3] + tokens[4]
             else:
                 src = tokens[2]
-                dst = tokens[3]+tokens[4]
+                dst = tokens[3] + tokens[4]
             self.registers[dst] = self.registers[src]
         except:
             pass
@@ -124,19 +127,27 @@ def scratch(self, line):
     def move(self, line):
         try:
             tokens = self.tokenize(line)
-            if tokens[2][0] in ['s', 'd'] and tokens[1][0] in ['s', 'd']:
-                self.registers[self.range(tokens[1])[0]] = deepcopy(self.registers[self.range(tokens[2])[0]])
+            if tokens[2][0] in ["s", "d"] and tokens[1][0] in ["s", "d"]:
+                self.registers[self.range(tokens[1])[0]] = deepcopy(
+                    self.registers[self.range(tokens[2])[0]]
+                )
         except:
             pass
 
     def updatelane(self, line):
         tokens = self.tokenize(line)
         try:
-            if 'v_readlane' in tokens[0]:
-                self.registers[tokens[1]].append(self.registers[tokens[2]][int(tokens[3])][-1])
-                self.registers[tokens[2]][int(tokens[3])] = self.registers[tokens[2]][int(tokens[3])][:-1]
-            elif 'v_writelane' in tokens[0]:
-                self.registers[tokens[1]][int(tokens[3])].append(self.registers[tokens[2]][-1])
+            if "v_readlane" in tokens[0]:
+                self.registers[tokens[1]].append(
+                    self.registers[tokens[2]][int(tokens[3])][-1]
+                )
+                self.registers[tokens[2]][int(tokens[3])] = self.registers[tokens[2]][
+                    int(tokens[3])
+                ][:-1]
+            elif "v_writelane" in tokens[0]:
+                self.registers[tokens[1]][int(tokens[3])].append(
+                    self.registers[tokens[2]][-1]
+                )
                 self.registers[tokens[2]] = self.registers[tokens[2]][-STACK_SIZE_LIMIT:]
         except Exception as e:
             pass
@@ -179,7 +190,8 @@ def updatelane(self, line):
 
 # Matches tokens in reverse order
 def try_match_swapped(insts, code, i, line):
-    return insts[i+1][1] == code[line][1] and insts[i][1] == code[line+1][1]
+    return insts[i + 1][1] == code[line][1] and insts[i][1] == code[line + 1][1]
+
 
 FORK_NAMES = 1
 # A successful parsed instruction
@@ -197,7 +209,7 @@ def __init__(self):
         self.data = None
         self.name = FORK_NAMES
         FORK_NAMES += 1
-        #print('Created new fork: ', self.name)
+        # print('Created new fork: ', self.name)
 
 # Try to match sequence "insts" with the branch "fork", starting at position "i"
 def move_down_fork(fork, insts, i): #(fork : Fork, insts : list, i : int):
@@ -217,6 +229,7 @@ def move_down_fork(fork, insts, i): #(fork : Fork, insts : list, i : int):
 
     return True, i
 
+
 FORK_TREE = Fork()
 
 # Check if there exists a previous wave with the same sequence of instructions executed
@@ -227,7 +240,7 @@ def fromDict(insts):
     while i < N:
         tillEnd, final_pos = move_down_fork(cur_fork, insts, i)
         if tillEnd:
-            #print('Reached end')
+            # print('Reached end')
             return True, cur_fork
 
         i += final_pos
@@ -250,7 +263,7 @@ def fromDict(insts):
             last_inst.forks.append(cur_fork)
             return False, cur_fork
 
-    print('Warning: Reached end of loop!')
+    print("Warning: Reached end of loop!")
     return False, cur_fork
 
 
@@ -279,8 +292,8 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
 
 
     SMEM_INST = []  # scalar memory
-    VLMEM_INST = [] # vector memory load
-    VSMEM_INST = [] # vector memory store
+    VLMEM_INST = []  # vector memory load
+    VSMEM_INST = []  # vector memory store
     FLAT_INST = []
     NUM_SMEM = 0
     NUM_VLMEM = 0
@@ -302,20 +315,20 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
                 line = len(code)
                 print('Begin at:', line, c)
         c = list(c)
-        c[0] = c[0].split(';')[0].split('//')[0].strip()
+        c[0] = c[0].split(";")[0].split("//")[0].strip()
 
         if c[1] != 100:
             code.append(c)
-        elif ':' in c[0]:
-            labels[c[0].split(':')[0]] = len(code)
-        jump_map.append(len(code)-1)
+        elif ":" in c[0]:
+            labels[c[0].split(":")[0]] = len(code)
+        jump_map.append(len(code) - 1)
 
     reverse_map = []
     for k, v in enumerate(jump_map):
         if v >= len(reverse_map):
             reverse_map.append(k)
 
-    jumps = {jump_map[j]+1: j for j in jumps}
+    jumps = {jump_map[j] + 1: j for j in jumps}
 
     # Checks if we have guaranteed ordering in memory operations
     smem_ordering = 0
@@ -340,7 +353,7 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
         as_line = code[line]
 
         matched = True
-        next = line+1
+        next = line + 1
 
         if not bIsAuto:
             if '_mov_' in as_line[0]:
@@ -375,25 +388,25 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
                 pcsequence.append(insts[i][2])
         elif inst[1] == as_line[1]:
             if line in jumps:
-                loopCount[jumps[line]-1] += 1
+                loopCount[jumps[line] - 1] += 1
             num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
 
             if inst[1] == SMEM or inst[1] == LDS:
                 smem_ordering = 1 if inst[1] == SMEM else smem_ordering
-                SMEM_INST.append([reverse_map[line],  num_inflight])
+                SMEM_INST.append([reverse_map[line], num_inflight])
                 NUM_SMEM += 1
-            elif inst[1] == VMEM or (inst[1] == FLAT and 'global_' in as_line[0]):
+            elif inst[1] == VMEM or (inst[1] == FLAT and "global_" in as_line[0]):
                 inc_ordering = False
-                if 'flat_' in as_line[0]:
+                if "flat_" in as_line[0]:
                     inc_ordering = True
 
-                if not bGFX9 and 'store' in as_line[0]:
-                    VSMEM_INST.append([reverse_map[line],  num_inflight])
+                if not bGFX9 and "store" in as_line[0]:
+                    VSMEM_INST.append([reverse_map[line], num_inflight])
                     NUM_VSMEM += 1
                     if inc_ordering:
                         vsmem_ordering = 1
                 else:
-                    VLMEM_INST.append([reverse_map[line],  num_inflight])
+                    VLMEM_INST.append([reverse_map[line], num_inflight])
                     NUM_VLMEM += 1
                     if inc_ordering:
                         vlmem_ordering = 1
@@ -401,44 +414,48 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
                 smem_ordering = 1
                 vlmem_ordering = 1
                 vsmem_ordering = 1
-                FLAT_INST.append([reverse_map[line],  num_inflight])
+                FLAT_INST.append([reverse_map[line], num_inflight])
                 NUM_FLAT += 1
-            elif inst[1] == IMMED and 's_waitcnt' in as_line[0]:
-                if 'lgkmcnt' in as_line[0]:
-                    wait_N = int(as_line[0].split('lgkmcnt(')[1].split(')')[0])
+            elif inst[1] == IMMED and "s_waitcnt" in as_line[0]:
+                if "lgkmcnt" in as_line[0]:
+                    wait_N = int(as_line[0].split("lgkmcnt(")[1].split(")")[0])
                     flight_count.append([as_line[5], num_inflight, wait_N])
                     if wait_N == 0:
                         smem_ordering = 0
                     if smem_ordering == 0:
-                        offset = len(SMEM_INST)-wait_N
-                        mem_unroll.append( [reverse_map[line], SMEM_INST[:offset]+FLAT_INST] )
+                        offset = len(SMEM_INST) - wait_N
+                        mem_unroll.append(
+                            [reverse_map[line], SMEM_INST[:offset] + FLAT_INST]
+                        )
                         SMEM_INST = SMEM_INST[offset:]
                         NUM_SMEM = len(SMEM_INST)
                         FLAT_INST = []
                         NUM_FLAT = 0
                     else:
-                        NUM_SMEM = min(max(wait_N-NUM_FLAT, 0), NUM_SMEM)
-                        NUM_FLAT = min(max(wait_N-NUM_SMEM, 0), NUM_FLAT)
+                        NUM_SMEM = min(max(wait_N - NUM_FLAT, 0), NUM_SMEM)
+                        NUM_FLAT = min(max(wait_N - NUM_SMEM, 0), NUM_FLAT)
                     num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
 
-                if 'vmcnt' in as_line[0]:
-                    wait_N = int(as_line[0].split('vmcnt(')[1].split(')')[0])
+                if "vmcnt" in as_line[0]:
+                    wait_N = int(as_line[0].split("vmcnt(")[1].split(")")[0])
                     flight_count.append([as_line[5], num_inflight, wait_N])
                     if wait_N == 0:
                         vlmem_ordering = 0
                     if vlmem_ordering == 0:
-                        offset = len(VLMEM_INST)-wait_N
-                        mem_unroll.append( [reverse_map[line], VLMEM_INST[:offset]+FLAT_INST] )
+                        offset = len(VLMEM_INST) - wait_N
+                        mem_unroll.append(
+                            [reverse_map[line], VLMEM_INST[:offset] + FLAT_INST]
+                        )
                         VLMEM_INST = VLMEM_INST[offset:]
                         NUM_VLMEM = len(VLMEM_INST)
                         FLAT_INST = []
                         NUM_FLAT = 0
                     else:
-                        NUM_VLMEM = min(max(wait_N-NUM_FLAT, 0), NUM_VLMEM)
-                        NUM_FLAT = min(max(wait_N-NUM_VLMEM, 0), NUM_FLAT)
+                        NUM_VLMEM = min(max(wait_N - NUM_FLAT, 0), NUM_VLMEM)
+                        NUM_FLAT = min(max(wait_N - NUM_VLMEM, 0), NUM_FLAT)
                     num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
 
-                if 'vscnt' in as_line[0] or (bGFX9 and 'vmcnt' in as_line[0]):
+                if "vscnt" in as_line[0] or (bGFX9 and "vmcnt" in as_line[0]):
                     try:
                         wait_N = int(as_line[0].split('vscnt(')[1].split(')')[0])
                     except:
@@ -450,35 +467,37 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
                     if wait_N == 0:
                         vsmem_ordering = 0
                     if vsmem_ordering == 0:
-                        offset = len(VSMEM_INST)-wait_N
-                        mem_unroll.append( [reverse_map[line], VSMEM_INST[:offset]+FLAT_INST] )
+                        offset = len(VSMEM_INST) - wait_N
+                        mem_unroll.append(
+                            [reverse_map[line], VSMEM_INST[:offset] + FLAT_INST]
+                        )
                         VSMEM_INST = VSMEM_INST[offset:]
                         NUM_VSMEM = len(VSMEM_INST)
                         FLAT_INST = []
                         NUM_FLAT = 0
                     else:
-                        NUM_VSMEM = min(max(wait_N-NUM_FLAT, 0), NUM_VSMEM)
-                        NUM_FLAT = min(max(wait_N-NUM_VSMEM, 0), NUM_FLAT)
+                        NUM_VSMEM = min(max(wait_N - NUM_FLAT, 0), NUM_VSMEM)
+                        NUM_FLAT = min(max(wait_N - NUM_VSMEM, 0), NUM_FLAT)
                     num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM
 
         elif inst[1] == JUMP and as_line[1] == BRANCH:
             next = jump_map[as_line[2]]
             if next is None or next == 0:
-                print('Jump to unknown location!', as_line)
+                print("Jump to unknown location!", as_line)
                 break
         elif inst[1] == NEXT and as_line[1] == BRANCH:
             next = line + 1
         else:
             matched = False
             next = line + 1
-            if i+1 < N and line+1 < len(code):
+            if i + 1 < N and line + 1 < len(code):
                 if try_match_swapped(insts, code, i, line):
                     temp = insts[i]
-                    insts[i] = insts[i+1]
-                    insts[i+1] = temp
+                    insts[i] = insts[i + 1]
+                    insts[i + 1] = temp
                     next = line
-                elif 's_waitcnt ' in as_line[0] or '_load_' in as_line[0]:
-                    if skipped_immed > 0 and 's_waitcnt ' in as_line[0]:
+                elif "s_waitcnt " in as_line[0] or "_load_" in as_line[0]:
+                    if skipped_immed > 0 and "s_waitcnt " in as_line[0]:
                         matched = True
                         skipped_immed -= 1
                     elif 'scratch_' not in as_line[0]:
@@ -508,8 +527,10 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto):
             pass
     else:
         while line < len(code):
-            if 's_endpgm' in code[line]:
-                mem_unroll.append( [reverse_map[line], SMEM_INST+VLMEM_INST+VSMEM_INST+FLAT_INST] )
+            if "s_endpgm" in code[line]:
+                mem_unroll.append(
+                    [reverse_map[line], SMEM_INST + VLMEM_INST + VSMEM_INST + FLAT_INST]
+                )
                 break
             line += 1
 
diff --git a/plugin/att/trace_view.py b/plugin/att/trace_view.py
index 0539b33d..806a90bb 100755
--- a/plugin/att/trace_view.py
+++ b/plugin/att/trace_view.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import sys
+
 if sys.version_info[0] < 3:
     raise Exception("Must be using Python 3")
 
@@ -23,6 +24,7 @@
 
 JSON_GLOBAL_DICTIONARY = {}
 
+
 def get_ip():
     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
     s.settimeout(0)
@@ -31,51 +33,64 @@ def get_ip():
         IPAddr = socket.gethostbyname(hostname)
         s.connect(({IPAddr}, 1))
     except Exception:
-        IPAddr = '127.0.0.1'
+        IPAddr = "127.0.0.1"
     finally:
         return IPAddr
 
 
 IPAddr = get_ip()
 PORT, WebSocketPort = 8000, 18000
-SP = '\u00A0'
+SP = "\u00A0"
 
 
 def get_top_n(code):
     TOP_N = 10
     top_n = sorted(deepcopy(code), key=lambda x: x[-1], reverse=True)[:TOP_N]
-    return [(line_num, hitc, 0, run_time) for _, _, _, _, line_num, _, hitc, run_time in top_n]
+    return [
+        (line_num, hitc, 0, run_time) for _, _, _, _, line_num, _, hitc, run_time in top_n
+    ]
 
 
 def wave_info(df, id):
     dic = {
-        'Issue': df['issued_ins'][id],
-        'Valu': df['valu_ins'][id], 'Valu_stall': df['valu_stalls'][id],
-        'Salu': df['salu_ins'][id], 'Salu_stall': df['salu_stalls'][id],
-        'Vmem': df['vmem_ins'][id], 'Vmem_stall': df['vmem_stalls'][id],
-        'Smem': df['smem_ins'][id], 'Smem_stall': df['smem_stalls'][id],
-        'Flat': df['flat_ins'][id], 'Flat_stall': df['flat_stalls'][id],
-        'Lds': df['lds_ins'][id], 'Lds_stall': df['lds_stalls'][id],
-        'Br': df['br_ins'][id], 'Br_stall': df['br_stalls'][id],
+        "Issue": df["issued_ins"][id],
+        "Valu": df["valu_ins"][id],
+        "Valu_stall": df["valu_stalls"][id],
+        "Salu": df["salu_ins"][id],
+        "Salu_stall": df["salu_stalls"][id],
+        "Vmem": df["vmem_ins"][id],
+        "Vmem_stall": df["vmem_stalls"][id],
+        "Smem": df["smem_ins"][id],
+        "Smem_stall": df["smem_stalls"][id],
+        "Flat": df["flat_ins"][id],
+        "Flat_stall": df["flat_stalls"][id],
+        "Lds": df["lds_ins"][id],
+        "Lds_stall": df["lds_stalls"][id],
+        "Br": df["br_ins"][id],
+        "Br_stall": df["br_stalls"][id],
     }
-    dic['Issue_stall'] = int(np.sum([dic[key] for key in dic.keys() if '_STALL' in key]))
+    dic["Issue_stall"] = int(np.sum([dic[key] for key in dic.keys() if "_STALL" in key]))
     return dic
 
 
 def extract_data(df, se_number):
-    if len(df['id']) == 0 or len(df['instructions']) == 0 or len(df['timeline']) == 0:
+    if len(df["id"]) == 0 or len(df["instructions"]) == 0 or len(df["timeline"]) == 0:
         return None
 
     wave_filenames = []
     flight_count = []
-    wave_slot_count = [{df['wave_slot'][wave_id]: 0 for wave_id in df['id']} for k in range(4)]
-    
-    print('Number of waves:', len(df['id']))
+    wave_slot_count = [
+        {df["wave_slot"][wave_id]: 0 for wave_id in df["id"]} for k in range(4)
+    ]
+
+    print("Number of waves:", len(df["id"]))
     allwaves_maxline = 0
 
-    for wave_id in df['id']:
-        stitched, loopCount, mem_unroll, count, maxline, num_insts = df['instructions'][wave_id]
-        timeline = df['timeline'][wave_id]
+    for wave_id in df["id"]:
+        stitched, loopCount, mem_unroll, count, maxline, num_insts = df["instructions"][
+            wave_id
+        ]
+        timeline = df["timeline"][wave_id]
 
         if len(stitched) == 0 or len(timeline) == 0 or len(stitched) != num_insts:
             continue
@@ -84,18 +99,18 @@ def extract_data(df, se_number):
         flight_count.append(count)
 
         wave_entry = {
-            "id": int(df['id'][wave_id]),
-            "simd": int(df['simd'][wave_id]),
-            "slot": int(df['wave_slot'][wave_id]),
-            "begin": int(df['begin_time'][wave_id]),
-            "end": int(df['end_time'][wave_id]),
+            "id": int(df["id"][wave_id]),
+            "simd": int(df["simd"][wave_id]),
+            "slot": int(df["wave_slot"][wave_id]),
+            "begin": int(df["begin_time"][wave_id]),
+            "end": int(df["end_time"][wave_id]),
             "info": wave_info(df, wave_id),
             "instructions": stitched,
             "timeline": timeline,
-            "waitcnt": mem_unroll
+            "waitcnt": mem_unroll,
         }
         data_obj = {
-            "name": 'SE'.format(se_number),
+            "name": "SE".format(se_number),
             "duration": sum(dur for (_, dur) in timeline),
             "wave": wave_entry,
             "loop_count": loopCount,
@@ -103,26 +118,36 @@ def extract_data(df, se_number):
             "num_stitched": len(stitched),
             "num_insts": num_insts,
             "websocket_port": WebSocketPort,
-            "generation_time": time.ctime()
+            "generation_time": time.ctime(),
         }
 
-        simd_id = df['simd'][wave_id]
-        slot_id = df['wave_slot'][wave_id]
+        simd_id = df["simd"][wave_id]
+        slot_id = df["wave_slot"][wave_id]
         slot_count = wave_slot_count[simd_id][slot_id]
         wave_slot_count[simd_id][slot_id] += 1
 
-        OUT = 'se'+str(se_number)+'_sm'+str(simd_id)+'_sl'+str(slot_id)+'_wv'+str(slot_count)+'.json'
+        OUT = (
+            "se"
+            + str(se_number)
+            + "_sm"
+            + str(simd_id)
+            + "_sl"
+            + str(slot_id)
+            + "_wv"
+            + str(slot_count)
+            + ".json"
+        )
         JSON_GLOBAL_DICTIONARY[OUT] = Readable(data_obj)
-        wave_filenames.append((OUT, df['begin_time'][wave_id], df['end_time'][wave_id]))
+        wave_filenames.append((OUT, df["begin_time"][wave_id], df["end_time"][wave_id]))
 
     data_obj = {
-        "name": 'SE'.format(se_number),
+        "name": "SE".format(se_number),
         "websocket_port": WebSocketPort,
-        "generation_time": time.ctime()
+        "generation_time": time.ctime(),
     }
     se_filename = None
     if len(wave_filenames) > 0:
-        se_filename = 'se'+str(se_number)+'_info.json'
+        se_filename = "se" + str(se_number) + "_info.json"
         JSON_GLOBAL_DICTIONARY[se_filename] = Readable(data_obj)
 
     return flight_count, wave_filenames, se_filename, allwaves_maxline
@@ -139,36 +164,43 @@ def send_my_headers(self):
         self.send_header("Expires", "0")
 
     def do_GET(self):
-        if '.png?' in self.path and self.path.split('/')[-1] not in JSON_GLOBAL_DICTIONARY.keys():
-            selections = [int(s)!=0 for s in self.path.split('.png?')[-1]]
-            counters_json, imagebytes = GeneratePIC(self.drawinfo, selections[1:], selections[0])
-            JSON_GLOBAL_DICTIONARY['graph_options.json'] = counters_json
-            JSON_GLOBAL_DICTIONARY[self.path.split('/')[-1]] = imagebytes[self.path.split('/')[-1].split('?')[0]]
-
-        if '.json' in self.path or '.png' in self.path:
+        if (
+            ".png?" in self.path
+            and self.path.split("/")[-1] not in JSON_GLOBAL_DICTIONARY.keys()
+        ):
+            selections = [int(s) != 0 for s in self.path.split(".png?")[-1]]
+            counters_json, imagebytes = GeneratePIC(
+                self.drawinfo, selections[1:], selections[0]
+            )
+            JSON_GLOBAL_DICTIONARY["graph_options.json"] = counters_json
+            JSON_GLOBAL_DICTIONARY[self.path.split("/")[-1]] = imagebytes[
+                self.path.split("/")[-1].split("?")[0]
+            ]
+
+        if ".json" in self.path or ".png" in self.path:
             try:
-                response_file = JSON_GLOBAL_DICTIONARY[self.path.split('/')[-1]]
+                response_file = JSON_GLOBAL_DICTIONARY[self.path.split("/")[-1]]
             except:
-                print('Invalid json request:', self.path)
+                print("Invalid json request:", self.path)
                 print(JSON_GLOBAL_DICTIONARY.keys())
                 self.send_error(HTTPStatus.NOT_FOUND, "File not found")
                 return
             self.send_response(HTTPStatus.OK)
             self.send_header("Content-Length", str(len(response_file)))
-            if '.b' in self.path:
-                self.send_header("Content-type", 'application/octet-stream')
+            if ".b" in self.path:
+                self.send_header("Content-type", "application/octet-stream")
                 response_file = BytesIO(response_file)
-            elif 'timeline.png' in self.path:
-                self.send_header("Content-type", 'image/png')
+            elif "timeline.png" in self.path:
+                self.send_header("Content-type", "image/png")
             else:
-                self.send_header("Content-type", 'application/json')
+                self.send_header("Content-type", "application/json")
             self.send_header("Last-Modified", self.date_time_string(time.time()))
             self.end_headers()
             self.copyfile(response_file, self.wfile)
-        elif self.path in ['/', '/styles.css', '/index.html', '/logo.svg']:
+        elif self.path in ["/", "/styles.css", "/index.html", "/logo.svg"]:
             http.server.SimpleHTTPRequestHandler.do_GET(self)
         else:
-            print('Invalid request:', self.path)
+            print("Invalid request:", self.path)
             self.send_error(HTTPStatus.NOT_FOUND, "File not found")
 
 
@@ -181,8 +213,8 @@ def server_bind(self):
 def run_server(drawinfo):
     Handler = NoCacheHTTPRequestHandler
     Handler.drawinfo = drawinfo
-    os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'ui/'))
-    #os.chdir('ui/')
+    os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), "ui/"))
+    # os.chdir('ui/')
     try:
         with RocTCPServer((IPAddr, PORT), Handler) as httpd:
             httpd.serve_forever()
@@ -191,25 +223,32 @@ def run_server(drawinfo):
 
 
 def fix_space(line):
-    line = line.replace(' ', SP)
-    line = line.replace('\t', SP*4)
+    line = line.replace(" ", SP)
+    line = line.replace("\t", SP * 4)
     return line
 
 
 def WebSocketserver(websocket, path):
     data = websocket.recv()
-    cpp, ln, _ = data.split(':')
+    cpp, ln, _ = data.split(":")
     ln = int(ln)
-    HL, EMP = 'highlight', ''
+    HL, EMP = "highlight", ""
     content = None
     print("loading...")
     try:
-        f = open(cpp, 'r', errors='replace')
-        content = ''.join('<li class=\"line_'+str(i)+
-                str(HL if i==ln else EMP)+'">'+str(i).ljust(5)+fix_space(l)+'</li>'
-                          for i, l in enumerate(f.readlines(), 1))
+        f = open(cpp, "r", errors="replace")
+        content = "".join(
+            '<li class="line_'
+            + str(i)
+            + str(HL if i == ln else EMP)
+            + '">'
+            + str(i).ljust(5)
+            + fix_space(l)
+            + "</li>"
+            for i, l in enumerate(f.readlines(), 1)
+        )
     except FileNotFoundError:
-        content = cpp + ' not found!'
+        content = cpp + " not found!"
     websocket.send(content)
 
 
@@ -223,12 +262,14 @@ def run_websocket():
 
 
 def assign_ports(ports):
-    ps = [int(port) for port in ports.split(',')]
+    ps = [int(port) for port in ports.split(",")]
     if ps[0] <= 5000 or ps[1] <= 5000:
-        print('Need to have port values > 5000')
+        print("Need to have port values > 5000")
         sys.exit(1)
     elif ps[0] == ps[1]:
-        print('Can not use the same port for both web server and websocket server: '+ps[0])
+        print(
+            "Can not use the same port for both web server and websocket server: " + ps[0]
+        )
         sys.exit(1)
     global IPAddr, PORT, WebSocketPort
     PORT, WebSocketPort = ps[0], ps[1]
@@ -236,35 +277,54 @@ def assign_ports(ports):
 
 def call_picture_callback(return_dict, drawinfo):
     response, imagebytes = GeneratePIC(drawinfo)
-    return_dict['graph_options.json'] = response
+    return_dict["graph_options.json"] = response
     for k, v in imagebytes.items():
         return_dict[k] = v
 
-    for n, m in enumerate(drawinfo['TIMELINES']):
-        return_dict['wstates'+str(n)+'.json'] = Readable({"data": [int(n) for n in list(np.asarray(m))]})
-    for n, e in enumerate(drawinfo['EVENTS']):
-        return_dict['se'+str(n)+'_perfcounter.json'] = Readable({"data": [v.toTuple() for v in e]})
-
-
-def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpOnly, se_time_begin, gfxv, drawinfo, MPI_COMM, mpi_root):
+    for n, m in enumerate(drawinfo["TIMELINES"]):
+        return_dict["wstates" + str(n) + ".json"] = Readable(
+            {"data": [int(n) for n in list(np.asarray(m))]}
+        )
+    for n, e in enumerate(drawinfo["EVENTS"]):
+        return_dict["se" + str(n) + "_perfcounter.json"] = Readable(
+            {"data": [v.toTuple() for v in e]}
+        )
+
+
+def view_trace(
+    args,
+    code,
+    dbnames,
+    att_filenames,
+    bReturnLoc,
+    OCCUPANCY,
+    bDumpOnly,
+    se_time_begin,
+    gfxv,
+    drawinfo,
+    MPI_COMM,
+    mpi_root,
+):
     global JSON_GLOBAL_DICTIONARY
     pic_thread = None
     if mpi_root:
         manager = Manager()
         return_dict = manager.dict()
-        JSON_GLOBAL_DICTIONARY['occupancy.json'] = Readable({str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))})
+        JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable(
+            {str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))}
+        )
         pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo))
         pic_thread.start()
 
     att_filenames = [Path(f).name for f in att_filenames]
-    se_numbers = [int(a.split('_se')[1].split('.att')[0]) for a in att_filenames]
+    se_numbers = [int(a.split("_se")[1].split(".att")[0]) for a in att_filenames]
     flight_count = []
     simd_wave_filenames = {}
     se_filenames = []
 
     allse_maxline = 0
     for se_number, dbname in zip(se_numbers, dbnames):
-        if len(dbname['id']) == 0:
+        if len(dbname["id"]) == 0:
             continue
 
         count, wv_filenames, se_filename, maxline = extract_data(dbname, se_number)
@@ -282,12 +342,15 @@ def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpO
         JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)})
 
     for key in simd_wave_filenames.keys():
-        wv_array = [[
-            int(s[0].split('_sm')[1].split('_sl')[0]),
-            int(s[0].split('_sl')[1].split('_wv')[0]),
-            int(s[0].split('_wv')[1].split('.')[0]),
-            s
-        ] for s in simd_wave_filenames[key]]
+        wv_array = [
+            [
+                int(s[0].split("_sm")[1].split("_sl")[0]),
+                int(s[0].split("_sl")[1].split("_wv")[0]),
+                int(s[0].split("_wv")[1].split(".")[0]),
+                s,
+            ]
+            for s in simd_wave_filenames[key]
+        ]
 
         wv_dict = {}
         for wv in wv_array:
@@ -309,13 +372,19 @@ def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpO
         simd_wave_filenames = MPI_COMM.gather(simd_wave_filenames, root=0)
         if mpi_root:
             se_filenames = [e for elem in se_filenames for e in elem]
-            simd_wave_filenames = {k:v for smf in simd_wave_filenames for k,v in smf.items()}
+            simd_wave_filenames = {
+                k: v for smf in simd_wave_filenames for k, v in smf.items()
+            }
 
     if mpi_root:
-        JSON_GLOBAL_DICTIONARY['filenames.json'] = Readable({"wave_filenames": simd_wave_filenames,
-                                                        "se_filenames": se_filenames,
-                                                        "global_begin_time": int(se_time_begin),
-                                                        "gfxv": gfxv})
+        JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable(
+            {
+                "wave_filenames": simd_wave_filenames,
+                "se_filenames": se_filenames,
+                "global_begin_time": int(se_time_begin),
+                "gfxv": gfxv,
+            }
+        )
 
     if pic_thread is not None:
         pic_thread.join()
@@ -330,14 +399,19 @@ def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpO
             JSON_GLOBAL_DICTIONARY = MPI_COMM.gather(JSON_GLOBAL_DICTIONARY, root=0)
             if not mpi_root:
                 quit()
-            JSON_GLOBAL_DICTIONARY = {k:v for smf in JSON_GLOBAL_DICTIONARY for k,v in smf.items()}
+            JSON_GLOBAL_DICTIONARY = {
+                k: v for smf in JSON_GLOBAL_DICTIONARY for k, v in smf.items()
+            }
 
-        JSON_GLOBAL_DICTIONARY['live.json'] = Readable({'live': 1})
+        JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 1})
         if args.ports:
             assign_ports(args.ports)
-        print('serving at ports: {0},{1}'.format(PORT, WebSocketPort))
+        print("serving at ports: {0},{1}".format(PORT, WebSocketPort))
         try:
-            PROCS = [Process(target=run_server, args=[drawinfo]), Process(target=run_websocket)]
+            PROCS = [
+                Process(target=run_server, args=[drawinfo]),
+                Process(target=run_websocket),
+            ]
             for p in PROCS:
                 p.start()
             for p in PROCS:
@@ -345,10 +419,14 @@ def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpO
         except KeyboardInterrupt:
             print("Exitting.")
     else:
-        os.makedirs('ui/', exist_ok=True)
+        os.makedirs("ui/", exist_ok=True)
         if mpi_root:
-            JSON_GLOBAL_DICTIONARY['live.json'] = Readable({'live': 0})
-            os.system('cp ' + os.path.join(os.path.abspath(os.path.dirname(__file__)),'ui') + '/* ui/' )
+            JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0})
+            os.system(
+                "cp "
+                + os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui")
+                + "/* ui/"
+            )
         for k, v in JSON_GLOBAL_DICTIONARY.items():
-            with open(os.path.join('ui',k), 'w' if '.json' in k else 'wb') as f:
+            with open(os.path.join("ui", k), "w" if ".json" in k else "wb") as f:
                 f.write(v.read())
diff --git a/plugin/att/ui/httpserver.py b/plugin/att/ui/httpserver.py
index 8e75b7be..ce3e1b35 100644
--- a/plugin/att/ui/httpserver.py
+++ b/plugin/att/ui/httpserver.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import sys
+
 if sys.version_info[0] < 3:
     raise Exception("Must be using Python 3")
 
@@ -9,6 +10,7 @@
 import os
 import sys
 
+
 class NoCacheHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
     def end_headers(self):
         self.send_my_headers()
@@ -20,25 +22,28 @@ def send_my_headers(self):
         self.send_header("Expires", "0")
 
     def do_GET(self):
-        if '.png?' in self.path:
-            self.path = self.path.split('.png?')[0]+'.png'
+        if ".png?" in self.path:
+            self.path = self.path.split(".png?")[0] + ".png"
 
         http.server.SimpleHTTPRequestHandler.do_GET(self)
 
+
 class RocTCPServer(socketserver.TCPServer):
     def server_bind(self):
         self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
         self.socket.bind(self.server_address)
 
+
 def run_server():
     Handler = NoCacheHTTPRequestHandler
-    os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'.'))
+    os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), "."))
     try:
         with RocTCPServer((IPAddr, PORT), Handler) as httpd:
             httpd.serve_forever()
     except KeyboardInterrupt:
         pass
 
+
 def get_ip():
     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
     s.settimeout(0)
@@ -47,16 +52,17 @@ def get_ip():
         IPAddr = socket.gethostbyname(hostname)
         s.connect(({IPAddr}, 1))
     except Exception:
-        IPAddr = '127.0.0.1'
+        IPAddr = "127.0.0.1"
     finally:
         return IPAddr
 
+
 IPAddr = get_ip()
 PORT = 8000
 
 if len(sys.argv) > 1:
     PORT = int(sys.argv[1])
-print('serving at port: {0}'.format(PORT))
+print("serving at port: {0}".format(PORT))
 
 try:
     run_server()
diff --git a/plugin/ctf/CMakeLists.txt b/plugin/ctf/CMakeLists.txt
index c523e1e2..2ed5c287 100644
--- a/plugin/ctf/CMakeLists.txt
+++ b/plugin/ctf/CMakeLists.txt
@@ -42,11 +42,9 @@ set(METADATA_STREAM_FILE_DIR "${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/plugin/ct
 target_compile_definitions(
     ctf_plugin
     PUBLIC AMD_INTERNAL_BUILD
-    PRIVATE
-        HIP_PROF_HIP_API_STRING=1
-        __HIP_PLATFORM_AMD__=1
-        CTF_PLUGIN_METADATA_FILE_PATH="${CMAKE_INSTALL_PREFIX}/${METADATA_STREAM_FILE_DIR}/metadata"
-    )
+    PRIVATE HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1
+            CTF_PLUGIN_METADATA_FILE_PATH="${METADATA_STREAM_FILE_DIR}/metadata"
+            CTF_PLUGIN_INSTALL_PREFIX="${CMAKE_INSTALL_PREFIX}")
 target_include_directories(
     ctf_plugin PRIVATE "${PROJECT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}/src/api"
                        "${CMAKE_CURRENT_BINARY_DIR}")
diff --git a/plugin/ctf/ctf.cpp b/plugin/ctf/ctf.cpp
index 5a90296e..97265f38 100644
--- a/plugin/ctf/ctf.cpp
+++ b/plugin/ctf/ctf.cpp
@@ -18,10 +18,18 @@
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE. */
 
+#include <dlfcn.h>
 #include <cassert>
 #include <stdexcept>
 #include <iostream>
+#include <string>
+#include <thread>
+#include <link.h>
+#include <chrono>
+#include <regex>
+#include <unistd.h>
 #include <experimental/filesystem>
+#include <type_traits>
 
 #include "rocprofiler.h"
 #include "rocprofiler_plugin.h"
@@ -49,17 +57,53 @@ ROCPROFILER_EXPORT int rocprofiler_plugin_initialize(const uint32_t rocprofiler_
     return -1;
   }
 
-  const char* output_dir = []() -> const char* {
+  auto output_dir = []() -> std::string {
     if (const char* output_dir_internal = getenv("OUTPUT_PATH"); output_dir_internal != nullptr) {
       return output_dir_internal;
     }
     return "./";
   }();
 
+  auto output_file = []() -> std::string {
+    auto _v = getenv("OUTPUT_FILE");
+    return (_v) ? _v : "trace-{PID}";
+  }();
+
+  auto _replace = [&output_dir, &output_file](const char* _key, auto _value) {
+    using value_type = std::remove_cv_t<std::remove_reference_t<std::decay_t<decltype(_value)>>>;
+    auto _value_str = std::to_string(_value);
+
+    const auto _re = std::regex{_key, std::regex_constants::icase};
+    output_dir = std::regex_replace(output_dir, _re, _value_str);
+    output_file = std::regex_replace(output_file, _re, _value_str);
+  };
+
+  _replace("\\{PID\\}", getpid());
+  _replace("\\$ENV\\{PID\\}", getpid());
+  _replace("\\{PPID\\}", getppid());
+  _replace("\\$ENV\\{PPID\\}", getppid());
+
   // Create the plugin instance.
+  auto* this_plugin_handle = dlopen("libctf_plugin.so", RTLD_LAZY | RTLD_NOLOAD);
+  auto* librocprofiler_handle = dlopen("librocprofiler64.so", RTLD_LAZY | RTLD_NOLOAD);
+  auto metadata_path = std::string{CTF_PLUGIN_METADATA_FILE_PATH};
+  struct link_map* _link_map = nullptr;
+  if (this_plugin_handle && dlinfo(this_plugin_handle, RTLD_DI_LINKMAP, &_link_map) == 0) {
+    metadata_path = fs::path{_link_map->l_name}.parent_path() / fs::path{"../.."} /
+        CTF_PLUGIN_METADATA_FILE_PATH;
+  } else if (librocprofiler_handle &&
+             dlinfo(librocprofiler_handle, RTLD_DI_LINKMAP, &_link_map) == 0) {
+    metadata_path =
+        fs::path{_link_map->l_name}.parent_path() / ".." / CTF_PLUGIN_METADATA_FILE_PATH;
+  }
+
+  if (!fs::exists(metadata_path)) {
+    metadata_path = fs::path{CTF_PLUGIN_INSTALL_PREFIX} / CTF_PLUGIN_METADATA_FILE_PATH;
+  }
+
   try {
-    the_plugin = new rocm_ctf::Plugin{256 * 1024, fs::path{output_dir} / "trace",
-                                      CTF_PLUGIN_METADATA_FILE_PATH};
+    the_plugin = new rocm_ctf::Plugin{256 * 1024, fs::path{output_dir} / output_file,
+                                      fs::absolute(metadata_path)};
   } catch (const std::exception& exc) {
     std::cerr << "rocprofiler_plugin_initialize(): " << exc.what() << std::endl;
     return -1;
diff --git a/plugin/ctf/gen_api_files.py b/plugin/ctf/gen_api_files.py
index 58743bcb..c6e37468 100644
--- a/plugin/ctf/gen_api_files.py
+++ b/plugin/ctf/gen_api_files.py
@@ -33,12 +33,12 @@ class _NumericFt:
     # Returns the C++ expression to cast the expression `expr` to the C
     # type of this field type.
     def cast(self, expr):
-        return f'static_cast<{self.c_type}>({expr})'
+        return f"static_cast<{self.c_type}>({expr})"
 
 
 # Integer field type (abstract).
 class _IntFt(_NumericFt):
-    def __init__(self, size, pref_disp_base='dec'):
+    def __init__(self, size, pref_disp_base="dec"):
         self._size = size
         self._pref_disp_base = pref_disp_base
 
@@ -56,8 +56,8 @@ def pref_disp_base(self):
     @property
     def barectf_yaml(self):
         return {
-            'size': self._size,
-            'preferred-display-base': self._pref_disp_base,
+            "size": self._size,
+            "preferred-display-base": self._pref_disp_base,
         }
 
 
@@ -67,13 +67,13 @@ class _SIntFt(_IntFt):
     @property
     def barectf_yaml(self):
         ret = super().barectf_yaml
-        ret['class'] = 'sint'
+        ret["class"] = "sint"
         return ret
 
     # Equivalent C type
     @property
     def c_type(self):
-        return f'std::int{self._size}_t'
+        return f"std::int{self._size}_t"
 
 
 # Unsigned integer field type.
@@ -82,24 +82,24 @@ class _UIntFt(_IntFt):
     @property
     def barectf_yaml(self):
         ret = super().barectf_yaml
-        ret['class'] = 'uint'
+        ret["class"] = "uint"
         return ret
 
     # Equivalent C type.
     @property
     def c_type(self):
-        return f'std::uint{self._size}_t'
+        return f"std::uint{self._size}_t"
 
 
 # Pointer field type.
 class _PointerFt(_UIntFt):
     def __init__(self):
-        super().__init__(64, 'hex')
+        super().__init__(64, "hex")
 
     # Returns the C++ expression to cast the expression `expr` to the C
     # type of this field type.
     def cast(self, expr):
-        return f'static_cast<{self.c_type}>(reinterpret_cast<std::uintptr_t>({expr}))'
+        return f"static_cast<{self.c_type}>(reinterpret_cast<std::uintptr_t>({expr}))"
 
 
 # Enumeration field type (abstract).
@@ -122,7 +122,7 @@ def barectf_yaml(self):
         for name, val in self._mappings.items():
             mappings[name] = [val]
 
-        ret['mappings'] = mappings
+        ret["mappings"] = mappings
         return ret
 
 
@@ -132,7 +132,7 @@ class _UEnumFt(_EnumFt, _UIntFt):
     @property
     def barectf_yaml(self):
         ret = super().barectf_yaml
-        ret['class'] = 'uenum'
+        ret["class"] = "uenum"
         return ret
 
 
@@ -142,7 +142,7 @@ class _SEnumFt(_EnumFt, _UIntFt):
     @property
     def barectf_yaml(self):
         ret = super().barectf_yaml
-        ret['class'] = 'senum'
+        ret["class"] = "senum"
         return ret
 
 
@@ -152,7 +152,7 @@ class _OptStrFt:
     @property
     def barectf_yaml(self):
         return {
-            'class': 'str',
+            "class": "str",
         }
 
 
@@ -175,18 +175,18 @@ def size(self):
     @property
     def barectf_yaml(self):
         return {
-            'class': 'real',
-            'size': self._size,
+            "class": "real",
+            "size": self._size,
         }
 
     # Equivalent C type.
     @property
     def c_type(self):
         if self._size == 32:
-            return 'float'
+            return "float"
         else:
             assert self._size == 64
-            return 'double'
+            return "double"
 
 
 # Event record type.
@@ -210,16 +210,16 @@ def members(self):
 class _BeginErt(_Ert):
     # Name of event record type depending on the API prefix.
     def name(self, api_prefix):
-        suffix = '_begin' if api_prefix == 'hsa' else 'Begin'
-        return f'{self._api_func_name}{suffix}'
+        suffix = "_begin" if api_prefix == "hsa" else "Begin"
+        return f"{self._api_func_name}{suffix}"
 
 
 # End event record type.
 class _EndErt(_Ert):
     # Name of event record type depending on the API prefix.
     def name(self, api_prefix):
-        suffix = '_end' if api_prefix == 'hsa' else 'End'
-        return f'{self._api_func_name}{suffix}'
+        suffix = "_end" if api_prefix == "hsa" else "End"
+        return f"{self._api_func_name}{suffix}"
 
 
 # Event record type member.
@@ -251,20 +251,20 @@ def ft(self):
 # This is an unconditional assertion.
 def _make_sure(cond, error_msg):
     if not cond:
-        print(f'Error: {error_msg}', file=sys.stderr)
+        print(f"Error: {error_msg}", file=sys.stderr)
         sys.exit(1)
 
 
 def _enumerator_effective_val(enum_val):
     # Try the value, but this value may be a string (an
     # enumerator/definition).
-    val = enum_val.get('value')
+    val = enum_val.get("value")
 
     if type(val) is int:
         return val
 
     # Try the raw value.
-    val = enum_val.get('raw_value')
+    val = enum_val.get("raw_value")
 
     if val is not None:
         if type(val) is int:
@@ -277,58 +277,61 @@ def _enumerator_effective_val(enum_val):
             except:
                 pass
 
-    _make_sure(False,
-               f'Cannot get the integral value of enumerator `{enum_val["name"]}`')
+    _make_sure(False, f'Cannot get the integral value of enumerator `{enum_val["name"]}`')
 
 
 # Returns the equivalent field type of the C type `c_type`.
 def _number_ft_from_c_type(cpp_header, c_type):
     # Check for known enumeration.
-    m = re.match(r'(?:enum\s+)?(\w+)', c_type)
+    m = re.match(r"(?:enum\s+)?(\w+)", c_type)
 
     if m:
         size = 32
 
         for enum_info in cpp_header.enums:
-            if m.group(1) == enum_info.get('name'):
+            if m.group(1) == enum_info.get("name"):
                 # Fill enumeration field type mappings.
                 mappings = {
-                    str(v['name']): _enumerator_effective_val(v)
-                    for v in enum_info['values']
+                    str(v["name"]): _enumerator_effective_val(v)
+                    for v in enum_info["values"]
                 }
 
                 if len(mappings) == 0:
                     return _SIntFt(64)
 
-                if max(mappings.values()) >= 2**31 or min(mappings.values()) < -2**31:
+                if max(mappings.values()) >= 2**31 or min(mappings.values()) < -(
+                    2**31
+                ):
                     size = 64
 
-                _make_sure(len(mappings) > 0, f'Enumeration `{enum_info["name"]}` is empty')
+                _make_sure(
+                    len(mappings) > 0, f'Enumeration `{enum_info["name"]}` is empty'
+                )
 
                 # Create corresponding enumeration field type.
                 return _SEnumFt(size, mappings)
 
     # Find corresponding basic field type.
-    is_unsigned = 'unsigned' in c_type
+    is_unsigned = "unsigned" in c_type
 
-    if 'long' in c_type:
+    if "long" in c_type:
         if is_unsigned:
             return _UIntFt(64)
         else:
             return _SIntFt(64)
-    elif 'short' in c_type:
+    elif "short" in c_type:
         if is_unsigned:
             return _UIntFt(16)
         else:
             return _SIntFt(16)
-    elif 'char' in c_type:
+    elif "char" in c_type:
         if is_unsigned:
             return _UIntFt(8)
         else:
             return _SIntFt(8)
-    elif 'float' in c_type:
+    elif "float" in c_type:
         return _FloatFt(32)
-    elif 'double' in c_type:
+    elif "double" in c_type:
         return _FloatFt(64)
     else:
         # Assume `int` (often an unresolved C enumeration).
@@ -340,23 +343,23 @@ def _number_ft_from_c_type(cpp_header, c_type):
 
 # Returns whether or not a property has a pointer type.
 def _prop_is_pointer(prop, c_type):
-    if prop['pointer'] or prop['function_pointer']:
+    if prop["pointer"] or prop["function_pointer"]:
         return True
 
-    if prop['array'] and 'array_size' in prop:
+    if prop["array"] and "array_size" in prop:
         return True
 
-    if prop['unresolved']:
+    if prop["unresolved"]:
         # HSA API function pointers.
-        if prop['name'] in ('callback', 'handler'):
+        if prop["name"] in ("callback", "handler"):
             return True
 
         # HIP API function pointers.
-        if c_type.endswith('Fn_t'):
+        if c_type.endswith("Fn_t"):
             return True
 
     # Check the C type itself.
-    if '*' in c_type or '*' in prop.get('raw_type', ''):
+    if "*" in c_type or "*" in prop.get("raw_type", ""):
         return True
 
     return False
@@ -369,24 +372,24 @@ def _get_ert_members_for_struct(cpp_header, struct, access, member_names):
     members = []
     member_names = member_names.copy()
     member_names.append(None)
-    props = struct['properties']['public']
+    props = struct["properties"]["public"]
 
     for index, prop in enumerate(props):
         # Property name.
-        name = prop['name']
+        name = prop["name"]
 
         # Member names, access, and C type.
         member_names[-1] = str(name)
-        this_access = f'{access}.{name}'
-        c_type = prop['type']
-        aliases = prop['aliases']
+        this_access = f"{access}.{name}"
+        c_type = prop["type"]
+        aliases = prop["aliases"]
 
         # Skip no type.
-        if c_type == '':
+        if c_type == "":
             continue
 
         # Skip unnamed or union.
-        if name == '' or 'union' in name or re.match(r'\bunion\b', c_type):
+        if name == "" or "union" in name or re.match(r"\bunion\b", c_type):
             continue
 
         # Check for known C type alias.
@@ -399,8 +402,7 @@ def _get_ert_members_for_struct(cpp_header, struct, access, member_names):
             c_type = c_type_alias
 
         # Check for C string.
-        if re.match(r'^((const\s+char)|(char\s+const)|char)\s*\*$',
-                    c_type.strip()):
+        if re.match(r"^((const\s+char)|(char\s+const)|char)\s*\*$", c_type.strip()):
             members.append(_ErtMember(this_access, member_names, _OptStrFt()))
             continue
 
@@ -417,13 +419,17 @@ def _get_ert_members_for_struct(cpp_header, struct, access, member_names):
             sub_struct = cpp_header.classes.get(aliases[0])
 
         if sub_struct is not None:
-            members += _get_ert_members_for_struct(cpp_header, sub_struct,
-                                                   this_access, member_names)
+            members += _get_ert_members_for_struct(
+                cpp_header, sub_struct, this_access, member_names
+            )
             continue
 
         # Use a basic field type.
-        members.append(_ErtMember(this_access, member_names,
-                                  _number_ft_from_c_type(cpp_header, c_type)))
+        members.append(
+            _ErtMember(
+                this_access, member_names, _number_ft_from_c_type(cpp_header, c_type)
+            )
+        )
 
     return members
 
@@ -439,40 +445,48 @@ def _erts_from_cb_data_struct(api_prefix, cpp_header, retval_info, struct):
     if retval_info is not None:
         args_nested_cls_index = 1
         retval_members = {}
-        nested_classes = struct['nested_classes']
-        _make_sure(len(nested_classes) >= 1,
-                   f"Return value union doesn't exist in `{struct['name']}`")
+        nested_classes = struct["nested_classes"]
+        _make_sure(
+            len(nested_classes) >= 1,
+            f"Return value union doesn't exist in `{struct['name']}`",
+        )
         retval_union = nested_classes[0]
 
-        for prop in retval_union['properties']['public']:
-            name = str(prop['name'])
-            member = _ErtMember(f'GetApiData().{name}', ['retval'],
-                                _number_ft_from_c_type(cpp_header, prop['type']))
-            retval_members[prop['name']] = member
+        for prop in retval_union["properties"]["public"]:
+            name = str(prop["name"])
+            member = _ErtMember(
+                f"GetApiData().{name}",
+                ["retval"],
+                _number_ft_from_c_type(cpp_header, prop["type"]),
+            )
+            retval_members[prop["name"]] = member
 
         # Make sure we have everything we need.
         for api_func_name, retval_name in retval_info.items():
             if retval_name is not None:
-                _make_sure(retval_name in retval_members,
-                           f"Return value union member `{retval_name}` doesn't exist (function {api_func_name}())")
+                _make_sure(
+                    retval_name in retval_members,
+                    f"Return value union member `{retval_name}` doesn't exist (function {api_func_name}())",
+                )
 
     # Create beginning/end event record type objects.
     begin_erts = []
     end_erts = []
-    nested_classes = struct['nested_classes'][args_nested_cls_index]['nested_classes']
-    props = struct['nested_classes'][args_nested_cls_index]['properties']['public']
-    _make_sure(len(nested_classes) == len(props),
-               f'Mismatch between nested structure and member count in `{struct["name"]}`')
+    nested_classes = struct["nested_classes"][args_nested_cls_index]["nested_classes"]
+    props = struct["nested_classes"][args_nested_cls_index]["properties"]["public"]
+    _make_sure(
+        len(nested_classes) == len(props),
+        f'Mismatch between nested structure and member count in `{struct["name"]}`',
+    )
 
     for index, prop in enumerate(props):
         # API function name is the name of the member.
-        api_func_name = str(prop['name'])
+        api_func_name = str(prop["name"])
 
         # Get the parameters.
-        members = _get_ert_members_for_struct(cpp_header,
-                                              nested_classes[index],
-                                              f'GetApiData().args.{api_func_name}',
-                                              [])
+        members = _get_ert_members_for_struct(
+            cpp_header, nested_classes[index], f"GetApiData().args.{api_func_name}", []
+        )
 
         # Append new beginning event record type object.
         begin_erts.append(_BeginErt(api_func_name, members))
@@ -499,7 +513,7 @@ def _erts_from_cb_data_struct(api_prefix, cpp_header, retval_info, struct):
 # This only applies to the HSA API: for other APIs, this function
 # returns `None`.
 def _get_retval_info(path):
-    if 'hsa' not in os.path.basename(path):
+    if "hsa" not in os.path.basename(path):
         return
 
     retval_info = {}
@@ -508,7 +522,7 @@ def _get_retval_info(path):
     with open(path) as f:
         for line in f:
             if 'out << ")' in line and cur_api_func_name is not None:
-                m = re.search(r'api_data.(\w+_retval)', line)
+                m = re.search(r"api_data.(\w+_retval)", line)
                 retval_info[cur_api_func_name] = m.group(1) if m else None
             else:
                 m = re.search(r'out << "(hsa_\w+)\(";', line)
@@ -525,7 +539,7 @@ def _yaml_dst_from_erts(api_prefix, erts):
     # Base.
     yaml_erts = {}
     yaml_dst = {
-        'event-record-types': yaml_erts,
+        "event-record-types": yaml_erts,
     }
 
     # Create one event record type per API function.
@@ -533,9 +547,9 @@ def _yaml_dst_from_erts(api_prefix, erts):
         # Base.
         yaml_members = []
         yaml_ert = {
-            'payload-field-type': {
-                'class': 'struct',
-                'members': yaml_members,
+            "payload-field-type": {
+                "class": "struct",
+                "members": yaml_members,
             },
         }
 
@@ -543,11 +557,14 @@ def _yaml_dst_from_erts(api_prefix, erts):
         for member in ert.members:
             # barectf doesn't support nested CTF structures, so join
             # individual member names with `__` to flatten.
-            yaml_members.append({
-                '_' + '__'.join(member.member_names): {
-                    'field-type': member.ft.barectf_yaml,
-                },
-            })
+            yaml_members.append(
+                {
+                    "_"
+                    + "__".join(member.member_names): {
+                        "field-type": member.ft.barectf_yaml,
+                    },
+                }
+            )
 
         # Add event record type.
         yaml_erts[ert.name(api_prefix)] = yaml_ert
@@ -560,23 +577,23 @@ def _yaml_dst_from_erts(api_prefix, erts):
 # tracing function depending on the API function operation ID.
 def _cpp_switch_statement_from_erts(api_prefix, erts):
     lines = []
-    lines.append('switch (GetOp()) {')
+    lines.append("switch (GetOp()) {")
 
     for ert in erts:
-        lines.append(f'  case {api_prefix.upper()}_API_ID_{ert.api_func_name}:')
-        lines.append(f'    barectf_{api_prefix}_api_trace_{ert.name(api_prefix)}(')
-        lines.append(f'      &barectf_ctx,')
-        lines.append(f'      GetThreadId(),')
-        lines.append(f'      GetQueueId(),')
-        lines.append(f'      GetAgentId(),')
-        lines.append(f'      GetCorrelationId(),')
+        lines.append(f"  case {api_prefix.upper()}_API_ID_{ert.api_func_name}:")
+        lines.append(f"    barectf_{api_prefix}_api_trace_{ert.name(api_prefix)}(")
+        lines.append(f"      &barectf_ctx,")
+        lines.append(f"      GetThreadId(),")
+        lines.append(f"      GetQueueId(),")
+        lines.append(f"      GetAgentId(),")
+        lines.append(f"      GetCorrelationId(),")
 
-        if api_prefix == 'hip':
-            lines.append(f'      GetKernelName().c_str(),')
+        if api_prefix == "hip":
+            lines.append(f"      GetKernelName().c_str(),")
 
         if len(ert.members) == 0:
             # Remove last comma.
-            lines[-1] = lines[-1].replace(',', '')
+            lines[-1] = lines[-1].replace(",", "")
 
         for index, member in enumerate(ert.members):
             if type(member.ft) is _OptStrFt:
@@ -584,17 +601,17 @@ def _cpp_switch_statement_from_erts(api_prefix, erts):
                 # an empty string.
                 lines.append(f'      {member.access} ? {member.access} : ""')
             elif type(member.ft) is _StrFt:
-                lines.append(f'      {member.access}')
+                lines.append(f"      {member.access}")
             else:
-                lines.append(f'      {member.ft.cast(member.access)}')
+                lines.append(f"      {member.ft.cast(member.access)}")
 
             if index + 1 < len(ert.members):
-                lines[-1] += ','
+                lines[-1] += ","
 
-        lines.append('    );')
-        lines.append('    break;')
+        lines.append("    );")
+        lines.append("    break;")
 
-    lines.append('}')
+    lines.append("}")
     return lines
 
 
@@ -612,29 +629,28 @@ def _process_file(api_prefix, path):
 
     # Find callback data structure.
     for struct_name, struct in cpp_header.classes.items():
-        if re.match(r'^' + api_prefix + r'_api_data\w+$', struct_name):
+        if re.match(r"^" + api_prefix + r"_api_data\w+$", struct_name):
             # Process callback data structure.
-            begin_erts, end_erts = _erts_from_cb_data_struct(api_prefix,
-                                                             cpp_header,
-                                                             retval_info,
-                                                             struct)
+            begin_erts, end_erts = _erts_from_cb_data_struct(
+                api_prefix, cpp_header, retval_info, struct
+            )
 
             # Write barectf YAML file.
-            with open(f'{api_prefix}_erts.yaml', 'w') as f:
+            with open(f"{api_prefix}_erts.yaml", "w") as f:
                 f.write(_yaml_dst_from_erts(api_prefix, begin_erts + end_erts))
 
             # Write C++ code (beginning event record).
-            with open(f'{api_prefix}_begin.cpp.i', 'w') as f:
-                f.write('\n'.join(_cpp_switch_statement_from_erts(api_prefix,
-                                                                  begin_erts)))
+            with open(f"{api_prefix}_begin.cpp.i", "w") as f:
+                f.write(
+                    "\n".join(_cpp_switch_statement_from_erts(api_prefix, begin_erts))
+                )
 
             # Write C++ code (end event record).
-            with open(f'{api_prefix}_end.cpp.i', 'w') as f:
-                f.write('\n'.join(_cpp_switch_statement_from_erts(api_prefix,
-                                                                  end_erts)))
+            with open(f"{api_prefix}_end.cpp.i", "w") as f:
+                f.write("\n".join(_cpp_switch_statement_from_erts(api_prefix, end_erts)))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Disable `CppHeaderParser` printing to standard output.
     CppHeaderParser.CppHeaderParser.print_warnings = 0
     CppHeaderParser.CppHeaderParser.print_errors = 0
diff --git a/plugin/ctf/gen_env_yaml.py b/plugin/ctf/gen_env_yaml.py
index 009f3689..4cf2222a 100644
--- a/plugin/ctf/gen_env_yaml.py
+++ b/plugin/ctf/gen_env_yaml.py
@@ -24,10 +24,14 @@
 import yaml
 
 
-if __name__ == '__main__':
-    with open('env.yaml', 'w') as f:
-        f.write(yaml.dump({
-            'environment': {
-                'rocprofiler_version': sys.argv[1],
-            }
-        }))
+if __name__ == "__main__":
+    with open("env.yaml", "w") as f:
+        f.write(
+            yaml.dump(
+                {
+                    "environment": {
+                        "rocprofiler_version": sys.argv[1],
+                    }
+                }
+            )
+        )
diff --git a/plugin/perfetto/CMakeLists.txt b/plugin/perfetto/CMakeLists.txt
index 3db11c13..9c5ea030 100644
--- a/plugin/perfetto/CMakeLists.txt
+++ b/plugin/perfetto/CMakeLists.txt
@@ -1,7 +1,8 @@
 file(GLOB ROCPROFILER_UTIL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp)
 
-add_library(perfetto_plugin ${LIBRARY_TYPE} ${ROCPROFILER_UTIL_SRC_FILES} perfetto.cpp
-                            perfetto_sdk/sdk/perfetto.cc)
+add_subdirectory(perfetto_sdk)
+
+add_library(perfetto_plugin ${LIBRARY_TYPE} ${ROCPROFILER_UTIL_SRC_FILES} perfetto.cpp)
 
 set_target_properties(
     perfetto_plugin
@@ -13,16 +14,14 @@ set_target_properties(
 target_compile_definitions(perfetto_plugin PRIVATE HIP_PROF_HIP_API_STRING=1
                                                    __HIP_PLATFORM_AMD__=1)
 
-target_include_directories(
-    perfetto_plugin PRIVATE ${PROJECT_SOURCE_DIR}
-                            ${PROJECT_SOURCE_DIR}/plugin/perfetto/perfetto_sdk/sdk)
+target_include_directories(perfetto_plugin PRIVATE ${PROJECT_SOURCE_DIR})
 
 target_link_options(
     perfetto_plugin PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exportmap
     -Wl,--no-undefined)
 
-target_link_libraries(perfetto_plugin PRIVATE rocprofiler-v2 Threads::Threads stdc++fs
-                                              amd_comgr)
+target_link_libraries(perfetto_plugin PRIVATE rocprofiler-v2 rocprofiler::perfetto-sdk
+                                              Threads::Threads stdc++fs amd_comgr)
 
 install(TARGETS perfetto_plugin
         LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} COMPONENT plugins)
diff --git a/plugin/perfetto/perfetto_sdk/CMakeLists.txt b/plugin/perfetto/perfetto_sdk/CMakeLists.txt
new file mode 100644
index 00000000..385d0820
--- /dev/null
+++ b/plugin/perfetto/perfetto_sdk/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(CMAKE_CXX_CLANG_TIDY)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+add_library(rocprofiler-perfetto-sdk STATIC sdk/perfetto.h sdk/perfetto.cc)
+add_library(rocprofiler::perfetto-sdk ALIAS rocprofiler-perfetto-sdk)
+
+target_include_directories(rocprofiler-perfetto-sdk
+                           PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/sdk>)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..585d9d4c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,25 @@
+
+[tool.black]
+line-length = 90
+target-version = ['py36', 'py37', 'py38', 'py39', 'py310']
+include = '\.py$'
+exclude = '''
+(
+  /(
+      \.eggs
+    | \.git
+    | \.github
+    | \.tox
+    | \.venv
+    | \.misc
+    | \.vscode
+    | \.cache
+    | \.pytest_cache
+    | dist
+    | external
+    | build
+    | build-release
+    | build-rocprofiler
+  )/
+)
+'''
diff --git a/requirements.txt b/requirements.txt
index 6cf1c143..69a7c594 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,10 @@
-barectf==3.1.1
-bcrypt==3.2.0
-CppHeaderParser==2.7.4
-lxml==4.9.2
-matplotlib==3.7.1
-pandas==2.0.2
-plotly==5.15.0
-ply==3.11
-protobuf==3.20.3
-pycparser==2.21
-pyparsing==3.0.9
-websocket-client==1.5.2
-websockets==11.0.3
+barectf
+bcrypt
+CppHeaderParser
+lxml
+matplotlib
+pandas
+protobuf
+pycparser
+pyparsing
+websockets
\ No newline at end of file
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index 6b280148..2eaadd49 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -23,6 +23,9 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip")
 set(CMAKE_HIP_ARCHITECTURES OFF)
+if(DEFINED ROCM_PATH)
+    set(HIP_ROOT_DIR "${ROCM_PATH}/bin")
+endif()
 find_package(HIP REQUIRED MODULE)
 
 find_package(
@@ -36,7 +39,7 @@ find_package(LibElf REQUIRED)
 find_package(LibDw REQUIRED)
 
 # Add a custom targets to build and run all the tests
-add_custom_target(samples)
+add_custom_target(samples ALL)
 add_dependencies(samples rocprofiler-v2)
 add_custom_target(
     run-samples
@@ -46,6 +49,32 @@ add_custom_target(
 file(GLOB ROCPROFILER_UTIL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp)
 # ########################################################################################
 
+function(rocprofiler_sample_add_test _TARGET _ARGS)
+    if(TARGET ${_TARGET})
+        if(NOT TEST ${_TARGET})
+            add_test(
+                NAME ${_TARGET}
+                COMMAND $<TARGET_FILE:${_TARGET}> ${_ARGS}
+                WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+        endif()
+        set_tests_properties(
+            ${_TARGET}
+            PROPERTIES
+                LABELS
+                "samples"
+                ENVIRONMENT
+                "ROCPROFILER_METRICS_PATH=${PROJECT_BINARY_DIR}/libexec/rocprofiler/counters/derived_counters.xml;${ROCPROFILER_MEMCHECK_PRELOAD_ENV}"
+                RUN_SERIAL
+                TRUE
+                ${ARGN})
+    endif()
+endfunction()
+
+function(rocprofiler_sample_add_executable _TARGET)
+    hip_add_executable(${_TARGET} ${ARGN})
+    rocprofiler_sample_add_test(${_TARGET} "")
+endfunction()
+
 # ########################################################################################
 # ########################################################################################
 # Samples Build & Run Script
@@ -59,7 +88,7 @@ file(GLOB ROCPROFILER_UTIL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp)
 # Build Kernel No Replay Sample
 set_source_files_properties(profiler/kernel_profiling_no_replay_sample.cpp
                             PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-hip_add_executable(
+rocprofiler_sample_add_executable(
     profiler_kernel_no_replay profiler/kernel_profiling_no_replay_sample.cpp
     ${ROCPROFILER_UTIL_SRC_FILES})
 target_include_directories(
@@ -75,8 +104,9 @@ install(TARGETS profiler_kernel_no_replay
 # Build Device Profiling Sample
 set_source_files_properties(profiler/device_profiling_sample.cpp
                             PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-hip_add_executable(profiler_device_profiling profiler/device_profiling_sample.cpp
-                   ${ROCPROFILER_UTIL_SRC_FILES})
+rocprofiler_sample_add_executable(
+    profiler_device_profiling profiler/device_profiling_sample.cpp
+    ${ROCPROFILER_UTIL_SRC_FILES})
 target_include_directories(
     profiler_device_profiling PRIVATE ${PROJECT_SOURCE_DIR}
                                       ${CMAKE_CURRENT_SOURCE_DIR}/common)
@@ -86,12 +116,14 @@ add_dependencies(samples profiler_device_profiling)
 install(TARGETS profiler_device_profiling
         RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/samples
                 COMPONENT samples)
+set_tests_properties(profiler_device_profiling PROPERTIES DISABLED TRUE)
 
 # Build Counters Sampling example
 set_source_files_properties(counters_sampler/pcie_counters_example.cpp
                             PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-hip_add_executable(pcie_counters_sampler counters_sampler/pcie_counters_example.cpp
-                   ${ROCPROFILER_UTIL_SRC_FILES})
+rocprofiler_sample_add_executable(
+    pcie_counters_sampler counters_sampler/pcie_counters_example.cpp
+    ${ROCPROFILER_UTIL_SRC_FILES})
 target_include_directories(
     pcie_counters_sampler PRIVATE ${PROJECT_SOURCE_DIR}
                                   ${CMAKE_CURRENT_SOURCE_DIR}/common)
@@ -105,7 +137,7 @@ install(TARGETS pcie_counters_sampler
 # Build XGMI Counters Sampling example
 set_source_files_properties(counters_sampler/xgmi_counters_sampler_example.cpp
                             PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-hip_add_executable(
+rocprofiler_sample_add_executable(
     xgmi_counters_sampler counters_sampler/xgmi_counters_sampler_example.cpp
     ${ROCPROFILER_UTIL_SRC_FILES})
 target_include_directories(
@@ -117,6 +149,7 @@ add_dependencies(samples xgmi_counters_sampler)
 install(TARGETS xgmi_counters_sampler
         RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/samples
                 COMPONENT samples)
+set_tests_properties(xgmi_counters_sampler PROPERTIES DISABLED TRUE)
 
 # ########################################################################################
 
@@ -126,7 +159,8 @@ install(TARGETS xgmi_counters_sampler
 
 # Build HIP/HSA Trace Sample
 set_source_files_properties(tracer/sample.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-hip_add_executable(tracer_hip_hsa tracer/sample.cpp ${ROCPROFILER_UTIL_SRC_FILES})
+rocprofiler_sample_add_executable(tracer_hip_hsa tracer/sample.cpp
+                                  ${ROCPROFILER_UTIL_SRC_FILES})
 target_include_directories(tracer_hip_hsa PRIVATE ${PROJECT_SOURCE_DIR}
                                                   ${CMAKE_CURRENT_SOURCE_DIR}/common)
 target_link_libraries(tracer_hip_hsa PRIVATE rocprofiler-v2 amd_comgr)
@@ -139,8 +173,8 @@ install(TARGETS tracer_hip_hsa
 # Build HIP/HSA Trace with async output api trace data Sample
 set_source_files_properties(tracer/sample_async.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT
                                                                1)
-hip_add_executable(tracer_hip_hsa_async tracer/sample_async.cpp
-                   ${ROCPROFILER_UTIL_SRC_FILES})
+rocprofiler_sample_add_executable(tracer_hip_hsa_async tracer/sample_async.cpp
+                                  ${ROCPROFILER_UTIL_SRC_FILES})
 target_include_directories(
     tracer_hip_hsa_async PRIVATE ${PROJECT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/common)
 target_link_libraries(tracer_hip_hsa_async PRIVATE rocprofiler-v2 amd_comgr)
@@ -162,6 +196,7 @@ hip_add_executable(
     pc_sampling_code_printing ${PC_SAMPLING_CODE_PRINTING_FILES} HIPCC_OPTIONS -std=c++17
     # Include debugging symbols and source for the contextual disassembly
     -gdwarf-4)
+rocprofiler_sample_add_test(pc_sampling_code_printing "-d;0;-n;100000000;10;43532")
 
 check_c_source_compiles(
     "
diff --git a/samples/profiler/device_profiling_sample.cpp b/samples/profiler/device_profiling_sample.cpp
index c6ad3f85..98d1495e 100644
--- a/samples/profiler/device_profiling_sample.cpp
+++ b/samples/profiler/device_profiling_sample.cpp
@@ -24,7 +24,7 @@ int main(int argc, char** argv) {
   int gpu_agent = 0;
   int cpu_agent = 0;
   CHECK_ROCPROFILER(rocprofiler_device_profiling_session_create(
-      &counters[0], counters.size(), &dp_session_id, gpu_agent, cpu_agent));
+      &counters[0], counters.size(), &dp_session_id, cpu_agent, gpu_agent));
 
   printf("session start \n");
   // start GPU device profiling
diff --git a/script/address-sanitizer-suppr.txt b/script/address-sanitizer-suppr.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py
index acd0fdf0..9084d73f 100755
--- a/script/gen_ostream_ops.py
+++ b/script/gen_ostream_ops.py
@@ -27,118 +27,164 @@
 import argparse
 import string
 
-LICENSE = \
-'/*\n' + \
-'Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.\n' + \
-'\n' + \
-'Permission is hereby granted, free of charge, to any person obtaining a copy\n' + \
-'of this software and associated documentation files (the "Software"), to deal\n' + \
-'in the Software without restriction, including without limitation the rights\n' + \
-'to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n' + \
-'copies of the Software, and to permit persons to whom the Software is\n' + \
-'furnished to do so, subject to the following conditions:\n' + \
-'\n' + \
-'The above copyright notice and this permission notice shall be included in\n' + \
-'all copies or substantial portions of the Software.\n' + \
-'\n' + \
-'THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' + \
-'IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n' + \
-'FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE\n' + \
-'AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n' + \
-'LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n' + \
-'OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n' + \
-'THE SOFTWARE.\n' + \
-'*/\n'
-
-
-header_basic = \
-'namespace detail {\n' + \
-'template <typename T>\n' + \
-'  inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \
-'     using std::operator<<;\n' + \
-'     static bool recursion = false;\n' + \
-'     if (recursion == false) { recursion = true; out << v; recursion = false; }\n' + \
-'     return out;\n  }\n' + \
-'\n' + \
-'  inline static std::ostream &operator<<(std::ostream &out, const unsigned char &v) {\n' + \
-'    out << (unsigned int)v;\n' + \
-'    return out;\n  }\n' + \
-'\n' + \
-'  inline static std::ostream &operator<<(std::ostream &out, const char &v) {\n' + \
-'    out << (unsigned char)v;\n' + \
-'    return out;\n  }\n'
+LICENSE = (
+    "/*\n"
+    + "Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.\n"
+    + "\n"
+    + "Permission is hereby granted, free of charge, to any person obtaining a copy\n"
+    + 'of this software and associated documentation files (the "Software"), to deal\n'
+    + "in the Software without restriction, including without limitation the rights\n"
+    + "to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n"
+    + "copies of the Software, and to permit persons to whom the Software is\n"
+    + "furnished to do so, subject to the following conditions:\n"
+    + "\n"
+    + "The above copyright notice and this permission notice shall be included in\n"
+    + "all copies or substantial portions of the Software.\n"
+    + "\n"
+    + 'THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n'
+    + "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n"
+    + "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE\n"
+    + "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n"
+    + "LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n"
+    + "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n"
+    + "THE SOFTWARE.\n"
+    + "*/\n"
+)
+
+
+header_basic = (
+    "namespace detail {\n"
+    + "template <typename T>\n"
+    + "  inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n"
+    + "     using std::operator<<;\n"
+    + "     static bool recursion = false;\n"
+    + "     if (recursion == false) { recursion = true; out << v; recursion = false; }\n"
+    + "     return out;\n  }\n"
+    + "\n"
+    + "  inline static std::ostream &operator<<(std::ostream &out, const unsigned char &v) {\n"
+    + "    out << (unsigned int)v;\n"
+    + "    return out;\n  }\n"
+    + "\n"
+    + "  inline static std::ostream &operator<<(std::ostream &out, const char &v) {\n"
+    + "    out << (unsigned char)v;\n"
+    + "    return out;\n  }\n"
+)
 
 structs_analyzed = {}
-global_ops = ''
-global_str = ''
+global_ops = ""
+global_str = ""
 output_filename_h = None
 apiname = ""
 
+
 # process_struct traverses recursively all structs to extract all fields
 def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, apiname):
-# file_handle: handle for output file {api_name}_ostream_ops.h to be generated
-# cppHeader_struct: cppHeader struct being processed
-# cppHeader: cppHeader object created by CppHeaderParser.CppHeader(...)
-# parent_hier_name: parent hierarchical name used for nested structs/enums
-# apiname: for example hip.
+    # file_handle: handle for output file {api_name}_ostream_ops.h to be generated
+    # cppHeader_struct: cppHeader struct being processed
+    # cppHeader: cppHeader object created by CppHeaderParser.CppHeader(...)
+    # parent_hier_name: parent hierarchical name used for nested structs/enums
+    # apiname: for example hip.
     global global_str
 
-    if cppHeader_struct == 'max_align_t': #function pointers not working in cppheaderparser
+    if (
+        cppHeader_struct == "max_align_t"
+    ):  # function pointers not working in cppheaderparser
         return
     if cppHeader_struct not in cppHeader.classes:
         return
     if cppHeader_struct in structs_analyzed:
         return
     structs_analyzed[cppHeader_struct] = 1
-    for l in reversed(range(len(cppHeader.classes[cppHeader_struct]["properties"]["public"]))):
-        key = 'name'
+    for l in reversed(
+        range(len(cppHeader.classes[cppHeader_struct]["properties"]["public"]))
+    ):
+        key = "name"
         name = ""
         if key in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]:
-           if parent_hier_name != '':
-             name = parent_hier_name + '.' + cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key]
-           else:
-             name = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key]
-        if name == '':
-           continue
-        key2 = 'type'
+            if parent_hier_name != "":
+                name = (
+                    parent_hier_name
+                    + "."
+                    + cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key]
+                )
+            else:
+                name = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key]
+        if name == "":
+            continue
+        key2 = "type"
         mtype = ""
         if key2 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]:
             mtype = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key2]
-        if mtype == '':
-          continue
-        key3 = 'array_size'
+        if mtype == "":
+            continue
+        key3 = "array_size"
         array_size = ""
         if key3 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]:
-            array_size = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key3]
-        key4 = 'property_of_class'
+            array_size = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][
+                key3
+            ]
+        key4 = "property_of_class"
         prop = ""
-        if  key4 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]:
+        if key4 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]:
             prop = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key4]
 
-        str = ''
+        str = ""
         if "union" not in mtype:
             indent = ""
-            str += "    if (std::string(\"" + cppHeader_struct + "::" + name + "\").find(" + apiname.upper() + "_structs_regex" + ") != std::string::npos)   {\n"
+            str += (
+                '    if (std::string("'
+                + cppHeader_struct
+                + "::"
+                + name
+                + '").find('
+                + apiname.upper()
+                + "_structs_regex"
+                + ") != std::string::npos)   {\n"
+            )
             indent = "    "
-            str += indent + "  roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \"" + name + "=\");\n"
-            str += indent + "  roctracer::" + apiname.lower() + "_support::detail::operator<<(out, v." + name + ");\n"
-            str += indent + "  roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \", \");\n"
+            str += (
+                indent
+                + "  roctracer::"
+                + apiname.lower()
+                + '_support::detail::operator<<(out, "'
+                + name
+                + '=");\n'
+            )
+            str += (
+                indent
+                + "  roctracer::"
+                + apiname.lower()
+                + "_support::detail::operator<<(out, v."
+                + name
+                + ");\n"
+            )
+            str += (
+                indent
+                + "  roctracer::"
+                + apiname.lower()
+                + '_support::detail::operator<<(out, ", ");\n'
+            )
             str += "    }\n"
             if "void" not in mtype:
                 global_str += str
         else:
-            if prop != '':
-              next_cppHeader_struct = prop + "::"
-              process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname)
-              next_cppHeader_struct = prop + "::" + mtype + " "
-              process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname)
+            if prop != "":
+                next_cppHeader_struct = prop + "::"
+                process_struct(
+                    file_handle, next_cppHeader_struct, cppHeader, name, apiname
+                )
+                next_cppHeader_struct = prop + "::" + mtype + " "
+                process_struct(
+                    file_handle, next_cppHeader_struct, cppHeader, name, apiname
+                )
             next_cppHeader_struct = cppHeader_struct + "::"
             process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname)
 
+
 #  Parses API header file and generates ostream ops files ostream_ops.h
 def gen_cppheader(infilepath, outfilepath, rank):
-# infilepath: API Header file to be parsed
-# outfilepath: Output file where ostream operators are written
+    # infilepath: API Header file to be parsed
+    # outfilepath: Output file where ostream operators are written
     global global_ops
     global output_filename_h
     global apiname
@@ -149,94 +195,142 @@ def gen_cppheader(infilepath, outfilepath, rank):
         print(e)
         sys.exit(1)
     if rank == 0 or rank == 2:
-      mpath = os.path.dirname(outfilepath)
-      if mpath == "":
-        mpath = os.getcwd()
-      apiname = outfilepath.replace(mpath + "/","")
-      output_filename_h = open(outfilepath,"w+")
-      apiname = apiname.replace("_ostream_ops.h","")
-      apiname = apiname.upper()
-      output_filename_h.write("// automatically generated\n")
-      output_filename_h.write(LICENSE + '\n')
-      header_s = \
-        '#ifndef INC_' + apiname + '_OSTREAM_OPS_H_\n' + \
-        '#define INC_' + apiname + '_OSTREAM_OPS_H_\n' + \
-        '\n' + \
-        '#include "src/core/session/tracer/src/roctracer.h"\n' + \
-        '\n' + \
-        '#ifdef __cplusplus\n' + \
-        '#include <iostream>\n' + \
-        '#include <string>\n'
-
-      output_filename_h.write(header_s)
-      output_filename_h.write('\n')
-      output_filename_h.write('namespace roctracer {\n')
-      output_filename_h.write('namespace ' + apiname.lower() + '_support {\n')
-      output_filename_h.write('static int ' + apiname.upper() + '_depth_max = 1;\n')
-      output_filename_h.write('static int ' + apiname.upper() + '_depth_max_cnt = 0;\n')
-      output_filename_h.write('static std::string ' + apiname.upper() + '_structs_regex = \"\";\n')
-      output_filename_h.write('// begin ostream ops for '+ apiname + ' \n')
-      output_filename_h.write("// basic ostream ops\n")
-      output_filename_h.write(header_basic)
-      output_filename_h.write("// End of basic ostream ops\n\n")
+        mpath = os.path.dirname(outfilepath)
+        if mpath == "":
+            mpath = os.getcwd()
+        apiname = outfilepath.replace(mpath + "/", "")
+        output_filename_h = open(outfilepath, "w+")
+        apiname = apiname.replace("_ostream_ops.h", "")
+        apiname = apiname.upper()
+        output_filename_h.write("// automatically generated\n")
+        output_filename_h.write(LICENSE + "\n")
+        header_s = (
+            "#ifndef INC_"
+            + apiname
+            + "_OSTREAM_OPS_H_\n"
+            + "#define INC_"
+            + apiname
+            + "_OSTREAM_OPS_H_\n"
+            + "\n"
+            + '#include "src/core/session/tracer/src/roctracer.h"\n'
+            + "\n"
+            + "#ifdef __cplusplus\n"
+            + "#include <iostream>\n"
+            + "#include <string>\n"
+        )
+
+        output_filename_h.write(header_s)
+        output_filename_h.write("\n")
+        output_filename_h.write("namespace roctracer {\n")
+        output_filename_h.write("namespace " + apiname.lower() + "_support {\n")
+        output_filename_h.write("static int " + apiname.upper() + "_depth_max = 1;\n")
+        output_filename_h.write("static int " + apiname.upper() + "_depth_max_cnt = 0;\n")
+        output_filename_h.write(
+            "static std::string " + apiname.upper() + '_structs_regex = "";\n'
+        )
+        output_filename_h.write("// begin ostream ops for " + apiname + " \n")
+        output_filename_h.write("// basic ostream ops\n")
+        output_filename_h.write(header_basic)
+        output_filename_h.write("// End of basic ostream ops\n\n")
 
     for c in cppHeader.classes:
-        if c[-2] == ':' and c[-1] == ':': continue #ostream operator cannot be overloaded for anonymous struct therefore it is skipped
+        if c[-2] == ":" and c[-1] == ":":
+            continue  # ostream operator cannot be overloaded for anonymous struct therefore it is skipped
         if "union" in c:
             continue
         if c in structs_analyzed:
             continue
-        if c == 'max_align_t' or c == '__fsid_t': # Skipping as it is defined in multiple domains
-          continue
+        if (
+            c == "max_align_t" or c == "__fsid_t"
+        ):  # Skipping as it is defined in multiple domains
+            continue
         if len(cppHeader.classes[c]["properties"]["public"]) != 0:
-          output_filename_h.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n")
-          output_filename_h.write("{\n")
-          output_filename_h.write("  std::operator<<(out, '{');\n")
-          output_filename_h.write("  " + apiname.upper() + "_depth_max_cnt++;\n")
-          output_filename_h.write("  if (" + apiname.upper() + "_depth_max == -1 || " + apiname.upper() + "_depth_max_cnt <= " + apiname.upper() + "_depth_max" + ") {\n" )
-          process_struct(output_filename_h, c, cppHeader, "", apiname)
-          global_str = "\n".join(global_str.split("\n")[0:-3])
-          if global_str != '': global_str += "\n    }\n"
-          output_filename_h.write(global_str)
-          output_filename_h.write("  };\n")
-          output_filename_h.write("  " + apiname.upper() + "_depth_max_cnt--;\n")
-          output_filename_h.write("  std::operator<<(out, '}');\n")
-          output_filename_h.write("  return out;\n")
-          output_filename_h.write("}\n")
-          global_str = ''
-          global_ops += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + "  roctracer::" + apiname.lower() + "_support::detail::operator<<(out, v);\n" + "  return out;\n" + "}\n\n"
+            output_filename_h.write(
+                "inline static std::ostream& operator<<(std::ostream& out, const "
+                + c
+                + "& v)\n"
+            )
+            output_filename_h.write("{\n")
+            output_filename_h.write("  std::operator<<(out, '{');\n")
+            output_filename_h.write("  " + apiname.upper() + "_depth_max_cnt++;\n")
+            output_filename_h.write(
+                "  if ("
+                + apiname.upper()
+                + "_depth_max == -1 || "
+                + apiname.upper()
+                + "_depth_max_cnt <= "
+                + apiname.upper()
+                + "_depth_max"
+                + ") {\n"
+            )
+            process_struct(output_filename_h, c, cppHeader, "", apiname)
+            global_str = "\n".join(global_str.split("\n")[0:-3])
+            if global_str != "":
+                global_str += "\n    }\n"
+            output_filename_h.write(global_str)
+            output_filename_h.write("  };\n")
+            output_filename_h.write("  " + apiname.upper() + "_depth_max_cnt--;\n")
+            output_filename_h.write("  std::operator<<(out, '}');\n")
+            output_filename_h.write("  return out;\n")
+            output_filename_h.write("}\n")
+            global_str = ""
+            global_ops += (
+                "inline static std::ostream& operator<<(std::ostream& out, const "
+                + c
+                + "& v)\n"
+                + "{\n"
+                + "  roctracer::"
+                + apiname.lower()
+                + "_support::detail::operator<<(out, v);\n"
+                + "  return out;\n"
+                + "}\n\n"
+            )
 
     if rank == 1 or rank == 2:
-      footer = '// end ostream ops for '+ apiname + ' \n'
-      footer += '};};};\n\n'
-      output_filename_h.write(footer)
-      output_filename_h.write(global_ops)
-      footer = '#endif //__cplusplus\n' + \
-               '#endif // INC_' + apiname + '_OSTREAM_OPS_H_\n' + \
-               ' \n'
-      output_filename_h.write(footer)
-      output_filename_h.write('#include <hip/amd_detail/hip_prof_str.h>')
-      output_filename_h.close()
-      print('File ' + outfilepath + ' generated')
+        footer = "// end ostream ops for " + apiname + " \n"
+        footer += "};};};\n\n"
+        output_filename_h.write(footer)
+        output_filename_h.write(global_ops)
+        footer = (
+            "#endif //__cplusplus\n"
+            + "#endif // INC_"
+            + apiname
+            + "_OSTREAM_OPS_H_\n"
+            + " \n"
+        )
+        output_filename_h.write(footer)
+        output_filename_h.write("#include <hip/amd_detail/hip_prof_str.h>")
+        output_filename_h.close()
+        print("File " + outfilepath + " generated")
 
     return
 
-parser = argparse.ArgumentParser(description='genOstreamOps.py: generates ostream operators for all typedefs in provided input file.')
-requiredNamed = parser.add_argument_group('Required arguments')
-requiredNamed.add_argument('-in', metavar='fileList', help='Comma separated list of header files to be parsed', required=True)
-requiredNamed.add_argument('-out', metavar='file', help='Output file with ostream operators', required=True)
+
+parser = argparse.ArgumentParser(
+    description="genOstreamOps.py: generates ostream operators for all typedefs in provided input file."
+)
+requiredNamed = parser.add_argument_group("Required arguments")
+requiredNamed.add_argument(
+    "-in",
+    metavar="fileList",
+    help="Comma separated list of header files to be parsed",
+    required=True,
+)
+requiredNamed.add_argument(
+    "-out", metavar="file", help="Output file with ostream operators", required=True
+)
 
 args = vars(parser.parse_args())
 
-if __name__ == '__main__':
-   flist = args['in'].split(',')
-   if len(flist) == 1:
-     gen_cppheader(flist[0], args['out'],2)
-   else:
-     for i in range(len(flist)):
-       if i == 0:
-         gen_cppheader(flist[i], args['out'],0)
-       elif i == len(flist)-1:
-         gen_cppheader(flist[i], args['out'],1)
-       else:
-         gen_cppheader(flist[i], args['out'],-1)
+if __name__ == "__main__":
+    flist = args["in"].split(",")
+    if len(flist) == 1:
+        gen_cppheader(flist[0], args["out"], 2)
+    else:
+        for i in range(len(flist)):
+            if i == 0:
+                gen_cppheader(flist[i], args["out"], 0)
+            elif i == len(flist) - 1:
+                gen_cppheader(flist[i], args["out"], 1)
+            else:
+                gen_cppheader(flist[i], args["out"], -1)
diff --git a/script/hsaap.py b/script/hsaap.py
index 784a6432..153a5e00 100755
--- a/script/hsaap.py
+++ b/script/hsaap.py
@@ -25,507 +25,610 @@
 from __future__ import print_function
 import os, sys, re
 
-H_OUT='hsa_prof_str.h'
-CPP_OUT='hsa_prof_str.inline.h'
-API_TABLES_H = 'hsa_api_trace.h'
+H_OUT = "hsa_prof_str.h"
+CPP_OUT = "hsa_prof_str.inline.h"
+API_TABLES_H = "hsa_api_trace.h"
 API_HEADERS_H = (
-  ('CoreApi', 'hsa.h'),
-  ('AmdExt', 'hsa_ext_amd.h'),
-  ('ImageExt', 'hsa_ext_image.h'),
-  ('AmdExt', API_TABLES_H),
+    ("CoreApi", "hsa.h"),
+    ("AmdExt", "hsa_ext_amd.h"),
+    ("ImageExt", "hsa_ext_image.h"),
+    ("AmdExt", API_TABLES_H),
+)
+
+LICENSE = (
+    "/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.\n"
+    + "\n"
+    + " Permission is hereby granted, free of charge, to any person obtaining a copy\n"
+    + ' of this software and associated documentation files (the "Software"), to deal\n'
+    + " in the Software without restriction, including without limitation the rights\n"
+    + " to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n"
+    + " copies of the Software, and to permit persons to whom the Software is\n"
+    + " furnished to do so, subject to the following conditions:\n"
+    + "\n"
+    + " The above copyright notice and this permission notice shall be included in\n"
+    + " all copies or substantial portions of the Software.\n"
+    + "\n"
+    + ' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n'
+    + " IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n"
+    + " FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE\n"
+    + " AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n"
+    + " LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n"
+    + " OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n"
+    + " THE SOFTWARE. */\n"
 )
 
-LICENSE = \
-'/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.\n' + \
-'\n' + \
-' Permission is hereby granted, free of charge, to any person obtaining a copy\n' + \
-' of this software and associated documentation files (the "Software"), to deal\n' + \
-' in the Software without restriction, including without limitation the rights\n' + \
-' to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n' + \
-' copies of the Software, and to permit persons to whom the Software is\n' + \
-' furnished to do so, subject to the following conditions:\n' + \
-'\n' + \
-' The above copyright notice and this permission notice shall be included in\n' + \
-' all copies or substantial portions of the Software.\n' + \
-'\n' + \
-' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' + \
-' IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n' + \
-' FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE\n' + \
-' AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n' + \
-' LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n' + \
-' OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n' + \
-' THE SOFTWARE. */\n'
 
 #############################################################
 # Error handler
 def fatal(module, msg):
-  print (module + ' Error: "' + msg + '"', file = sys.stderr)
-  sys.exit(1)
+    print(module + ' Error: "' + msg + '"', file=sys.stderr)
+    sys.exit(1)
+
 
 # Get next text block
 def NextBlock(pos, record):
-  if len(record) == 0: return pos
-
-  space_pattern = re.compile(r'(\s+)')
-  word_pattern = re.compile(r'([\w\*]+)')
-  if record[pos] != '(':
-    m = space_pattern.match(record, pos)
-    if not m:
-      m = word_pattern.match(record, pos)
-    if m:
-      return pos + len(m.group(1))
+    if len(record) == 0:
+        return pos
+
+    space_pattern = re.compile(r"(\s+)")
+    word_pattern = re.compile(r"([\w\*]+)")
+    if record[pos] != "(":
+        m = space_pattern.match(record, pos)
+        if not m:
+            m = word_pattern.match(record, pos)
+        if m:
+            return pos + len(m.group(1))
+        else:
+            fatal("NextBlock", "bad record '" + record + "' pos(" + str(pos) + ")")
     else:
-      fatal('NextBlock', "bad record '" + record + "' pos(" + str(pos) + ")")
-  else:
-    count = 0
-    for index in range(pos, len(record)):
-      if record[index] == '(':
-        count = count + 1
-      elif record[index] == ')':
-        count = count - 1
-        if count == 0:
-          index = index + 1
-          break
-    if count != 0:
-      fatal('NextBlock', "count is not zero (" + str(count) + ")")
-    if record[index - 1] != ')':
-      fatal('NextBlock', "last char is not ')' '" + record[index - 1] + "'")
-    return index
+        count = 0
+        for index in range(pos, len(record)):
+            if record[index] == "(":
+                count = count + 1
+            elif record[index] == ")":
+                count = count - 1
+                if count == 0:
+                    index = index + 1
+                    break
+        if count != 0:
+            fatal("NextBlock", "count is not zero (" + str(count) + ")")
+        if record[index - 1] != ")":
+            fatal("NextBlock", "last char is not ')' '" + record[index - 1] + "'")
+        return index
+
 
 #############################################################
 # API table parser class
 class API_TableParser:
-  def fatal(self, msg):
-    fatal('API_TableParser', msg)
-
-  def __init__(self, header, name):
-    self.name = name
-
-    if not os.path.isfile(header):
-      self.fatal("file '" + header + "' not found")
-
-    self.inp = open(header, 'r')
-
-    self.beg_pattern = re.compile('^\s*struct\s+' + name + 'Table\s*{\s*$')
-    self.end_pattern = re.compile('^\s*};\s*$')
-    self.array = []
-    self.parse()
-
-  # normalizing a line
-  def norm_line(self, line):
-    return re.sub(r'^\s+', r' ', line[:-1])
-
-  # check for start record
-  def is_start(self, record):
-    return self.beg_pattern.match(record)
-
-  # check for end record
-  def is_end(self, record):
-    return self.end_pattern.match(record)
-
-  # check for declaration entry record
-  def is_entry(self, record):
-    return re.match(r'^\s*decltype\(([^\)]*)\)', record)
-
-  # parse method
-  def parse(self):
-    active = 0
-    for line in self.inp.readlines():
-      record = self.norm_line(line)
-      if self.is_start(record): active = 1
-      if active != 0:
-        if self.is_end(record): return
-        m = self.is_entry(record)
-        if m:
-          self.array.append(m.group(1))
+    def fatal(self, msg):
+        fatal("API_TableParser", msg)
+
+    def __init__(self, header, name):
+        self.name = name
+
+        if not os.path.isfile(header):
+            self.fatal("file '" + header + "' not found")
+
+        self.inp = open(header, "r")
+
+        self.beg_pattern = re.compile("^\s*struct\s+" + name + "Table\s*{\s*$")
+        self.end_pattern = re.compile("^\s*};\s*$")
+        self.array = []
+        self.parse()
+
+    # normalizing a line
+    def norm_line(self, line):
+        return re.sub(r"^\s+", r" ", line[:-1])
+
+    # check for start record
+    def is_start(self, record):
+        return self.beg_pattern.match(record)
+
+    # check for end record
+    def is_end(self, record):
+        return self.end_pattern.match(record)
+
+    # check for declaration entry record
+    def is_entry(self, record):
+        return re.match(r"^\s*decltype\(([^\)]*)\)", record)
+
+    # parse method
+    def parse(self):
+        active = 0
+        for line in self.inp.readlines():
+            record = self.norm_line(line)
+            if self.is_start(record):
+                active = 1
+            if active != 0:
+                if self.is_end(record):
+                    return
+                m = self.is_entry(record)
+                if m:
+                    self.array.append(m.group(1))
+
 
 #############################################################
 # API declaration parser class
 class API_DeclParser:
-  def fatal(self, msg):
-    fatal('API_DeclParser', msg)
-
-  def __init__(self, header, array, data):
-    if not os.path.isfile(header):
-      self.fatal("file '" + header + "' not found")
-
-    self.inp = open(header, 'r')
-
-    self.end_pattern = re.compile('\);\s*$')
-    self.data = data
-    for call in array:
-      if call in data:
-        self.fatal(call + ' is already found')
-      self.parse(call)
-
-  # api record filter
-  def api_filter(self, record):
-    record = re.sub(r'\sHSA_API\s', r' ', record)
-    record = re.sub(r'\sHSA_DEPRECATED\s', r' ', record)
-    return record
-
-  # check for start record
-  def is_start(self, call, record):
-    return re.search('\s' + call + '\s*\(', record)
-
-  # check for API method record
-  def is_api(self, call, record):
-    record = self.api_filter(record)
-    return re.match('\s+\S+\s+' + call + '\s*\(', record)
-
-  # check for end record
-  def is_end(self, record):
-    return self.end_pattern.search(record)
-
-  # parse method args
-  def get_args(self, record):
-    struct = {'ret': '', 'args': '', 'astr': {}, 'alst': [], 'tlst': []}
-    record = re.sub(r'^\s+', r'', record)
-    record = re.sub(r'\s*(\*+)\s*', r'\1 ', record)
-    rind = NextBlock(0, record)
-    struct['ret'] = record[0:rind]
-    pos = record.find('(')
-    end = NextBlock(pos, record);
-    args = record[pos:end]
-    args = re.sub(r'^\(\s*', r'', args)
-    args = re.sub(r'\s*\)$', r'', args)
-    args = re.sub(r'\s*,\s*', r',', args)
-    struct['args'] = re.sub(r',', r', ', args)
-    if len(args) == 0: return struct
-
-    pos = 0
-    args = args + ','
-    while pos < len(args):
-      ind1 = NextBlock(pos, args) # type
-      ind2 = NextBlock(ind1, args) # space
-      if args[ind2] != '(':
-        while ind2 < len(args):
-          end = NextBlock(ind2, args)
-          if args[end] == ',': break
-          else: ind2 = end
-        name = args[ind2:end]
-      else:
-        ind3 = NextBlock(ind2, args) # field
-        m = re.match(r'\(\s*\*\s*(\S+)\s*\)', args[ind2:ind3])
-        if not m:
-          self.fatal("bad block3 '" + args + "' : '" + args[ind2:ind3] + "'")
-        name = m.group(1)
-        end = NextBlock(ind3, args) # the rest
-      item = args[pos:end]
-      struct['astr'][name] = item
-      struct['alst'].append(name)
-      struct['tlst'].append(item)
-      if args[end] != ',':
-        self.fatal("no comma '" + args + "'")
-      pos = end + 1
-
-    return struct
-
-  # parse given api
-  def parse(self, call):
-    record = ''
-    active = 0
-    found = 0
-    api_name = ''
-    prev_line = ''
-
-    self.inp.seek(0)
-    for line in self.inp.readlines():
-      record += ' ' + line[:-1]
-      record = re.sub(r'^\s*', r' ', record)
-
-      if active == 0:
-        if self.is_start(call, record):
-          active = 1
-          m = self.is_api(call, record)
-          if not m:
-            record = ' ' + prev_line + ' ' + record
-            m = self.is_api(call, record)
-            if not m:
-              self.fatal("bad api '" + line + "'")
-
-      if active == 1:
-        if self.is_end(record):
-          self.data[call] = self.get_args(record)
-          active = 0
-          found = 0
-
-      if active == 0: record = ''
-      prev_line = line
+    def fatal(self, msg):
+        fatal("API_DeclParser", msg)
+
+    def __init__(self, header, array, data):
+        if not os.path.isfile(header):
+            self.fatal("file '" + header + "' not found")
+
+        self.inp = open(header, "r")
+
+        self.end_pattern = re.compile("\);\s*$")
+        self.data = data
+        for call in array:
+            if call in data:
+                self.fatal(call + " is already found")
+            self.parse(call)
+
+    # api record filter
+    def api_filter(self, record):
+        record = re.sub(r"\sHSA_API\s", r" ", record)
+        record = re.sub(r"\sHSA_DEPRECATED\s", r" ", record)
+        return record
+
+    # check for start record
+    def is_start(self, call, record):
+        return re.search("\s" + call + "\s*\(", record)
+
+    # check for API method record
+    def is_api(self, call, record):
+        record = self.api_filter(record)
+        return re.match("\s+\S+\s+" + call + "\s*\(", record)
+
+    # check for end record
+    def is_end(self, record):
+        return self.end_pattern.search(record)
+
+    # parse method args
+    def get_args(self, record):
+        struct = {"ret": "", "args": "", "astr": {}, "alst": [], "tlst": []}
+        record = re.sub(r"^\s+", r"", record)
+        record = re.sub(r"\s*(\*+)\s*", r"\1 ", record)
+        rind = NextBlock(0, record)
+        struct["ret"] = record[0:rind]
+        pos = record.find("(")
+        end = NextBlock(pos, record)
+        args = record[pos:end]
+        args = re.sub(r"^\(\s*", r"", args)
+        args = re.sub(r"\s*\)$", r"", args)
+        args = re.sub(r"\s*,\s*", r",", args)
+        struct["args"] = re.sub(r",", r", ", args)
+        if len(args) == 0:
+            return struct
+
+        pos = 0
+        args = args + ","
+        while pos < len(args):
+            ind1 = NextBlock(pos, args)  # type
+            ind2 = NextBlock(ind1, args)  # space
+            if args[ind2] != "(":
+                while ind2 < len(args):
+                    end = NextBlock(ind2, args)
+                    if args[end] == ",":
+                        break
+                    else:
+                        ind2 = end
+                name = args[ind2:end]
+            else:
+                ind3 = NextBlock(ind2, args)  # field
+                m = re.match(r"\(\s*\*\s*(\S+)\s*\)", args[ind2:ind3])
+                if not m:
+                    self.fatal("bad block3 '" + args + "' : '" + args[ind2:ind3] + "'")
+                name = m.group(1)
+                end = NextBlock(ind3, args)  # the rest
+            item = args[pos:end]
+            struct["astr"][name] = item
+            struct["alst"].append(name)
+            struct["tlst"].append(item)
+            if args[end] != ",":
+                self.fatal("no comma '" + args + "'")
+            pos = end + 1
+
+        return struct
+
+    # parse given api
+    def parse(self, call):
+        record = ""
+        active = 0
+        found = 0
+        api_name = ""
+        prev_line = ""
+
+        self.inp.seek(0)
+        for line in self.inp.readlines():
+            record += " " + line[:-1]
+            record = re.sub(r"^\s*", r" ", record)
+
+            if active == 0:
+                if self.is_start(call, record):
+                    active = 1
+                    m = self.is_api(call, record)
+                    if not m:
+                        record = " " + prev_line + " " + record
+                        m = self.is_api(call, record)
+                        if not m:
+                            self.fatal("bad api '" + line + "'")
+
+            if active == 1:
+                if self.is_end(record):
+                    self.data[call] = self.get_args(record)
+                    active = 0
+                    found = 0
+
+            if active == 0:
+                record = ""
+            prev_line = line
+
 
 #############################################################
 # API description parser class
 class API_DescrParser:
-  def fatal(self, msg):
-    fatal('API_DescrParser', msg)
-
-  def __init__(self, out_h_file, hsa_dir, api_table_h, api_headers, license):
-    out_macro = re.sub(r'[\/\.]', r'_', out_h_file.upper()) + '_'
-
-    self.h_content = ''
-    self.cpp_content = ''
-    self.api_names = []
-    self.api_calls = {}
-    self.api_rettypes = set()
-    self.api_id = {}
-
-    api_data = {}
-    api_list = []
-    ns_calls = []
-
-    for i in range(0, len(api_headers)):
-      (name, header) = api_headers[i]
-
-      if i < len(api_headers) - 1:
-        api = API_TableParser(hsa_dir + api_table_h, name)
-        api_list = api.array
-        self.api_names.append(name)
-        self.api_calls[name] = api_list
-      else:
-        api_list = ns_calls
-        ns_calls = []
+    def fatal(self, msg):
+        fatal("API_DescrParser", msg)
 
-      for call in api_list:
-        if call in api_data:
-          self.fatal("call '"  + call + "' is already found")
+    def __init__(self, out_h_file, hsa_dir, api_table_h, api_headers, license):
+        out_macro = re.sub(r"[\/\.]", r"_", out_h_file.upper()) + "_"
 
-      API_DeclParser(hsa_dir + header, api_list, api_data)
+        self.h_content = ""
+        self.cpp_content = ""
+        self.api_names = []
+        self.api_calls = {}
+        self.api_rettypes = set()
+        self.api_id = {}
 
-      for call in api_list:
-        if not call in api_data:
-          # Not-supported functions
-          ns_calls.append(call)
+        api_data = {}
+        api_list = []
+        ns_calls = []
+
+        for i in range(0, len(api_headers)):
+            (name, header) = api_headers[i]
+
+            if i < len(api_headers) - 1:
+                api = API_TableParser(hsa_dir + api_table_h, name)
+                api_list = api.array
+                self.api_names.append(name)
+                self.api_calls[name] = api_list
+            else:
+                api_list = ns_calls
+                ns_calls = []
+
+            for call in api_list:
+                if call in api_data:
+                    self.fatal("call '" + call + "' is already found")
+
+            API_DeclParser(hsa_dir + header, api_list, api_data)
+
+            for call in api_list:
+                if not call in api_data:
+                    # Not-supported functions
+                    ns_calls.append(call)
+                else:
+                    # API ID map
+                    self.api_id[call] = "HSA_API_ID_" + call
+                    # Return types
+                    self.api_rettypes.add(api_data[call]["ret"])
+
+        self.api_rettypes.discard("void")
+        self.api_data = api_data
+        self.ns_calls = ns_calls
+
+        self.h_content += (
+            "/* Generated by " + os.path.basename(__file__) + " */\n" + license + "\n\n"
+        )
+
+        self.h_content += "/* HSA API tracing primitives\n"
+        for name, header in api_headers:
+            self.h_content += (
+                " '"
+                + name
+                + "', header '"
+                + header
+                + "', "
+                + str(len(self.api_calls[name]))
+                + " funcs\n"
+            )
+        for call in self.ns_calls:
+            self.h_content += " " + call + " was not parsed\n"
+        self.h_content += " */\n"
+        self.h_content += "\n"
+        self.h_content += "#ifndef " + out_macro + "\n"
+        self.h_content += "#define " + out_macro + "\n"
+
+        self.h_content += self.add_section("API ID enumeration", "  ", self.gen_id_enum)
+
+        self.h_content += "/* Declarations of APIs intended for use only by tools. */\n"
+        self.h_content += "typedef void (*hsa_amd_queue_intercept_packet_writer)(const void*, uint64_t);\n"
+        self.h_content += "typedef void (*hsa_amd_queue_intercept_handler)(const void*, uint64_t, uint64_t, void*,\n"
+        self.h_content += "                                                hsa_amd_queue_intercept_packet_writer);\n"
+        self.h_content += "typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t*, hsa_agent_t, void*);\n"
+
+        self.h_content += self.add_section(
+            "API arg structure", "    ", self.gen_arg_struct
+        )
+        self.h_content += self.add_section(
+            "API output stream", "    ", self.gen_out_stream
+        )
+        self.h_content += "#endif /* " + out_macro + " */\n"
+
+        self.cpp_content += (
+            "/* Generated by " + os.path.basename(__file__) + " */\n" + license + "\n\n"
+        )
+
+        self.cpp_content += "#include <hsa/hsa_api_trace.h>\n"
+        self.cpp_content += "#include <atomic>\n"
+        self.cpp_content += "namespace roctracer::hsa_support::detail {\n"
+
+        self.cpp_content += "static CoreApiTable CoreApi_saved_before_cb;\n"
+        self.cpp_content += "static AmdExtTable AmdExt_saved_before_cb;\n"
+        self.cpp_content += "static ImageExtTable ImageExt_saved_before_cb;\n\n"
+
+        self.cpp_content += self.add_section(
+            "API callback functions", "", self.gen_callbacks
+        )
+        self.cpp_content += self.add_section(
+            "API intercepting code", "", self.gen_intercept
+        )
+        self.cpp_content += self.add_section(
+            "API get_name function", "    ", self.gen_get_name
+        )
+        self.cpp_content += self.add_section(
+            "API get_code function", "  ", self.gen_get_code
+        )
+        self.cpp_content += "\n};\n"
+
+    # add code section
+    def add_section(self, title, gap, fun):
+        content = ""
+        n = 0
+        content += "\n/* section: " + title + " */\n\n"
+        content += fun(-1, "-", "-", {})
+        for index in range(len(self.api_names)):
+            last = index == len(self.api_names) - 1
+            name = self.api_names[index]
+            if n != 0:
+                if gap == "":
+                    content += fun(n, name, "-", {})
+                content += "\n"
+            content += gap + "/* block: " + name + " API */\n"
+            for call in self.api_calls[name]:
+                content += fun(n, name, call, self.api_data[call])
+                n += 1
+        content += fun(n, "-", "-", {})
+        return content
+
+    # generate API ID enumeration
+    def gen_id_enum(self, n, name, call, data):
+        content = ""
+        if n == -1:
+            content += "enum hsa_api_id_t {\n"
+            return content
+        if call != "-":
+            content += "  " + self.api_id[call] + " = " + str(n) + ",\n"
         else:
-          # API ID map
-          self.api_id[call] = 'HSA_API_ID_' + call
-          # Return types
-          self.api_rettypes.add(api_data[call]['ret'])
-
-    self.api_rettypes.discard('void')
-    self.api_data = api_data
-    self.ns_calls = ns_calls
-
-    self.h_content += "/* Generated by " + os.path.basename(__file__) + " */\n" + license + "\n\n"
-
-    self.h_content += "/* HSA API tracing primitives\n"
-    for (name, header) in api_headers:
-      self.h_content += " '" + name + "', header '" + header + "', " + str(len(self.api_calls[name])) + ' funcs\n'
-    for call in self.ns_calls:
-      self.h_content += ' ' + call + ' was not parsed\n'
-    self.h_content += " */\n"
-    self.h_content += '\n'
-    self.h_content += '#ifndef ' + out_macro + '\n'
-    self.h_content += '#define ' + out_macro + '\n'
-
-    self.h_content += self.add_section('API ID enumeration', '  ', self.gen_id_enum)
-
-    self.h_content += '/* Declarations of APIs intended for use only by tools. */\n'
-    self.h_content += 'typedef void (*hsa_amd_queue_intercept_packet_writer)(const void*, uint64_t);\n'
-    self.h_content += 'typedef void (*hsa_amd_queue_intercept_handler)(const void*, uint64_t, uint64_t, void*,\n'
-    self.h_content += '                                                hsa_amd_queue_intercept_packet_writer);\n'
-    self.h_content += 'typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t*, hsa_agent_t, void*);\n'
-
-    self.h_content += self.add_section('API arg structure', '    ', self.gen_arg_struct)
-    self.h_content += self.add_section('API output stream', '    ', self.gen_out_stream)
-    self.h_content += '#endif /* ' + out_macro + ' */\n'
-
-    self.cpp_content += "/* Generated by " + os.path.basename(__file__) + " */\n" + license + "\n\n"
-
-    self.cpp_content += '#include <hsa/hsa_api_trace.h>\n'
-    self.cpp_content += '#include <atomic>\n'
-    self.cpp_content += 'namespace roctracer::hsa_support::detail {\n'
-
-    self.cpp_content += 'static CoreApiTable CoreApi_saved_before_cb;\n'
-    self.cpp_content += 'static AmdExtTable AmdExt_saved_before_cb;\n'
-    self.cpp_content += 'static ImageExtTable ImageExt_saved_before_cb;\n\n'
-
-    self.cpp_content += self.add_section('API callback functions', '', self.gen_callbacks)
-    self.cpp_content += self.add_section('API intercepting code', '', self.gen_intercept)
-    self.cpp_content += self.add_section('API get_name function', '    ', self.gen_get_name)
-    self.cpp_content += self.add_section('API get_code function', '  ', self.gen_get_code)
-    self.cpp_content += '\n};\n'
-
-  # add code section
-  def add_section(self, title, gap, fun):
-    content = ''
-    n = 0
-    content +=  '\n/* section: ' + title + ' */\n\n'
-    content += fun(-1, '-', '-', {})
-    for index in range(len(self.api_names)):
-      last = (index == len(self.api_names) - 1)
-      name = self.api_names[index]
-      if n != 0:
-        if gap == '': content += fun(n, name, '-', {})
-        content += '\n'
-      content += gap + '/* block: ' + name + ' API */\n'
-      for call in self.api_calls[name]:
-        content += fun(n, name, call, self.api_data[call])
-        n += 1
-    content += fun(n, '-', '-', {})
-    return content
-
-  # generate API ID enumeration
-  def gen_id_enum(self, n, name, call, data):
-    content = ''
-    if n == -1:
-      content += 'enum hsa_api_id_t {\n'
-      return content
-    if call != '-':
-      content += '  ' + self.api_id[call] + ' = ' + str(n) + ',\n'
-    else:
-      content += '\n'
-      content += '  HSA_API_ID_DISPATCH = ' + str(n) + ',\n'
-      content += '  HSA_API_ID_NUMBER = ' + str(n + 1) + ',\n'
-      content += '};\n'
-    return content
-
-  # generate API args structure
-  def gen_arg_struct(self, n, name, call, struct):
-    content = ''
-    if n == -1:
-      content += 'typedef struct hsa_api_data_s {\n'
-      content += '  uint64_t correlation_id;\n'
-      content += '  uint32_t phase;\n'
-      content += '  union {\n'
-      for ret_type in self.api_rettypes:
-        content += '    ' + ret_type + ' ' + ret_type + '_retval;\n'
-      content += '  };\n'
-      content += '  union {\n'
-      return content
-    if call != '-':
-      content +=   '    struct {\n'
-      for (var, item) in struct['astr'].items():
-        content += '      ' + item + ';\n'
-        if call == "hsa_amd_memory_async_copy_rect" and item == "const hsa_dim3_t* range":
-          content += '      hsa_dim3_t range__val;\n'
-      content +=   '    } ' + call + ';\n'
-    else:
-      content += '  } args;\n'
-      content += '  uint64_t *phase_data;\n'
-      content += '} hsa_api_data_t;\n'
-    return content
-
-  # generate API callbacks
-  def gen_callbacks(self, n, name, call, struct):
-    content = ''
-    if n == -1:
-      content += '/* section: Static declarations */\n'
-      content += '\n'
-    if call != '-':
-      call_id = self.api_id[call];
-      ret_type = struct['ret']
-      content += 'static ' + ret_type + ' ' + call + '_callback(' + struct['args'] + ') {\n'
-
-      content += '  hsa_trace_data_t trace_data;\n'
-      content += '  bool enabled{false};\n'
-      content += '\n'
-      content += '  if (auto function = report_activity.load(std::memory_order_relaxed); function &&\n'
-      content += '      (enabled =\n'
-      content += '           function(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &trace_data) == 0)) {\n'
-      content += '    if (trace_data.phase_enter != nullptr) {\n'
-
-      for var in struct['alst']:
-        item = struct['astr'][var];
-        if re.search(r'char\* ', item):
-          # FIXME: we should not strdup the char* arguments here, as the callback will not outlive the scope of this function. Instead, we
-          # should generate a helper function to capture the content of the arguments similar to hipApiArgsInit for HIP. We also need a
-          # helper to free the memory that is allocated to capture the content.
-          content += '      trace_data.api_data.args.' + call + '.' + var + ' = ' + '(' + var + ' != NULL) ? strdup(' + var + ')' + ' : NULL;\n'
+            content += "\n"
+            content += "  HSA_API_ID_DISPATCH = " + str(n) + ",\n"
+            content += "  HSA_API_ID_NUMBER = " + str(n + 1) + ",\n"
+            content += "};\n"
+        return content
+
+    # generate API args structure
+    def gen_arg_struct(self, n, name, call, struct):
+        content = ""
+        if n == -1:
+            content += "typedef struct hsa_api_data_s {\n"
+            content += "  uint64_t correlation_id;\n"
+            content += "  uint32_t phase;\n"
+            content += "  union {\n"
+            for ret_type in self.api_rettypes:
+                content += "    " + ret_type + " " + ret_type + "_retval;\n"
+            content += "  };\n"
+            content += "  union {\n"
+            return content
+        if call != "-":
+            content += "    struct {\n"
+            for var, item in struct["astr"].items():
+                content += "      " + item + ";\n"
+                if (
+                    call == "hsa_amd_memory_async_copy_rect"
+                    and item == "const hsa_dim3_t* range"
+                ):
+                    content += "      hsa_dim3_t range__val;\n"
+            content += "    } " + call + ";\n"
         else:
-          content += '      trace_data.api_data.args.' + call + '.' + var + ' = ' + var + ';\n'
-          if call == 'hsa_amd_memory_async_copy_rect' and var == 'range':
-            content += '      trace_data.api_data.args.' + call + '.' + var + '__val = ' + '*(' + var + ');\n'
-
-      content += '      trace_data.phase_enter(' + call_id + ', &trace_data);\n'
-      content += '    }\n'
-      content += '  }\n'
-      content += '\n'
-
-      if ret_type != 'void':
-        content +=  '  trace_data.api_data.' + ret_type + '_retval = '
-      content += '  ' + name + '_saved_before_cb.' + call + '_fn(' + ', '.join(struct['alst']) + ');\n'
-
-      content += '\n'
-      content += '  if (enabled && trace_data.phase_exit != nullptr)\n'
-      content += '    trace_data.phase_exit(' + call_id + ', &trace_data);\n'
-
-      if ret_type != 'void':
-        content += '  return trace_data.api_data.' + ret_type + '_retval;\n'
-      content += '}\n'
-
-    return content
-
-  # generate API intercepting code
-  def gen_intercept(self, n, name, call, struct):
-    content = ''
-    if n > 0 and call == '-':
-      content += '};\n'
-    if n == 0 or (call == '-' and name != '-'):
-      content += 'static void Install' + name + 'Wrappers(' + name + 'Table* table) {\n'
-      content += '  ' + name + '_saved_before_cb = *table;\n'
-    if call != '-':
-      if call != 'hsa_shut_down':
-        content += '  table->' + call + '_fn = ' + call + '_callback;\n'
-      else:
-        content += '  { void* p = (void*)' + call + '_callback; (void)p; }\n'
-    return content
-
-  # generate API name function
-  def gen_get_name(self, n, name, call, struct):
-    content = ''
-    if n == -1:
-      content += 'static const char* GetApiName(uint32_t id) {\n'
-      content += '  switch (id) {\n'
-      return content
-    if call != '-':
-      content += '    case ' + self.api_id[call] + ': return "' + call + '";\n'
-    else:
-      content += '  }\n'
-      content += '  return "unknown";\n'
-      content += '}\n'
-    return content
-
-  # generate API code function
-  def gen_get_code(self, n, name, call, struct):
-    content = ''
-    if n == -1:
-      content += 'static uint32_t GetApiCode(const char* str) {\n'
-      return content
-    if call != '-':
-      content += '  if (strcmp("' + call + '", str) == 0) return ' + self.api_id[call] + ';\n'
-    else:
-      content += '  return HSA_API_ID_NUMBER;\n'
-      content += '}\n'
-    return content
-
-  # generate stream operator
-  def gen_out_stream(self, n, name, call, struct):
-    content = ''
-    if n == -1:
-      content += '#ifdef __cplusplus\n'
-      content += '#include "hsa_ostream_ops.h"\n'
-      content += 'typedef std::pair<uint32_t, hsa_api_data_t> hsa_api_data_pair_t;\n'
-      content += 'inline std::ostream& operator<< (std::ostream& out, const hsa_api_data_pair_t& data_pair) {\n'
-      content += '  const uint32_t cid = data_pair.first;\n'
-      content += '  const hsa_api_data_t& api_data = data_pair.second;\n'
-      content += '  switch(cid) {\n'
-      return content
-    if call != '-':
-      content += '    case ' + self.api_id[call] + ': {\n'
-      content += '      out << "' + call + '(";\n'
-      arg_list = struct['alst']
-      if len(arg_list) != 0:
-        for ind in range(len(arg_list)):
-          arg_var = arg_list[ind]
-          arg_val = 'api_data.args.' + call + '.' + arg_var
-          if re.search(r'char\* ', struct['astr'][arg_var]):
-            content += '      out << "0x" << std::hex << (uint64_t)' + arg_val
-          else:
-            content += '      out << ' + arg_val
-            if call == "hsa_amd_memory_async_copy_rect" and arg_var == "range":
-              content += ' << ", ";\n'
-              content += '      out << ' + arg_val + '__val'
-          '''
+            content += "  } args;\n"
+            content += "  uint64_t *phase_data;\n"
+            content += "} hsa_api_data_t;\n"
+        return content
+
+    # generate API callbacks
+    def gen_callbacks(self, n, name, call, struct):
+        content = ""
+        if n == -1:
+            content += "/* section: Static declarations */\n"
+            content += "\n"
+        if call != "-":
+            call_id = self.api_id[call]
+            ret_type = struct["ret"]
+            content += (
+                "static "
+                + ret_type
+                + " "
+                + call
+                + "_callback("
+                + struct["args"]
+                + ") {\n"
+            )
+
+            content += "  hsa_trace_data_t trace_data;\n"
+            content += "  bool enabled{false};\n"
+            content += "\n"
+            content += "  if (auto function = report_activity.load(std::memory_order_relaxed); function &&\n"
+            content += "      (enabled =\n"
+            content += (
+                "           function(ACTIVITY_DOMAIN_HSA_API, "
+                + call_id
+                + ", &trace_data) == 0)) {\n"
+            )
+            content += "    if (trace_data.phase_enter != nullptr) {\n"
+
+            for var in struct["alst"]:
+                item = struct["astr"][var]
+                if re.search(r"char\* ", item):
+                    # FIXME: we should not strdup the char* arguments here, as the callback will not outlive the scope of this function. Instead, we
+                    # should generate a helper function to capture the content of the arguments similar to hipApiArgsInit for HIP. We also need a
+                    # helper to free the memory that is allocated to capture the content.
+                    content += (
+                        "      trace_data.api_data.args."
+                        + call
+                        + "."
+                        + var
+                        + " = "
+                        + "("
+                        + var
+                        + " != NULL) ? strdup("
+                        + var
+                        + ")"
+                        + " : NULL;\n"
+                    )
+                else:
+                    content += (
+                        "      trace_data.api_data.args."
+                        + call
+                        + "."
+                        + var
+                        + " = "
+                        + var
+                        + ";\n"
+                    )
+                    if call == "hsa_amd_memory_async_copy_rect" and var == "range":
+                        content += (
+                            "      trace_data.api_data.args."
+                            + call
+                            + "."
+                            + var
+                            + "__val = "
+                            + "*("
+                            + var
+                            + ");\n"
+                        )
+
+            content += "      trace_data.phase_enter(" + call_id + ", &trace_data);\n"
+            content += "    }\n"
+            content += "  }\n"
+            content += "\n"
+
+            if ret_type != "void":
+                content += "  trace_data.api_data." + ret_type + "_retval = "
+            content += (
+                "  "
+                + name
+                + "_saved_before_cb."
+                + call
+                + "_fn("
+                + ", ".join(struct["alst"])
+                + ");\n"
+            )
+
+            content += "\n"
+            content += "  if (enabled && trace_data.phase_exit != nullptr)\n"
+            content += "    trace_data.phase_exit(" + call_id + ", &trace_data);\n"
+
+            if ret_type != "void":
+                content += "  return trace_data.api_data." + ret_type + "_retval;\n"
+            content += "}\n"
+
+        return content
+
+    # generate API intercepting code
+    def gen_intercept(self, n, name, call, struct):
+        content = ""
+        if n > 0 and call == "-":
+            content += "};\n"
+        if n == 0 or (call == "-" and name != "-"):
+            content += (
+                "static void Install" + name + "Wrappers(" + name + "Table* table) {\n"
+            )
+            content += "  " + name + "_saved_before_cb = *table;\n"
+        if call != "-":
+            if call != "hsa_shut_down":
+                content += "  table->" + call + "_fn = " + call + "_callback;\n"
+            else:
+                content += "  { void* p = (void*)" + call + "_callback; (void)p; }\n"
+        return content
+
+    # generate API name function
+    def gen_get_name(self, n, name, call, struct):
+        content = ""
+        if n == -1:
+            content += "static const char* GetApiName(uint32_t id) {\n"
+            content += "  switch (id) {\n"
+            return content
+        if call != "-":
+            content += "    case " + self.api_id[call] + ': return "' + call + '";\n'
+        else:
+            content += "  }\n"
+            content += '  return "unknown";\n'
+            content += "}\n"
+        return content
+
+    # generate API code function
+    def gen_get_code(self, n, name, call, struct):
+        content = ""
+        if n == -1:
+            content += "static uint32_t GetApiCode(const char* str) {\n"
+            return content
+        if call != "-":
+            content += (
+                '  if (strcmp("'
+                + call
+                + '", str) == 0) return '
+                + self.api_id[call]
+                + ";\n"
+            )
+        else:
+            content += "  return HSA_API_ID_NUMBER;\n"
+            content += "}\n"
+        return content
+
+    # generate stream operator
+    def gen_out_stream(self, n, name, call, struct):
+        content = ""
+        if n == -1:
+            content += "#ifdef __cplusplus\n"
+            content += '#include "hsa_ostream_ops.h"\n'
+            content += (
+                "typedef std::pair<uint32_t, hsa_api_data_t> hsa_api_data_pair_t;\n"
+            )
+            content += "inline std::ostream& operator<< (std::ostream& out, const hsa_api_data_pair_t& data_pair) {\n"
+            content += "  const uint32_t cid = data_pair.first;\n"
+            content += "  const hsa_api_data_t& api_data = data_pair.second;\n"
+            content += "  switch(cid) {\n"
+            return content
+        if call != "-":
+            content += "    case " + self.api_id[call] + ": {\n"
+            content += '      out << "' + call + '(";\n'
+            arg_list = struct["alst"]
+            if len(arg_list) != 0:
+                for ind in range(len(arg_list)):
+                    arg_var = arg_list[ind]
+                    arg_val = "api_data.args." + call + "." + arg_var
+                    if re.search(r"char\* ", struct["astr"][arg_var]):
+                        content += '      out << "0x" << std::hex << (uint64_t)' + arg_val
+                    else:
+                        content += "      out << " + arg_val
+                        if (
+                            call == "hsa_amd_memory_async_copy_rect"
+                            and arg_var == "range"
+                        ):
+                            content += ' << ", ";\n'
+                            content += "      out << " + arg_val + "__val"
+                    """
           arg_item = struct['tlst'][ind]
           if re.search(r'\(\* ', arg_item): arg_pref = ''
           elif re.search(r'void\* ', arg_item): arg_pref = ''
@@ -536,46 +639,53 @@ def gen_out_stream(self, n, name, call, struct):
             content += '      if (' + arg_val + ') out << ' + arg_pref + '(' + arg_val + '); else out << ' + arg_val
           else:
             content += '      out << ' + arg_val
-          '''
-          if ind < len(arg_list) - 1: content += ' << ", ";\n'
-          else: content += ';\n'
-      if struct['ret'] != 'void':
-        content += '      out << ") = " << api_data.' + struct['ret'] + '_retval;\n'
-      else:
-        content += '      out << ") = void";\n'
-      content += '      break;\n'
-      content += '    }\n'
-    else:
-      content += '    default:\n'
-      content += '      out << "ERROR: unknown API";\n'
-      content += '      abort();\n'
-      content += '  }\n'
-      content += '  return out;\n'
-      content += '}\n'
-      content += '#endif\n'
-    return content
+          """
+                    if ind < len(arg_list) - 1:
+                        content += ' << ", ";\n'
+                    else:
+                        content += ";\n"
+            if struct["ret"] != "void":
+                content += (
+                    '      out << ") = " << api_data.' + struct["ret"] + "_retval;\n"
+                )
+            else:
+                content += '      out << ") = void";\n'
+            content += "      break;\n"
+            content += "    }\n"
+        else:
+            content += "    default:\n"
+            content += '      out << "ERROR: unknown API";\n'
+            content += "      abort();\n"
+            content += "  }\n"
+            content += "  return out;\n"
+            content += "}\n"
+            content += "#endif\n"
+        return content
+
 
 #############################################################
 # main
 # Usage
 if len(sys.argv) != 3:
-  print ("Usage:", sys.argv[0], " <OUT prefix> <HSA runtime include path>", file=sys.stderr)
-  sys.exit(1)
+    print(
+        "Usage:", sys.argv[0], " <OUT prefix> <HSA runtime include path>", file=sys.stderr
+    )
+    sys.exit(1)
 else:
-  PREFIX = sys.argv[1] + '/'
-  HSA_DIR = sys.argv[2] + '/'
+    PREFIX = sys.argv[1] + "/"
+    HSA_DIR = sys.argv[2] + "/"
 
 descr = API_DescrParser(H_OUT, HSA_DIR, API_TABLES_H, API_HEADERS_H, LICENSE)
 
 out_file = PREFIX + H_OUT
-print ('Generating "' + out_file + '"')
-f = open(out_file, 'w')
+print('Generating "' + out_file + '"')
+f = open(out_file, "w")
 f.write(descr.h_content[:-1])
 f.close()
 
 out_file = PREFIX + CPP_OUT
-print ('Generating "' + out_file + '"')
-f = open(out_file, 'w')
+print('Generating "' + out_file + '"')
+f = open(out_file, "w")
 f.write(descr.cpp_content[:-1])
 f.close()
 #############################################################
diff --git a/script/leak-sanitizer-suppr.txt b/script/leak-sanitizer-suppr.txt
new file mode 100644
index 00000000..8aad4454
--- /dev/null
+++ b/script/leak-sanitizer-suppr.txt
@@ -0,0 +1,8 @@
+#
+# LeakSanitizer suppressions file for rocprofiler project.
+#
+
+leak:amd_comgr
+leak:hsa-runtime
+leak:amdhip
+leak:python
diff --git a/script/run-ci.py b/script/run-ci.py
new file mode 100755
index 00000000..51243891
--- /dev/null
+++ b/script/run-ci.py
@@ -0,0 +1,454 @@
+#!/usr/bin/env python3
+
+
+import os
+import re
+import sys
+import glob
+import socket
+import shutil
+import argparse
+import multiprocessing
+
+# this constant is used to define CTEST_PROJECT_NAME
+# and default value for CTEST_SUBMIT_URL
+_PROJECT_NAME = "rocprofiler"
+_BASE_URL = "10.194.116.31/cdash"
+
+
+def which(cmd, require):
+    v = shutil.which(cmd)
+    if require and v is None:
+        raise RuntimeError(f"{cmd} not found")
+    return v if v is not None else ""
+
+
+def generate_custom(args, cmake_args, ctest_args):
+    if not os.path.exists(args.binary_dir):
+        os.makedirs(args.binary_dir)
+
+    if args.memcheck is not None:
+        if args.coverage:
+            raise ValueError(
+                f"Enabling --memcheck={args.memcheck} and --coverage not supported"
+            )
+        cmake_args += [f"-DROCPROFILER_MEMCHECK={args.memcheck}"]
+
+    NAME = args.name
+    SITE = args.site
+    BUILD_JOBS = args.build_jobs
+    SUBMIT_URL = args.submit_url
+    SOURCE_DIR = os.path.realpath(args.source_dir)
+    BINARY_DIR = os.path.realpath(args.binary_dir)
+    CMAKE_ARGS = " ".join(cmake_args)
+    CTEST_ARGS = " ".join(ctest_args)
+
+    GIT_CMD = which("git", require=True)
+    GCOV_CMD = which("gcov", require=False)
+    CMAKE_CMD = which("cmake", require=True)
+    # CTEST_CMD = which("ctest", require=True)
+
+    NAME = re.sub(r"(.*)-([0-9]+)/merge", "PR_\\2_\\1", NAME)
+
+    DEFAULT_CMAKE_ARGS = " ".join(
+        [f"-DROCPROFILER_BUILD_{x}=ON" for x in ["CI", "TESTS", "SAMPLES"]]
+    )
+
+    GPU_TARGETS = ";".join(args.gpu_targets)
+    MEMCHECK_TYPE = "" if args.memcheck is None else args.memcheck
+
+    MEMCHECK_SANITIZER_OPTIONS = ""
+    MEMCHECK_SUPPRESSION_FILE = ""
+
+    if MEMCHECK_TYPE == "AddressSanitizer":
+        MEMCHECK_SANITIZER_OPTIONS = "detect_leaks=0 use_sigaltstack=0"
+        MEMCHECK_SUPPRESSION_FILE = f"{SOURCE_DIR}/script/address-sanitizer-suppr.txt"
+    elif MEMCHECK_TYPE == "LeakSanitizer":
+        MEMCHECK_SUPPRESSION_FILE = f"{SOURCE_DIR}/script/leak-sanitizer-suppr.txt"
+    elif MEMCHECK_TYPE == "ThreadSanitizer":
+        external_symbolizer_path = ""
+        for version in range(8, 20):
+            _symbolizer = shutil.which(f"llvm-symbolizer-{version}")
+            if _symbolizer:
+                external_symbolizer_path = f"external_symbolizer_path={_symbolizer}"
+        os.environ["TSAN_OPTIONS"] = " ".join(
+            [
+                "history_size=5",
+                "second_deadlock_stack=1",
+                f"suppressions={SOURCE_DIR}/script/thread-sanitizer-suppr.txt",
+                external_symbolizer_path,
+                os.environ.get("TSAN_OPTIONS", ""),
+            ]
+        )
+
+    return f"""
+        set(CTEST_PROJECT_NAME "{_PROJECT_NAME}")
+        set(CTEST_NIGHTLY_START_TIME "05:00:00 UTC")
+
+        set(CTEST_DROP_METHOD "http")
+        set(CTEST_DROP_SITE_CDASH TRUE)
+        set(CTEST_SUBMIT_URL "http://{SUBMIT_URL}")
+
+        set(CTEST_UPDATE_TYPE git)
+        set(CTEST_UPDATE_VERSION_ONLY TRUE)
+        set(CTEST_GIT_COMMAND {GIT_CMD})
+        set(CTEST_GIT_INIT_SUBMODULES FALSE)
+
+        set(CTEST_OUTPUT_ON_FAILURE TRUE)
+        set(CTEST_USE_LAUNCHERS TRUE)
+        set(CMAKE_CTEST_ARGUMENTS --output-on-failure {CTEST_ARGS})
+
+        set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100")
+        set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100")
+        set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "51200")
+        set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/usr/.*;/opt/.*;.*external/.*;.*samples/.*;.*test/.*;.*tests-v2/.*;.*perfetto/perfetto_sdk/.*;.*ctf/barectf.*")
+
+        set(CTEST_MEMORYCHECK_TYPE "{MEMCHECK_TYPE}")
+        set(CTEST_MEMORYCHECK_SUPPRESSIONS_FILE "{MEMCHECK_SUPPRESSION_FILE}")
+        set(CTEST_MEMORYCHECK_SANITIZER_OPTIONS "{MEMCHECK_SANITIZER_OPTIONS}")
+
+        set(CTEST_SITE "{SITE}")
+        set(CTEST_BUILD_NAME "{NAME}")
+
+        set(CTEST_SOURCE_DIRECTORY {SOURCE_DIR})
+        set(CTEST_BINARY_DIRECTORY {BINARY_DIR})
+
+        set(CTEST_CONFIGURE_COMMAND "{CMAKE_CMD} -B {BINARY_DIR} {SOURCE_DIR} {DEFAULT_CMAKE_ARGS} -DGPU_TARGETS={GPU_TARGETS} {CMAKE_ARGS}")
+        set(CTEST_BUILD_COMMAND "{CMAKE_CMD} --build {BINARY_DIR} --target all --parallel {BUILD_JOBS}")
+        set(CTEST_COVERAGE_COMMAND {GCOV_CMD})
+        """
+
+
+def generate_dashboard_script(args):
+    CODECOV = 1 if args.coverage else 0
+    DASHBOARD_MODE = args.mode
+    SOURCE_DIR = os.path.realpath(args.source_dir)
+    BINARY_DIR = os.path.realpath(args.binary_dir)
+    MEMCHECK = 1 if args.memcheck is not None else 0
+    SUBMIT = 0 if args.disable_cdash else 1
+    ARGN = "${ARGN}"
+
+    if args.memcheck == "ThreadSanitizer":
+        MEMCHECK = 0
+
+    _script = f"""
+        macro(dashboard_submit)
+            if("{SUBMIT}" GREATER 0)
+                ctest_submit({ARGN})
+            endif()
+        endmacro()
+    """
+
+    _script += """
+
+        include("${CMAKE_CURRENT_LIST_DIR}/CTestCustom.cmake")
+
+        macro(handle_error _message _ret)
+            if(NOT ${${_ret}} EQUAL 0)
+                dashboard_submit(PARTS Done RETURN_VALUE _submit_ret)
+                message(FATAL_ERROR "${_message} failed: ${${_ret}}")
+            endif()
+        endmacro()
+        """
+
+    _script += f"""
+        ctest_start({DASHBOARD_MODE})
+        ctest_update(SOURCE "{SOURCE_DIR}" RETURN_VALUE _update_ret
+                     CAPTURE_CMAKE_ERROR _update_err)
+        ctest_configure(BUILD "{BINARY_DIR}" RETURN_VALUE _configure_ret)
+        dashboard_submit(PARTS Start Update Configure RETURN_VALUE _submit_ret)
+
+        if(NOT _update_err EQUAL 0)
+            message(WARNING "ctest_update failed")
+        endif()
+
+        handle_error("Configure" _configure_ret)
+
+        ctest_build(BUILD "{BINARY_DIR}" RETURN_VALUE _build_ret)
+        dashboard_submit(PARTS Build RETURN_VALUE _submit_ret)
+
+        handle_error("Build" _build_ret)
+
+        if("{MEMCHECK}" GREATER 0)
+            ctest_memcheck(BUILD "{BINARY_DIR}" RETURN_VALUE _test_ret)
+            dashboard_submit(PARTS Test RETURN_VALUE _submit_ret)
+        else()
+            ctest_test(BUILD "{BINARY_DIR}" RETURN_VALUE _test_ret)
+            dashboard_submit(PARTS Test RETURN_VALUE _submit_ret)
+        endif()
+
+        if("{CODECOV}" GREATER 0)
+            ctest_coverage(
+                BUILD "{BINARY_DIR}"
+                RETURN_VALUE _coverage_ret
+                CAPTURE_CMAKE_ERROR _coverage_err)
+            dashboard_submit(PARTS Coverage RETURN_VALUE _submit_ret)
+        endif()
+
+        handle_error("Testing" _test_ret)
+
+        dashboard_submit(PARTS Done RETURN_VALUE _submit_ret)
+        """
+    return _script
+
+
+def parse_cdash_args(args):
+    BUILD_JOBS = multiprocessing.cpu_count()
+    DASHBOARD_MODE = "Continuous"
+    DASHBOARD_STAGES = [
+        "Start",
+        "Update",
+        "Configure",
+        "Build",
+        "Test",
+        "MemCheck",
+        "Coverage",
+        "Submit",
+    ]
+    SOURCE_DIR = os.getcwd()
+    BINARY_DIR = os.path.join(SOURCE_DIR, "build")
+    SITE = socket.gethostname()
+    SUBMIT_URL = f"{_BASE_URL}/submit.php?project={_PROJECT_NAME}"
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-n", "--name", help="Job name", default=None, type=str, required=True
+    )
+    parser.add_argument("-s", "--site", help="Site name", default=SITE, type=str)
+    parser.add_argument(
+        "-q", "--quiet", help="Disable printing logs", action="store_true"
+    )
+    parser.add_argument(
+        "-c", "--coverage", help="Enable code coverage", action="store_true"
+    )
+    parser.add_argument(
+        "-j",
+        "--build-jobs",
+        help="Number of build tasks",
+        default=BUILD_JOBS,
+        type=int,
+    )
+    parser.add_argument(
+        "-B",
+        "--binary-dir",
+        help="Build directory",
+        default=BINARY_DIR,
+        type=str,
+    )
+    parser.add_argument(
+        "-S",
+        "--source-dir",
+        help="Source directory",
+        default=SOURCE_DIR,
+        type=str,
+    )
+    parser.add_argument(
+        "-F",
+        "--clean",
+        help="Remove existing build directory",
+        action="store_true",
+    )
+    parser.add_argument(
+        "-M",
+        "--mode",
+        help="Dashboard mode",
+        default=DASHBOARD_MODE,
+        choices=("Continuous", "Nightly", "Experimental"),
+        type=str,
+    )
+    parser.add_argument(
+        "-T",
+        "--stages",
+        help="Dashboard stages",
+        nargs="+",
+        default=DASHBOARD_STAGES,
+        choices=DASHBOARD_STAGES,
+        type=str,
+    )
+    parser.add_argument(
+        "--submit-url",
+        help="CDash submission site",
+        default=SUBMIT_URL,
+        type=str,
+    )
+    parser.add_argument(
+        "--repeat-until-pass",
+        help="<N> for --repeat until-pass:<N>",
+        default=None,
+        type=int,
+    )
+    parser.add_argument(
+        "--repeat-until-fail",
+        help="<N> for --repeat until-fail:<N>",
+        default=None,
+        type=int,
+    )
+    parser.add_argument(
+        "--repeat-after-timeout",
+        help="<N> for --repeat after-timeout:<N>",
+        default=None,
+        type=int,
+    )
+    parser.add_argument(
+        "--disable-cdash",
+        help="Disable submitting results to CDash dashboard",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--gpu-targets",
+        help="GPU build architectures",
+        default="gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100".split(),
+        type=str,
+        nargs="+",
+    )
+    parser.add_argument(
+        "--memcheck",
+        help="Run dynamic analysis tool",
+        default=None,
+        type=str,
+        choices=(
+            "ThreadSanitizer",
+            "AddressSanitizer",
+            "LeakSanitizer",
+            "MemorySanitizer",
+            "UndefinedBehaviorSanitizer",
+        ),
+    )
+    parser.add_argument(
+        "--linter",
+        help="Enable linting tool",
+        default=None,
+        type=str,
+        choices=("clang-tidy",),
+    )
+
+    return parser.parse_args(args)
+
+
+def parse_args(args=None):
+    if args is None:
+        args = sys.argv[1:]
+
+    index = 0
+    input_args = []
+    ctest_args = []
+    cmake_args = []
+    data = [input_args, cmake_args, ctest_args]
+    cmd = os.path.basename(sys.argv[0])
+
+    for itr in args:
+        if itr == "--":
+            index += 1
+            if index > 2:
+                raise RuntimeError(
+                    f"Usage: {cmd} <options> -- <cmake-args> -- <ctest-args>"
+                )
+        else:
+            data[index].append(itr)
+
+    cdash_args = parse_cdash_args(input_args)
+
+    if cdash_args.coverage:
+        cmake_args += ["-DROCPROFILER_BUILD_CODECOV=ON"]
+
+    if cdash_args.linter == "clang-tidy":
+        cmake_args += ["-DROCPROFILER_ENABLE_CLANG_TIDY=ON"]
+
+    def get_repeat_val(_param):
+        _value = getattr(cdash_args, f"repeat_{_param}".replace("-", "_"))
+        return [f"{_param}:{_value}"] if _value is not None and _value > 1 else []
+
+    repeat_args = (
+        get_repeat_val("until-pass")
+        + get_repeat_val("until-fail")
+        + get_repeat_val("after-timeout")
+    )
+    ctest_args += ["--repeat"] + repeat_args if len(repeat_args) > 0 else []
+
+    return [cdash_args, cmake_args, ctest_args]
+
+
+def run(*args, **kwargs):
+    import subprocess
+
+    return subprocess.run(*args, **kwargs)
+
+
+if __name__ == "__main__":
+    args, cmake_args, ctest_args = parse_args()
+
+    if args.clean and os.path.exists(args.binary_dir):
+        if args.source_dir == args.binary_dir:
+            raise RuntimeError(
+                f"cannot clean binary directory == source directory ({args.source_dir})"
+            )
+
+        shutil.rmtree(args.binary_dir)
+
+    if not os.path.exists(args.binary_dir):
+        os.makedirs(args.binary_dir)
+
+    from textwrap import dedent
+
+    _config = dedent(generate_custom(args, cmake_args, ctest_args))
+    _script = dedent(generate_dashboard_script(args))
+
+    if not args.quiet:
+        sys.stderr.write(f"##### CTestCustom.cmake #####\n\n{_config}\n\n")
+        sys.stderr.write(f"##### dashboard.cmake #####\n\n{_script}\n\n")
+
+    with open(os.path.join(args.binary_dir, "CTestCustom.cmake"), "w") as f:
+        f.write(f"{_config}\n")
+
+    with open(os.path.join(args.binary_dir, "dashboard.cmake"), "w") as f:
+        f.write(f"{_script}\n")
+
+    CTEST_CMD = which("ctest", require=True)
+
+    dashboard_args = ["-D"]
+    for itr in args.stages:
+        dashboard_args.append(f"{args.mode}{itr}")
+
+    try:
+        if not args.quiet and len(ctest_args) == 0:
+            ctest_args = ["--output-on-failure", "-V"]
+
+        run(
+            [CTEST_CMD]
+            + dashboard_args
+            + [
+                "-S",
+                os.path.join(args.binary_dir, "dashboard.cmake"),
+            ]
+            + ctest_args,
+            check=True,
+        )
+    finally:
+        if "-VV" not in ctest_args and not args.quiet:
+            for file in glob.glob(
+                os.path.join(args.binary_dir, "Testing/Temporary/**"),
+                recursive=True,
+            ):
+                if not os.path.isfile(file):
+                    continue
+                if (
+                    re.match(
+                        r"Last(Start|Update|Configure|Build|Test).*\.log$",
+                        os.path.basename(file),
+                    )
+                    is None
+                ):
+                    continue
+
+                print(f"\n\n\n###### Reading {file}... ######\n\n\n")
+                with open(file, "r") as inpf:
+                    fdata = inpf.read()
+                    if "LastTest" not in file and "Coverage" not in file:
+                        print(fdata)
+                    oname = os.path.basename(file)
+                    if oname.endswith(".log"):
+                        oname += ".log"
+                    with open(os.path.join(args.binary_dir, oname), "w") as outf:
+                        print(f"\n\n###### Writing {oname}... ######\n\n")
+                        outf.write(fdata)
diff --git a/script/thread-sanitizer-suppr.txt b/script/thread-sanitizer-suppr.txt
new file mode 100644
index 00000000..32d5847d
--- /dev/null
+++ b/script/thread-sanitizer-suppr.txt
@@ -0,0 +1,9 @@
+#
+# ThreadSanitizer suppressions file for rocprofiler project.
+#
+
+# leaked thread
+thread:libhsa-runtime64.so
+
+# unlock of an unlocked mutex (or by a wrong thread)
+mutex:librocm_smi64.so
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8ab325d3..c97ad943 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -20,5 +20,9 @@
 # THE SOFTWARE.
 ################################################################################
 
+if(ROCPROFILER_BUILD_CODECOV)
+    set(CMAKE_BUILD_TYPE "Coverage")
+endif()
+
 add_subdirectory(api)
 add_subdirectory(tools)
diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt
index f994227e..c184210a 100644
--- a/src/api/CMakeLists.txt
+++ b/src/api/CMakeLists.txt
@@ -242,8 +242,9 @@ target_include_directories(
     ${ROCPROFILER_TARGET}
     PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include/rocprofiler>
     PRIVATE ${LIB_DIR} ${ROOT_DIR} ${PROJECT_SOURCE_DIR}/include/rocprofiler)
-target_link_libraries(${ROCPROFILER_TARGET} PRIVATE ${AQLPROFILE_LIB}
-                                                    hsa-runtime64::hsa-runtime64 c stdc++)
+target_link_libraries(
+    ${ROCPROFILER_TARGET} PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 c stdc++
+                                  dl rocprofiler::build-flags rocprofiler::memcheck)
 
 get_target_property(ROCPROFILER_LIBRARY_V1_NAME ${ROCPROFILER_TARGET} NAME)
 get_target_property(ROCPROFILER_LIBRARY_V1_VERSION ${ROCPROFILER_TARGET} VERSION)
@@ -313,47 +314,26 @@ target_include_directories(
            $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
     PRIVATE ${LIB_DIR} ${ROOT_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${PROJECT_SOURCE_DIR}
             ${PROJECT_SOURCE_DIR}/tools)
-if(ASAN)
-    target_compile_options(rocprofiler-v2 PRIVATE -fsanitize=address)
-    target_link_options(
-        rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap
-        -Wl,--no-undefined,-fsanitize=address)
-    target_link_libraries(
-        rocprofiler-v2
-        PRIVATE ${AQLPROFILE_LIB}
-                hsa-runtime64::hsa-runtime64
-                Threads::Threads
-                atomic
-                numa
-                asan
-                dl
-                c
-                stdc++
-                stdc++fs
-                amd_comgr
-                dw
-                elf
-                ${PCIACCESS_LIBRARIES})
-else()
-    target_link_options(
-        rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap
-        -Wl,--no-undefined)
-    target_link_libraries(
-        rocprofiler-v2
-        PRIVATE ${AQLPROFILE_LIB}
-                hsa-runtime64::hsa-runtime64
-                Threads::Threads
-                atomic
-                numa
-                dl
-                c
-                stdc++
-                stdc++fs
-                amd_comgr
-                dw
-                elf
-                ${PCIACCESS_LIBRARIES})
-endif()
+target_link_libraries(rocprofiler-v2 PRIVATE rocprofiler::build-flags)
+target_link_options(
+    rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap
+    -Wl,--no-undefined)
+target_link_libraries(
+    rocprofiler-v2
+    PRIVATE ${AQLPROFILE_LIB}
+            hsa-runtime64::hsa-runtime64
+            Threads::Threads
+            atomic
+            numa
+            dl
+            c
+            stdc++
+            stdc++fs
+            amd_comgr
+            dw
+            elf
+            ${PCIACCESS_LIBRARIES}
+            rocprofiler::memcheck)
 
 get_target_property(ROCPROFILER_LIBRARY_V2_NAME rocprofiler-v2 OUTPUT_NAME)
 get_target_property(ROCPROFILER_LIBRARY_V2_VERSION rocprofiler-v2 VERSION)
@@ -372,7 +352,12 @@ add_custom_command(
     COMMAND
         ${CMAKE_COMMAND} -E create_symlink
         lib${ROCPROFILER_LIBRARY_V2_NAME}.so.${ROCPROFILER_LIBRARY_V2_SOVERSION}
-        ${CMAKE_BINARY_DIR}/lib/lib${ROCPROFILER_LIBRARY_V2_NAME}v2.so)
+        ${CMAKE_BINARY_DIR}/lib/lib${ROCPROFILER_LIBRARY_V2_NAME}v2.so
+    # Temporarily up till Jenkins side is fixed
+    COMMAND
+        ${CMAKE_COMMAND} -E create_symlink
+        lib/lib${ROCPROFILER_LIBRARY_V1_NAME}.so
+        ${CMAKE_BINARY_DIR}/lib${ROCPROFILER_LIBRARY_V1_NAME}.so)
 # Add custom target to trigger the create_symlink command
 add_custom_target(create_rocprofiler_lib DEPENDS rocprofiler-v2 ${ROCPROFILER_TARGET})
 
diff --git a/src/core/counters/basic/xml_parser_basic.py b/src/core/counters/basic/xml_parser_basic.py
index 9ada504d..1478815d 100644
--- a/src/core/counters/basic/xml_parser_basic.py
+++ b/src/core/counters/basic/xml_parser_basic.py
@@ -6,228 +6,334 @@
 from lxml import etree
 import sys
 
-CPP_OUT='basic_counter.cpp'
+CPP_OUT = "basic_counter.cpp"
 
-if (__name__ == "__main__"):
-  cpp_content = ''
-  cpp_content += '/* Copyright (c) 2022 Advanced Micro Devices, Inc.\n'
-  cpp_content += '\n'
-  cpp_content += ' Permission is hereby granted, free of charge, to any person obtaining a copy\n'
-  cpp_content += ' of this software and associated documentation files (the \"Software\"), to deal\n'
-  cpp_content += ' in the Software without restriction, including without limitation the rights\n'
-  cpp_content += ' to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n'
-  cpp_content += ' copies of the Software, and to permit persons to whom the Software is\n'
-  cpp_content += ' furnished to do so, subject to the following conditions:\n'
-  cpp_content += '\n'
-  cpp_content += ' The above copyright notice and this permission notice shall be included in\n'
-  cpp_content += ' all copies or substantial portions of the Software.\n'
-  cpp_content += '\n'
-  cpp_content += ' THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n'
-  cpp_content += ' IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n'
-  cpp_content += ' FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n'
-  cpp_content += ' AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n'
-  cpp_content += ' LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n'
-  cpp_content += ' OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n'
-  cpp_content += ' THE SOFTWARE. */\n'
-  cpp_content += '\n'
-  cpp_content += '#include <cassert>\n'
-  cpp_content += '#include "src/utils/helper.h"\n'
-  cpp_content += '\n'
-  cpp_content += '#include \"src/core/counters/basic/basic_counter.h\"\n'
-  cpp_content += '#include \"src/core/hardware/hsa_info.h\"\n'
-  cpp_content += '\n'
-  cpp_content += '#define ASSERTM(exp, msg) assert(((void)msg, exp))\n'
-  cpp_content += '\n'
-  cpp_content += '#pragma GCC diagnostic push\n'
-  cpp_content += '#pragma GCC diagnostic ignored \"-Wmaybe-uninitialized\"\n'
-  cpp_content += 'namespace Counter {\n'
-  cpp_content += '\n'
-  cpp_content += 'BasicCounter::BasicCounter(uint64_t event_id, std::string block_id,\n'
-  cpp_content += '                           std::string name, std::string description,\n'
-  cpp_content += '                           std::string gpu_name)\n'
-  cpp_content += '    : Counter(name, description, gpu_name),\n'
-  cpp_content += '      event_id_(event_id),\n'
-  cpp_content += '      block_id_(block_id) {\n'
-  cpp_content += '  AddCounterToCounterMap();\n'
-  cpp_content += '}\n'
-  cpp_content += '\n'
-  cpp_content += 'BasicCounter::~BasicCounter() {}\n'
-  cpp_content += '\n'
-  cpp_content += 'uint64_t BasicCounter::GetBasicCounterID() {\n'
-  cpp_content += '  return GetCounterID();\n'
-  cpp_content += '}\n'
-  cpp_content += '\n'
-  cpp_content += 'uint64_t BasicCounter::GetEventId() { return event_id_; }\n'
-  cpp_content += 'std::string BasicCounter::GetBlockId() { return block_id_; }\n'
-  cpp_content += 'std::string BasicCounter::GetName() { return Counter::GetName(); }\n'
-  cpp_content += '\n'
-  cpp_content += 'bool BasicCounter::GetValue(uint64_t* value, int64_t instance_id = -1) {\n'
-  cpp_content += '  Agent::CounterHardwareInfo* agent_info =\n'
-  cpp_content += '      reinterpret_cast<Agent::CounterHardwareInfo*>(counter_hw_info);\n'
-  cpp_content += '  if ((agent_info->getNumInstances() > 1 && instance_id == -1) ||\n'
-  cpp_content += '      instance_id < -1 || instance_id >= agent_info->getNumInstances())\n'
-  cpp_content += '    return false;\n'
-  cpp_content += '  if (instance_id == -1) *value = instances_values_[0];\n'
-  cpp_content += '  *value = instances_values_[instance_id];\n'
-  cpp_content += '  return true;\n'
-  cpp_content += '}\n'
-  cpp_content += '\n'
-  cpp_content += 'uint64_t BasicCounter::GetValue(int64_t instance_id) {\n'
-  cpp_content += '  Agent::CounterHardwareInfo* agent_info =\n'
-  cpp_content += '      reinterpret_cast<Agent::CounterHardwareInfo*>(counter_hw_info);\n'
-  cpp_content += '  if ((agent_info->getNumInstances() > 1 && instance_id == -1) ||\n'
-  cpp_content += '      instance_id < -1 || instance_id >= agent_info->getNumInstances())\n'
-  cpp_content += '    throw(std::string(\"Error: Wrong number of instances (\") +\n'
-  cpp_content += '                std::to_string(agent_info->getNumInstances()) +\n'
-  cpp_content += '                \") OR Instance ID is less than 0 \");\n'
-  cpp_content += '  if (instance_id == -1) return instances_values_[0];\n'
-  cpp_content += '  return instances_values_[instance_id];\n'
-  cpp_content += '}\n'
-  cpp_content += '\n'
-  cpp_content += 'uint64_t BasicCounter::avr(int64_t instances_count) {\n'
-  cpp_content += '  Agent::CounterHardwareInfo* agent_info =\n'
-  cpp_content += '      reinterpret_cast<Agent::CounterHardwareInfo*>(counter_hw_info);\n'
-  cpp_content += '  if (agent_info->getNumInstances() > instances_count)\n'
-  cpp_content += '    throw(std::string(\"Error: Number of instances (\") +\n'
-  cpp_content += '                std::to_string(agent_info->getNumInstances()) +\n'
-  cpp_content += '                \") is greater than the given instance count(\" +\n'
-  cpp_content += '                std::to_string(instances_count) + \")\");\n'
-  cpp_content += '  uint64_t result = 0;\n'
-  cpp_content += '  int64_t instance_id;\n'
-  cpp_content += '  for (instance_id = 0; instance_id < instances_count; instance_id++) {\n'
-  cpp_content += '    uint64_t value;\n'
-  cpp_content += '    if (GetValue(&value, instance_id)) result += value;\n'
-  cpp_content += '  }\n'
-  cpp_content += '  return result / instances_count;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t BasicCounter::max(int64_t instances_count) {\n'
-  cpp_content += '  uint64_t result = 0;\n'
-  cpp_content += '  int64_t instance_id;\n'
-  cpp_content += '  for (instance_id = 0; instance_id < instances_count; instance_id++) {\n'
-  cpp_content += '    uint64_t value;\n'
-  cpp_content += '    if (GetValue(&value, instance_id) && result < value) result = value;\n'
-  cpp_content += '  }\n'
-  cpp_content += '  return result;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t BasicCounter::min(int64_t instances_count) {\n'
-  cpp_content += '  int64_t instance_id;\n'
-  cpp_content += '  uint64_t result = 0;\n'
-  cpp_content += '  for (instance_id = 0; instance_id < instances_count; instance_id++) {\n'
-  cpp_content += '    uint64_t value;\n'
-  cpp_content += '    if (GetValue(&value, instance_id) && result > value) result = value;\n'
-  cpp_content += '  }\n'
-  cpp_content += '  return result;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t BasicCounter::sum(int64_t instances_count) {\n'
-  cpp_content += '  int64_t instance_id;\n'
-  cpp_content += '  uint64_t result = 0;\n'
-  cpp_content += '  for (instance_id = 0; instance_id < instances_count; instance_id++) {\n'
-  cpp_content += '    uint64_t value;\n'
-  cpp_content += '    if (GetValue(&value, instance_id)) result += value;\n'
-  cpp_content += '  }\n'
-  cpp_content += '  return result;\n'
-  cpp_content += '}\n'
-  cpp_content += '\n'
-  cpp_content += 'uint64_t operator+(BasicCounter counter, const uint64_t number) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value = 0;\n'
-  cpp_content += '  ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n'
-  cpp_content += '  return number + value;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t operator-(BasicCounter counter, const uint64_t number) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value = 0;\n'
-  cpp_content += '  ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n'
-  cpp_content += '  return number - value;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t operator*(BasicCounter counter, const uint64_t number) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value = 0;\n'
-  cpp_content += '  ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n'
-  cpp_content += '  return number * value;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t operator/(BasicCounter counter, const uint64_t number) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value = 0;\n'
-  cpp_content += '  ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n'
-  cpp_content += '  return number / value;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t operator^(BasicCounter counter, const uint64_t number) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value = 0;\n'
-  cpp_content += '  ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n'
-  cpp_content += '  return number ^ value;\n'
-  cpp_content += '}\n'
-  cpp_content += '\n'
-  cpp_content += 'uint64_t operator+(BasicCounter counter1, BasicCounter counter2) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value1 = 0;\n'
-  cpp_content += '  ASSERTM(counter1.GetValue(&value1), \"Error: Counter has no value!\");\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value2 = 0;\n'
-  cpp_content += '  ASSERTM(counter2.GetValue(&value2), \"Error: Counter has no value!\");\n'
-  cpp_content += '  return value1 + value2;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t operator-(BasicCounter counter1, BasicCounter counter2) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value1 = 0;\n'
-  cpp_content += '  ASSERTM(counter1.GetValue(&value1), \"Error: Counter has no value!\");\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value2 = 0;\n'
-  cpp_content += '  ASSERTM(counter2.GetValue(&value2), \"Error: Counter has no value!\");\n'
-  cpp_content += '  return value1 - value2;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t operator*(BasicCounter counter1, BasicCounter counter2) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value1 = 0;\n'
-  cpp_content += '  ASSERTM(counter1.GetValue(&value1), \"Error: Counter has no value!\");\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value2 = 0;\n'
-  cpp_content += '  ASSERTM(counter2.GetValue(&value2), \"Error: Counter has no value!\");\n'
-  cpp_content += '  return value1 * value2;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t operator/(BasicCounter counter1, BasicCounter counter2) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value1 = 0;\n'
-  cpp_content += '  ASSERTM(counter1.GetValue( & value1), \"Error: Counter has no value!\");\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value2 = 0;\n'
-  cpp_content += '  ASSERTM(counter2.GetValue( & value2), \"Error: Counter has no value!\");\n'
-  cpp_content += ' return value1 / value2;\n'
-  cpp_content += '}\n'
-  cpp_content += 'uint64_t operator^(BasicCounter counter1, BasicCounter counter2) {\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value1 = 0;\n'
-  cpp_content += '  ASSERTM(counter1.GetValue(&value1), \"Error: Counter has no value!\");\n'
-  cpp_content += '  [[maybe_unused]] uint64_t value2 = 0;\n'
-  cpp_content += '  ASSERTM(counter2.GetValue(&value2), \"Error: Counter has no value!\");\n'
-  cpp_content += '  return value1 ^ value2;\n'
-  cpp_content += '}\n'
-  cpp_content += '\n'
-  cpp_content += 'static std::map<uint64_t, BasicCounter> basic_counters;\n'
-  cpp_content += '\n'
-  cpp_content += 'BasicCounter* GetGeneratedBasicCounter(uint64_t id) {\n'
-  cpp_content += '  return &basic_counters.at(id);\n'
-  cpp_content += '}\n'
-  cpp_content += '\n'
-  cpp_content += 'void ClearBasicCounters() {\n'
-  cpp_content += '  basic_counters.clear();\n'
-  cpp_content += '}\n'
-  cpp_content += '\n'
-  cpp_content += '/**\n'
-  cpp_content += ' * @brief Basic Counters\n'
-  cpp_content += ' *\n'
-  cpp_content += ' * @{\n'
-  cpp_content += ' */\n'
-  cpp_content += 'uint64_t GetBasicCounter(const char* name, const char* gpu_name) {\n'
-  cpp_content += '  std::string gpu;\n'
-  parser=etree.XMLParser(recover=True, encoding='utf-8')
-  xml_file=ET.parse(sys.argv[1] + '/gfx_metrics.xml', parser=parser)
-  root=xml_file.getroot()
-  for gpu in root:
-      cpp_content += "\n\t/**\n\t * @brief Basic " + gpu.tag + " counters\n\t *\n\t * @{\n\t */\n"
-      cpp_content += "\tgpu = \"" + gpu.tag + "\";\n\n"
-      cpp_content += "\tif (strncmp(gpu_name, gpu.c_str(), gpu.length())==0) {\n"
-      for child in gpu:
-          cpp_content += "\t/**\n\t * Basic Counter: " + child.attrib['name'] + "\n\t *\n\t * " + child.attrib['descr'] + "\n\t */\n\tif (strcmp(name, \"" + child.attrib['name'] + "\")==0) {\n\t\tbasic_counters.emplace(" + child.attrib['event'] + ", BasicCounter{" + child.attrib['event'] + ", \"" + child.attrib['block'] + "\", \"" + child.attrib['name'] + "\", \"" + child.attrib['descr'] + "\", \"" + gpu.tag + "\"});\n\t\treturn " + child.attrib['event'] + ";\n\t}\n"
-      cpp_content += "\t}\n\n\t/**\n\t * @}\n\t */\n"
-  cpp_content += '  throw(\"Couldn\'t find the required Counter name for the mentioned GPU!\");\n'
-  cpp_content += '  return 0;\n'
-  cpp_content += '}\n'
-  cpp_content += '/**\n'
-  cpp_content += ' * @}\n'
-  cpp_content += ' */\n'
-  cpp_content += '\n'
-  cpp_content += '}  // namespace Counter\n'
-  cpp_content += '\n'
-  cpp_content += '#pragma GCC diagnostic pop\n'
-  print ('Generating "' + sys.argv[2] + '"')
-  f = open(sys.argv[2], 'w')
-  f.write(cpp_content[:-1])
-  f.close()
+if __name__ == "__main__":
+    cpp_content = ""
+    cpp_content += "/* Copyright (c) 2022 Advanced Micro Devices, Inc.\n"
+    cpp_content += "\n"
+    cpp_content += (
+        " Permission is hereby granted, free of charge, to any person obtaining a copy\n"
+    )
+    cpp_content += (
+        ' of this software and associated documentation files (the "Software"), to deal\n'
+    )
+    cpp_content += (
+        " in the Software without restriction, including without limitation the rights\n"
+    )
+    cpp_content += (
+        " to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n"
+    )
+    cpp_content += (
+        " copies of the Software, and to permit persons to whom the Software is\n"
+    )
+    cpp_content += " furnished to do so, subject to the following conditions:\n"
+    cpp_content += "\n"
+    cpp_content += (
+        " The above copyright notice and this permission notice shall be included in\n"
+    )
+    cpp_content += " all copies or substantial portions of the Software.\n"
+    cpp_content += "\n"
+    cpp_content += (
+        ' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n'
+    )
+    cpp_content += (
+        " IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n"
+    )
+    cpp_content += (
+        " FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n"
+    )
+    cpp_content += (
+        " AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n"
+    )
+    cpp_content += (
+        " LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n"
+    )
+    cpp_content += (
+        " OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n"
+    )
+    cpp_content += " THE SOFTWARE. */\n"
+    cpp_content += "\n"
+    cpp_content += "#include <cassert>\n"
+    cpp_content += '#include "src/utils/helper.h"\n'
+    cpp_content += "\n"
+    cpp_content += '#include "src/core/counters/basic/basic_counter.h"\n'
+    cpp_content += '#include "src/core/hardware/hsa_info.h"\n'
+    cpp_content += "\n"
+    cpp_content += "#define ASSERTM(exp, msg) assert(((void)msg, exp))\n"
+    cpp_content += "\n"
+    cpp_content += "#pragma GCC diagnostic push\n"
+    cpp_content += '#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"\n'
+    cpp_content += "namespace Counter {\n"
+    cpp_content += "\n"
+    cpp_content += "BasicCounter::BasicCounter(uint64_t event_id, std::string block_id,\n"
+    cpp_content += (
+        "                           std::string name, std::string description,\n"
+    )
+    cpp_content += "                           std::string gpu_name)\n"
+    cpp_content += "    : Counter(name, description, gpu_name),\n"
+    cpp_content += "      event_id_(event_id),\n"
+    cpp_content += "      block_id_(block_id) {\n"
+    cpp_content += "  AddCounterToCounterMap();\n"
+    cpp_content += "}\n"
+    cpp_content += "\n"
+    cpp_content += "BasicCounter::~BasicCounter() {}\n"
+    cpp_content += "\n"
+    cpp_content += "uint64_t BasicCounter::GetBasicCounterID() {\n"
+    cpp_content += "  return GetCounterID();\n"
+    cpp_content += "}\n"
+    cpp_content += "\n"
+    cpp_content += "uint64_t BasicCounter::GetEventId() { return event_id_; }\n"
+    cpp_content += "std::string BasicCounter::GetBlockId() { return block_id_; }\n"
+    cpp_content += "std::string BasicCounter::GetName() { return Counter::GetName(); }\n"
+    cpp_content += "\n"
+    cpp_content += (
+        "bool BasicCounter::GetValue(uint64_t* value, int64_t instance_id = -1) {\n"
+    )
+    cpp_content += "  Agent::CounterHardwareInfo* agent_info =\n"
+    cpp_content += (
+        "      reinterpret_cast<Agent::CounterHardwareInfo*>(counter_hw_info);\n"
+    )
+    cpp_content += "  if ((agent_info->getNumInstances() > 1 && instance_id == -1) ||\n"
+    cpp_content += (
+        "      instance_id < -1 || instance_id >= agent_info->getNumInstances())\n"
+    )
+    cpp_content += "    return false;\n"
+    cpp_content += "  if (instance_id == -1) *value = instances_values_[0];\n"
+    cpp_content += "  *value = instances_values_[instance_id];\n"
+    cpp_content += "  return true;\n"
+    cpp_content += "}\n"
+    cpp_content += "\n"
+    cpp_content += "uint64_t BasicCounter::GetValue(int64_t instance_id) {\n"
+    cpp_content += "  Agent::CounterHardwareInfo* agent_info =\n"
+    cpp_content += (
+        "      reinterpret_cast<Agent::CounterHardwareInfo*>(counter_hw_info);\n"
+    )
+    cpp_content += "  if ((agent_info->getNumInstances() > 1 && instance_id == -1) ||\n"
+    cpp_content += (
+        "      instance_id < -1 || instance_id >= agent_info->getNumInstances())\n"
+    )
+    cpp_content += '    throw(std::string("Error: Wrong number of instances (") +\n'
+    cpp_content += "                std::to_string(agent_info->getNumInstances()) +\n"
+    cpp_content += '                ") OR Instance ID is less than 0 ");\n'
+    cpp_content += "  if (instance_id == -1) return instances_values_[0];\n"
+    cpp_content += "  return instances_values_[instance_id];\n"
+    cpp_content += "}\n"
+    cpp_content += "\n"
+    cpp_content += "uint64_t BasicCounter::avr(int64_t instances_count) {\n"
+    cpp_content += "  Agent::CounterHardwareInfo* agent_info =\n"
+    cpp_content += (
+        "      reinterpret_cast<Agent::CounterHardwareInfo*>(counter_hw_info);\n"
+    )
+    cpp_content += "  if (agent_info->getNumInstances() > instances_count)\n"
+    cpp_content += '    throw(std::string("Error: Number of instances (") +\n'
+    cpp_content += "                std::to_string(agent_info->getNumInstances()) +\n"
+    cpp_content += '                ") is greater than the given instance count(" +\n'
+    cpp_content += '                std::to_string(instances_count) + ")");\n'
+    cpp_content += "  uint64_t result = 0;\n"
+    cpp_content += "  int64_t instance_id;\n"
+    cpp_content += (
+        "  for (instance_id = 0; instance_id < instances_count; instance_id++) {\n"
+    )
+    cpp_content += "    uint64_t value;\n"
+    cpp_content += "    if (GetValue(&value, instance_id)) result += value;\n"
+    cpp_content += "  }\n"
+    cpp_content += "  return result / instances_count;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t BasicCounter::max(int64_t instances_count) {\n"
+    cpp_content += "  uint64_t result = 0;\n"
+    cpp_content += "  int64_t instance_id;\n"
+    cpp_content += (
+        "  for (instance_id = 0; instance_id < instances_count; instance_id++) {\n"
+    )
+    cpp_content += "    uint64_t value;\n"
+    cpp_content += (
+        "    if (GetValue(&value, instance_id) && result < value) result = value;\n"
+    )
+    cpp_content += "  }\n"
+    cpp_content += "  return result;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t BasicCounter::min(int64_t instances_count) {\n"
+    cpp_content += "  int64_t instance_id;\n"
+    cpp_content += "  uint64_t result = 0;\n"
+    cpp_content += (
+        "  for (instance_id = 0; instance_id < instances_count; instance_id++) {\n"
+    )
+    cpp_content += "    uint64_t value;\n"
+    cpp_content += (
+        "    if (GetValue(&value, instance_id) && result > value) result = value;\n"
+    )
+    cpp_content += "  }\n"
+    cpp_content += "  return result;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t BasicCounter::sum(int64_t instances_count) {\n"
+    cpp_content += "  int64_t instance_id;\n"
+    cpp_content += "  uint64_t result = 0;\n"
+    cpp_content += (
+        "  for (instance_id = 0; instance_id < instances_count; instance_id++) {\n"
+    )
+    cpp_content += "    uint64_t value;\n"
+    cpp_content += "    if (GetValue(&value, instance_id)) result += value;\n"
+    cpp_content += "  }\n"
+    cpp_content += "  return result;\n"
+    cpp_content += "}\n"
+    cpp_content += "\n"
+    cpp_content += "uint64_t operator+(BasicCounter counter, const uint64_t number) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  return number + value;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t operator-(BasicCounter counter, const uint64_t number) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  return number - value;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t operator*(BasicCounter counter, const uint64_t number) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  return number * value;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t operator/(BasicCounter counter, const uint64_t number) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  return number / value;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t operator^(BasicCounter counter, const uint64_t number) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  return number ^ value;\n"
+    cpp_content += "}\n"
+    cpp_content += "\n"
+    cpp_content += "uint64_t operator+(BasicCounter counter1, BasicCounter counter2) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value1 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter1.GetValue(&value1), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  [[maybe_unused]] uint64_t value2 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter2.GetValue(&value2), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  return value1 + value2;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t operator-(BasicCounter counter1, BasicCounter counter2) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value1 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter1.GetValue(&value1), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  [[maybe_unused]] uint64_t value2 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter2.GetValue(&value2), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  return value1 - value2;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t operator*(BasicCounter counter1, BasicCounter counter2) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value1 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter1.GetValue(&value1), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  [[maybe_unused]] uint64_t value2 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter2.GetValue(&value2), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  return value1 * value2;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t operator/(BasicCounter counter1, BasicCounter counter2) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value1 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter1.GetValue( & value1), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  [[maybe_unused]] uint64_t value2 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter2.GetValue( & value2), "Error: Counter has no value!");\n'
+    )
+    cpp_content += " return value1 / value2;\n"
+    cpp_content += "}\n"
+    cpp_content += "uint64_t operator^(BasicCounter counter1, BasicCounter counter2) {\n"
+    cpp_content += "  [[maybe_unused]] uint64_t value1 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter1.GetValue(&value1), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  [[maybe_unused]] uint64_t value2 = 0;\n"
+    cpp_content += (
+        '  ASSERTM(counter2.GetValue(&value2), "Error: Counter has no value!");\n'
+    )
+    cpp_content += "  return value1 ^ value2;\n"
+    cpp_content += "}\n"
+    cpp_content += "\n"
+    cpp_content += "static std::map<uint64_t, BasicCounter> basic_counters;\n"
+    cpp_content += "\n"
+    cpp_content += "BasicCounter* GetGeneratedBasicCounter(uint64_t id) {\n"
+    cpp_content += "  return &basic_counters.at(id);\n"
+    cpp_content += "}\n"
+    cpp_content += "\n"
+    cpp_content += "void ClearBasicCounters() {\n"
+    cpp_content += "  basic_counters.clear();\n"
+    cpp_content += "}\n"
+    cpp_content += "\n"
+    cpp_content += "/**\n"
+    cpp_content += " * @brief Basic Counters\n"
+    cpp_content += " *\n"
+    cpp_content += " * @{\n"
+    cpp_content += " */\n"
+    cpp_content += "uint64_t GetBasicCounter(const char* name, const char* gpu_name) {\n"
+    cpp_content += "  std::string gpu;\n"
+    parser = etree.XMLParser(recover=True, encoding="utf-8")
+    xml_file = ET.parse(sys.argv[1] + "/gfx_metrics.xml", parser=parser)
+    root = xml_file.getroot()
+    for gpu in root:
+        cpp_content += (
+            "\n\t/**\n\t * @brief Basic " + gpu.tag + " counters\n\t *\n\t * @{\n\t */\n"
+        )
+        cpp_content += '\tgpu = "' + gpu.tag + '";\n\n'
+        cpp_content += "\tif (strncmp(gpu_name, gpu.c_str(), gpu.length())==0) {\n"
+        for child in gpu:
+            cpp_content += (
+                "\t/**\n\t * Basic Counter: "
+                + child.attrib["name"]
+                + "\n\t *\n\t * "
+                + child.attrib["descr"]
+                + '\n\t */\n\tif (strcmp(name, "'
+                + child.attrib["name"]
+                + '")==0) {\n\t\tbasic_counters.emplace('
+                + child.attrib["event"]
+                + ", BasicCounter{"
+                + child.attrib["event"]
+                + ', "'
+                + child.attrib["block"]
+                + '", "'
+                + child.attrib["name"]
+                + '", "'
+                + child.attrib["descr"]
+                + '", "'
+                + gpu.tag
+                + '"});\n\t\treturn '
+                + child.attrib["event"]
+                + ";\n\t}\n"
+            )
+        cpp_content += "\t}\n\n\t/**\n\t * @}\n\t */\n"
+    cpp_content += (
+        '  throw("Couldn\'t find the required Counter name for the mentioned GPU!");\n'
+    )
+    cpp_content += "  return 0;\n"
+    cpp_content += "}\n"
+    cpp_content += "/**\n"
+    cpp_content += " * @}\n"
+    cpp_content += " */\n"
+    cpp_content += "\n"
+    cpp_content += "}  // namespace Counter\n"
+    cpp_content += "\n"
+    cpp_content += "#pragma GCC diagnostic pop\n"
+    print('Generating "' + sys.argv[2] + '"')
+    f = open(sys.argv[2], "w")
+    f.write(cpp_content[:-1])
+    f.close()
diff --git a/src/core/counters/derived/xml_parser_derived.py b/src/core/counters/derived/xml_parser_derived.py
index 07f5fe5b..be9e0283 100644
--- a/src/core/counters/derived/xml_parser_derived.py
+++ b/src/core/counters/derived/xml_parser_derived.py
@@ -7,82 +7,125 @@
 import ast
 import sys
 
-ops = {'Div': '/', 'Mult': '*', 'Add': '+', 'Sub': '-'}
-calls = {'avr', 'max', 'min', 'sum'}
+ops = {"Div": "/", "Mult": "*", "Add": "+", "Sub": "-"}
+calls = {"avr", "max", "min", "sum"}
+
 
 def parse_expr(gpu_tag, data):
-  global exprs_counters
-  global exprs_counters_init
-  global expr_print
-  global counter_count
-  global counters_dictionary
-  expr_queue = deque()
-  for line in data.split('\n'):
-    if 'Constant' in line:
-      number = line.split('(')[1].split(')')[0]
-      expr_queue.append('(uint64_t)' + number)
-    if 'Name' in line:
-      name = line.split('\'')[1]
-      if name in calls:
-        expr_queue.append(name)
-      else:
-        if not name in exprs_counters:
-          exprs_counters += "getGeneratedBasicCounter(" + name + "_id), "
-          exprs_counters_init += "\n\t\tuint64_t " + name + "_id = getBasicCounter(\"" + name + "\", \"" + gpu_tag + "\");"
-          counters_dictionary[name] = counter_count
-          counter_count+=1
-        expr_queue.append("counter.getBasicCounterFromDerived(" + str(counters_dictionary[name]) + ")")
-    op = line.split('(')[0]
-    if op in ops:
-      expr_queue.append(ops[op])
-  expr_print += "\n\t\t\t\treturn "
-  i = 0
-  for element in expr_queue:
-    if element in calls:
-      i = 1
-      call = element
-    elif i == 1:
-      expr_print += element + "." + call + "("
-      call = ""
-      i = 2
-    elif i == 2:
-      expr_print += element + ")"
-      i = 0
-    else:
-      expr_print += element
-      if "counter.getBasicCounterFromDerived" == element[0:34]:
-        expr_print += "->getValue()"
+    global exprs_counters
+    global exprs_counters_init
+    global expr_print
+    global counter_count
+    global counters_dictionary
+    expr_queue = deque()
+    for line in data.split("\n"):
+        if "Constant" in line:
+            number = line.split("(")[1].split(")")[0]
+            expr_queue.append("(uint64_t)" + number)
+        if "Name" in line:
+            name = line.split("'")[1]
+            if name in calls:
+                expr_queue.append(name)
+            else:
+                if not name in exprs_counters:
+                    exprs_counters += "getGeneratedBasicCounter(" + name + "_id), "
+                    exprs_counters_init += (
+                        "\n\t\tuint64_t "
+                        + name
+                        + '_id = getBasicCounter("'
+                        + name
+                        + '", "'
+                        + gpu_tag
+                        + '");'
+                    )
+                    counters_dictionary[name] = counter_count
+                    counter_count += 1
+                expr_queue.append(
+                    "counter.getBasicCounterFromDerived("
+                    + str(counters_dictionary[name])
+                    + ")"
+                )
+        op = line.split("(")[0]
+        if op in ops:
+            expr_queue.append(ops[op])
+    expr_print += "\n\t\t\t\treturn "
+    i = 0
+    for element in expr_queue:
+        if element in calls:
+            i = 1
+            call = element
+        elif i == 1:
+            expr_print += element + "." + call + "("
+            call = ""
+            i = 2
+        elif i == 2:
+            expr_print += element + ")"
+            i = 0
+        else:
+            expr_print += element
+            if "counter.getBasicCounterFromDerived" == element[0:34]:
+                expr_print += "->getValue()"
 
 
-if (__name__ == "__main__"):
-  global exprs_counters
-  global exprs_counters_init
-  global expr_print
-  global counter_count
-  parser = etree.XMLParser(recover=True, encoding='utf-8')
-  xml_file = ET.parse(sys.argv[1] + '/metrics.xml', parser=parser)
-  root = xml_file.getroot()
-  print(
-      "uint64_t getDerivedCounter(const char* name, const char* gpu_name) {")
-  for gpu in root:
-    print("\n\t/**\n\t * @brief Derived " + gpu.tag + " counters\n\t *\n\t * @{\n\t */")
-    print("\tif (strcmp(gpu_name, \"" + gpu.tag + "\")==0) {")
-    for child in gpu:
-      exprs_counters = ""
-      exprs_counters_init = ""
-      expr_print = ""
-      counter_count = 0
-      counters_dictionary = {}
-      parse_expr(gpu.tag.split("_")[0], ast.dump(ast.parse(
-          child.attrib['expr'], mode='eval'), annotate_fields=False, include_attributes=False, indent=0))
-      print("\t/**\n\t * Derived Counter: " + child.attrib['name'] + "\n\t *\n\t * " + child.attrib['descr'] + "\n\t */\n\tif (strcmp(name, \"" +
-            child.attrib['name'] + "\")==0) {" + exprs_counters_init + "\n\t\tDerivedCounter counter = DerivedCounter(\"" + child.attrib['name'] +
-            "\", \"" + child.attrib['descr'] + "\", \"" + gpu.tag.split("_")[0] + "\");")
-      exprs_counter_count = 0
-      for expr_counter in exprs_counters[0:-2].split(", "):
-        print("\n\t\tcounter.addBasicCounter(" + str(exprs_counter_count) + ", " + expr_counter + ");")
-        exprs_counter_count += 1
-      # print("\n\t\tcounter.evaluate_metric = [counter]() {" + expr_print + ";\n\t\t\t};")
-      print("\n\t\tderived_counters.emplace(counter.getMetricId(), counter);\n\t\treturn counter.getMetricId();\n\t}")
-    print("\t}\n\n\t/**\n\t * @}\n\t */")
-  print("\n\treturn 0;\n}\n")
+if __name__ == "__main__":
+    global exprs_counters
+    global exprs_counters_init
+    global expr_print
+    global counter_count
+    parser = etree.XMLParser(recover=True, encoding="utf-8")
+    xml_file = ET.parse(sys.argv[1] + "/metrics.xml", parser=parser)
+    root = xml_file.getroot()
+    print("uint64_t getDerivedCounter(const char* name, const char* gpu_name) {")
+    for gpu in root:
+        print(
+            "\n\t/**\n\t * @brief Derived " + gpu.tag + " counters\n\t *\n\t * @{\n\t */"
+        )
+        print('\tif (strcmp(gpu_name, "' + gpu.tag + '")==0) {')
+        for child in gpu:
+            exprs_counters = ""
+            exprs_counters_init = ""
+            expr_print = ""
+            counter_count = 0
+            counters_dictionary = {}
+            parse_expr(
+                gpu.tag.split("_")[0],
+                ast.dump(
+                    ast.parse(child.attrib["expr"], mode="eval"),
+                    annotate_fields=False,
+                    include_attributes=False,
+                    indent=0,
+                ),
+            )
+            print(
+                "\t/**\n\t * Derived Counter: "
+                + child.attrib["name"]
+                + "\n\t *\n\t * "
+                + child.attrib["descr"]
+                + '\n\t */\n\tif (strcmp(name, "'
+                + child.attrib["name"]
+                + '")==0) {'
+                + exprs_counters_init
+                + '\n\t\tDerivedCounter counter = DerivedCounter("'
+                + child.attrib["name"]
+                + '", "'
+                + child.attrib["descr"]
+                + '", "'
+                + gpu.tag.split("_")[0]
+                + '");'
+            )
+            exprs_counter_count = 0
+            for expr_counter in exprs_counters[0:-2].split(", "):
+                print(
+                    "\n\t\tcounter.addBasicCounter("
+                    + str(exprs_counter_count)
+                    + ", "
+                    + expr_counter
+                    + ");"
+                )
+                exprs_counter_count += 1
+            # print("\n\t\tcounter.evaluate_metric = [counter]() {" + expr_print + ";\n\t\t\t};")
+            print(
+                "\n\t\tderived_counters.emplace(counter.getMetricId(), counter);\n\t\treturn counter.getMetricId();\n\t}"
+            )
+        print("\t}\n\n\t/**\n\t * @}\n\t */")
+    print("\n\treturn 0;\n}\n")
diff --git a/src/core/counters/metrics/eval_metrics.cpp b/src/core/counters/metrics/eval_metrics.cpp
index d8aebcfa..1a1abdd7 100644
--- a/src/core/counters/metrics/eval_metrics.cpp
+++ b/src/core/counters/metrics/eval_metrics.cpp
@@ -46,7 +46,7 @@ hsa_status_t pmcCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
     if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) {
       if (IsEventMatch(info_data->pmc_data.event, (*data_it)->event)) {
         uint32_t xcc_index = floor(passed_data->index / passed_data->single_xcc_buff_size);
-        (*data_it)->xcc_vals[xcc_index] +=
+        (*data_it)->xcc_vals.at(xcc_index) +=
             info_data->pmc_data.result;  // stores event result from each xcc separately
         (*data_it)->val_double +=
             info_data->pmc_data.result;  // stores accumulated event result from all xccs
diff --git a/src/core/hsa/hsa_support.cpp b/src/core/hsa/hsa_support.cpp
index 62c20abd..ab676543 100644
--- a/src/core/hsa/hsa_support.cpp
+++ b/src/core/hsa/hsa_support.cpp
@@ -41,6 +41,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+#include <linux/limits.h>
 
 #include "core/hardware/hsa_info.h"
 #include "src/core/session/tracer/src/correlation_id.h"
@@ -68,15 +69,16 @@ hsa_status_t hsa_executable_iteration_callback(hsa_executable_t executable, hsa_
         symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
     // TODO(aelwazir): to be removed if the HSA fixed the issue of corrupted
     // names overflowing the length given
+    name_length = std::min<size_t>(name_length, PATH_MAX);
     if (name_length > 1) {
       if (!(*static_cast<bool*>(args))) {
-        char name[name_length + 1];
+        auto name = std::vector<char>(name_length + 1, '\0');
         uint64_t kernel_object;
         hsasupport_singleton.GetCoreApiTable().hsa_executable_symbol_get_info_fn(
-            symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
+            symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name.data());
         hsasupport_singleton.GetCoreApiTable().hsa_executable_symbol_get_info_fn(
             symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object);
-        std::string kernel_name = std::string(name).substr(0, name_length);
+        auto kernel_name = std::string{name.data()}.substr(0, name_length);
         rocprofiler::AddKernelName(kernel_object, kernel_name);
       } else {
         uint64_t kernel_object;
@@ -751,8 +753,9 @@ void HSASupport_Singleton::SetHSALoaderApi() {
 
 
 const Agent::DeviceInfo& HSAAgentInfo::GetDeviceInfo() const {
-  if (type_ == HSA_DEVICE_TYPE_GPU)
+  if (type_ == HSA_DEVICE_TYPE_GPU) {
     return device_info_;
+  }
   assert("Attempting to read deviceInfo for a CPU agent");
 }
 
diff --git a/src/core/hsa/packets/packets_generator.cpp b/src/core/hsa/packets/packets_generator.cpp
index 20cb11e1..85712153 100644
--- a/src/core/hsa/packets/packets_generator.cpp
+++ b/src/core/hsa/packets/packets_generator.cpp
@@ -82,7 +82,8 @@ static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool
   if (nullptr == data) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
   err = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_get_info_fn(
       pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
   ASSERTM(err != HSA_STATUS_ERROR, "hsa_amd_memory_pool_get_info");
@@ -115,10 +116,10 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
 }
 
 void InitializePools(hsa_agent_t cpu_agent, rocprofiler::HSAAgentInfo* agent_info) {
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
-  hsa_status_t status =
-      hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
-          cpu_agent, FindStandardPool, &(agent_info->cpu_pool_));
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
+  hsa_status_t status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
+      cpu_agent, FindStandardPool, &(agent_info->cpu_pool_));
   CHECK_HSA_STATUS("Error: Command Buffer Pool is not initialized", status);
 
   status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
@@ -127,9 +128,10 @@ void InitializePools(hsa_agent_t cpu_agent, rocprofiler::HSAAgentInfo* agent_inf
 }
 
 void InitializeGPUPool(hsa_agent_t gpu_agent, rocprofiler::HSAAgentInfo* agent_info) {
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
-  hsa_status_t status =
-      hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(gpu_agent, FindStandardPool, &(agent_info->gpu_pool_));
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
+  hsa_status_t status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
+      gpu_agent, FindStandardPool, &(agent_info->gpu_pool_));
   CHECK_HSA_STATUS("hsa_amd_agent_iterate_memory_pools(gpu_pool)", status);
 }
 
@@ -146,7 +148,8 @@ std::map<uint32_t, rocprofiler::MetricsDict*> metricsDict;
 
 
 void CheckPacketReqiurements() {
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
   for (auto& gpu_agent : hsasupport_singleton.gpu_agents) {
     // get the instance of MetricsDict
     rocprofiler::HSAAgentInfo& agentInfo = hsasupport_singleton.GetHSAAgentInfo(gpu_agent.handle);
@@ -162,8 +165,10 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
                      std::vector<std::string>& counter_names, rocprofiler_session_id_t session_id,
                      bool is_spm) {
   hsa_status_t status = HSA_STATUS_SUCCESS;
-  rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = rocprofiler::ROCProfiler_Singleton::GetInstance();
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
+  rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton =
+      rocprofiler::ROCProfiler_Singleton::GetInstance();
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
   if (!counters_added.load(std::memory_order_acquire)) {
     for (auto& name : counter_names) {
       if (rocprofiler_singleton.HasActiveSession()) {
@@ -337,7 +342,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
                   << "Error: Command buffer given size is " << size << std::endl;
         abort();
       }
-      status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
+      status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
           agentInfo.cpu_pool_, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
       if (status != HSA_STATUS_SUCCESS) {
         profile->command_buffer.ptr = malloc(size);
@@ -351,7 +356,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
         }
       } else {
         // Both the CPU and GPU can access the memory
-        status =hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn(
+        status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn(
             ag_list_count, ag_list, NULL, profile->command_buffer.ptr);
         CHECK_HSA_STATUS("Error: Allowing access to Command Buffer", status);
       }
@@ -364,8 +369,9 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
                     << "Error: Output buffer given size is " << size << std::endl;
           abort();
         }
-        status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
-            agentInfo.kernarg_pool_, size, 0, reinterpret_cast<void**>(&profile->output_buffer.ptr));
+        status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
+            agentInfo.kernarg_pool_, size, 0,
+            reinterpret_cast<void**>(&profile->output_buffer.ptr));
         if (status != HSA_STATUS_SUCCESS) {
           profile->output_buffer.ptr = malloc(size);
           /*numa_alloc_onnode(
@@ -378,7 +384,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
             abort();
           }
         } else {
-          status =hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn(
+          status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn(
               ag_list_count, ag_list, NULL, profile->output_buffer.ptr);
           CHECK_HSA_STATUS("Error: GPU Agent can't have output buffer access", status);
           memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size);
@@ -426,34 +432,33 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
 
   // Preparing an Getting the size of the command and output buffers
   status = hsa_ven_amd_aqlprofile_start(profile, NULL);
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
   rocprofiler::HSAAgentInfo& agentInfo = hsasupport_singleton.GetHSAAgentInfo(gpu_agent.handle);
   size_t ag_list_count = 1;
   hsa_agent_t ag_list[ag_list_count];
   ag_list[0] = gpu_agent;
 
   // Allocating Command Buffer
-  //FixMe: Command buffer and output buffers are allocated repetatively.
+  // FixMe: Command buffer and output buffers are allocated repetatively.
   status = HSA_STATUS_ERROR;
   size_t size = profile->command_buffer.size;
   profile->command_buffer.ptr = nullptr;
   if (size <= 0) return nullptr;
   size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
-  status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
+  status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
       agentInfo.cpu_pool_, size, 0, reinterpret_cast<void**>(&(profile->command_buffer.ptr)));
   // Both the CPU and GPU can access the memory
   if (status == HSA_STATUS_SUCCESS) {
-    status =hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn(
+    status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn(
         ag_list_count, ag_list, NULL, profile->command_buffer.ptr);
     CHECK_HSA_STATUS("Error: GPU Agent can't have command buffer access", status);
   } else {
     hsa_agent_t near_cpu_node = agentInfo.GetNearCpuAgent();
     uint32_t near_cpu_node_id = 0;
-    hsasupport_singleton.GetCoreApiTable().hsa_agent_get_info_fn(near_cpu_node,
-      HSA_AGENT_INFO_NODE, &near_cpu_node_id);
-    profile->command_buffer.ptr = numa_alloc_onnode(
-        profile->command_buffer.size,
-       near_cpu_node_id);
+    hsasupport_singleton.GetCoreApiTable().hsa_agent_get_info_fn(near_cpu_node, HSA_AGENT_INFO_NODE,
+                                                                 &near_cpu_node_id);
+    profile->command_buffer.ptr = numa_alloc_onnode(profile->command_buffer.size, near_cpu_node_id);
     if (profile->command_buffer.ptr != nullptr) {
       status = HSA_STATUS_SUCCESS;
     } else {
@@ -466,12 +471,12 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets(
   size = profile->output_buffer.size;
   profile->output_buffer.ptr = nullptr;
   size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
-  status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
+  status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
       agentInfo.gpu_pool_, size, 0, reinterpret_cast<void**>(&(profile->output_buffer.ptr)));
   CHECK_HSA_STATUS("Error: Can't Allocate Output Buffer", status);
   // Both the CPU and GPU can access the kernel arguments
   if (status == HSA_STATUS_SUCCESS) {
-    status =hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn(
+    status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn(
         ag_list_count, ag_list, NULL, profile->output_buffer.ptr);
     CHECK_HSA_STATUS("Error: Can't allow access on the Output Buffer for the GPU", status);
     memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size);
@@ -501,8 +506,9 @@ uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_po
   hsa_status_t status = HSA_STATUS_ERROR;
   uint8_t* buffer = NULL;
   size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
-  status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
+  status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
       *cpu_pool, size, 0, reinterpret_cast<void**>(&buffer));
   // Both the CPU and GPU can access the memory
   if (status == HSA_STATUS_SUCCESS) {
@@ -516,16 +522,20 @@ uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_po
 // Allocate memory for use by a kernel of specified size
 uint8_t* AllocateLocalMemory(size_t size, hsa_amd_memory_pool_t* gpu_pool) {
   hsa_status_t status = HSA_STATUS_ERROR;
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
   uint8_t* buffer = NULL;
   size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
-  status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(*gpu_pool, size, 0, reinterpret_cast<void**>(&buffer));
+  status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(
+      *gpu_pool, size, 0, reinterpret_cast<void**>(&buffer));
   uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
   return ptr;
 }
 
-hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile,  size_t att_buffer_size) {
- rocprofiler::HSAAgentInfo& agentInfo = rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(gpu_agent.handle);
+hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile,
+                      size_t att_buffer_size) {
+  rocprofiler::HSAAgentInfo& agentInfo =
+      rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(gpu_agent.handle);
   profile->command_buffer.ptr =
       AllocateSysMemory(gpu_agent, profile->command_buffer.size, &agentInfo.cpu_pool_);
   profile->output_buffer.size = att_buffer_size;
@@ -538,11 +548,14 @@ hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* p
 
 bool AllocateMemoryPools(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent,
                          hsa_amd_memory_pool_t* cpu_pool, hsa_amd_memory_pool_t* gpu_pool) {
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton =  rocprofiler::HSASupport_Singleton::GetInstance();
-  hsa_status_t status =  hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(cpu_agent, FindStandardPool, cpu_pool);
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
+  hsa_status_t status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
+      cpu_agent, FindStandardPool, cpu_pool);
   CHECK_HSA_STATUS("hsa_amd_agent_iterate_memory_pools(cpu_pool)", status);
 
-  status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(gpu_agent, FindStandardPool, gpu_pool);
+  status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(
+      gpu_agent, FindStandardPool, gpu_pool);
   CHECK_HSA_STATUS("hsa_amd_agent_iterate_memory_pools(gpu_pool)", status);
 
   return true;
diff --git a/src/core/hsa/queues/queue.cpp b/src/core/hsa/queues/queue.cpp
index 4b8c2aa0..56fc00cf 100644
--- a/src/core/hsa/queues/queue.cpp
+++ b/src/core/hsa/queues/queue.cpp
@@ -114,7 +114,6 @@ std::string GetKernelNameUsingDispatchID(uint64_t given_id) {
 }
 
 
-
 struct kernel_descriptor_t {
   uint8_t reserved0[16];
   int64_t kernel_code_entry_byte_offset;
@@ -126,7 +125,7 @@ struct kernel_descriptor_t {
   uint8_t reserved2[6];
 };
 // AMD Compute Program Resource Register Three.
-typedef uint32_t amd_compute_pgm_rsrc_three32_t;
+using amd_compute_pgm_rsrc_three32_t = uint32_t;
 enum amd_compute_gfx9_pgm_rsrc_three_t {
   AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET, 0, 5),
   AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TG_SPLIT, 16, 1)
@@ -158,17 +157,20 @@ enum amd_kernel_code_property_t {
 
 static const kernel_descriptor_t* GetKernelCode(uint64_t kernel_object) {
   const kernel_descriptor_t* kernel_code = NULL;
-  rocprofiler::HSASupport_Singleton&  hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
-  hsa_status_t status = hsasupport_singleton.GetHSALoaderApi().hsa_ven_amd_loader_query_host_address(
-      reinterpret_cast<const void*>(kernel_object), reinterpret_cast<const void**>(&kernel_code));
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
+  hsa_status_t status =
+      hsasupport_singleton.GetHSALoaderApi().hsa_ven_amd_loader_query_host_address(
+          reinterpret_cast<const void*>(kernel_object),
+          reinterpret_cast<const void**>(&kernel_code));
   if (HSA_STATUS_SUCCESS != status) {
     kernel_code = reinterpret_cast<kernel_descriptor_t*>(kernel_object);
   }
   return kernel_code;
 }
 
-static uint32_t arch_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code) {
-
+static uint32_t arch_vgpr_count(const std::string_view& name,
+                                const kernel_descriptor_t& kernel_code) {
   std::string info_name(name.data(), name.size());
   if (strcmp(name.data(), "gfx90a") == 0 || strncmp(name.data(), "gfx94", 5) == 0)
     return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3,
@@ -184,8 +186,8 @@ static uint32_t arch_vgpr_count(const std::string_view& name, const kernel_descr
            ? 8
            : 4);
 }
-static uint32_t accum_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code) {
-
+static uint32_t accum_vgpr_count(const std::string_view& name,
+                                 const kernel_descriptor_t& kernel_code) {
   std::string info_name(name.data(), name.size());
   if (strcmp(info_name.c_str(), "gfx908") == 0) return arch_vgpr_count(name, kernel_code);
   if (strcmp(info_name.c_str(), "gfx90a") == 0 || strncmp(info_name.c_str(), "gfx94", 5) == 0)
@@ -204,19 +206,19 @@ static uint32_t sgpr_count(const std::string_view& name, const kernel_descriptor
   // TODO(srnagara): Recheck the extraction of gfxip from gpu name
   const char* name_data = name.data();
   const size_t gfxip_label_len = std::min(name.size() - 2, size_t{63});
-  if (gfxip_label_len > 0 && strlen(name_data) >= gfxip_label_len) {
-    char gfxip[gfxip_label_len];
+  if (gfxip_label_len > 0 && strnlen(name_data, gfxip_label_len + 1) >= gfxip_label_len) {
+    char gfxip[gfxip_label_len + 1];
     memcpy(gfxip, name_data, gfxip_label_len);
+    gfxip[gfxip_label_len] = '\0';
     // TODO(srnagara): Check if it is hardcoded
-    if (std::atoi(&gfxip[3]) >= 10) return 128;
+    if (std::stoi(&gfxip[3]) >= 10) return 128;
     return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1,
                              AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) /
                 2 +
             1) *
         16;
-  } else {
-    return 0;
   }
+  return 0;
 }
 
 rocprofiler_kernel_properties_t set_kernel_properties(hsa_kernel_dispatch_packet_t packet,
@@ -233,9 +235,11 @@ rocprofiler_kernel_properties_t set_kernel_properties(hsa_kernel_dispatch_packet
   kernel_properties_ptr.workgroup_size = (uint32_t)workgroup_size;
   kernel_properties_ptr.lds_size = packet.group_segment_size;
   kernel_properties_ptr.scratch_size = packet.private_segment_size;
-  HSAAgentInfo agent_info =  HSASupport_Singleton::GetInstance().GetHSAAgentInfo(agent.handle);
-  kernel_properties_ptr.arch_vgpr_count = arch_vgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code);
-  kernel_properties_ptr.accum_vgpr_count = accum_vgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code);
+  HSAAgentInfo agent_info = HSASupport_Singleton::GetInstance().GetHSAAgentInfo(agent.handle);
+  kernel_properties_ptr.arch_vgpr_count =
+      arch_vgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code);
+  kernel_properties_ptr.accum_vgpr_count =
+      accum_vgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code);
   kernel_properties_ptr.sgpr_count = sgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code);
   kernel_properties_ptr.wave_size =
       AMD_HSA_BITS_GET(kernel_code->kernel_code_properties,
@@ -249,7 +253,7 @@ rocprofiler_kernel_properties_t set_kernel_properties(hsa_kernel_dispatch_packet
 
 namespace queue {
 
-  hsa_status_t pmcCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
+hsa_status_t pmcCallback(hsa_ven_amd_aqlprofile_info_type_t info_type,
                          hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) {
   hsa_status_t status = HSA_STATUS_SUCCESS;
   pmc_callback_data_t* passed_data = reinterpret_cast<pmc_callback_data_t*>(data);
@@ -302,7 +306,8 @@ void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_sign
         rocprofiler_record_counter_value_t{value}});
   }
   record->counters = counters;
-  rocprofiler::Session* session = rocprofiler::ROCProfiler_Singleton::GetInstance().GetSession(pending->session_id);
+  rocprofiler::Session* session =
+      rocprofiler::ROCProfiler_Singleton::GetInstance().GetSession(pending->session_id);
   void* initial_handle = const_cast<rocprofiler_record_counter_instance_t*>(record->counters);
   if (session->FindBuffer(pending->buffer_id)) {
     Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id);
@@ -365,10 +370,11 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent
 
 bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
   auto queue_info_session = static_cast<queue_info_session_t*>(data);
-  rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = rocprofiler::ROCProfiler_Singleton::GetInstance();
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance();
-  if (!queue_info_session ||
-      !rocprofiler_singleton.GetSession(queue_info_session->session_id) ||
+  rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton =
+      rocprofiler::ROCProfiler_Singleton::GetInstance();
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
+  if (!queue_info_session || !rocprofiler_singleton.GetSession(queue_info_session->session_id) ||
       !rocprofiler_singleton.GetSession(queue_info_session->session_id)->GetProfiler())
     return true;
   rocprofiler::Session* session = rocprofiler_singleton.GetSession(queue_info_session->session_id);
@@ -381,9 +387,10 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
     for (auto it = pending_signals.begin(); it != pending_signals.end();
          it = pending_signals.erase(it)) {
       auto& pending = *it;
-      if (hsasupport_singleton.GetCoreApiTable().hsa_signal_load_relaxed_fn(pending->new_signal)) return true;
+      if (hsasupport_singleton.GetCoreApiTable().hsa_signal_load_relaxed_fn(pending->new_signal))
+        return true;
       hsa_amd_profiling_dispatch_time_t time;
-     hsasupport_singleton.GetAmdExtTable().hsa_amd_profiling_get_dispatch_time_fn(
+      hsasupport_singleton.GetAmdExtTable().hsa_amd_profiling_get_dispatch_time_fn(
           queue_info_session->agent, pending->original_signal, &time);
       uint32_t record_count = 1;
       bool is_individual_xcc_mode = false;
@@ -429,7 +436,7 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
                                                  pending->context->metrics_list,
                                                  time.end - time.start);
           AddRecordCounters(&record, pending);
-      }else {
+        } else {
           if (session->FindBuffer(pending->buffer_id)) {
             Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id);
             buffer->AddRecord(record);
@@ -440,12 +447,12 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
         // TODO(aelwazir): we need a better way of distributing events and free them
         // if (pending->profile->output_buffer.ptr)
         //   numa_free(pending->profile->output_buffer.ptr, pending->profile->output_buffer.size);
-        hsa_status_t status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_free_fn(
+        hsa_status_t status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_free_fn(
             (pending->profile->output_buffer.ptr));
         CHECK_HSA_STATUS("Error: Couldn't free output buffer memory", status);
         // if (pending->profile->command_buffer.ptr)
         //   numa_free(pending->profile->command_buffer.ptr, pending->profile->command_buffer.size);
-        status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_free_fn(
+        status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_free_fn(
             (pending->profile->command_buffer.ptr));
         CHECK_HSA_STATUS("Error: Couldn't free command buffer memory", status);
         delete pending->profile;
@@ -455,9 +462,10 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
         delete pending->context;
       }
       if (pending->new_signal.handle)
-       hsasupport_singleton.GetCoreApiTable().hsa_signal_destroy_fn(pending->new_signal);
+        hsasupport_singleton.GetCoreApiTable().hsa_signal_destroy_fn(pending->new_signal);
       if (queue_info_session->interrupt_signal.handle)
-       hsasupport_singleton.GetCoreApiTable().hsa_signal_destroy_fn(queue_info_session->interrupt_signal);
+        hsasupport_singleton.GetCoreApiTable().hsa_signal_destroy_fn(
+            queue_info_session->interrupt_signal);
     }
   }
   delete queue_info_session;
@@ -466,12 +474,12 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) {
 }
 
 bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) {
-
   auto queue_info_session = static_cast<queue_info_session_t*>(data);
-  rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = rocprofiler::ROCProfiler_Singleton::GetInstance();
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton =  rocprofiler::HSASupport_Singleton::GetInstance();
-  if (!queue_info_session ||
-      !rocprofiler_singleton.GetSession(queue_info_session->session_id) ||
+  rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton =
+      rocprofiler::ROCProfiler_Singleton::GetInstance();
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
+  if (!queue_info_session || !rocprofiler_singleton.GetSession(queue_info_session->session_id) ||
       !rocprofiler_singleton.GetSession(queue_info_session->session_id)->GetAttTracer())
     return true;
   rocprofiler::Session* session = rocprofiler_singleton.GetSession(queue_info_session->session_id);
@@ -487,7 +495,8 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) {
          it = pending_signals.erase(it)) {
       auto& pending = *it;
       std::lock_guard<std::mutex> lock(session->GetSessionLock());
-      if (hsasupport_singleton.GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.new_signal)) return true;
+      if (hsasupport_singleton.GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.new_signal))
+        return true;
       rocprofiler_record_att_tracer_t record{};
       record.kernel_id = rocprofiler_kernel_id_t{pending.kernel_descriptor};
       record.gpu_id = rocprofiler_agent_id_t{(uint64_t)queue_info_session->gpu_index};
@@ -535,7 +544,7 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) {
 
 void CreateBarrierPacket(const hsa_signal_t& packet_completion_signal,
                          std::vector<Packet::packet_t>* transformed_packets) {
-  hsa_barrier_and_packet_t barrier{0};
+  hsa_barrier_and_packet_t barrier{};
   barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
   barrier.dep_signal[0] = packet_completion_signal;
   void* barrier_ptr = &barrier;
@@ -549,20 +558,23 @@ void AddVendorSpecificPacket(const Packet::packet_t* packet,
 }
 
 void SignalAsyncHandler(const hsa_signal_t& signal, void* data) {
-  hsa_status_t status = HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_async_handler_fn(
-      signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandler, data);
+  hsa_status_t status =
+      HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_async_handler_fn(
+          signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandler, data);
   CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler failed", status);
 }
 
 void signalAsyncHandlerATT(const hsa_signal_t& signal, void* data) {
-  hsa_status_t status = HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_async_handler_fn(
-      signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandlerATT, data);
+  hsa_status_t status =
+      HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_async_handler_fn(
+          signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandlerATT, data);
   CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler for ATT failed", status);
 }
 
 void CreateSignal(uint32_t attribute, hsa_signal_t* signal) {
   hsa_status_t status =
-      HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_create_fn(1, 0, nullptr, attribute, signal);
+      HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_create_fn(
+          1, 0, nullptr, attribute, signal);
   CHECK_HSA_STATUS("Error: hsa_amd_signal_create failed", status);
 }
 
@@ -604,15 +616,16 @@ void ResetSessionID(rocprofiler_session_id_t id) { session_id = id; }
 
 void CheckNeededProfileConfigs() {
   rocprofiler_session_id_t internal_session_id;
-    // Getting Session ID
-  rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = rocprofiler::ROCProfiler_Singleton::GetInstance();
+  // Getting Session ID
+  rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton =
+      rocprofiler::ROCProfiler_Singleton::GetInstance();
   internal_session_id = rocprofiler_singleton.GetCurrentSessionId();
 
 
   if (session_id.handle == 0 || internal_session_id.handle != session_id.handle) {
     session_id = internal_session_id;
     // Getting Counters count from the Session
-    if (session_id.handle > 0 ) {
+    if (session_id.handle > 0) {
       session = rocprofiler_singleton.GetSession(session_id);
       if (session && session->FindFilterWithKind(ROCPROFILER_COUNTERS_COLLECTION)) {
         rocprofiler_filter_id_t filter_id =
@@ -658,7 +671,8 @@ std::pair<std::vector<bool>, bool> GetAllowedProfilesList(const void* packets, i
   std::vector<bool> can_profile_packet;
   bool b_can_profile_anypacket = false;
   can_profile_packet.reserve(pkt_count);
-  rocprofiler::HSASupport_Singleton& hsasupport_singleton =  rocprofiler::HSASupport_Singleton::GetInstance();
+  rocprofiler::HSASupport_Singleton& hsasupport_singleton =
+      rocprofiler::HSASupport_Singleton::GetInstance();
   std::lock_guard<std::mutex> lock(hsasupport_singleton.ksymbol_map_lock);
   assert(hsasupport_singleton.ksymbols);
 
@@ -702,13 +716,9 @@ std::pair<std::vector<bool>, bool> GetAllowedProfilesList(const void* packets, i
   return {can_profile_packet, b_can_profile_anypacket};
 }
 
-std::pair<hsa_ven_amd_aqlprofile_profile_t*, rocprofiler_codeobj_capture_mode_t>
-ProcessATTParams(
-  Packet::packet_t& start_packet,
-  Packet::packet_t& stop_packet,
-  Queue& queue_info,
-  rocprofiler::HSAAgentInfo& agentInfo
-) {
+std::pair<hsa_ven_amd_aqlprofile_profile_t*, rocprofiler_codeobj_capture_mode_t> ProcessATTParams(
+    Packet::packet_t& start_packet, Packet::packet_t& stop_packet, Queue& queue_info,
+    rocprofiler::HSAAgentInfo& agentInfo) {
   std::vector<hsa_ven_amd_aqlprofile_parameter_t> att_params;
   int num_att_counters = 0;
   uint32_t att_buffer_size = DEFAULT_ATT_BUFFER_SIZE;
@@ -762,8 +772,9 @@ ProcessATTParams(
     for (; num_att_counters < 16; num_att_counters++) att_params.push_back(zero_perf);
   }
   // Get the PM4 Packets using packets_generator
-  return {Packet::GenerateATTPackets(queue_info.GetCPUAgent(), queue_info.GetGPUAgent(),
-          att_params, &start_packet, &stop_packet, att_buffer_size), capture_mode};
+  return {Packet::GenerateATTPackets(queue_info.GetCPUAgent(), queue_info.GetGPUAgent(), att_params,
+                                     &start_packet, &stop_packet, att_buffer_size),
+          capture_mode};
 }
 
 /**
@@ -773,9 +784,8 @@ ProcessATTParams(
  * pointer to the packet. This packet is written into the queue by this
  * interceptor by invoking the writer function.
  */
-void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt_index, void* data,
-                      hsa_amd_queue_intercept_packet_writer writer) {
-
+void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt_index,
+                             void* data, hsa_amd_queue_intercept_packet_writer writer) {
   static const char* env_MAX_ATT_PROFILES = getenv("ROCPROFILER_MAX_ATT_PROFILES");
   static int MAX_ATT_PROFILES = env_MAX_ATT_PROFILES ? atoi(env_MAX_ATT_PROFILES) : 1;
 
@@ -871,7 +881,7 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
       // Make a copy of the original packet, adding its signal to a barrier
       // packet and create a new signal for it to get timestamps
       if (original_packet.completion_signal.handle) {
-        hsa_barrier_and_packet_t barrier{0};
+        hsa_barrier_and_packet_t barrier{};
         barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
         Packet::packet_t* __attribute__((__may_alias__)) pkt =
             (reinterpret_cast<Packet::packet_t*>(&barrier));
@@ -897,26 +907,26 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
         // Added Interrupt Signal with barrier and provided handler for it
         CreateBarrierPacket(interrupt_signal, &transformed_packets);
       } else {
-        hsa_barrier_and_packet_t barrier{0};
+        hsa_barrier_and_packet_t barrier{};
         barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
         barrier.completion_signal = interrupt_signal;
         Packet::packet_t* __attribute__((__may_alias__)) pkt =
             (reinterpret_cast<Packet::packet_t*>(&barrier));
         transformed_packets.emplace_back(*pkt);
       }
-       rocprofiler::HSAAgentInfo& agentInfo =
-          rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(queue_info.GetGPUAgent().handle);
+      rocprofiler::HSAAgentInfo& agentInfo =
+          rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(
+              queue_info.GetGPUAgent().handle);
       //  Creating Async Handler to be called every time the interrupt signal is
       //  marked complete
-      SignalAsyncHandler(
-          interrupt_signal,
-          new queue_info_session_t{queue_info.GetGPUAgent(), session_id_snapshot, queue_info.GetQueueID(),
-                                   writer_id, interrupt_signal, agentInfo.GetDeviceInfo().getGPUId(),
-                                   agentInfo.GetDeviceInfo().getXccCount()});
+      SignalAsyncHandler(interrupt_signal,
+                         new queue_info_session_t{
+                             queue_info.GetGPUAgent(), session_id_snapshot, queue_info.GetQueueID(),
+                             writer_id, interrupt_signal, agentInfo.GetDeviceInfo().getGPUId(),
+                             agentInfo.GetDeviceInfo().getXccCount()});
       ACTIVE_INTERRUPT_SIGNAL_COUNT.fetch_add(1, std::memory_order_relaxed);
       // profile_id++;
       // } while (replay_mode_count > 0 && profile_id < replay_mode_count);  // Profiles loop end
-
     }
 
     /* Write the transformed packets to the hardware queue.  */
@@ -927,7 +937,9 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
     // Getting Queue Data and Information
     auto& queue_info = *static_cast<Queue*>(data);
     std::lock_guard<std::mutex> lk(queue_info.qw_mutex);
-    rocprofiler::HSAAgentInfo& agentInfo =  rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(queue_info.GetGPUAgent().handle);
+    rocprofiler::HSAAgentInfo& agentInfo =
+        rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(
+            queue_info.GetGPUAgent().handle);
 
     bool can_profile_anypacket = false;
     std::vector<bool> can_profile_packet;
@@ -947,11 +959,8 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
     rocprofiler_codeobj_capture_mode_t capture_mode = ROCPROFILER_CAPTURE_SYMBOLS_ONLY;
 
     if (att_parameters_data.size() > 0) {
-      std::tie(profile, capture_mode) = ProcessATTParams(start_packet,
-                                                         stop_packet,
-                                                         queue_info,
-                                                         agentInfo
-                                                       );
+      std::tie(profile, capture_mode) =
+          ProcessATTParams(start_packet, stop_packet, queue_info, agentInfo);
     }
 
     // Searching across all the packets given during this write
@@ -974,7 +983,7 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
       KernelInterceptCount += 1;
       writer_id = WRITER_ID.fetch_add(1, std::memory_order_release);
 
-      if (att_parameters_data.size() > 0 && profile) {
+      if (!att_parameters_data.empty() && profile) {
         // Adding start packet and its barrier with a dummy signal
         hsa_signal_t dummy_signal{};
         dummy_signal.handle = 0;
@@ -1001,14 +1010,14 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
           kernel_properties, (uint32_t)syscall(__NR_gettid), user_pkt_index);
 
       uint64_t off = dispatch_packet.kernel_object +
-                    GetKernelCode(dispatch_packet.kernel_object)->kernel_code_entry_byte_offset;
+          GetKernelCode(dispatch_packet.kernel_object)->kernel_code_entry_byte_offset;
       codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, off);
       codeobj_record::start_capture(rocprofiler_record_id_t{record_id});
       codeobj_record::stop_capture(rocprofiler_record_id_t{record_id});
 
       // Make a copy of the original packet, adding its signal to a barrier packet
-      if (original_packet.completion_signal.handle) {
-        hsa_barrier_and_packet_t barrier{0};
+      if (original_packet.completion_signal.handle != 0U) {
+        hsa_barrier_and_packet_t barrier{};
         barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
         Packet::packet_t* __attribute__((__may_alias__)) pkt =
             (reinterpret_cast<Packet::packet_t*>(&barrier));
@@ -1028,7 +1037,7 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
         // Added Interrupt Signal with barrier and provided handler for it
         CreateBarrierPacket(interrupt_signal, &transformed_packets);
       } else {
-        hsa_barrier_and_packet_t barrier{0};
+        hsa_barrier_and_packet_t barrier{};
         barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
         barrier.completion_signal = interrupt_signal;
         Packet::packet_t* __attribute__((__may_alias__)) pkt =
@@ -1050,13 +1059,12 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u
     /* Write the original packets to the hardware queue if no profiling session
      * is active  */
     writer(packets, pkt_count);
-
   }
 }
 
 
 Queue::Queue(const hsa_agent_t cpu_agent, const hsa_agent_t gpu_agent, hsa_queue_t* queue)
-    : cpu_agent_(cpu_agent), gpu_agent_(gpu_agent), intercept_queue_(queue) { }
+    : cpu_agent_(cpu_agent), gpu_agent_(gpu_agent), intercept_queue_(queue) {}
 
 Queue::~Queue() {
   while (ACTIVE_INTERRUPT_SIGNAL_COUNT.load(std::memory_order_acquire) > 0) {
@@ -1071,8 +1079,7 @@ hsa_agent_t Queue::GetCPUAgent() { return cpu_agent_; }
 
 uint64_t Queue::GetQueueID() { return intercept_queue_->id; }
 
-void CheckPacketReqiurements() {
-  Packet::CheckPacketReqiurements();}
+void CheckPacketReqiurements() { Packet::CheckPacketReqiurements(); }
 
 }  // namespace queue
 }  // namespace rocprofiler
diff --git a/src/core/profile.h b/src/core/profile.h
index d91e265c..8c9c7561 100644
--- a/src/core/profile.h
+++ b/src/core/profile.h
@@ -198,7 +198,6 @@ class Profile {
       status = api->hsa_ven_amd_aqlprofile_stop(&profile_, &stop);
       if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_stop");
       hsa_status_t rd_status = HSA_STATUS_ERROR;
-#ifdef AQLPROF_NEW_API
       if (profile_.type == HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC) {
         rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read);
         if (is_concurrent) {  // concurrent: one more read
@@ -208,7 +207,6 @@ class Profile {
       }
 #if 0  // Read API returns error if disabled
       if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read");
-#endif
 #endif
 
       // Set completion signal of start
diff --git a/src/core/session/tracer/src/roctracer.cpp b/src/core/session/tracer/src/roctracer.cpp
index 363108de..fd19bdbf 100644
--- a/src/core/session/tracer/src/roctracer.cpp
+++ b/src/core/session/tracer/src/roctracer.cpp
@@ -833,9 +833,7 @@ static std::string getKernelNameMultiKernelMultiDevice(hipLaunchParams* launchPa
   return name_str.str();
 }
 
-template <typename... Ts> struct Overloaded : Ts... {
-  using Ts::operator()...;
-};
+template <typename... Ts> struct Overloaded : Ts... { using Ts::operator()...; };
 template <class... Ts> Overloaded(Ts...) -> Overloaded<Ts...>;
 
 std::optional<std::string> GetHipKernelName(uint32_t cid, hip_api_data_t* data) {
diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt
index 4cfa1e89..e17664c5 100644
--- a/src/tools/CMakeLists.txt
+++ b/src/tools/CMakeLists.txt
@@ -28,38 +28,17 @@ target_compile_definitions(
     PUBLIC AMD_INTERNAL_BUILD
     PRIVATE HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1)
 
-if(ASAN)
-    target_compile_options(rocprofiler_tool PRIVATE -fsanitize=address)
-    target_link_libraries(
-        rocprofiler_tool
-        rocprofiler-v2
-        hsa-runtime64::hsa-runtime64
-        Threads::Threads
-        atomic
-        asan
-        dl
-        rt
-        stdc++fs
-        amd_comgr)
-    target_link_options(
-        rocprofiler_tool PRIVATE
-        -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap
-        -Wl,--no-undefined,-fsanitize=address)
-else()
-    target_link_libraries(
-        rocprofiler_tool
-        rocprofiler-v2
-        hsa-runtime64::hsa-runtime64
-        Threads::Threads
-        atomic
-        dl
-        rt
-        stdc++fs
-        amd_comgr)
-    target_link_options(
-        rocprofiler_tool PRIVATE
-        -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap -Wl,--no-undefined)
-endif()
+target_link_libraries(rocprofiler_tool
+                      PRIVATE $<BUILD_INTERFACE:rocprofiler::build-flags>)
+
+target_link_libraries(
+    rocprofiler_tool
+    PUBLIC rocprofiler-v2 hsa-runtime64::hsa-runtime64 Threads::Threads atomic dl rt
+           stdc++fs amd_comgr
+    PRIVATE rocprofiler::memcheck)
+target_link_options(
+    rocprofiler_tool PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap
+    -Wl,--no-undefined)
 
 install(TARGETS rocprofiler_tool LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rocprofiler
                                          COMPONENT runtime)
diff --git a/src/tools/rocsys/CMakeLists.txt b/src/tools/rocsys/CMakeLists.txt
index 3b809211..d6d10c7a 100644
--- a/src/tools/rocsys/CMakeLists.txt
+++ b/src/tools/rocsys/CMakeLists.txt
@@ -9,10 +9,7 @@ file(GLOB ROCPROFILER_ROCSYS_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 # Compiling/Installing ROCProfiler API
 add_executable(rocprofiler_rocsys_fe ${ROCPROFILER_ROCSYS_SRC_FILES})
 
-set_target_properties(
-    rocprofiler_rocsys_fe
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_OUTPUT_DIRECTORY} OUTPUT_NAME
-                                                                         "rocsys")
+set_target_properties(rocprofiler_rocsys_fe PROPERTIES OUTPUT_NAME "rocsys")
 
 target_include_directories(
     rocprofiler_rocsys_fe PRIVATE ${PROJECT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}
diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp
index cbc63b7f..156d5252 100644
--- a/src/util/hsa_rsrc_factory.cpp
+++ b/src/util/hsa_rsrc_factory.cpp
@@ -53,8 +53,8 @@ namespace util {
 static const char* cpp_demangle(const char* symname) {
   size_t size = 0;
   int status;
-  const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status);
-  return (ret != 0) ? ret : strdup(symname);
+  const char* ret = abi::__cxa_demangle(symname, nullptr, &size, &status);
+  return (ret != nullptr) ? ret : strdup(symname);
 }
 
 // Callback function to get available in the system agents
@@ -62,7 +62,7 @@ hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data)
   hsa_status_t status = HSA_STATUS_ERROR;
   HsaRsrcFactory* hsa_rsrc = reinterpret_cast<HsaRsrcFactory*>(data);
   const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent);
-  if (agent_info != NULL) status = HSA_STATUS_SUCCESS;
+  if (agent_info != nullptr) status = HSA_STATUS_SUCCESS;
   return status;
 }
 
@@ -123,10 +123,10 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) {
 HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) {
   hsa_status_t status;
 
-  cpu_pool_ = NULL;
-  kern_arg_pool_ = NULL;
+  cpu_pool_ = nullptr;
+  kern_arg_pool_ = nullptr;
 
-  InitHsaApiTable(NULL);
+  InitHsaApiTable(nullptr);
 
   // Initialize the Hsa Runtime
   if (initialize_hsa_) {
@@ -137,11 +137,12 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
   // Discover the set of Gpu devices available on the platform
   status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this);
   CHECK_STATUS("Error Calling hsa_iterate_agents", status);
-  if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR);
-  if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR);
+  if (cpu_pool_ == nullptr) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR);
+  if (kern_arg_pool_ == nullptr)
+    CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR);
 
   // Get AqlProfile API table
-  aqlprofile_api_ = {0};
+  aqlprofile_api_ = {};
 #ifdef ROCP_LD_AQLPROFILE
   status = LoadAqlProfileLib(&aqlprofile_api_);
 #else
@@ -152,7 +153,7 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
   CHECK_STATUS("aqlprofile API table load failed", status);
 
   // Get Loader API table
-  loader_api_ = {0};
+  loader_api_ = {};
   status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1,
                                                          sizeof(loader_api_), &loader_api_);
   CHECK_STATUS("loader API table query failed", status);
@@ -160,7 +161,7 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
   // Instantiate HSA timer
   timer_ = new HsaTimer(&hsa_api_);
   CHECK_STATUS("HSA timer allocation failed",
-               (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
+               (timer_ == nullptr) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
 
   // Time correlation
   const uint32_t corr_iters = 1000;
@@ -179,8 +180,8 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
 // Destructor of the class
 HsaRsrcFactory::~HsaRsrcFactory() {
   delete timer_;
-  for (auto p : cpu_list_) delete p;
-  for (auto p : gpu_list_) delete p;
+  for (const auto* p : cpu_list_) delete p;
+  for (const auto* p : gpu_list_) delete p;
   if (initialize_hsa_) {
     hsa_status_t status = hsa_api_.hsa_shut_down();
     CHECK_STATUS("Error in hsa_shut_down", status);
@@ -190,8 +191,8 @@ HsaRsrcFactory::~HsaRsrcFactory() {
 void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
   std::lock_guard<mutex_t> lck(mutex_);
 
-  if (hsa_api_.hsa_init == NULL) {
-    if (table != NULL) {
+  if (hsa_api_.hsa_init == nullptr) {
+    if (table != nullptr) {
       hsa_api_.hsa_init = table->core_->hsa_init_fn;
       hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn;
       hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn;
@@ -289,7 +290,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
 
 hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
   void* handle = dlopen(kAqlProfileLib, RTLD_NOW);
-  if (handle == NULL) {
+  if (handle == nullptr) {
     fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror());
     return HSA_STATUS_ERROR;
   }
@@ -305,10 +306,8 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
       (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start");
   api->hsa_ven_amd_aqlprofile_stop =
       (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop");
-#ifdef AQLPROF_NEW_API
   api->hsa_ven_amd_aqlprofile_read =
       (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read");
-#endif
   api->hsa_ven_amd_aqlprofile_legacy_get_pm4 =
       (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym(
           handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4");
@@ -325,7 +324,7 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
 const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
   // Determine if device is a Gpu agent
   hsa_status_t status;
-  AgentInfo* agent_info = NULL;
+  AgentInfo* agent_info = nullptr;
 
   hsa_device_type_t type;
   status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
@@ -339,10 +338,11 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
 
     status =
         hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool);
-    if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool;
+    if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == nullptr))
+      cpu_pool_ = &agent_info->cpu_pool;
     status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool,
                                                          &agent_info->kern_arg_pool);
-    if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL))
+    if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == nullptr))
       kern_arg_pool_ = &agent_info->kern_arg_pool;
     agent_info->gpu_pool = {};
 
@@ -362,7 +362,7 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
     hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
     hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
     hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
-    agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
+    agent_info->is_apu = agent_info->profile == HSA_PROFILE_FULL;
     hsa_api_.hsa_agent_get_info(
         agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT),
         &agent_info->cu_num);
@@ -407,7 +407,7 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
 
 // Return systen agent info
 const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) {
-  const AgentInfo* agent_info = NULL;
+  const AgentInfo* agent_info = nullptr;
   auto it = agent_map_.find(agent.handle);
   if (it != agent_map_.end()) {
     agent_info = it->second;
@@ -482,8 +482,8 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info)
 bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts,
                                  hsa_queue_t** queue) {
   hsa_status_t status;
-  status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
-                                     UINT32_MAX, UINT32_MAX, queue);
+  status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, nullptr,
+                                     nullptr, UINT32_MAX, UINT32_MAX, queue);
   return (status == HSA_STATUS_SUCCESS);
 }
 
@@ -493,7 +493,7 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts,
 // @return bool true if successful, false otherwise
 bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) {
   hsa_status_t status;
-  status = hsa_api_.hsa_signal_create(value, 0, NULL, signal);
+  status = hsa_api_.hsa_signal_create(value, 0, nullptr, signal);
   return (status == HSA_STATUS_SUCCESS);
 }
 
@@ -504,11 +504,11 @@ bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) {
 // @return uint8_t* Pointer to buffer, null if allocation fails.
 uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) {
   hsa_status_t status = HSA_STATUS_ERROR;
-  uint8_t* buffer = NULL;
+  uint8_t* buffer = nullptr;
   size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
   status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0,
                                                  reinterpret_cast<void**>(&buffer));
-  uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
+  uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : nullptr;
   return ptr;
 }
 
@@ -519,7 +519,7 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t
 // @return uint8_t* Pointer to buffer, null if allocation fails.
 uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) {
   hsa_status_t status = HSA_STATUS_ERROR;
-  uint8_t* buffer = NULL;
+  uint8_t* buffer = nullptr;
   if (!cpu_agents_.empty()) {
     size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
     status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0,
@@ -527,10 +527,10 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size
     // Both the CPU and GPU can access the kernel arguments
     if (status == HSA_STATUS_SUCCESS) {
       hsa_agent_t ag_list[1] = {agent_info->dev_id};
-      status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer);
+      status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, nullptr, buffer);
     }
   }
-  uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
+  uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : nullptr;
   return ptr;
 }
 
@@ -540,7 +540,7 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size
 // @return uint8_t* Pointer to buffer, null if allocation fails.
 uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) {
   hsa_status_t status = HSA_STATUS_ERROR;
-  uint8_t* buffer = NULL;
+  uint8_t* buffer = nullptr;
   size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
   if (!cpu_agents_.empty()) {
     status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0,
@@ -548,10 +548,10 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s
     // Both the CPU and GPU can access the memory
     if (status == HSA_STATUS_SUCCESS) {
       hsa_agent_t ag_list[1] = {agent_info->dev_id};
-      status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer);
+      status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, nullptr, buffer);
     }
   }
-  uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL;
+  uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : nullptr;
   return ptr;
 }
 
@@ -562,8 +562,8 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s
 uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) {
   size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK;
   uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP)
-      ? reinterpret_cast<uint8_t*>(
-            mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0))
+      ? reinterpret_cast<uint8_t*>(mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC,
+                                        MAP_SHARED | MAP_ANONYMOUS, 0, 0))
       : AllocateSysMemory(agent_info, size);
   return ptr;
 }
@@ -573,7 +573,7 @@ hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal,
                                               const hsa_signal_value_t& signal_value) const {
   const hsa_signal_value_t exp_value = signal_value - 1;
   hsa_signal_value_t ret_value = signal_value;
-  while (1) {
+  while (true) {
     ret_value = hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value,
                                                    timeout_, HSA_WAIT_STATE_BLOCKED);
     if (ret_value == exp_value) break;
@@ -599,9 +599,10 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src
   hsa_status_t status = HSA_STATUS_ERROR;
   if (!cpu_agents_.empty()) {
     hsa_signal_t s = {};
-    status = hsa_api_.hsa_signal_create(1, 0, NULL, &s);
+    status = hsa_api_.hsa_signal_create(1, 0, nullptr, &s);
     CHECK_STATUS("hsa_signal_create()", status);
-    status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s);
+    status =
+        hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, nullptr, s);
     CHECK_STATUS("hsa_amd_memory_async_copy()", status);
     SignalWait(s, 1);
     status = hsa_api_.hsa_signal_destroy(s);
@@ -654,12 +655,12 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br
 
   // Create executable.
   status = hsa_api_.hsa_executable_create_alt(
-      HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
+      HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, nullptr, executable);
   CHECK_STATUS("Error in creating executable object", status);
 
   // Load code object.
   status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
-                                                          code_obj_rdr, NULL, NULL);
+                                                          code_obj_rdr, nullptr, nullptr);
   CHECK_STATUS("Error in loading executable object", status);
 
   // Freeze executable.
@@ -668,8 +669,8 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br
 
   // Get symbol handle.
   hsa_executable_symbol_t kernelSymbol;
-  status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
-                                              &kernelSymbol);
+  status = hsa_api_.hsa_executable_get_symbol(*executable, nullptr, kernel_name, agent_info->dev_id,
+                                              0, &kernelSymbol);
   CHECK_STATUS("Error in looking up kernel symbol", status);
 
   close(file_handle);
@@ -799,7 +800,7 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec,
         hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symname);
     CHECK_STATUS("Error in getting kernel name", status);
     symname[len] = 0;
-    if (data == NULL) {
+    if (data == nullptr) {
       const char* name = cpp_demangle(symname);
       auto ret = symbols_map_->insert({addr, name});
       if (ret.second == false) {
@@ -816,16 +817,16 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec,
 hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable,
                                                                const char* options) {
   std::lock_guard<mutex_t> lck(mutex_);
-  if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t;
+  if (symbols_map_ == nullptr) symbols_map_ = new symbols_map_t;
   hsa_status_t status =
-      hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL);
+      hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, nullptr);
   CHECK_STATUS("Error in iterating executable symbols", status);
   return hsa_api_.hsa_executable_freeze(executable, options);
 }
 
 hsa_status_t HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable) {
   std::lock_guard<mutex_t> lck(mutex_);
-  if (symbols_map_ != NULL) {
+  if (symbols_map_ != nullptr) {
     hsa_status_t status =
         hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)1);
     CHECK_STATUS("Error in iterating executable symbols", status);
@@ -838,8 +839,8 @@ HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_;
 HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX;
 hsa_pfn_t HsaRsrcFactory::hsa_api_{};
 bool HsaRsrcFactory::executable_tracking_on_ = false;
-HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL;
-void* HsaRsrcFactory::to_dump_code_obj_ = NULL;
+HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = nullptr;
+void* HsaRsrcFactory::to_dump_code_obj_ = nullptr;
 
 }  // namespace util
 }  // namespace rocprofiler
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8473a421..46efbd5b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -19,14 +19,29 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 ################################################################################
-cmake_minimum_required(VERSION 3.16.0)
+cmake_minimum_required(VERSION 3.18.0 FATAL_ERROR)
 set(EXE_NAME "rocprof-ctrl")
 
+# Temporary up till we remove independant build of tests exists in the Jenkins side
 if(NOT DEFINED TEST_DIR)
     set(TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR})
     project(${EXE_NAME} C CXX)
     # Set build environment
-    include(env)
+    list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_SOURCE_DIR}/../cmake_modules")
+    include(rocprofiler_options)
+    include(rocprofiler_utils)
+    include(rocprofiler_env)
+    include(rocprofiler_formatting)
+    include(rocprofiler_linting)
+    find_package(Threads REQUIRED)
+    find_package(
+      hsa-runtime64 REQUIRED CONFIG
+      HINTS ${CMAKE_INSTALL_PREFIX}
+      PATHS ${ROCM_PATH})
+    find_package(
+      HIP REQUIRED CONFIG
+      HINTS ${CMAKE_INSTALL_PREFIX}
+      PATHS ${ROCM_PATH})
 endif()
 
 set(THREADS_PREFER_PTHREAD_FLAG ON)
@@ -136,8 +151,12 @@ foreach(target_id ${GPU_LIST})
     generate_hsaco(${target_id} ${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME}.cl
                    ${target_id}_DummyKernel.hsaco)
 endforeach(target_id)
-add_custom_target(test DEPENDS ${HSACO_TARGET_LIST})
-add_custom_target(mytest DEPENDS ${HSACO_TARGET_LIST})
+#
+# NOTE (jomadsen): cannot create target named test because this is a reserved target name
+# -- cmake provides "test" target to run tests
+#
+# add_custom_target(test DEPENDS ${HSACO_TARGET_LIST})
+add_custom_target(mytest ALL DEPENDS ${HSACO_TARGET_LIST})
 add_custom_command(
     TARGET mytest
     POST_BUILD DEPENDS mytest
@@ -183,9 +202,11 @@ target_include_directories(${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR}
                                                ${PROJECT_SOURCE_DIR}/include)
 target_link_libraries(${EXE_NAME} hsa-runtime64::hsa-runtime64 hsakmt::hsakmt
                       Threads::Threads stdc++fs dl)
-execute_process(COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}")
-execute_process(COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}")
-execute_process(COMMAND sh -xc "mkdir -p ${PROJECT_BINARY_DIR}/RESULTS")
+
+file(GLOB XML_FILES "${TEST_DIR}/tool/*.xml")
+configure_file(${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}/run.sh COPYONLY)
+execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${XML_FILES} ${PROJECT_BINARY_DIR}/)
+
 # TODO(aelwazir): Should be replaced by the current location in the main CMakeLists.txt
 install(
     TARGETS ${EXE_NAME}
@@ -219,14 +240,14 @@ install(
     DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME}
     COMPONENT asan)
 
-# Build memory test bench
-add_custom_target(
-    mbench
-    COMMAND sh -xc "cp -r ${TEST_DIR}/memory_validation ${PROJECT_BINARY_DIR}/test/."
-    COMMAND make -C "${PROJECT_BINARY_DIR}/test/memory_validation")
+add_test(NAME v1-tests COMMAND ${PROJECT_BINARY_DIR}/run.sh)
+set_tests_properties(
+    v1-tests PROPERTIES LABELS "v1" ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}"
+                        RUN_SERIAL TRUE)
 
 # Copy OCL test
-execute_process(COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test/.")
+execute_process(COMMAND ${CMAKE_COMMAND} -E copy_directory ${TEST_DIR}/ocl
+                        ${PROJECT_BINARY_DIR}/test/ocl)
 install(
     DIRECTORY ${TEST_DIR}/ocl
     DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests-v1/test
diff --git a/test/app/test.cpp b/test/app/test.cpp
index e54bec82..f102d8fb 100644
--- a/test/app/test.cpp
+++ b/test/app/test.cpp
@@ -50,8 +50,8 @@ void thread_fun(const int kiter, const int diter, const uint32_t agents_number)
 
   for (int i = 0; i < kiter; ++i) {
     for (uint32_t n = 0; n < agents_number; ++n) {
-      // RunKernel<DummyKernel, TestAql>(0, NULL, agent_info[n], queue[n], diter);
-      RunKernel<SimpleConvolution, TestAql>(0, NULL, agent_info[n], queue[n], diter);
+      RunKernel<DummyKernel, TestAql>(0, NULL, agent_info[n], queue[n], diter);
+      // RunKernel<SimpleConvolution, TestAql>(0, NULL, agent_info[n], queue[n], diter);
     }
   }
 
diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp
index fddf77e2..0a44d182 100644
--- a/test/util/hsa_rsrc_factory.cpp
+++ b/test/util/hsa_rsrc_factory.cpp
@@ -299,10 +299,8 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
       (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start");
   api->hsa_ven_amd_aqlprofile_stop =
       (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop");
-#ifdef AQLPROF_NEW_API
   api->hsa_ven_amd_aqlprofile_read =
       (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read");
-#endif
   api->hsa_ven_amd_aqlprofile_legacy_get_pm4 =
       (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym(
           handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4");
diff --git a/tests-v2/featuretests/gtests_main.cpp b/tests-v2/featuretests/gtests_main.cpp
index a861d36b..1d20fe2b 100644
--- a/tests-v2/featuretests/gtests_main.cpp
+++ b/tests-v2/featuretests/gtests_main.cpp
@@ -1,30 +1,25 @@
 #include <gtest/gtest.h>
+#include <string_view>
+
 #include "src/core/hardware/hsa_info.h"
 
 int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
-  // Add line below to disable any problematic test
-  hsa_init();
-  testing::GTEST_FLAG(filter) =
-      "-OpenMPTest.*:ProfilerSPMTest.*:ProfilerMQTest.*:ProfilerMPTest.*:MPITest.*";
   // Disable ATT test fir gfx10 GPUs until its supported
-  // iterate for gpu's
-  hsa_iterate_agents(
-      [](hsa_agent_t agent, void*) {
-        char gpu_name[64];
-        hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, gpu_name);
-        std::string gfx_name = gpu_name;
-        if (gfx_name.find("gfx10") != std::string::npos) {
-          testing::GTEST_FLAG(filter) =
-              "-ATTCollection.*:OpenMPTest.*:ProfilerSPMTest*:ProfilerMQTest.*:*ProfilerMPTest.*:"
-              "MPITest.*";
-        }
-        return HSA_STATUS_SUCCESS;
-      },
-    nullptr);
-// Append filter above to disable any problematic test
-  int res = RUN_ALL_TESTS();
-  hsa_shut_down();
-  return res;
+  // read the command line arguments after above filters so it
+  // does not override the command-line --gtest_filter argument
+  bool skipInit = false;
+  for (int i = 0; i < argc; i++) {
+    if (std::string_view("--gtest_list_tests").compare(argv[i]) == 0 ||
+        std::string_view("-h").compare(argv[i]) == 0 ||
+        std::string_view("--help").compare(argv[i]) == 0) {
+      skipInit = true;
+      break;
+    }
+  }
+  if (!skipInit) hsa_init();
+  testing::InitGoogleTest(&argc, argv);
+  // hsa_shut_down(); // Waiting for hsa_shutdown bug to fix
+  // Append filter above to disable any problematic test
+  return RUN_ALL_TESTS();
 }
diff --git a/tests-v2/featuretests/profiler/CMakeLists.txt b/tests-v2/featuretests/profiler/CMakeLists.txt
index 125aaac4..4296ccf3 100644
--- a/tests-v2/featuretests/profiler/CMakeLists.txt
+++ b/tests-v2/featuretests/profiler/CMakeLists.txt
@@ -28,6 +28,9 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip")
 set(CMAKE_HIP_ARCHITECTURES OFF)
+if(NOT DEFINED HIP_ROOT_DIR)
+    set(HIP_ROOT_DIR "${CMAKE_INSTALL_PREFIX}")
+endif()
 find_package(HIP REQUIRED MODULE)
 
 # Enable CLANG-TIDY for static analysis search for clang-tidy
@@ -36,6 +39,28 @@ if(CLANG_TIDY_EXE)
     set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY_EXE}; -format-style='file';
                              -header-filter=${CMAKE_CURRENT_SOURCE_DIR};)
 endif()
+
+# ########################################################################################
+
+function(rocprofiler_featuretests_profiler_add_test _TARGET)
+    if(TARGET ${_TARGET})
+        if(NOT TEST ${_TARGET})
+            add_test(
+                NAME ${_TARGET}
+                COMMAND $<TARGET_FILE:${_TARGET}>
+                WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+        endif()
+        set_tests_properties(
+            ${_TARGET} PROPERTIES LABELS "featuretests;profiler" ENVIRONMENT
+                                  "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" ${ARGN})
+    endif()
+endfunction()
+
+function(rocprofiler_featuretests_profiler_add_executable _TARGET)
+    hip_add_executable(${_TARGET} ${ARGN})
+    rocprofiler_featuretests_profiler_add_test(${_TARGET})
+endfunction()
+
 # ########################################################################################
 # App Based FeatureTests
 # ########################################################################################
@@ -53,7 +78,7 @@ endforeach()
 # Compile Applications hip_helloworld
 set_source_files_properties(apps/hello_world_hip.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT
                                                                 1)
-hip_add_executable(hip_helloworld apps/hello_world_hip.cpp)
+rocprofiler_featuretests_profiler_add_executable(hip_helloworld apps/hello_world_hip.cpp)
 set_target_properties(
     hip_helloworld PROPERTIES RUNTIME_OUTPUT_DIRECTORY
                               "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
@@ -68,7 +93,7 @@ install(
 # hip_vectoradd
 set_source_files_properties(apps/vector_add_hip.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT
                                                                1)
-hip_add_executable(hip_vectoradd apps/vector_add_hip.cpp)
+rocprofiler_featuretests_profiler_add_executable(hip_vectoradd apps/vector_add_hip.cpp)
 set_target_properties(
     hip_vectoradd PROPERTIES RUNTIME_OUTPUT_DIRECTORY
                              "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
@@ -81,12 +106,22 @@ install(
         COMPONENT tests)
 
 # mpi_vectoradd
+add_library(rocprofiler-tests-mpi INTERFACE)
+add_library(rocprofiler::tests-mpi ALIAS rocprofiler-tests-mpi)
+
 find_package(MPI)
 if(MPI_CXX_FOUND)
-    include_directories(SYSTEM ${MPI_INCLUDE_PATH})
+    set(USE_MPI 1)
+    target_compile_definitions(rocprofiler-tests-mpi INTERFACE USE_MPI=1)
+    target_link_libraries(rocprofiler-tests-mpi INTERFACE stdc++fs ${MPI_C_LIBRARIES})
+    target_include_directories(rocprofiler-tests-mpi INTERFACE ${MPI_INCLUDE_PATH})
+endif()
+
+if(USE_MPI)
     set_source_files_properties(apps/vector_add_mpi.cpp
                                 PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-    hip_add_executable(mpi_vectoradd apps/vector_add_mpi.cpp)
+    rocprofiler_featuretests_profiler_add_executable(mpi_vectoradd
+                                                     apps/vector_add_mpi.cpp)
     set_target_properties(
         mpi_vectoradd
         PROPERTIES RUNTIME_OUTPUT_DIRECTORY
@@ -98,24 +133,50 @@ if(MPI_CXX_FOUND)
             DESTINATION
                 ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
             COMPONENT tests)
-    target_link_libraries(mpi_vectoradd ${MPI_C_LIBRARIES} stdc++fs)
+    target_link_libraries(mpi_vectoradd PRIVATE rocprofiler::tests-mpi)
+endif()
+
+# openmp_helloworld
+add_library(rocprofiler-tests-openmp INTERFACE)
+add_library(rocprofiler::tests-openmp ALIAS rocprofiler-tests-openmp)
+
+find_package(OpenMP)
+if(OpenMP_CXX_FOUND)
+    set(USE_OpenMP 1)
+    target_compile_definitions(rocprofiler-tests-openmp INTERFACE USE_OpenMP=1)
+    target_link_libraries(rocprofiler-tests-openmp INTERFACE OpenMP::OpenMP_CXX
+                                                             hip::device)
+elseif()
+    message(WARNING "OpenMP not found")
+    unset(USE_OpenMP)
 endif()
 
-# openmp_helloworld find_package(hip REQUIRED) find_package(OpenMP) if(OpenMP_CXX_FOUND) #
-# Source files. set_source_files_properties(gtests/apps/openmp/hello_world.cpp PROPERTIES
-# HIP_SOURCE_PROPERTY_FORMAT 1) hip_add_executable(openmp_helloworld
-# gtests/apps/openmp/hello_world.cpp) set_target_properties(openmp_helloworld PROPERTIES
-# RUNTIME_OUTPUT_DIRECTORY
-# "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/gtests/apps")
+# if(USE_OpenMP)
+#     set_source_files_properties(apps/hello_world_omp.cpp
+#                                 PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+#     rocprofiler_featuretests_profiler_add_executable(openmp_helloworld
+#                                                      apps/hello_world_omp.cpp)
+#     set_target_properties(
+#         openmp_helloworld
+#         PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+#                    "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps")
+
+#     target_link_options(openmp_helloworld PRIVATE "-Wl,--build-id=md5")
+#     target_link_libraries(openmp_helloworld PRIVATE rocprofiler::tests-openmp)
 
-# # Link Libraries - HIP Device and OpenMP. target_compile_options(openmp_helloworld
-# PRIVATE ${OpenMP_CXX_FLAGS}) target_link_libraries(openmp_helloworld PRIVATE hip::device
-# ${OpenMP_CXX_FLAGS}) endif()
+#     install(
+#         TARGETS openmp_helloworld
+#         RUNTIME
+#             DESTINATION
+#                 ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
+#             COMPONENT tests)
+# endif()
 
 # hsa-mem_async_copy -- Not Enabled for Now
 set_source_files_properties(apps/async_mem_copy.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT
                                                                1)
-hip_add_executable(hsa_async_mem_copy apps/async_mem_copy.cpp)
+rocprofiler_featuretests_profiler_add_executable(hsa_async_mem_copy
+                                                 apps/async_mem_copy.cpp)
 set_target_properties(
     hsa_async_mem_copy
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY
@@ -147,8 +208,8 @@ file(GLOB GTEST_MAIN_SRC_FILE ${GTEST_MAIN_DIR}/gtests_main.cpp)
 
 set_source_files_properties(apps/multithreaded_testapp.cpp
                             PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-hip_add_executable(multithreaded_testapp apps/multithreaded_testapp.cpp
-                   ../utils/test_utils.cpp)
+rocprofiler_featuretests_profiler_add_executable(
+    multithreaded_testapp apps/multithreaded_testapp.cpp ../utils/test_utils.cpp)
 target_include_directories(
     multithreaded_testapp
     PRIVATE ${PROJECT_SOURCE_DIR}
@@ -226,18 +287,20 @@ install(
             ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps
         COMPONENT tests)
 
-# add_executable(profiler_multiqueue_test discretetests/binary/multiqueue_test.cpp
-# utils/csv_parser.cpp  utils/test_utils.cpp)
-# target_include_directories(profiler_multiqueue_test PRIVATE ${PROJECT_SOURCE_DIR}
-# ${PROJECT_SOURCE_DIR}/tests-v2/featuretests/profiler)
-# target_link_libraries(profiler_multiqueue_test PRIVATE hsa-runtime64::hsa-runtime64
-# Threads::Threads dl stdc++fs amd_comgr) add_dependencies(tests profiler_multiqueue_test)
+set(runFeatureTests_SOURCES
+    profiler_gtest.cpp apps/hip_kernels.cpp ${GTEST_MAIN_SRC_FILE} ${CORE_HSA_SRC_FILES}
+    ${CORE_HW_SRC_FILES} ${CORE_UTILS_SRC_FILES} ${TEST_UTILS_SRC_FILES})
 
 set_source_files_properties(apps/hip_kernels.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-hip_add_executable(
-    runFeatureTests profiler_gtest.cpp apps/hip_kernels.cpp ${GTEST_MAIN_SRC_FILE}
-    ${CORE_HSA_SRC_FILES} ${CORE_HW_SRC_FILES} ${CORE_UTILS_SRC_FILES}
-    ${TEST_UTILS_SRC_FILES})
+hip_add_executable(runFeatureTests ${runFeatureTests_SOURCES})
+
+# link MPI and OpenMP to runFeatureTests for visibility
+if(USE_MPI)
+    target_compile_definitions(runFeatureTests PRIVATE USE_MPI=1)
+endif()
+if(USE_OpenMP)
+    target_compile_definitions(runFeatureTests PRIVATE USE_OpenMP=1)
+endif()
 
 target_include_directories(
     runFeatureTests
@@ -253,7 +316,40 @@ target_link_options(runFeatureTests PRIVATE "-Wl,--build-id=md5")
 install(TARGETS runFeatureTests
         RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests
                 COMPONENT tests)
-add_test(AllTests runFeatureTests)
+
+# add_test(AllTests runFeatureTests)
+include(GoogleTest)
+
+set(GTEST_DISCOVER_TESTS_TARGET runFeatureTests)
+set(GTEST_DISCOVER_TESTS_LABELS "v2" "featuretests")
+set(GTEST_DISCOVER_TESTS_ENVIRONMENT ${ROCPROFILER_MEMCHECK_PRELOAD_ENV})
+configure_file(
+    ${PROJECT_SOURCE_DIR}/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/runFeatureTests_TestProperties.cmake @ONLY)
+
+# we cannot do LD_PRELOAD during test discovery but test discovery displays disabled tests
+# in CDash so below is the work-around
+# Commenting the first temporarily up till is fixed on Jenkins dockers
+# if(NOT ROCPROFILER_MEMCHECK MATCHES "(Thread|Address)Sanitizer")
+#     gtest_discover_tests(runFeatureTests)
+
+#     set_property(
+#         DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+#         APPEND
+#         PROPERTY TEST_INCLUDE_FILES
+#                  ${CMAKE_CURRENT_BINARY_DIR}/runFeatureTests_TestProperties.cmake)
+# else()
+    gtest_add_tests(
+        TARGET runFeatureTests
+        SOURCES "${runFeatureTests_SOURCES}"
+        TEST_LIST runFeatureTests_TESTS)
+    include(${CMAKE_CURRENT_BINARY_DIR}/runFeatureTests_TestProperties.cmake)
+# endif()
+
+# for the *_FilePlugin tests
+if(NOT EXISTS "${PROJECT_BINARY_DIR}/test-output")
+    file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/test-output")
+endif()
 
 # Copy scripts, input files to samples folder
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/apps/goldentraces/basic_metrics.txt
@@ -268,3 +364,21 @@ install(
     DESTINATION
         ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps/goldentraces
     COMPONENT tests)
+
+find_package(
+    Python3
+    COMPONENTS Interpreter
+    REQUIRED)
+
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import pandas"
+        RESULT_VARIABLE PANDAS_HEADER_PARSER
+        OUTPUT_QUIET)
+    if(NOT ${PANDAS_HEADER_PARSER} EQUAL 0)
+        message(
+                "The \"pandas\" Python3 package is not installed. \
+                Please install it using the following command: \"${Python3_EXECUTABLE} -m pip install pandas\".")
+    else()
+        # cmake based tests
+        include(${CMAKE_CURRENT_LIST_DIR}/counter_validation_tests.cmake)
+    endif()
diff --git a/tests-v2/featuretests/profiler/apps/goldentraces/mpi_vectoradd_golden_traces.txt b/tests-v2/featuretests/profiler/apps/goldentraces/mpi_vectoradd_golden_traces.txt
index a8c4edda..59107d0a 100755
--- a/tests-v2/featuretests/profiler/apps/goldentraces/mpi_vectoradd_golden_traces.txt
+++ b/tests-v2/featuretests/profiler/apps/goldentraces/mpi_vectoradd_golden_traces.txt
@@ -4,14 +4,7 @@ Enabling Counter Collection
 ROCProfilerV2: Collecting the following counters:
 - GRBM_COUNT
 Enabling Counter Collection
-ROCProfilerV2: Collecting the following counters:
-- GRBM_COUNT
-Enabling Counter Collection
-device count and rank is8: 2
-Rank Id: 0 | Device Id : 0 | Num Devices: 8
-device count and rank is8: 2
-Rank Id: 1 | Device Id : 1 | Num Devices: 8
-Max error: 0.000000
+device count and rank is1: 1
+Rank Id: 0 | Device Id : 0 | Num Devices: 1
 Max error: 0.000000
-Dispatch_ID(1), GPU_ID(5), Queue_ID(1), Queue_Index(0), Process_ID(2185441), Thread_ID(2185441), Grid_Size(1048576), Workgroup_Size(256), LDS(0), Scratch_Size(0), Arch_VGPR(12), Accumulative_VGPR(4), SGPR(32), Wave_Size(64), Kernel_Name("add"), Begin_Timestamp(139857691152944), End_Timestamp(139857835223272), Correlation_ID(0), GRBM_COUNT(499551.000000)
-Dispatch_ID(1), GPU_ID(4), Queue_ID(1), Queue_Index(0), Process_ID(2185436), Thread_ID(2185436), Grid_Size(1048576), Workgroup_Size(256), LDS(0), Scratch_Size(0), Arch_VGPR(12), Accumulative_VGPR(4), SGPR(32), Wave_Size(64), Kernel_Name("add"), Begin_Timestamp(140429257347632), End_Timestamp(140429483317480), Correlation_ID(0), GRBM_COUNT(499406.000000)
+Dispatch_ID(1), GPU_ID(1), Queue_ID(1), Queue_Index(0), Process_ID(6293), Thread_ID(6293), Grid_Size(1048576), Workgroup_Size(256), LDS(0), Scratch_Size(0), Arch_VGPR(12), Accumulative_VGPR(0), SGPR(32), Wave_Size(64), Kernel_Name("add"), Begin_Timestamp(140016470724832), End_Timestamp(5), Correlation_ID(0), GRBM_COUNT(1108537.000000)
diff --git a/tests-v2/featuretests/profiler/apps/goldentraces/openmp_helloworld_golden_traces.txt b/tests-v2/featuretests/profiler/apps/goldentraces/openmp_helloworld_golden_traces.txt
index e0007548..5333b219 100755
--- a/tests-v2/featuretests/profiler/apps/goldentraces/openmp_helloworld_golden_traces.txt
+++ b/tests-v2/featuretests/profiler/apps/goldentraces/openmp_helloworld_golden_traces.txt
@@ -1,4 +1,5 @@
 ROCProfilerV2: Collecting the following counters:
 - GRBM_COUNT
 Enabling Counter Collection
-Dispatch_ID(1), GPU_ID(4), Queue_ID(1), Queue_Index(1), Process_ID(2186189), Thread_ID(2186189), Grid_Size(10), Workgroup_Size(10), LDS(0), Scratch_Size(0), Arch_VGPR(8), Accumulative_VGPR(0), SGPR(16), Wave_Size(64), Kernel_Name("helloworld"), Begin_Timestamp(140284033765472), End_Timestamp(140288419293408), Correlation_ID(0), GRBM_COUNT(13839.000000)
+PASSED!
+Dispatch_ID(1), GPU_ID(1), Queue_ID(1), Queue_Index(0), Process_ID(11822), Thread_ID(11822), Grid_Size(1), Workgroup_Size(1), LDS(0), Scratch_Size(0), Arch_VGPR(4), Accumulative_VGPR(0), SGPR(16), Wave_Size(64), Kernel_Name("hip_helloworld"), Begin_Timestamp(140470675179888), End_Timestamp(140470675179776), Correlation_ID(0), GRBM_COUNT(22315.000000)
\ No newline at end of file
diff --git a/tests-v2/featuretests/profiler/counter_validation_tests.cmake b/tests-v2/featuretests/profiler/counter_validation_tests.cmake
new file mode 100644
index 00000000..ff319780
--- /dev/null
+++ b/tests-v2/featuretests/profiler/counter_validation_tests.cmake
@@ -0,0 +1,32 @@
+# counter validation test - GRBM_COUNT
+add_test(
+    NAME grbm_count_helloworld_test
+    COMMAND
+        ${PROJECT_BINARY_DIR}/rocprofv2 -i
+        ${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps/input.txt -d
+        ${PROJECT_BINARY_DIR}/out-grbm_count -o grbm
+        tests-v2/featuretests/profiler/apps/hip_helloworld
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
+
+set_tests_properties(
+    grbm_count_helloworld_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT
+                                          "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
+
+add_test(
+    NAME grbm_count_helloworld_test_validation
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/grbm_validate.py
+            "out-grbm_count/pmc_1/results_grbm.csv"
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
+
+set_tests_properties(
+    grbm_count_helloworld_test_validation
+    PROPERTIES DEPENDS
+               grbm_count_helloworld_test
+               LABELS
+               "v2;validation"
+               PASS_REGULAR_EXPRESSION
+               "Test Passed"
+               FAIL_REGULAR_EXPRESSION
+               "Test Failed"
+               SKIP_REGULAR_EXPRESSION
+               "Skipped")
diff --git a/tests-v2/featuretests/profiler/grbm_validate.py b/tests-v2/featuretests/profiler/grbm_validate.py
new file mode 100644
index 00000000..05fdd4b2
--- /dev/null
+++ b/tests-v2/featuretests/profiler/grbm_validate.py
@@ -0,0 +1,27 @@
+import pandas as pd
+import sys
+
+
+def validate_grbm_count(filename):
+    df = pd.read_csv(filename)
+
+    grbm_count = df.loc[0, "GRBM_COUNT"]
+
+    # Validate the data
+    if not grbm_count < 0:
+        print("Test Passed: grbm count is valid.")
+        return 0
+    else:
+        print("Test Failed: grbm count is not valid.")
+        return 1
+
+
+if __name__ == "__main__":
+    files = sys.argv[1:]
+    if not files:
+        raise RuntimeError("no input files provided")
+    for filename in files:
+        ec = validate_grbm_count(filename)
+        if ec != 0:
+            sys.stderr.write(f"{filename} did not pass validation\n")
+            sys.exit(ec)
diff --git a/tests-v2/featuretests/profiler/profiler_gtest.cpp b/tests-v2/featuretests/profiler/profiler_gtest.cpp
index b0848b20..8461e0cf 100644
--- a/tests-v2/featuretests/profiler/profiler_gtest.cpp
+++ b/tests-v2/featuretests/profiler/profiler_gtest.cpp
@@ -99,6 +99,10 @@ void ApplicationParser::SetApplicationEnv(const char* app_name) {
   setenv("COUNTERS_PATH", counter_path.str().c_str(), true);
 
   std::stringstream hsa_tools_lib_path;
+  auto _existing_ld_preload = getenv("LD_PRELOAD");
+  if (_existing_ld_preload && strnlen(_existing_ld_preload, 1) > 0)
+    hsa_tools_lib_path << _existing_ld_preload << ":";
+
   hsa_tools_lib_path << app_path << lib_path;
 
   setenv("LD_PRELOAD", hsa_tools_lib_path.str().c_str(), true);
@@ -247,9 +251,10 @@ TEST_F(HelloWorldTest, WhenRunningProfilerWithAppThenKernelNamessMatchWithGolden
   std::vector<profiler_kernel_info_t> current_kernel_info;
   GetKernelInfoForRunningApplication(&current_kernel_info);
 
-  ASSERT_TRUE(current_kernel_info.size());
-  ASSERT_TRUE(golden_kernel_info.size());
-  EXPECT_EQ(golden_kernel_info[0].kernel_name, current_kernel_info[0].kernel_name);
+  ASSERT_EQ(golden_kernel_info.size(), current_kernel_info.size());
+  for (size_t i = 0; i < current_kernel_info.size(); ++i) {
+    EXPECT_EQ(golden_kernel_info[i].kernel_name, current_kernel_info[i].kernel_name) << "i=" << i;
+  }
 }
 
 // Test:3 Compares order of kernel-names in golden output against current
@@ -380,143 +385,106 @@ TEST_F(HSATest, WhenRunningProfilerWithAppThenKernelNumbersMatchWithGoldenOutput
  * ############ OpenMP Tests ################
  * ###################################################
  */
-
-class OpenMPTest : public ProfilerTest {
- protected:
-  std::vector<profiler_kernel_info_t> golden_kernel_info;
-  void SetUp() {
-    ProfilerTest::SetUp("openmp_helloworld");
-    GetKernelInfoForGoldenOutput("openmp_helloworld", kGoldenOutputOpenMP, &golden_kernel_info);
-  }
-};
-
-// Test:1 Compares total num of kernel-names in golden output against current
-// profiler output
-TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelNumbersMatchWithGoldenOutput) {
-  std::vector<profiler_kernel_info_t> current_kernel_info;
-
-  GetKernelInfoForRunningApplication(&current_kernel_info);
-  ASSERT_TRUE(current_kernel_info.size());
-
-  EXPECT_EQ(golden_kernel_info.size(), current_kernel_info.size());
-}
-
-// Test:2 Compares order of kernel-names in golden output against current
-// profiler output
-TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelNamessMatchWithGoldenOutput) {
-  std::vector<profiler_kernel_info_t> current_kernel_info;
-
-  GetKernelInfoForRunningApplication(&current_kernel_info);
-  ASSERT_TRUE(current_kernel_info.size());
-
-  EXPECT_EQ(golden_kernel_info[0].kernel_name, current_kernel_info[0].kernel_name);
-  EXPECT_EQ(golden_kernel_info[1].kernel_name, current_kernel_info[1].kernel_name);
-}
-
-// Test:3 Compares order of kernel-names in golden output against current
-// profiler output
-TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelDurationShouldBePositive) {
-  // kernel info in current profiler run
-  std::vector<profiler_kernel_info_t> current_kernel_info;
-
-  GetKernelInfoForRunningApplication(&current_kernel_info);
-  ASSERT_TRUE(current_kernel_info.size());
-
-  EXPECT_GT(current_kernel_info.size(), 0);
-}
-
-// Test:4 Compares end-time is greater than start-time in current
-// profiler output
-TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenEndTimeIsGreaterThenStartTime) {
-  // kernel info in current profiler run
-  std::vector<profiler_kernel_info_t> current_kernel_info;
-
-  GetKernelInfoForRunningApplication(&current_kernel_info);
-  ASSERT_TRUE(current_kernel_info.size());
-
-  for (auto& itr : current_kernel_info) {
-    if (!(itr.end_time).empty()) {
-      EXPECT_GT(itr.end_time, itr.begin_time);
-    }
-  }
-}
-
+// #ifdef USE_OpenMP
+// class OpenMPTest : public ProfilerTest {
+//  protected:
+//   std::vector<profiler_kernel_info_t> golden_kernel_info;
+//   void SetUp() {
+//     ProfilerTest::SetUp("openmp_helloworld");
+//     GetKernelInfoForGoldenOutput("openmp_helloworld", kGoldenOutputOpenMP, &golden_kernel_info);
+//   }
+// };
+
+// // Test:1 Compares total num of kernel-names in golden output against current
+// // profiler output
+// TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelNumbersMatchWithGoldenOutput) {
+//   std::vector<profiler_kernel_info_t> current_kernel_info;
+
+//   GetKernelInfoForRunningApplication(&current_kernel_info);
+//   ASSERT_TRUE(current_kernel_info.size());
+
+//   EXPECT_EQ(golden_kernel_info.size(), current_kernel_info.size());
+// }
+
+// // Test:2 Compares order of kernel-names in golden output against current
+// // profiler output
+// TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelNamesMatchWithGoldenOutput) {
+//   std::vector<profiler_kernel_info_t> current_kernel_info;
+
+//   GetKernelInfoForRunningApplication(&current_kernel_info);
+//   ASSERT_TRUE(current_kernel_info.size());
+
+//   EXPECT_EQ(golden_kernel_info[0].kernel_name, current_kernel_info[0].kernel_name);
+// }
+
+// // Test:3 Compares order of kernel-names in golden output against current
+// // profiler output
+// TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelDurationShouldBePositive) {
+//   // kernel info in current profiler run
+//   std::vector<profiler_kernel_info_t> current_kernel_info;
+
+//   GetKernelInfoForRunningApplication(&current_kernel_info);
+//   ASSERT_TRUE(current_kernel_info.size());
+
+//   EXPECT_GT(current_kernel_info.size(), 0);
+// }
+
+// // Test:4 Compares end-time is greater than start-time in current
+// // profiler output
+// TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenEndTimeIsGreaterThenStartTime) {
+//   // kernel info in current profiler run
+//   std::vector<profiler_kernel_info_t> current_kernel_info;
+
+//   GetKernelInfoForRunningApplication(&current_kernel_info);
+//   ASSERT_TRUE(current_kernel_info.size());
+
+//   for (auto& itr : current_kernel_info) {
+//     if (!(itr.end_time).empty()) {
+//       EXPECT_GT(itr.end_time, itr.begin_time);
+//     }
+//   }
+// }
+// #endif
 /*
  * ###################################################
  * ############ MPI Tests ################
  * ###################################################
  */
-
+#ifdef USE_MPI
 class MPITest : public ProfilerTest {
  protected:
   void ProcessMPIApplication(const char* app_name);
   void ExecuteAndParseApplication(std::stringstream& ss);
 
+  std::vector<profiler_kernel_info_t> golden_kernel_info;
   void SetUp() {
     /*To supress No protocol found prints*/
     setenv("HWLOC_COMPONENTS", "-gl", 1);
-
-    // run as standalone test
     ProfilerTest::SetUp("mpi_vectoradd");
-
-    // run mpirun script
-    // ProcessMPIApplication("mpi_run.sh");
+    GetKernelInfoForGoldenOutput("mpi_vectoradd", kGoldenOutputMpi, &golden_kernel_info);
   }
-
-  /*virtual void TearDown() override {
-    unsetenv("HWLOC_COMPONENTS");
-    unsetenv("LD_PRELOAD");
-    ProfilerTest::TearDown();
-  }*/
 };
 
-void MPITest::ProcessMPIApplication(const char* app_name) {
-  std::string app_path = GetRunningPath(running_path);
-  std::string lib_path = app_path;
-
-  std::stringstream hsa_tools_lib_path;
-
-  hsa_tools_lib_path << app_path << "librocprofiler_tool.so";
-  setenv("LD_PRELOAD", hsa_tools_lib_path.str().c_str(), true);
-
-  std::stringstream os;
-  os << app_path << "tests/featuretests/profiler/apps/" << app_name;
-  ExecuteAndParseApplication(os);
-}
-
-void MPITest::ExecuteAndParseApplication(std::stringstream& ss) {
-  FILE* handle = popen(ss.str().c_str(), "r");
-  ASSERT_NE(handle, nullptr);
-  char* ln{NULL};
-  std::string temp{""};
-  size_t len{0};
-
-  while (getline(&ln, &len, handle) != -1) {
-    temp = temp + std::string(ln);
-  }
+// Test:1 if kernel-name exists in current profiler output
+TEST_F(MPITest, WhenRunningProfilerWithAppThenKernelNumbersOutputGenerated) {
+  std::vector<profiler_kernel_info_t> current_kernel_info;
 
-  free(ln);
-  size_t pos{0};
-  std::string delimiter{"\n"};
-  while ((pos = temp.find(delimiter)) != std::string::npos) {
-    output_lines.push_back(temp.substr(0, pos));
-    temp.erase(0, pos + delimiter.length());
-  }
+  GetKernelInfoForRunningApplication(&current_kernel_info);
+  ASSERT_TRUE(current_kernel_info.size());
 
-  pclose(handle);
+  EXPECT_GT(current_kernel_info.size(), 0);
 }
 
-// Test:1 Compares total num of kernel-names in golden output against current
-// profiler output
-TEST_F(MPITest, WhenRunningProfilerWithAppThenKernelNumbersMatchWithGoldenOutput) {
+// Test:1 if kernel-name matches with golden output
+TEST_F(MPITest, WhenRunningProfilerWithAppThenKernelNameMatchWithGoldenOutput) {
   std::vector<profiler_kernel_info_t> current_kernel_info;
 
   GetKernelInfoForRunningApplication(&current_kernel_info);
   ASSERT_TRUE(current_kernel_info.size());
 
-  EXPECT_GT(current_kernel_info.size(), 0);
+  EXPECT_EQ(golden_kernel_info[0].kernel_name, current_kernel_info[0].kernel_name);
 }
-
+#endif
 /*
  * ###################################################
  * ############ HSA Load Unload Tests ################
@@ -586,8 +554,8 @@ TEST_F(LoadUnloadTest, WhenLoadingSecondTimeThenToolLoadsUnloadsSuccessfully) {
 
 class ATTCollection : public ::testing::Test {
  public:
-  virtual void SetUp() { bCollected = false; };
-  virtual void TearDown(){};
+  void SetUp() override { bCollected = false; };
+  void TearDown() override{};
   static bool bCollected;
 
   static void FlushCallback(const rocprofiler_record_header_t* record,
@@ -625,11 +593,42 @@ class ATTCollection : public ::testing::Test {
 bool ATTCollection::bCollected = false;
 
 TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithOldAPI) {
+  // iterate for gpu's
+  struct agent_info {
+    bool skip = false;
+    std::vector<std::string> agents = {};
+
+    auto as_string() const {
+      auto _ss = std::stringstream{};
+      for (const auto& itr : agents) _ss << ", " << itr;
+      auto _v = _ss.str();
+      if (_v.length() > 2) return _v.substr(2);
+      return _v;
+    }
+  };
+
+  auto _info = agent_info{};
+  hsa_iterate_agents(
+      [](hsa_agent_t agent, void* _arg) {
+        agent_info* _info_v = static_cast<agent_info*>(_arg);
+        EXPECT_NE(_info_v, nullptr);
+        char gpu_name[64] = {'\0'};
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, gpu_name);
+        _info_v->agents.emplace_back(std::string{gpu_name});
+        if (std::regex_search(_info_v->agents.back(), std::regex{"^gfx1[0-1][0-9][0-9]"})) {
+          _info_v->skip = true;
+        }
+        return HSA_STATUS_SUCCESS;
+      },
+      static_cast<void*>(&_info));
+
+  if (_info.skip) GTEST_SKIP();
+
   int result = ROCPROFILER_STATUS_ERROR;
 
   // inititalize ROCProfiler
   result = rocprofiler_initialize();
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // Att trace collection parameters
   rocprofiler_session_id_t session_id;
@@ -642,12 +641,12 @@ TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithOldAPI) {
 
   // create a session
   result = rocprofiler_create_session(ROCPROFILER_NONE_REPLAY_MODE, &session_id);
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // create a buffer to hold att trace records for each kernel launch
   rocprofiler_buffer_id_t buffer_id;
   result = rocprofiler_create_buffer(session_id, FlushCallback, 0x9999, &buffer_id);
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // create a filter for collecting att traces
   rocprofiler_filter_id_t filter_id;
@@ -655,65 +654,93 @@ TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithOldAPI) {
   result = rocprofiler_create_filter(session_id, ROCPROFILER_ATT_TRACE_COLLECTION,
                                      rocprofiler_filter_data_t{.att_parameters = &parameters[0]},
                                      parameters.size(), &filter_id, property);
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // set buffer for the filter
   result = rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id);
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // activating att tracing session
   result = rocprofiler_start_session(session_id);
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // Launch a kernel
   LaunchVectorAddKernel();
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // deactivate att tracing session
   result = rocprofiler_terminate_session(session_id);
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // dump att tracing data
   result = rocprofiler_flush_data(session_id, buffer_id);
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // destroy session
   result = rocprofiler_destroy_session(session_id);
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // finalize att tracing by destroying rocprofiler object
   result = rocprofiler_finalize();
-  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
+  EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string();
 
   // check if we got data from any shader engine
-  EXPECT_EQ(bCollected, true);
+  EXPECT_EQ(bCollected, true) << "agents: " << _info.as_string();
 }
 
 // New API
 TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithNewAPI) {
-  int result = ROCPROFILER_STATUS_ERROR;
+  // iterate for gpu's
+  struct agent_info {
+    bool skip = false;
+    std::vector<std::string> agents = {};
+
+    auto as_string() const {
+      auto _ss = std::stringstream{};
+      for (const auto& itr : agents) _ss << ", " << itr;
+      auto _v = _ss.str();
+      if (_v.length() > 2) return _v.substr(2);
+      return _v;
+    }
+  };
+
+  auto _info = agent_info{};
+  hsa_iterate_agents(
+      [](hsa_agent_t agent, void* _arg) {
+        agent_info* _info_v = static_cast<agent_info*>(_arg);
+        EXPECT_NE(_info_v, nullptr);
+        char gpu_name[64] = {'\0'};
+        hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, gpu_name);
+        _info_v->agents.emplace_back(std::string{gpu_name});
+        if (std::regex_search(_info_v->agents.back(), std::regex{"^gfx1[0-1][0-9][0-9]"})) {
+          _info_v->skip = true;
+        }
+        return HSA_STATUS_SUCCESS;
+      },
+      static_cast<void*>(&_info));
+
+  if (_info.skip) GTEST_SKIP();
 
+  int result = ROCPROFILER_STATUS_ERROR;
   // inititalize ROCProfiler
   result = rocprofiler_initialize();
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // Att trace collection parameters
   rocprofiler_session_id_t session_id;
   std::vector<rocprofiler_att_parameter_t> parameters;
   parameters.emplace_back(rocprofiler_att_parameter_t{ROCPROFILER_ATT_COMPUTE_UNIT, 0});
   parameters.emplace_back(rocprofiler_att_parameter_t{ROCPROFILER_ATT_SE_MASK, 0xF});
-  parameters.emplace_back(rocprofiler_att_parameter_t{ROCPROFILER_ATT_SIMD_SELECT, 0x3}); // Replace below tests once aqlprofile passes
-  parameters.emplace_back(rocprofiler_att_parameter_t{ROCPROFILER_ATT_BUFFER_SIZE, 0x1000000}); // Replace below tests once aqlprofile passes
-
+  parameters.emplace_back(rocprofiler_att_parameter_t{
+      ROCPROFILER_ATT_SIMD_SELECT, 0x3});  // Replace below tests once aqlprofile passes
+  parameters.emplace_back(rocprofiler_att_parameter_t{
+      ROCPROFILER_ATT_BUFFER_SIZE, 0x1000000});  // Replace below tests once aqlprofile passes
   // create a session
   result = rocprofiler_create_session(ROCPROFILER_NONE_REPLAY_MODE, &session_id);
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // create a buffer to hold att trace records for each kernel launch
   rocprofiler_buffer_id_t buffer_id;
   result = rocprofiler_create_buffer(session_id, FlushCallback, 0x9999, &buffer_id);
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // create a filter for collecting att traces
   rocprofiler_filter_id_t filter_id;
   rocprofiler_filter_property_t property = {};
@@ -721,35 +748,27 @@ TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithNewAPI) {
                                      rocprofiler_filter_data_t{.att_parameters = &parameters[0]},
                                      parameters.size(), &filter_id, property);
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // set buffer for the filter
   result = rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id);
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // activating att tracing session
   result = rocprofiler_start_session(session_id);
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // Launch a kernel
   LaunchVectorAddKernel();
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // deactivate att tracing session
   result = rocprofiler_terminate_session(session_id);
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // dump att tracing data
   result = rocprofiler_flush_data(session_id, buffer_id);
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // destroy session
   result = rocprofiler_destroy_session(session_id);
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // finalize att tracing by destroying rocprofiler object
   result = rocprofiler_finalize();
   EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result);
-
   // check if we got data from any shader engine
   EXPECT_EQ(bCollected, true);
 }
@@ -767,6 +786,7 @@ class ProfilerAPITest : public ::testing::Test {
     std::stringstream gfx_path;
     gfx_path << app_path << metrics_path;
     setenv("ROCPROFILER_METRICS_PATH", gfx_path.str().c_str(), true);
+    setenv("ROCPROFILER_MAX_ATT_PROFILES", "2", 1);
   }
   // function to check profiler API status
   static void CheckApi(rocprofiler_status_t status) {
@@ -932,96 +952,6 @@ TEST_F(DerivedMetricsReuseTest, WhenRunningRepeatedBaseMetricsAPIsWorkFine) {
   CheckApi(rocprofiler_finalize());
 }
 
-/*
- * ###################################################
- * ############ SPM Tests ################
- * ###################################################
- */
-
-class ProfilerSPMTest : public ::testing::Test {
-  // function to check spm tracing API status
- protected:
-  // function to check profiler API status
-  static void CheckApi(rocprofiler_status_t status) {
-    ASSERT_EQ(status, ROCPROFILER_STATUS_SUCCESS);
-  };
-
-  static void FlushCallback(const rocprofiler_record_header_t* record,
-                            const rocprofiler_record_header_t* end_record,
-                            rocprofiler_session_id_t session_id,
-                            rocprofiler_buffer_id_t buffer_id) {
-    while (record < end_record) {
-      if (!record)
-        break;
-      else if (record->kind == ROCPROFILER_SPM_RECORD) {
-        const rocprofiler_record_spm_t* spm_record =
-            reinterpret_cast<const rocprofiler_record_spm_t*>(record);
-        int se_num = 4;
-        // iterate over each shader engine
-        for (int i = 0; i < se_num; i++) {
-          printf("\n\n-------------- shader_engine %d --------------\n\n", i);
-          rocprofiler_record_se_spm_data_t se_spm = spm_record->shader_engine_data[i];
-          for (int i = 0; i < 32; i++) {
-            printf("%04x\n", se_spm.counters_data[i].value);
-          }
-        }
-      }
-      CheckApi(rocprofiler_next_record(record, &record, session_id, buffer_id));
-    }
-  }
-};
-
-TEST_F(ProfilerSPMTest, WhenRunningSPMItCollectsSPMData) {
-  // initialize rocprofiler
-  hsa_init();
-  CheckApi(rocprofiler_initialize());
-
-  // spm trace collection parameters
-  rocprofiler_session_id_t session_id;
-  rocprofiler_spm_parameter_t spm_parameters;
-  const char* counter_name = "SQ_WAVES";
-  spm_parameters.counters_names = &counter_name;
-  spm_parameters.counters_count = 1;
-  spm_parameters.gpu_agent_id = NULL;
-  // spm_parameters.cpu_agent_id = NULL;
-  spm_parameters.sampling_rate = 10000;
-  // create a session
-  CheckApi(rocprofiler_create_session(ROCPROFILER_NONE_REPLAY_MODE, &session_id));
-
-  // create a buffer to hold spm trace records for each kernel launch
-  rocprofiler_buffer_id_t buffer_id;
-  CheckApi(rocprofiler_create_buffer(session_id, FlushCallback, 0x99999999, &buffer_id));
-
-  // create a filter for collecting spm traces
-  rocprofiler_filter_id_t filter_id;
-  rocprofiler_filter_property_t property = {};
-  CheckApi(rocprofiler_create_filter(session_id, ROCPROFILER_SPM_COLLECTION,
-                                     rocprofiler_filter_data_t{.spm_parameters = &spm_parameters},
-                                     1, &filter_id, property));
-
-  // set buffer for the filter
-  CheckApi(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id));
-
-  // activating spm tracing session
-  CheckApi(rocprofiler_start_session(session_id));
-
-  // Launch a kernel
-  LaunchVectorAddKernel();
-
-  // deactivate spm tracing session
-  // dump spm tracing data
-  //
-  CheckApi(rocprofiler_terminate_session(session_id));
-  // CheckApi(rocprofiler_flush_data(session_id, buffer_id));
-
-  // destroy session
-  CheckApi(rocprofiler_destroy_session(session_id));
-
-  // finalize spm tracing by destroying rocprofiler object
-  CheckApi(rocprofiler_finalize());
-  hsa_shut_down();
-}
-
 /*
  * ###################################################
  * ############ Multi Thread Binary Tests ############
@@ -1222,13 +1152,13 @@ TEST(ProfilerMPTest, WhenRunningMultiProcessTestItPasses) {
  */
 
 class CodeobjTest : public ::testing::Test {
-public:
-  virtual void SetUp(const char* app_name) {};
+ public:
+  virtual void SetUp(const char* app_name){};
   virtual void TearDown(){};
   static void FlushCallback(const rocprofiler_record_header_t* record,
                             const rocprofiler_record_header_t* end_record,
                             rocprofiler_session_id_t session_id,
-                            rocprofiler_buffer_id_t buffer_id) {};
+                            rocprofiler_buffer_id_t buffer_id){};
 
   void SetupRocprofiler() {
     int result = ROCPROFILER_STATUS_ERROR;
@@ -1279,7 +1209,7 @@ TEST_F(CodeobjTest, WhenRunningProfilerWithCodeobjCapture) {
   EXPECT_GE(capture.count, 1);
   bool bCaptured_itself = false;
 
-  for (int i=0; i<(int)capture.count; i++) {
+  for (int i = 0; i < (int)capture.count; i++) {
     const char* path = capture.symbols[i].filepath;
     if (!path) continue;
     std::string fpath(path);
@@ -1334,7 +1264,7 @@ TEST_F(CodeobjTest, WhenRunningProfilerWithMultipleCaptureAndCopy) {
 
   EXPECT_GE(capture.count, 1);
 
-  for (int i=0; i<(int)capture.count; i++) {
+  for (int i = 0; i < (int)capture.count; i++) {
     EXPECT_NE(capture.symbols[i].base_address, 0);
     EXPECT_NE(capture.symbols[i].clock_start.value, 0);
     EXPECT_NE(capture.symbols[i].data, nullptr);
@@ -1445,7 +1375,8 @@ class VectorAddPerfettoMPITest : public PerfettoPluginTest {
  protected:
   virtual void SetUp() {
     setenv("MPI_RANK", "7", true);
-    RunApplication("hip_vectoradd", " -d /tmp/tests-v2/perfetto/ -o test_%q{MPI_RANK}_ --plugin perfetto");
+    RunApplication("hip_vectoradd",
+                   " -d /tmp/tests-v2/perfetto/ -o test_%q{MPI_RANK}_ --plugin perfetto");
   }
   virtual void TearDown() {
     std::experimental::filesystem::remove_all("/tmp/tests-v2/perfetto/");
@@ -1459,7 +1390,8 @@ TEST_F(VectorAddPerfettoMPITest, WhenRunningProfilerWithPerfettoTest) {
 }
 
 bool CTFPluginTest::hasMetadataInDir(const char* directory) {
-  for (const auto& entry : std::experimental::filesystem::directory_iterator(directory))
+  auto path = std::experimental::filesystem::directory_iterator(directory)->path();
+  for (const auto& entry : std::experimental::filesystem::directory_iterator(path))
     if (std::string(entry.path().filename()) == "metadata") return true;
   return false;
 }
@@ -1471,7 +1403,7 @@ class VectorAddCTFTest : public CTFPluginTest {
     std::experimental::filesystem::remove_all("/tmp/tests-v2/");
     unsetenv("MPI_RANK");
   }
-  bool hasFile() { return hasMetadataInDir("/tmp/tests-v2/ctf/trace/"); }
+  bool hasFile() { return hasMetadataInDir("/tmp/tests-v2/ctf/"); }
 };
 
 TEST_F(VectorAddCTFTest, WhenRunningProfilerWithCTFTest) { EXPECT_EQ(hasFile(), true); }
@@ -1486,7 +1418,7 @@ class VectorAddCTFMPITest : public CTFPluginTest {
     std::experimental::filesystem::remove_all("/tmp/tests-v2/");
     unsetenv("MPI_RANK");
   }
-  bool hasFile() { return hasMetadataInDir("/tmp/tests-v2/ctf_7/trace/"); }
+  bool hasFile() { return hasMetadataInDir("/tmp/tests-v2/ctf_7/"); }
 };
 
 TEST_F(VectorAddCTFMPITest, WhenRunningProfilerWithCTFTest) { EXPECT_EQ(hasFile(), true); }
diff --git a/tests-v2/featuretests/tracer/CMakeLists.txt b/tests-v2/featuretests/tracer/CMakeLists.txt
index 12b10640..af7d9602 100644
--- a/tests-v2/featuretests/tracer/CMakeLists.txt
+++ b/tests-v2/featuretests/tracer/CMakeLists.txt
@@ -6,8 +6,32 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip")
 set(CMAKE_HIP_ARCHITECTURES OFF)
+if(DEFINED ROCM_PATH)
+    set(HIP_ROOT_DIR "${ROCM_PATH}/bin")
+endif()
 find_package(HIP REQUIRED MODULE)
 
+# ########################################################################################
+function(rocprofiler_featuretests_tracer_add_test _TARGET)
+    if(TARGET ${_TARGET})
+        if(NOT TEST ${_TARGET})
+            add_test(
+                NAME ${_TARGET}
+                COMMAND $<TARGET_FILE:${_TARGET}>
+                WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+        endif()
+
+        set_tests_properties(
+            ${_TARGET} PROPERTIES LABELS "featuretests;tracer" ENVIRONMENT
+                                  "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" ${ARGN})
+    endif()
+endfunction()
+
+function(rocprofiler_featuretests_tracer_add_executable _TARGET)
+    hip_add_executable(${_TARGET} ${ARGN})
+    rocprofiler_featuretests_tracer_add_test(${_TARGET})
+endfunction()
+
 # Setup testing
 enable_testing()
 find_package(GTest REQUIRED)
@@ -29,7 +53,7 @@ file(GLOB GTEST_MAIN_SRC_FILE ${GTEST_MAIN_DIR}/*.cpp)
 
 # Compile Applications hip_helloworld
 set_source_files_properties(apps/hello_world.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-hip_add_executable(tracer_hip_helloworld apps/hello_world.cpp)
+rocprofiler_featuretests_tracer_add_executable(tracer_hip_helloworld apps/hello_world.cpp)
 set_target_properties(
     tracer_hip_helloworld
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY
@@ -45,7 +69,7 @@ install(
 # hsa-mem_async_copy and async_copy_on_engine
 set_source_files_properties(apps/copy_on_engine.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT
                                                                1)
-hip_add_executable(copy_on_engine apps/copy_on_engine.cpp)
+rocprofiler_featuretests_tracer_add_executable(copy_on_engine apps/copy_on_engine.cpp)
 set_target_properties(
     copy_on_engine PROPERTIES RUNTIME_OUTPUT_DIRECTORY
                               "${PROJECT_BINARY_DIR}/tests-v2/featuretests/tracer/apps")
@@ -60,24 +84,27 @@ target_link_libraries(copy_on_engine hsa-runtime64::hsa-runtime64 Threads::Threa
                       stdc++fs)
 
 # Compile MatrixTranspose App with ROCTX
-find_library(ROCTX_LIBRARY NAMES roctx64 HINTS ${ROCM_PATH}/lib)
+find_library(
+    ROCTX_LIBRARY
+    NAMES roctx64
+    HINTS ${ROCM_PATH}/lib)
 if(ROCTX_LIBRARY)
-  set_source_files_properties(apps/MatrixTranspose.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-  hip_add_executable(tracer_matrix_transpose apps/MatrixTranspose.cpp)
-  set_target_properties(
-      tracer_matrix_transpose
-      PROPERTIES RUNTIME_OUTPUT_DIRECTORY
-                 "${PROJECT_BINARY_DIR}/tests-v2/featuretests/tracer/apps")
-  target_link_options(tracer_matrix_transpose PRIVATE "-Wl,--build-id=md5")
-  target_include_directories(
-      tracer_matrix_transpose PRIVATE ${ROCM_PATH})
-  target_link_libraries(tracer_matrix_transpose ${ROCTX_LIBRARY})
-  install(
-      TARGETS tracer_matrix_transpose
-      RUNTIME
-          DESTINATION
-              ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/tracer/apps
-          COMPONENT tests)
+    set_source_files_properties(apps/MatrixTranspose.cpp
+                                PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+    hip_add_executable(tracer_matrix_transpose apps/MatrixTranspose.cpp)
+    set_target_properties(
+        tracer_matrix_transpose
+        PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                   "${PROJECT_BINARY_DIR}/tests-v2/featuretests/tracer/apps")
+    target_link_options(tracer_matrix_transpose PRIVATE "-Wl,--build-id=md5")
+    target_include_directories(tracer_matrix_transpose PRIVATE ${ROCM_PATH})
+    target_link_libraries(tracer_matrix_transpose ${ROCTX_LIBRARY})
+    install(
+        TARGETS tracer_matrix_transpose
+        RUNTIME
+            DESTINATION
+                ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/tracer/apps
+            COMPONENT tests)
 endif()
 
 # Add test cpp file
@@ -102,3 +129,11 @@ install(
     DESTINATION
         ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/tracer/apps/goldentraces
     COMPONENT tests)
+
+find_package(
+    Python3
+    COMPONENTS Interpreter
+    REQUIRED)
+
+# cmake based tests
+include(${CMAKE_CURRENT_LIST_DIR}/hiptrace_validation_tests.cmake)
diff --git a/tests-v2/featuretests/tracer/apps/MatrixTranspose.cpp b/tests-v2/featuretests/tracer/apps/MatrixTranspose.cpp
index d6452395..81b2585c 100755
--- a/tests-v2/featuretests/tracer/apps/MatrixTranspose.cpp
+++ b/tests-v2/featuretests/tracer/apps/MatrixTranspose.cpp
@@ -39,52 +39,52 @@ THE SOFTWARE.
 
 // Device (Kernel) function, it must be void
 __global__ void matrixTranspose(float* out, float* in, const int width) {
-    int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
-    int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
+  int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+  int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
 
-    out[y * width + x] = in[x * width + y];
+  out[y * width + x] = in[x * width + y];
 }
 
 // CPU implementation of matrix transpose
 void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {
-    for (unsigned int j = 0; j < width; j++) {
-        for (unsigned int i = 0; i < width; i++) {
-            output[i * width + j] = input[j * width + i];
-        }
+  for (unsigned int j = 0; j < width; j++) {
+    for (unsigned int i = 0; i < width; i++) {
+      output[i * width + j] = input[j * width + i];
     }
+  }
 }
 
 int main() {
-    float* Matrix;
-    float* TransposeMatrix;
-    float* cpuTransposeMatrix;
+  float* Matrix;
+  float* TransposeMatrix;
+  float* cpuTransposeMatrix;
 
-    float* gpuMatrix;
-    float* gpuTransposeMatrix;
+  float* gpuMatrix;
+  float* gpuTransposeMatrix;
 
-    hipDeviceProp_t devProp;
-    hipGetDeviceProperties(&devProp, 0);
+  hipDeviceProp_t devProp;
+  hipGetDeviceProperties(&devProp, 0);
 
-    std::cout << "Device name " << devProp.name << std::endl;
+  std::cout << "Device name " << devProp.name << std::endl;
 
-    int i;
-    int errors;
+  int i;
+  int errors;
 
-    Matrix = (float*)malloc(NUM * sizeof(float));
-    TransposeMatrix = (float*)malloc(NUM * sizeof(float));
-    cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  Matrix = (float*)malloc(NUM * sizeof(float));
+  TransposeMatrix = (float*)malloc(NUM * sizeof(float));
+  cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));
 
-    // initialize the input data
-    for (i = 0; i < NUM; i++) {
-        Matrix[i] = (float)i * 10.0f;
-    }
+  // initialize the input data
+  for (i = 0; i < NUM; i++) {
+    Matrix[i] = (float)i * 10.0f;
+  }
 
-    // allocate the memory on the device side
-    hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
-    hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
+  // allocate the memory on the device side
+  hipMalloc((void**)&gpuMatrix, NUM * sizeof(float));
+  hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float));
 
-    uint32_t iterations = 10;
-    while (iterations-- > 0) {
+  uint32_t iterations = 10;
+  while (iterations-- > 0) {
     std::cout << "## Iteration (" << iterations << ") #################" << std::endl;
 
     // Memory transfer from host to device
@@ -96,11 +96,11 @@ int main() {
     roctx_range_id_t roctx_id = roctxRangeStartA("roctx_range with id");
 
     // Lauching kernel from host
-    hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
-                    dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix,
-                    gpuMatrix, WIDTH);
+    hipLaunchKernelGGL(
+        matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),
+        dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH);
 
-    roctxRangeStop(roctx_id);    
+    roctxRangeStop(roctx_id);
     roctxMark("ROCTX-MARK: after hipLaunchKernel");
 
     // Memory transfer from device to host
@@ -108,8 +108,8 @@ int main() {
 
     hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);
 
-    roctxRangePop(); // for "hipMemcpy"
-    roctxRangePop(); // for "hipLaunchKernel"
+    roctxRangePop();  // for "hipMemcpy"
+    roctxRangePop();  // for "hipLaunchKernel"
 
     // CPU MatrixTranspose computation
     matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
@@ -118,26 +118,25 @@ int main() {
     errors = 0;
     double eps = 1.0E-6;
     for (i = 0; i < NUM; i++) {
-        if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
-            errors++;
-        }
+      if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {
+        errors++;
+      }
     }
     if (errors != 0) {
-        printf("FAILED: %d errors\n", errors);
+      printf("FAILED: %d errors\n", errors);
     } else {
-        printf("PASSED!\n");
-    }
-
+      printf("PASSED!\n");
     }
+  }
 
-    // free the resources on device side
-    hipFree(gpuMatrix);
-    hipFree(gpuTransposeMatrix);
+  // free the resources on device side
+  hipFree(gpuMatrix);
+  hipFree(gpuTransposeMatrix);
 
-    // free the resources on host side
-    free(Matrix);
-    free(TransposeMatrix);
-    free(cpuTransposeMatrix);
+  // free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
 
-    return errors;
+  return errors;
 }
diff --git a/tests-v2/featuretests/tracer/apps/copy_on_engine.cpp b/tests-v2/featuretests/tracer/apps/copy_on_engine.cpp
index 7ce76b72..185a060e 100644
--- a/tests-v2/featuretests/tracer/apps/copy_on_engine.cpp
+++ b/tests-v2/featuretests/tracer/apps/copy_on_engine.cpp
@@ -195,8 +195,8 @@ static hsa_status_t AsyncCpyTest(async_mem_cpy_agent* dst, async_mem_cpy_agent*
   // Initialize the system and destination buffers with a value so we can later
   // validate it has been overwritten
   void* sysPtr = args->cpu.ptr;
-
-  *reinterpret_cast<uint32_t*>(src->ptr) = val;
+  err = hsa_amd_memory_fill(src->ptr, val, sz / sizeof(uint32_t));
+  RET_IF_HSA_ERR(err);
 
   // Make sure the target and destination agents have access to the buffer.
   hsa_agent_t ag_list[3] = {dst->dev, src->dev, args->cpu.dev};
@@ -231,14 +231,14 @@ static hsa_status_t AsyncCpyTest(async_mem_cpy_agent* dst, async_mem_cpy_agent*
   }
 
   // Check that the contents of the buffer are what is expected.
-  if (*reinterpret_cast<uint32_t*>(dst->ptr) != *reinterpret_cast<uint32_t*>(src->ptr)) {
-    fprintf(stderr,
-            "Expected 0x%x but got 0x%x in buffer when copying from %lu to %lu and CPU device is "
-            "%lu.\n",
-            *reinterpret_cast<uint32_t*>(src->ptr), *reinterpret_cast<uint32_t*>(dst->ptr),
-            src->dev.handle, dst->dev.handle, args->cpu.dev.handle);
-    return HSA_STATUS_ERROR;
+  for (uint32_t i = 0; i < sz / sizeof(uint32_t); ++i) {
+    if (reinterpret_cast<uint32_t*>(sysPtr)[i] != val) {
+      fprintf(stderr, "Expected 0x%x but got 0x%x in buffer at index %d.\n", val,
+              reinterpret_cast<uint32_t*>(sysPtr)[i], i);
+      return HSA_STATUS_ERROR;
+    }
   }
+
   return HSA_STATUS_SUCCESS;
 }
 
diff --git a/tests-v2/featuretests/tracer/hip_trace_validate.py b/tests-v2/featuretests/tracer/hip_trace_validate.py
new file mode 100644
index 00000000..a3880989
--- /dev/null
+++ b/tests-v2/featuretests/tracer/hip_trace_validate.py
@@ -0,0 +1,28 @@
+import pandas as pd
+import sys
+
+
+def validate_hip_trace(filename):
+    df = pd.read_csv(filename)
+
+    start_time = df.loc[0, "Start_Timestamp"]
+    end_time = df.loc[0, "End_Timestamp"]
+
+    # Validate the data
+    if start_time < end_time:
+        print("Test Passed: Time stamps are valid.")
+        return 0
+    else:
+        print("Test Failed: Time stamps are not valid.")
+        return 1
+
+
+if __name__ == "__main__":
+    files = sys.argv[1:]
+    if not files:
+        raise RuntimeError("no input files provided")
+    for filename in files:
+        ec = validate_hip_trace(filename)
+        if ec != 0:
+            sys.stderr.write(f"{filename} did not pass validation\n")
+            sys.exit(ec)
diff --git a/tests-v2/featuretests/tracer/hiptrace_validation_tests.cmake b/tests-v2/featuretests/tracer/hiptrace_validation_tests.cmake
new file mode 100644
index 00000000..c690756e
--- /dev/null
+++ b/tests-v2/featuretests/tracer/hiptrace_validation_tests.cmake
@@ -0,0 +1,29 @@
+# hip-trace validation test - Timestamp
+add_test(
+    NAME hiptrace_helloworld_test
+    COMMAND ${PROJECT_BINARY_DIR}/rocprofv2 --hip-api -d ${PROJECT_BINARY_DIR}/out-trace
+            -o out tests-v2/featuretests/profiler/apps/hip_helloworld
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
+
+set_tests_properties(
+    hiptrace_helloworld_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT
+                                        "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
+
+add_test(
+    NAME hiptrace_helloworld_test_validation
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/hip_trace_validate.py
+            "out-trace/hip_api_trace_out.csv"
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
+
+set_tests_properties(
+    hiptrace_helloworld_test_validation
+    PROPERTIES DEPENDS
+               hiptrace_helloworld_test
+               LABELS
+               "v2;validation"
+               PASS_REGULAR_EXPRESSION
+               "Test Passed"
+               FAIL_REGULAR_EXPRESSION
+               "Test Failed"
+               SKIP_REGULAR_EXPRESSION
+               "Skipped")
diff --git a/tests-v2/featuretests/tracer/tracer_gtest.cpp b/tests-v2/featuretests/tracer/tracer_gtest.cpp
index 9d5d7a64..11bfcd74 100644
--- a/tests-v2/featuretests/tracer/tracer_gtest.cpp
+++ b/tests-v2/featuretests/tracer/tracer_gtest.cpp
@@ -69,6 +69,10 @@ void ApplicationParser::SetApplicationEnv(const char* app_name, const char* trac
   setenv("LD_LIBRARY_PATH", ld_library_path.str().c_str(), true);
 
   std::stringstream hsa_tools_lib_path;
+  auto _existing_ld_preload = getenv("LD_PRELOAD");
+  if (_existing_ld_preload && strnlen(_existing_ld_preload, 1) > 0)
+    hsa_tools_lib_path << _existing_ld_preload << ":";
+
   hsa_tools_lib_path << app_path << lib_path;
   setenv("LD_PRELOAD", hsa_tools_lib_path.str().c_str(), true);
 
@@ -273,7 +277,7 @@ class AsyncCopyTest : public Tracertest {
 
 // Test:1 Compares total num of kernel-names in golden output against current
 // tracer output
-TEST_F(AsyncCopyTest, DISABLED_WhenRunningTracerWithAppThenAsyncCopyOutputIsgenerated) {
+TEST_F(AsyncCopyTest, WhenRunningTracerWithAppThenAsyncCopyOutputIsGenerated) {
   // kernel info in current profler run
   std::vector<tracer_kernel_info_t> current_kernel_info;
 
@@ -282,7 +286,7 @@ TEST_F(AsyncCopyTest, DISABLED_WhenRunningTracerWithAppThenAsyncCopyOutputIsgene
 }
 
 // Test:2 Matches coelation Ids
-TEST_F(AsyncCopyTest, DISABLED_WhenRunningTracerWithAppThenAsyncCorelationCountIsCorrect) {
+TEST_F(AsyncCopyTest, WhenRunningTracerWithAppThenAsyncCorelationCountIsCorrect) {
   // kernel info in current profler run
   std::vector<tracer_kernel_info_t> current_kernel_info;
 
@@ -387,4 +391,4 @@ TEST_F(ROCTXTest, WhenRunningTracerWithAppThenROCTxOutputIsgenerated) {
   EXPECT_EQ(roctx_output.size(), i)
       << "Current Output number of records is greater than golden output number of records"
       << std::endl;
-}
\ No newline at end of file
+}
diff --git a/tests-v2/featuretests/utils/csv_parser.h b/tests-v2/featuretests/utils/csv_parser.h
index d060bbee..085a6d62 100644
--- a/tests-v2/featuretests/utils/csv_parser.h
+++ b/tests-v2/featuretests/utils/csv_parser.h
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <assert.h>
 #include <stdio.h>
 
+#include <cstdint>
 #include <cstring>
 #include <iostream>
 #include <map>
diff --git a/tests-v2/microbenchmarks/CMakeLists.txt b/tests-v2/microbenchmarks/CMakeLists.txt
index ec8e425b..d66cb4c6 100644
--- a/tests-v2/microbenchmarks/CMakeLists.txt
+++ b/tests-v2/microbenchmarks/CMakeLists.txt
@@ -6,6 +6,9 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip")
 set(CMAKE_HIP_ARCHITECTURES OFF)
+if(DEFINED ROCM_PATH)
+    set(HIP_ROOT_DIR "${ROCM_PATH}/bin")
+endif()
 find_package(HIP REQUIRED MODULE)
 
 set(TEST_DIR ${PROJECT_SOURCE_DIR}/tests-v2/microbenchmarks)
@@ -13,6 +16,14 @@ file(GLOB TEST_SRC_FILE ${TEST_DIR}/*.cpp)
 
 set_source_files_properties(${TEST_SRC_FILE} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
 hip_add_executable(pcie_bw_test ${TEST_SRC_FILE})
+add_test(
+    NAME pcie_bw_test
+    COMMAND $<TARGET_FILE:pcie_bw_test>
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}")
+set_tests_properties(
+    pcie_bw_test
+    PROPERTIES LABELS "v2;benchmarks" ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}"
+               SKIP_REGULAR_EXPRESSION "SIGBUS error. Aborting test" DISABLED TRUE)
 
 target_link_libraries(pcie_bw_test PRIVATE rocm_smi64)
 target_link_options(pcie_bw_test PRIVATE "-Wl,--build-id=md5")
diff --git a/tests-v2/microbenchmarks/pcie_bw_test.cpp b/tests-v2/microbenchmarks/pcie_bw_test.cpp
index 5b5c9d7c..35f93d76 100644
--- a/tests-v2/microbenchmarks/pcie_bw_test.cpp
+++ b/tests-v2/microbenchmarks/pcie_bw_test.cpp
@@ -18,11 +18,17 @@
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  THE SOFTWARE. */
 
+// make sure assert works
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
 #include <stdint.h>
 #include <stddef.h>
 #include <hip/hip_runtime.h>
 #include <unistd.h>
 
+#include <array>
 #include <iomanip>
 #include <iostream>
 #include <string>
@@ -31,6 +37,7 @@
 #include <chrono>
 #include <thread>
 #include <future>
+#include <csignal>
 
 #include "rocm_smi/rocm_smi.h"
 
@@ -71,22 +78,22 @@
 #define HANDLE_ERROR CHK_ERR_ASRT(ret);
 #define HIP_ASSERT(x) (assert((x) == hipSuccess))
 
-#define SEND_DATA()                                                                                \
-  HIP_ASSERT(hipMemcpyAsync(dst, src, SIZE * sizeof(int), hipMemcpyDefault, stream));
-
-static float burn_hip(int dev, int* dst, int* src, size_t SIZE,
-                      std::atomic<bool>* transfer_started) {
+static float burn_hip(int dev, int* dst, int* src, size_t sz, std::atomic<bool>* transfer_started) {
   hipSetDevice(dev);
   hipStream_t stream;
   hipStreamCreate(&stream);
-  hipEvent_t events[3];
+  auto events = std::array<hipEvent_t, 3>{};
+
+  auto send_data = [dst, src, sz, stream]() {
+    HIP_ASSERT(hipMemcpyAsync(dst, src, sz * sizeof(int), hipMemcpyDefault, stream));
+  };
 
-  for (int i = 0; i < 3; i++) {
-    hipEventCreate(events + i);
-    SEND_DATA();
-    hipEventRecord(events[i], stream);
+  for (auto& event : events) {
+    hipEventCreate(&event);
+    send_data();
+    hipEventRecord(event, stream);
   }
-  SEND_DATA();
+  send_data();
   hipEventSynchronize(events[0]);
   transfer_started->store(true);
 
@@ -95,23 +102,30 @@ static float burn_hip(int dev, int* dst, int* src, size_t SIZE,
   while (elapsed < 1500.0f) {  // Transfer data for 1.5 seconds = 1500 ms
     float out;
 
-    hipEventSynchronize(events[(counter + 1) % 3]);
-    hipEventElapsedTime(&out, events[counter % 3], events[(counter + 1) % 3]);
+    hipEventSynchronize(events[(counter + 1) % events.size()]);
+    hipEventElapsedTime(&out, events[counter % events.size()],
+                        events[(counter + 1) % events.size()]);
     elapsed += out;
 
-    hipEventRecord(events[counter % 3], stream);
-    SEND_DATA();
+    hipEventRecord(events[counter % events.size()], stream);
+    send_data();
     counter += 1;
   }
   hipStreamSynchronize(stream);
 
-  for (int i = 0; i < 3; i++) hipEventDestroy(events[i]);
+  for (auto& event : events) hipEventDestroy(event);
   hipStreamDestroy(stream);
 
-  return float(SIZE * sizeof(int) * counter) / elapsed / 1E6;
+  return float(sz * sizeof(int) * counter) / elapsed / 1E6;
 }
 
+namespace {
+void signal_handler(int _sig);
+void activate_signal_handler();
+}  // namespace
+
 int main() {
+  activate_signal_handler();
   const size_t SIZE = 3 << 28;
   rsmi_status_t ret;
   uint16_t dev_id;
@@ -132,9 +146,10 @@ int main() {
     int* d_ptr;
     HIP_ASSERT(hipMalloc((void**)&d_ptr, SIZE * sizeof(int)));
 
-    std::cout << ">>> Device " << dev << std::endl;
+    std::cout << ">>> Device " << dev << std::flush;
     ret = rsmi_dev_id_get(dev, &dev_id);
     HANDLE_ERROR;
+    std::cout << " (rsmi device id: " << dev_id << ")" << std::endl;
 
     rsmi_pcie_bandwidth_t bandwidth;
     ret = rsmi_dev_pci_bandwidth_get(dev, &bandwidth);
@@ -147,7 +162,9 @@ int main() {
     std::cout << "Current: " << bandwidth.transfer_rate.frequency[bandwidth.transfer_rate.current]
               << '\n';
 
-    uint64_t sent = 0, received = 0, max_pkt_sz = 0;
+    uint64_t sent = 0;
+    uint64_t received = 0;
+    uint64_t max_pkt_sz = 0;
     std::atomic<bool> transfer_started;
     transfer_started.store(false);
     auto thread =
@@ -184,4 +201,23 @@ int main() {
   delete[] h_ptr;
   ret = rsmi_shut_down();
   return 0;
-}
\ No newline at end of file
+}
+
+namespace {
+// activate a signal handler to catch a SIGBUS on navi32 and
+// emit a message that we can use to skip the test in CTest
+void activate_signal_handler() {
+  struct sigaction _action = {};
+  sigemptyset(&_action.sa_mask);
+  _action.sa_flags = SA_RESTART;
+  _action.sa_handler = signal_handler;
+  sigaction(SIGBUS, &_action, nullptr);
+}
+
+void signal_handler(int _sig) {
+  if (_sig == SIGBUS) {
+    std::cerr << "SIGBUS error. Aborting test" << std::endl;
+  }
+  ::quick_exit(_sig);
+}
+}  // namespace
diff --git a/tests-v2/run_tests.sh b/tests-v2/run_tests.sh
index 1a1977e7..d86804c8 100755
--- a/tests-v2/run_tests.sh
+++ b/tests-v2/run_tests.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -e
 
 CURRENT_DIR="$( dirname -- "$0"; )";
 
@@ -15,4 +15,4 @@ echo -e "running feature tests for rocprofiler"
 eval ${CURRENT_DIR}/tests-v2/featuretests/profiler/runFeatureTests
 
 echo -e "Running Tracer Tests"
-eval ${CURRENT_DIR}/tests-v2/featuretests/tracer/runTracerFeatureTests
\ No newline at end of file
+eval ${CURRENT_DIR}/tests-v2/featuretests/tracer/runTracerFeatureTests
diff --git a/tests-v2/unittests/core/CMakeLists.txt b/tests-v2/unittests/core/CMakeLists.txt
index c85a8338..c255e124 100644
--- a/tests-v2/unittests/core/CMakeLists.txt
+++ b/tests-v2/unittests/core/CMakeLists.txt
@@ -20,6 +20,30 @@
 # SOFTWARE.
 # ##############################################################################
 
+# ##############################################################################
+# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# ##############################################################################
+
+# Setup unit testing env
+
 find_library(PCIACCESS_LIBRARIES pciaccess REQUIRED)
 
 enable_testing()
@@ -55,14 +79,14 @@ file(GLOB ROCPROFILER_TRACER_SRC_FILES
      ${PROJECT_SOURCE_DIR}/src/core/session/tracer/*.cpp)
 file(GLOB ROCPROFILER_ROCTRACER_SRC_FILES
      ${PROJECT_SOURCE_DIR}/src/core/session/tracer/src/*.cpp)
-     file(GLOB ROCPROFILER_ATT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/att/*.cpp)
-     file(GLOB ROCPROFILER_SRC_CLASS_FILES
-          ${CMAKE_CURRENT_SOURCE_DIR}/rocprofiler_singleton.cpp)
-     file(GLOB ROCPROFILER_ISA_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/isa_capture/*.cpp)
-     file(GLOB ROCPROFILER_SPM_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/spm/spm.cpp)
-     file(GLOB ROCPROFILER_SRC_API_FILES ${PROJECT_SOURCE_DIR}/src/api/*.cpp)
-     set(ROCPROFILER_SRC_FILES ${ROCPROFILER_SRC_API_FILES} ${ROCPROFILER_ATT_SRC_FILES}
-          ${ROCPROFILER_ISA_SRC_FILES} ${ROCPROFILER_SRC_PROFILER_FILES} ${ROCPROFILER_ATT_SRC_FILES})
+file(GLOB ROCPROFILER_ATT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/att/*.cpp)
+file(GLOB ROCPROFILER_SRC_CLASS_FILES
+     ${CMAKE_CURRENT_SOURCE_DIR}/rocprofiler_singleton.cpp)
+file(GLOB ROCPROFILER_ISA_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/isa_capture/*.cpp)
+file(GLOB ROCPROFILER_SPM_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/spm/spm.cpp)
+file(GLOB ROCPROFILER_SRC_API_FILES ${PROJECT_SOURCE_DIR}/src/api/*.cpp)
+set(ROCPROFILER_SRC_FILES ${ROCPROFILER_SRC_API_FILES} ${ROCPROFILER_ATT_SRC_FILES}
+     ${ROCPROFILER_ISA_SRC_FILES} ${ROCPROFILER_SRC_PROFILER_FILES} ${ROCPROFILER_ATT_SRC_FILES})
 
 set(CORE_HSA_DIR ${PROJECT_SOURCE_DIR}/src/core/hsa)
 file(GLOB CORE_HSA_SRC_FILES ${CORE_HSA_DIR}/*.cpp)
@@ -81,10 +105,14 @@ file(GLOB CORE_COUNTERS_PARENT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters
 file(GLOB CORE_COUNTERS_METRICS_SRC_FILES
      ${PROJECT_SOURCE_DIR}/src/core/counters/metrics/*.cpp)
 file(GLOB CORE_COUNTERS_MMIO_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/mmio/*.cpp)
-set(GTEST_MAIN_DIR ${PROJECT_SOURCE_DIR}/tests-v2/unittests/core)
-file(GLOB GTEST_MAIN_SRC_FILE ${GTEST_MAIN_DIR}/gtests_main.cpp)
-add_executable(
-    runCoreUnitTests
+file(GLOB HSASingleton_TEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/HSASingleton/*.cpp)
+file(GLOB ROCProfiler_Singleton_TEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/ROCProfiler_Singleton/*.cpp)
+file(GLOB GTEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+
+set(runCoreUnitTests_SOURCES
+    ${GTEST_SRC_FILES}
+    ${HSASingleton_TEST_SRC_FILES}
+    ${ROCProfiler_Singleton_TEST_SRC_FILES}
     ${CORE_MEMORY_SRC_FILES}
     ${CORE_SESSION_SRC_FILES}
     ${CORE_FILTER_SRC_FILES}
@@ -103,31 +131,56 @@ add_executable(
     ${CORE_COUNTERS_METRICS_SRC_FILES}
     ${CORE_COUNTERS_MMIO_SRC_FILES}
     ${CORE_COUNTERS_PARENT_SRC_FILES}
-    ${CORE_PC_SAMPLING_FILES}
-    ${GTEST_MAIN_SRC_FILE}
-    ${CMAKE_CURRENT_SOURCE_DIR}/ROCProfiler_Singleton/ROCProfiler_Singleton_unittests.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/HSASingleton/HSASingleton_unittests.cpp
-   )
+    ${CORE_PC_SAMPLING_FILES})
+
+add_executable(runCoreUnitTests ${runCoreUnitTests_SOURCES})
 
 target_include_directories(
-    runCoreUnitTests
-    PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/inc
+     runCoreUnitTests
+     PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/inc
             ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR}
             ${PROJECT_BINARY_DIR}/rocprofiler)
 
 target_compile_definitions(
-    runCoreUnitTests
-    PUBLIC AMD_INTERNAL_BUILD
-    PRIVATE PROF_API_IMPL HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1)
-
+     runCoreUnitTests
+     PUBLIC AMD_INTERNAL_BUILD
+     PRIVATE PROF_API_IMPL HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1)
 
 target_link_libraries(
-    runCoreUnitTests PRIVATE  rocprofiler_tool test_hsatool_library ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64
+     runCoreUnitTests PRIVATE rocprofiler_tool test_hsatool_library ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64
                          GTest::gtest GTest::gtest_main stdc++fs ${PCIACCESS_LIBRARIES})
 
-
 add_dependencies(tests runCoreUnitTests)
 install(TARGETS runCoreUnitTests
         RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests
                 COMPONENT tests)
-add_test(AllTests runCoreUnitTests)
\ No newline at end of file
+
+# add_test(AllTests runCoreUnitTests)
+include(GoogleTest)
+
+set(GTEST_DISCOVER_TESTS_TARGET runCoreUnitTests)
+set(GTEST_DISCOVER_TESTS_LABELS "v2" "unittests")
+set(GTEST_DISCOVER_TESTS_ENVIRONMENT ${ROCPROFILER_MEMCHECK_PRELOAD_ENV})
+configure_file(
+    ${PROJECT_SOURCE_DIR}/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake @ONLY)
+
+if(NOT ROCPROFILER_MEMCHECK MATCHES "(Thread|Address)Sanitizer")
+    gtest_discover_tests(runCoreUnitTests)
+    set_property(
+        DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        APPEND
+        PROPERTY TEST_INCLUDE_FILES
+                 ${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake)
+else()
+    gtest_add_tests(
+        TARGET runCoreUnitTests
+        SOURCES "${runUnitTests_SOURCES}"
+        TEST_LIST runUnitTests_TESTS)
+    include(${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake)
+endif()
+
+# for the *_FilePlugin tests
+if(NOT EXISTS "${PROJECT_BINARY_DIR}/test-output")
+    file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/test-output")
+endif()
\ No newline at end of file
diff --git a/tests-v2/unittests/profiler/CMakeLists.txt b/tests-v2/unittests/profiler/CMakeLists.txt
index 393be7f1..e84d1a78 100644
--- a/tests-v2/unittests/profiler/CMakeLists.txt
+++ b/tests-v2/unittests/profiler/CMakeLists.txt
@@ -1,3 +1,25 @@
+# ##############################################################################
+# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# ##############################################################################
+
 # Setup unit testing env
 
 find_library(PCIACCESS_LIBRARIES pciaccess REQUIRED)
@@ -63,8 +85,8 @@ file(GLOB CORE_COUNTERS_PARENT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters
 file(GLOB CORE_COUNTERS_METRICS_SRC_FILES
      ${PROJECT_SOURCE_DIR}/src/core/counters/metrics/*.cpp)
 file(GLOB CORE_COUNTERS_MMIO_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/mmio/*.cpp)
-add_executable(
-    runUnitTests
+
+set(runUnitTests_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/profiler_gtest.cpp
     ${CORE_MEMORY_SRC_FILES}
     ${CORE_SESSION_SRC_FILES}
@@ -84,27 +106,56 @@ add_executable(
     ${CORE_COUNTERS_METRICS_SRC_FILES}
     ${CORE_COUNTERS_MMIO_SRC_FILES}
     ${CORE_COUNTERS_PARENT_SRC_FILES}
-    ${CORE_PC_SAMPLING_FILES}
-    ${GTEST_MAIN_SRC_FILE}
-)
+    ${CORE_PC_SAMPLING_FILES})
+
+add_executable(runUnitTests ${runUnitTests_SOURCES})
 
 target_include_directories(
-    runUnitTests
-    PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/inc
+     runUnitTests
+     PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/inc
             ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR}
             ${PROJECT_BINARY_DIR}/rocprofiler)
 
 target_compile_definitions(
-    runUnitTests
-    PUBLIC AMD_INTERNAL_BUILD
-    PRIVATE PROF_API_IMPL HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1)
+     runUnitTests
+     PUBLIC AMD_INTERNAL_BUILD
+     PRIVATE PROF_API_IMPL HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1)
 
 target_link_libraries(
-    runUnitTests PRIVATE rocprofiler_tool ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64
-    GTest::gtest GTest::gtest_main stdc++fs ${PCIACCESS_LIBRARIES} ${GDB} dw elf c dl)
+     runUnitTests PRIVATE rocprofiler_tool ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64
+                         GTest::gtest GTest::gtest_main stdc++fs ${PCIACCESS_LIBRARIES} ${GDB} dw elf c dl)
 
 add_dependencies(tests runUnitTests)
 install(TARGETS runUnitTests
         RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests
                 COMPONENT tests)
-add_test(AllTests runUnitTests)
\ No newline at end of file
+
+# add_test(AllTests runUnitTests)
+include(GoogleTest)
+
+set(GTEST_DISCOVER_TESTS_TARGET runUnitTests)
+set(GTEST_DISCOVER_TESTS_LABELS "v2" "unittests")
+set(GTEST_DISCOVER_TESTS_ENVIRONMENT ${ROCPROFILER_MEMCHECK_PRELOAD_ENV})
+configure_file(
+    ${PROJECT_SOURCE_DIR}/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake @ONLY)
+
+if(NOT ROCPROFILER_MEMCHECK MATCHES "(Thread|Address)Sanitizer")
+    gtest_discover_tests(runUnitTests)
+    set_property(
+        DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+        APPEND
+        PROPERTY TEST_INCLUDE_FILES
+                 ${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake)
+else()
+    gtest_add_tests(
+        TARGET runUnitTests
+        SOURCES "${runUnitTests_SOURCES}"
+        TEST_LIST runUnitTests_TESTS)
+    include(${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake)
+endif()
+
+# for the *_FilePlugin tests
+if(NOT EXISTS "${PROJECT_BINARY_DIR}/test-output")
+    file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/test-output")
+endif()
\ No newline at end of file
diff --git a/tests-v2/unittests/profiler/profiler_gtest.cpp b/tests-v2/unittests/profiler/profiler_gtest.cpp
index 6145cd04..9827d560 100644
--- a/tests-v2/unittests/profiler/profiler_gtest.cpp
+++ b/tests-v2/unittests/profiler/profiler_gtest.cpp
@@ -23,6 +23,7 @@
 #include <vector>
 #include <mutex>
 #include <memory>
+#include <string_view>
 
 #include "api/rocprofiler_singleton.h"
 #include "core/memory/generic_buffer.h"
@@ -36,12 +37,9 @@
  *  ###############################################
  */
 
-
 void buffer_callback_fun(const rocprofiler_record_header_t* begin,
                          const rocprofiler_record_header_t* end,
-                         rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) {
-  std::cout << "buffer callback" << std::endl;
-}
+                         rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) {}
 
 /*
  *  ###############################################
@@ -51,7 +49,7 @@ void buffer_callback_fun(const rocprofiler_record_header_t* begin,
 
 // A lot have changed in the class, since this test was written
 // Need to rewrite all the test cases again.
-TEST(WhenAddingARecordToBuffer,   DISABLED_RecordGetsAddedSuccefully) {
+TEST(WhenAddingARecordToBuffer, RecordGetsAddedSuccefully) {
   Memory::GenericBuffer* buffer = new Memory::GenericBuffer(
       rocprofiler_session_id_t{0}, rocprofiler_buffer_id_t{0}, 0x8000, buffer_callback_fun);
 
@@ -337,4 +335,4 @@ TEST(WhenTrucatingKokkossKernelNames, KernelNameGetsTruncatedProperly) {
 
   std::string trunkated_name = rocprofiler::truncate_name(long_kernel_name);
   EXPECT_EQ("hip_parallel_launch_local_memory", trunkated_name);
-}
\ No newline at end of file
+}