diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000..c2c3ca3a --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,30 @@ +--- +Checks: "-*,\ +misc-*,\ +-misc-incorrect-roundings,\ +-misc-macro-parentheses,\ +-misc-misplaced-widening-cast,\ +-misc-static-assert,\ +-misc-no-recursion,\ +-misc-non-private-member-variables-in-classes,\ +modernize-*,\ +-modernize-deprecated-headers,\ +-modernize-raw-string-literal,\ +-modernize-return-braced-init-list,\ +-modernize-use-transparent-functors,\ +-modernize-use-trailing-return-type,\ +-modernize-avoid-c-arrays,\ +-modernize-use-auto,\ +-modernize-concat-nested-namespaces,\ +-modernize-use-nodiscard,\ +performance-*,\ +readability-*,\ +-readability-function-size,\ +-readability-identifier-naming,\ +" +CheckOptions: + - key: readability-braces-around-statements.ShortStatementLines + value: '2' + - key: readability-implicit-bool-conversion.AllowPointerConditions + value: '1' +... diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..90e05c40 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "github-actions" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml new file mode 100644 index 00000000..c19ecf92 --- /dev/null +++ b/.github/workflows/continuous_integration.yml @@ -0,0 +1,330 @@ +name: Continuous Integration + +on: + workflow_dispatch: + push: + branches: [ "amd-staging" ] + paths-ignore: + - '.github/workflows/pull_*.yml' + - '.github/workflows/linting.yml' + - '.github/workflows/markdown_lint.yml' + - '*.md' + pull_request: + branches: [ "amd-staging" ] + paths-ignore: + - '.github/workflows/pull_*.yml' + - '.github/workflows/linting.yml' + - '.github/workflows/markdown_lint.yml' + - '*.md' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + # TODO(jrmadsen): replace LD_RUNPATH_FLAG, GPU_LIST, etc. with internal handling in cmake + ROCM_PATH: "/opt/rocm" + CMAKE_PREFIX_PATH: "/opt/rocm" + LD_RUNPATH_FLAG: " -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib" + GPU_LIST: "gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102" + +jobs: + get_latest_mainline_build_number: + runs-on: mi200 + + outputs: + LATEST_BUILD_NUMBER: ${{ steps.get_build_number.outputs.LATEST_BUILD_NUMBER }} + + steps: + - id: get_build_number + run: echo "LATEST_BUILD_NUMBER=$(wget -qO- 'http://rocm-ci.amd.com/job/compute-rocm-dkms-no-npi-hipclang/lastSuccessfulBuild/buildNumber')" >> $GITHUB_OUTPUT + + Mi200-Ubuntu22-Doc-Packages: + # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix + strategy: + fail-fast: true + max-parallel: 4 + matrix: + include: + - os: 'ubuntu-22.04' + runner: 'renderD129' + device: '/renderD129' + build-type: 'Release' + ci-flags: '--coverage' + name-tag: '-codecov' + - os: 'ubuntu-22.04' + runner: 'renderD130' + device: '/renderD130' + build-type: 'RelWithDebInfo' + ci-flags: '' + name-tag: '' + + runs-on: ${{ matrix.runner }} + + # define this for containers + env: + GIT_DISCOVERY_ACROSS_FILESYSTEM: 1 + + container: + image: compute-artifactory.amd.com:5000/rocm-plus-docker/compute-rocm-dkms-no-npi-hipclang:${{ needs.get_latest_mainline_build_number.outputs.LATEST_BUILD_NUMBER }}-${{ matrix.os }}-stg1 + options: --memory=128g --cpus=32 --ipc=host --device=/dev/kfd --device=/dev/dri${{ matrix.device }} --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined + + if: ${{ always() }} + needs: get_latest_mainline_build_number + + steps: + - uses: actions/checkout@v3 + + - name: List Files + shell: bash + run: | + which-realpath() { echo "$1 resolves to $(realpath $(which $1))"; } + for i in python python3 git cmake ctest; do which-realpath $i; done + ls -la + + - name: Install Python requirements + shell: bash + run: pip3 install -r requirements.txt + + - name: Configure, Build, and Test + timeout-minutes: 30 + shell: bash + run: + python3 ./script/run-ci.py -B build + --name ${{ github.repository }}-${{ github.ref_name }}-mi200-${{ matrix.os }}${{ matrix.name-tag }} + --build-jobs 12 + --site mi200 + --gpu-targets ${{ env.GPU_LIST }} + ${{ matrix.ci-flags }} + -- + -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} + -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}" + -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake" + -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}" + -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }} + -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF + -DCPACK_PACKAGING_INSTALL_PREFIX=${{ env.ROCM_PATH }} + -DCPACK_OBJCOPY_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objcopy" + -DCPACK_READELF_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-readelf" + -DCPACK_STRIP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-strip" + -DCPACK_OBJDUMP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objdump" + -DCPACK_GENERATOR='DEB;RPM;TGZ' + -DPython3_EXECUTABLE=$(which python3) + + - name: Install + timeout-minutes: 10 + run: + cmake --build build --target install --parallel 8 + + - name: Build Docs + timeout-minutes: 10 + run: + cmake --build build --target doc --parallel 8 + + - name: Build Packaging + timeout-minutes: 10 + run: + cmake --build build --target package --parallel 8 + + - name: Archive production artifacts + uses: actions/upload-artifact@v3 + with: + name: artifacts + path: | + ${{github.workspace}}/build/*.deb + ${{github.workspace}}/build/*.rpm + ${{github.workspace}}/build/*.tgz + + Building-Testing: + # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix + + strategy: + fail-fast: true + matrix: + runner: ['vega20', 'mi100', 'navi21', 'navi32'] + device: [''] + os: ['ubuntu-22.04'] + build-type: ['RelWithDebInfo'] + ci-flags: [''] + name-tag: [''] + extra-options: [''] + include: + - os: 'rhel-8.x' + runner: 'renderD131' + device: '/renderD131' + build-type: 'RelWithDebInfo' + ci-flags: '' + name-tag: '' + extra-options: '--memory=128g --cpus=32' + - os: 'rhel-9.x' + runner: 'renderD129' + device: '/renderD129' + build-type: 'RelWithDebInfo' + ci-flags: '' + name-tag: '' + extra-options: '--memory=128g --cpus=32' + - os: 'sles' + runner: 'renderD130' + device: '/renderD130' + build-type: 'RelWithDebInfo' + ci-flags: '' + name-tag: '' + extra-options: '--memory=128g --cpus=32' + - os: 'ubuntu-20.04' + runner: 'renderD131' + device: '/renderD131' + build-type: 'RelWithDebInfo' + ci-flags: '' + name-tag: '' + extra-options: '--memory=128g --cpus=32' + + runs-on: ${{ matrix.runner }} + + # define this for containers + env: + GIT_DISCOVERY_ACROSS_FILESYSTEM: 1 + + container: + image: compute-artifactory.amd.com:5000/rocm-plus-docker/compute-rocm-dkms-no-npi-hipclang:${{ needs.get_latest_mainline_build_number.outputs.LATEST_BUILD_NUMBER }}-${{ matrix.os }}-stg1 + options: ${{ matrix.extra-options }} --ipc=host --device=/dev/kfd --device=/dev/dri${{ matrix.device }} --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined + + if: ${{ always() }} + needs: get_latest_mainline_build_number + + steps: + - uses: actions/checkout@v3 + + - name: List Files + shell: bash + run: | + which-realpath() { echo "$1 resolves to $(realpath $(which $1))"; } + for i in python python3 git cmake ctest; do which-realpath $i; done + ls -la + + - name: Install Python requirements + shell: bash + run: pip3 install -r requirements.txt + + - name: Configure, Build, and Test + if: ${{ matrix.runner != 'navi32' }} + timeout-minutes: 30 + shell: bash + run: + python3 ./script/run-ci.py -B build + --name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}-${{ matrix.os }}${{ matrix.name-tag }} + --build-jobs 12 + --site ${{ matrix.runner }} + --gpu-targets ${{ env.GPU_LIST }} + ${{ matrix.ci-flags }} + -- + -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} + -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}" + -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake" + -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}" + -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }} + -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF + -DCPACK_PACKAGING_INSTALL_PREFIX=${{ env.ROCM_PATH }} + -DCPACK_OBJCOPY_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objcopy" + -DCPACK_READELF_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-readelf" + -DCPACK_STRIP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-strip" + -DCPACK_OBJDUMP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objdump" + -DCPACK_GENERATOR='DEB;RPM;TGZ' + -DPython3_EXECUTABLE=$(which python3) + + - name: Configure, Build, and Test + if: ${{ matrix.runner == 'navi32' }} + timeout-minutes: 30 + shell: bash + run: + python3 ./script/run-ci.py -B build + --name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}${{ matrix.name-tag }} + --build-jobs 12 + --site ${{ matrix.runner }} + --gpu-targets ${{ env.GPU_LIST }} + ${{ matrix.ci-flags }} + -- + -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} + -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}" + -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake" + -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}" + -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }} + -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF + -DCPACK_PACKAGING_INSTALL_PREFIX=${{ env.ROCM_PATH }} + -DCPACK_OBJCOPY_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objcopy" + -DCPACK_READELF_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-readelf" + -DCPACK_STRIP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-strip" + -DCPACK_OBJDUMP_EXECUTABLE="${{ env.ROCM_PATH }}/llvm/bin/llvm-objdump" + -DCPACK_GENERATOR='DEB;RPM;TGZ' + -DPython3_EXECUTABLE=$(which python3) + -- + -LE v1 + + sanitizers: + # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix + + strategy: + fail-fast: false + matrix: + include: + - os: 'ubuntu-22.04' + runner: 'vega20' + build-type: 'RelWithDebInfo' + ci-flags: '' + sanitizer: 'ThreadSanitizer' + - os: 'ubuntu-22.04' + runner: 'navi32' + build-type: 'RelWithDebInfo' + ci-flags: '' + sanitizer: 'LeakSanitizer' + - os: 'ubuntu-22.04' + runner: 'mi100' + build-type: 'RelWithDebInfo' + ci-flags: '' + sanitizer: 'AddressSanitizer' + + runs-on: ${{ matrix.runner }} + + # define this for containers + env: + GIT_DISCOVERY_ACROSS_FILESYSTEM: 1 + + container: + image: compute-artifactory.amd.com:5000/rocm-plus-docker/compute-rocm-dkms-no-npi-hipclang:${{ needs.get_latest_mainline_build_number.outputs.LATEST_BUILD_NUMBER }}-${{ matrix.os }}-stg1 + options: --privileged --ipc=host --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined + + if: ${{ always() }} + needs: get_latest_mainline_build_number + + steps: + - uses: actions/checkout@v3 + + - name: List Files + shell: bash + run: | + which-realpath() { echo "$1 resolves to $(realpath $(which $1))"; } + for i in python python3 git cmake ctest; do which-realpath $i; done + ls -la + + - name: Install Python requirements + shell: bash + run: pip3 install -r requirements.txt + + - name: Configure, Build, and Test + timeout-minutes: 30 + shell: bash + run: + python3 ./script/run-ci.py -B build + --name ${{ github.repository }}-${{ github.ref_name }}-mi100-${{ matrix.sanitizer }} + --build-jobs 12 + --site mi100 + --gpu-targets ${{ env.GPU_LIST }} + --memcheck=${{ matrix.sanitizer }} + ${{ matrix.ci-flags }} + -- + -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} + -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}" + -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake" + -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}" + -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }} + -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF + -DPython3_EXECUTABLE=$(which python3) diff --git a/.github/workflows/docker_cleanup.yml b/.github/workflows/docker_cleanup.yml new file mode 100644 index 00000000..9caa4adf --- /dev/null +++ b/.github/workflows/docker_cleanup.yml @@ -0,0 +1,27 @@ +name: Dockers Cleanup + +on: + # allow triggering manually + workflow_dispatch: + # run on weekly schedule + schedule: + - cron: "0 0 * * 6" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + cleanup-dockers: + + strategy: + fail-fast: false + matrix: + runner: ['vega20', mi200, mi100, navi21, navi31] + + runs-on: ${{ matrix.runner }} + + steps: + - name: prune-dockers + run: | + docker system prune -f -a --volumes diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml index fbed48e0..9af70090 100644 --- a/.github/workflows/formatting.yml +++ b/.github/workflows/formatting.yml @@ -1,12 +1,14 @@ name: Formatting -run-name: formatting on: + workflow_dispatch: pull_request: branches: [ amd-staging ] paths-ignore: - '.github/workflows/pull_*.yml' + - '.github/workflows/linting.yml' + - '.github/workflows/markdown_lint.yml' concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -48,10 +50,10 @@ jobs: if: failure() uses: peter-evans/create-pull-request@v5 with: - commit-message: "run cmake formatting (cmake-format)" + commit-message: "cmake formatting (cmake-format)" branch: ${{ steps.extract_branch.outputs.branch }}-cmake-format delete-branch: true - title: "Apply cmake-format to ${{ steps.extract_branch.outputs.branch }}" + title: "Format cmake code (via cmake-format) on ${{ steps.extract_branch.outputs.branch }}" base: ${{ steps.extract_branch.outputs.branch }} source: @@ -90,8 +92,55 @@ jobs: if: failure() uses: peter-evans/create-pull-request@v5 with: - commit-message: "run formatting (clang-format v11)" + commit-message: "source formatting (clang-format v11)" branch: ${{ steps.extract_branch.outputs.branch }}-clang-format delete-branch: true - title: "Apply clang-format (v11) to ${{ steps.extract_branch.outputs.branch }}" + title: "Format source code (via clang-format v11) on ${{ steps.extract_branch.outputs.branch }}" + base: ${{ steps.extract_branch.outputs.branch }} + + python: + runs-on: ubuntu-22.04 + strategy: + matrix: + python-version: ['3.10'] + + steps: + - uses: actions/checkout@v3 + + - name: Extract branch name + shell: bash + run: | + echo "branch=${GITHUB_HEAD_REF:-${GITHUB_HEAD_REF#refs/heads/}}" >> $GITHUB_OUTPUT + id: extract_branch + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install black + + - name: black format + run: | + black . + if [ $(git diff | wc -l) -ne 0 ]; then + echo -e "\nError! Python code not formatted. Run black...\n" + echo -e "\nFiles:\n" + git diff --name-only + echo -e "\nFull diff:\n" + git diff + exit 1 + fi + + - name: Create pull request + if: failure() + uses: peter-evans/create-pull-request@v5 + with: + commit-message: "python formatting (black)" + branch: ${{ steps.extract_branch.outputs.branch }}-python-format + delete-branch: true + title: "Format python code (via black) on ${{ steps.extract_branch.outputs.branch }}" base: ${{ steps.extract_branch.outputs.branch }} diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 00000000..361088e9 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,103 @@ +name: Linting + +on: + workflow_dispatch: + push: + branches: [ "amd-staging" ] + paths-ignore: + - '.github/workflows/pull_*.yml' + - '*.md' + pull_request: + branches: [ "amd-staging" ] + paths-ignore: + - '.github/workflows/pull_*.yml' + - '*.md' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + # TODO(jrmadsen): replace LD_RUNPATH_FLAG, GPU_LIST, etc. with internal handling in cmake + GIT_DISCOVERY_ACROSS_FILESYSTEM: 1 + ROCM_PATH: "/opt/rocm" + CMAKE_PREFIX_PATH: "/opt/rocm" + LD_RUNPATH_FLAG: " -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib" + GPU_LIST: "gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102" + +jobs: + get_latest_mainline_build_number: + runs-on: mi200 + + outputs: + LATEST_BUILD_NUMBER: ${{ steps.get_build_number.outputs.LATEST_BUILD_NUMBER }} + + steps: + - id: get_build_number + run: echo "LATEST_BUILD_NUMBER=$(wget -qO- 'http://rocm-ci.amd.com/job/compute-rocm-dkms-no-npi-hipclang/lastSuccessfulBuild/buildNumber')" >> $GITHUB_OUTPUT + + linting: + strategy: + fail-fast: false + matrix: + include: + - build-type: 'Debug' + linter: 'clang-tidy' + runner: 'mi100' + os: 'ubuntu-22.04' + - build-type: 'Release' + linter: 'clang-tidy' + runner: 'vega20' + os: 'ubuntu-22.04' + + runs-on: ${{ matrix.runner }} + + needs: get_latest_mainline_build_number + + container: + image: compute-artifactory.amd.com:5000/rocm-plus-docker/compute-rocm-dkms-no-npi-hipclang:${{ needs.get_latest_mainline_build_number.outputs.LATEST_BUILD_NUMBER }}-${{ matrix.os }}-stg1 + options: --ipc=host --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --cap-add CAP_SYS_PTRACE --cap-add CAP_SYS_ADMIN --security-opt seccomp=unconfined + + steps: + - uses: actions/checkout@v3 + + - name: List Files + shell: bash + run: | + which-realpath() { echo "$1 resolves to $(realpath $(which $1))"; } + for i in python python3 git cmake ctest; do which-realpath $i; done + ls -la + + - name: Update container + run: | + apt-get update + apt-get install -y clang-tidy-11 g++-12 + update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 10 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10 --slave /usr/bin/g++ g++ /usr/bin/g++-11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 20 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + + - name: Install Python requirements + shell: bash + run: | + python3 -m pip install -r requirements.txt + + - name: Configure, Build, and Test + timeout-minutes: 30 + shell: bash + run: + python3 ./script/run-ci.py -B build + --name ${{ github.repository }}-${{ github.ref_name }}-${{ matrix.runner }}-${{ matrix.linter }}-${{ matrix.build-type }} + --build-jobs 12 + --site ${{ matrix.runner }} + --gpu-targets ${{ env.GPU_LIST }} + --linter ${{ matrix.linter }} + -- + -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} + -DCMAKE_INSTALL_PREFIX="${{ env.ROCM_PATH }}" + -DCMAKE_MODULE_PATH="${{ env.ROCM_PATH }}/hip/cmake;${{ env.ROCM_PATH }}/lib/cmake" + -DCMAKE_SHARED_LINKER_FLAGS="${{ env.LD_RUNPATH_FLAG }}" + -DCMAKE_INSTALL_RPATH=${{ env.ROCM_PATH }} + -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=OFF + -DPython3_EXECUTABLE=$(which python3) + -- + -VV diff --git a/.github/workflows/markdown_lint.yml b/.github/workflows/markdown_lint.yml new file mode 100644 index 00000000..2523de0d --- /dev/null +++ b/.github/workflows/markdown_lint.yml @@ -0,0 +1,21 @@ +name: Markdown Lint + +on: + workflow_dispatch: + pull_request: + branches: [ "amd-staging" ] + paths: + - '*.md' + +jobs: + check-readme: + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Lint Markdown files + uses: avto-dev/markdown-lint@v1 + with: + config: './.markdown-lint-config.yml' + args: './README.md' diff --git a/.markdown-lint-config.yml b/.markdown-lint-config.yml new file mode 100644 index 00000000..3161169c --- /dev/null +++ b/.markdown-lint-config.yml @@ -0,0 +1,141 @@ +default: false # includes/excludes all rules by default + +# Heading levels should only increment by one level at a time +MD001: true + +# Heading style +MD003: true + +# Unordered list style +MD004: true + +# Inconsistent indentation for list items at the same level +MD005: true + +# Consider starting bulleted lists at the beginning of the line +MD006: true + +# Unordered list indentation +MD007: true + +# Trailing spaces +MD009: true + +# Hard tabs +MD010: true + +# Reversed link syntax +MD011: true + +# Multiple consecutive blank lines +MD012: true + +# Line length +MD013: false + +# Dollar signs used before commands without showing output +MD014: false + +# No space after hash on atx style heading +MD018: true + +# Multiple spaces after hash on atx style heading +MD019: true + +# No space inside hashes on closed atx style heading +MD020: true + +# Multiple spaces inside hashes on closed atx style heading +MD021: true + +# Headings should be surrounded by blank lines +MD022: true + +# Headings must start at the beginning of the line +MD023: true + +# Multiple headings with the same content +MD024: + allow_different_nesting: true + +# Multiple top level headings in the same document +MD025: true + +# Trailing punctuation in heading +MD026: true + +# Multiple spaces after blockquote symbol +MD027: true + +# Blank line inside blockquote +MD028: false + +# Ordered list item prefix +MD029: + style: 'one' + +# Spaces after list markers +MD030: true + +# Fenced code blocks should be surrounded by blank lines +MD031: true + +# Lists should be surrounded by blank lines +MD032: true + +# Inline HTML +MD033: true + +# Bare URL used +MD034: true + +# Horizontal rule style +MD035: + style: '***' + +# Emphasis used instead of a heading +MD036: true + +# Spaces inside emphasis markers +MD037: true + +# Spaces inside code span elements +MD038: true + +# Spaces inside link text +MD039: true + +# Fenced code blocks should have a language specified +MD040: true + +# First line in file should be a top level heading +MD041: true + +# No empty links +MD042: true + +# Required heading structure +MD043: false + +# Proper names should have the correct capitalization +MD044: false + +# Images should have alternate text (alt text) +MD045: false + +# Code block style +MD046: + style: 'fenced' + +# Files should end with a single newline character +MD047: true + +# Code fence style +MD048: + style: 'backtick' + +# Custom rules: +CHANGELOG-RULE-001: true +CHANGELOG-RULE-002: true +CHANGELOG-RULE-003: true +CHANGELOG-RULE-004: true \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ebf937c..f5284cca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,21 @@ # SOFTWARE. # ############################################################################## -cmake_minimum_required(VERSION 3.18.0) +cmake_minimum_required(VERSION 3.18.0 FATAL_ERROR) + +set(CMAKE_C_FLAGS_COVERAGE_INIT + "-Og -g3 -fno-omit-frame-pointer -fprofile-abs-path -fprofile-arcs -ftest-coverage --coverage" + CACHE STRING "C flags for code coverage builds") +set(CMAKE_CXX_FLAGS_COVERAGE_INIT + "-Og -g3 -fno-omit-frame-pointer -fprofile-abs-path -fprofile-arcs -ftest-coverage --coverage" + CACHE STRING "C++ flags for code coverage builds") + +set(CMAKE_C_FLAGS_COVERAGE + "${CMAKE_C_FLAGS_COVERAGE_INIT}" + CACHE STRING "C flags for code coverage builds") +set(CMAKE_CXX_FLAGS_COVERAGE + "${CMAKE_CXX_FLAGS_COVERAGE_INIT}" + CACHE STRING "C++ flags for code coverage builds") # Build is not supported on Windows plaform if(WIN32) @@ -49,18 +63,24 @@ endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) -add_compile_options(-Wall) - set(THREADS_PREFER_PTHREAD_FLAG ON) +set(CMAKE_BUILD_RPATH + "${PROJECT_BINARY_DIR}:${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR}") +set(CMAKE_BUILD_RPATH_USE_ORIGIN ON) +set(CMAKE_SKIP_BUILD_RPATH OFF) # Adding default path cmake modules -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") +list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") + # Set build environment -include(utils) -include(env) +include(rocprofiler_options) +include(rocprofiler_utils) +include(rocprofiler_env) +include(rocprofiler_formatting) +include(rocprofiler_linting) # Setup the package version. -get_version("1.0.0") +rocprofiler_get_version("1.0.0") message("-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") set(BUILD_VERSION_MAJOR ${VERSION_MAJOR}) @@ -162,23 +182,33 @@ if(USE_PROF_API EQUAL 1) endif() endif() +enable_testing() + +# Temporarily for CI to work +set(ROCPROFILER_BUILD_TESTS ON) +set(ROCPROFILER_BUILD_CI ON) + +if(ROCPROFILER_BUILD_CI) + include(CTest) +endif() + # Build libraries add_subdirectory(src) -if(${LIBRARY_TYPE} STREQUAL SHARED) +# Build Plugins +add_subdirectory(plugin) + +if(ROCPROFILER_BUILD_SAMPLES) # Build samples add_subdirectory(samples) +endif() +if(ROCPROFILER_BUILD_TESTS) # Build tests + add_subdirectory(test) add_subdirectory(tests-v2) endif() -# Build Plugins -add_subdirectory(plugin) - -# Build tests -add_subdirectory(${TEST_DIR} ${PROJECT_BINARY_DIR}/test) - # Installation and packaging set(DEST_NAME ${ROCPROFILER_NAME}) if(DEFINED CMAKE_INSTALL_PREFIX) @@ -241,27 +271,12 @@ install( DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME} COMPONENT runtime) -# librocprof-tool.so -install( - FILES ${PROJECT_BINARY_DIR}/test/librocprof-tool.so - DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME} - COMPONENT runtime) - -install( - FILES ${PROJECT_BINARY_DIR}/test/librocprof-tool.so - DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME} - COMPONENT asan) - -install( - FILES ${PROJECT_BINARY_DIR}/test/rocprof-ctrl - DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME} - PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ - WORLD_EXECUTE - COMPONENT runtime) - -# File reorg Backward compatibility -option(FILE_REORG_BACKWARD_COMPATIBILITY - "Enable File Reorg with backward compatibility" OFF) +# File reorg backward compatibility for non ASAN packaging +if(NOT ENABLE_ASAN_PACKAGING) + # File reorg Backward compatibility + option(FILE_REORG_BACKWARD_COMPATIBILITY + "Enable File Reorg with backward compatibility" ON) +endif() if(FILE_REORG_BACKWARD_COMPATIBILITY) # To enabe/disable #error in wrapper header files diff --git a/README.md b/README.md index af9c0962..75fd98b2 100644 --- a/README.md +++ b/README.md @@ -83,19 +83,24 @@ export ROCPROFILER_TRACE=1 ## Supported AMD GPU Architectures (V1) The following AMD GPU architectures are supported with ROCprofiler V1: - + - gfx8 (Fiji/Ellesmere) - gfx900 (AMD Vega 10) - gfx906 (AMD Vega 7nm also referred to as AMD Vega 20) - gfx908 (AMD Instinct™ MI100 accelerator) - gfx90a (AMD Instinct™ MI200) +*** +Note: ROCProfiler V1 tool usage documentation is available at [Click Here](doc/rocprof_tool.md) +*** + ## ROCProfiler V2 -ROCProfilerV2 is a newly developed design for AMD’s tooling infrastructure that provides a hardware specific low level performance analysis interface for profiling of GPU compute applications. The first API library version for ROCProfiler v2 is 9.0.0 -### Note: ROCProfilerV2 is currently considered a beta version and is subject to change in future releases +*** +Note: ROCProfilerV2 is currently considered a beta version and is subject to change in future releases +*** ### ROCProfilerV2 Modules @@ -288,14 +293,6 @@ Usage: rocprofv2 --plugin perfetto --hsa-trace -d output_dir # -d is optional, but can be used to define the directory output for output results ``` - Both the output directory and filenames allow for simple environment variable substitution via a special syntax %q{var} -> $var, e.g.: - - ```bash - export var="FOO" - rocprofv2 --plugin perfetto -o file_%q{var}_name - # Generates file names: file_FOO_name[...].pftrace - ``` - - CTF plugin: Outputs the data in ctf format(a binary trace format). CTF binary output can be viewed using TraceCompass or babeltrace. Usage: @@ -313,7 +310,7 @@ Tool used to collect fine-grained hardware metrics. Provides ISA-level instructi rocprofv2 -i input.txt --plugin att --mode network ``` - - app_assembly_file: + - app_assembly_file: On ROCm 6.0, ATT enables automatic capture of the ISA during kernel execution, and does not require recompiling. It is recommeneded to leave at "auto". - app_relative_path Path for the running application @@ -356,7 +353,7 @@ Tool used to collect fine-grained hardware metrics. Provides ISA-level instructi - att: TARGET_CU=1 //or some other CU [0,15] - WGP for Navi [0,8] - SE_MASK=0x1 // bitmask of shader engines. The fewer, the easier on the hardware. Default enables 1 out of 4 shader engines. - SIMD_MASK=0xF // GFX9: bitmask of SIMDs. Navi: SIMD Index [0-3]. - - DISPATCH=ID,RN // collect trace only for the given dispatch_ID and MPI rank RN. RN is optional and ignored for single processes. Multiple line with varying combinations of RN and ID can be added. + - DISPATCH=ID,RN // collect trace only for the given dispatch_ID and MPI rank RN. RN ignored for single processes. Multiple lines with varying combinations of RN and ID can be added. - KERNEL=kernname // Profile only kernels containing the string kernname (c++ mangled name). Multiple lines can be added. - PERFCOUNTERS_COL_PERIOD=0x3 // Multiplier period for counter collection [0~31]. 0=fastest (usually once every 16 cycles). GFX9 only. Counters will be shown in a graph over time in the browser UI. - PERFCOUNTER=counter_name // Add a SQ counter to be collected with ATT; period defined by PERFCOUNTERS_COL_PERIOD. GFX9 only. @@ -434,7 +431,105 @@ A device profiling session allows the user to profile the GPU device for counter ### Session Support -A session is a unique identifier for a profiling/tracing/pc-sampling task. A ROCProfilerV2 Session has enough information about what needs to be collected or traced and it allows the user to start/stop profiling/tracing whenever required. More details on the API can be found in the API specification documentation that can be installed using rocprofiler-doc package. Samples also can be found for how to use the API in samples directory. + A session is a unique identifier for a profiling/tracing/pc-sampling task. A ROCProfilerV2 Session has enough information about what needs to be collected or traced and it allows the user to start/stop profiling/tracing whenever required. More details on the API can be found in the API specification documentation that can be installed using rocprofiler-doc package. Samples also can be found for how to use the API in samples directory. + +- #### (ATT) Advanced Thread Trace + + Tool used to collect fine-grained hardware metrics. Provides ISA-level instruction hotspot analysis via hardware tracing. + + ```bash + # ATT(Advanced Thread Trace) needs some preparation before running. + + # 1. Make sure to generate the assembly file for application by executing the following before compiling your HIP Application + # This can be achieved globally by following environment variable + export HIPCC_COMPILE_FLAGS_APPEND="--save-temps -g" + # Similarly, the --save-temps -g flags can be added per file for better ISA generation control. + + # 2. Install plugin package + # see Plugin Support section for installation + + # 3. Run the following to view the trace + # Att-specific options must come right after the assembly file + rocprofv2 -i input.txt --plugin att --mode network + ``` + + ```bash + # Example for vectoradd on navi31. + # Special attention to gfx1100.s==navi31 in the ISA file name. + # Use gfx1030 for navi21, gfx90a for MI200 and gfx940 for MI300 + hipcc -g --save-temps vectoradd_hip.cpp -o vectoradd_hip.exe + rocprofv2 -i input.txt --plugin att vectoradd_hip-hip-amdgcn-amd-amdhsa-gfx1100.s --mode network ./vectoradd_hip.exe + # Then open the browser at http://localhost:8000 + # The ISA can also be obtained from llvm/roc objdump, however, annotations will be different + ``` + + For MPI or very long applications, we recommend to run collection, and later run the parser with already collected data: + + ```bash + # Run only collection: The assembly file is not used. Use mpirun [...] rocprofv2 [...] if needed. + rocprofv2 -i input.txt --plugin att none ./vectoradd_hip.exe + # Remove the binary/application: Only runs the parser. + rocprofv2 -i input.txt --plugin att vectoradd_hip-hip-amdgcn-amd-amdhsa-gfx1100.s --mode network + ``` + +- ##### app_assembly_file_relative_path + + AMDGCN ISA file with .s extension generated in 1st step + +- ##### app_relative_path + + Path for the running application + +- ##### ATT plugin optional parameters + + - --depth [n]: How many waves per slot to parse (maximum). + - --mpi [proc]: Parse with this many mpi processes, for greater analysis speed. Does not change results. Requires mpi4py. + - --att_kernel "filename": Kernel filename to use (instead of ATT asking which one to use). + - --trace_file "files": glob (wildcards allowed) of traces files to parse. Requires quotes for use with wildcards. + - --mode [network, file, off (default)] + +- ##### network + + Opens the server with the browser UI. + att needs 2 ports available (e.g. 8000, 18000). There is an option (default: --ports "8000,18000") to change these. + In case rocprofv2 is running on a different machine, use port forwarding "ssh -L 8000:localhost:8000 " so the browser can be used locally. For docker, use --network=host --ipc=host -p8000:8000 -p18000:18000 + +- ##### file + + Dumps the analyzed json files to disk for vieweing at a later time. Run python3 httpserver.py from within the generated ui/ folder to view the trace, similarly to network mode. The folder can be copied to another machine, and will run without rocm. + +- ##### off + + Runs trace collection but not analysis, so it can be analyzed at a later time. Run rocprofv2 ATT [network, file] with the same parameters, removing the application binary, to analyze previously generated traces. We recommend not setting the mode when collecting for MPI applications. + +- ##### input.txt + + Required. Used to select specific compute units and other trace parameters. + For first time users, we recommend compiling and running vectorAdd with + + ```bash + att: TARGET_CU=1 + SE_MASK=0x1 + SIMD_MASK=0x3 + ``` + + and histogram with + + ```bash + att: TARGET_CU=0 + SE_MASK=0xFF + SIMD_MASK=0xF // 0xF for GFX9, SIMD_MASK=0 for Navi + ``` + + Possible contents: + - att: TARGET_CU=1 //or some other CU [0,15] - WGP for Navi [0,8] + - SE_MASK=0x1 // bitmask of shader engines. The fewer, the easier on the hardware. Default enables 1 out of 4 shader engines. + - SIMD_MASK=0xF // GFX9: bitmask of SIMDs. Navi: SIMD Index [0-3]. + - DISPATCH=ID,RN // collect trace only for the given dispatch_ID and MPI rank RN. RN is optional and ignored for single processes. Multiple lines with varying combinations of RN and ID can be added. + - KERNEL=kernname // Profile only kernels containing the string kernname (c++ mangled name). Multiple lines can be added. + - PERFCOUNTERS_COL_PERIOD=0x3 // Multiplier period for counter collection [0~31]. 0=fastest (usually once every 16 cycles). GFX9 only. Counters will be shown in a graph over time in the browser UI. + - PERFCOUNTER=counter_name // Add a SQ counter to be collected with ATT; period defined by PERFCOUNTERS_COL_PERIOD. GFX9 only. + - BUFFER_SIZE=[size] // Sets size of the ATT buffer collection, per dispatch, in megabytes (shared among all shader engines). ## Tests @@ -476,6 +571,12 @@ rocprofiler-tests-9.0.0-local.x86_64.rpm rocprofv2 -t ``` +OR + +```bash +ctest +``` + ### Guidelines for adding new tests - Prefer to enhance an existing test as opposed to writing a new one. Tests have overhead to start and many small tests spend precious test time on startup and initialization issues. @@ -561,7 +662,7 @@ samples can be run as independent executables once installed ## Support -Please report in the Github Issues +Please report in the Github Issues. ## Limitations diff --git a/bin/att_to_out.py b/bin/att_to_out.py index 6be60a50..84b45759 100755 --- a/bin/att_to_out.py +++ b/bin/att_to_out.py @@ -22,19 +22,23 @@ import numpy as np import sys -BYTE_MAP = [str(k) for k in range(10)] + ['a', 'b', 'c', 'd', 'e', 'f'] + +BYTE_MAP = [str(k) for k in range(10)] + ["a", "b", "c", "d", "e", "f"] + def map8(c): - return BYTE_MAP[(c//16)%16]+BYTE_MAP[c%16] + return BYTE_MAP[(c // 16) % 16] + BYTE_MAP[c % 16] + def map16(c): - return map8(c>>8)+map8(c) + return map8(c >> 8) + map8(c) + in_filename = sys.argv[1] -out_filename = in_filename.split('.att')[0]+'.out' +out_filename = in_filename.split(".att")[0] + ".out" in_bytes = np.fromfile(in_filename, dtype=np.uint16) -out_bytes = [map16(c)+'\n' for c in in_bytes] +out_bytes = [map16(c) + "\n" for c in in_bytes] -with open(out_filename, 'w') as f: +with open(out_filename, "w") as f: [f.write(b) for b in out_bytes] diff --git a/bin/dform.py b/bin/dform.py index a417b7ad..97dbc621 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -23,51 +23,85 @@ import os from sqlitedb import SQLiteDB + def gen_message(outfile): - if outfile != '': - print("File '" + outfile + "' is generating") - -def post_process_data(db, table_name, outfile = ''): -# db.add_data_column('A', 'DispDurNs', 'INTEGER', 'BeginNs - DispatchNs') -# db.add_data_column('A', 'ComplDurNs', 'INTEGER', 'CompleteNs - EndNs') -# db.add_data_column('A', 'TotalDurNs', 'INTEGER', 'CompleteNs - DispatchNs') -# db.add_data_column(table_name, 'TimeNs', 'INTEGER', 'BeginNs - %d' % start_ns) - db.add_data_column(table_name, 'DurationNs', 'INTEGER', 'EndNs - BeginNs') - if outfile != '': db.dump_csv(table_name, outfile) - gen_message(outfile) + if outfile != "": + print("File '" + outfile + "' is generating") + + +def post_process_data(db, table_name, outfile=""): + # db.add_data_column('A', 'DispDurNs', 'INTEGER', 'BeginNs - DispatchNs') + # db.add_data_column('A', 'ComplDurNs', 'INTEGER', 'CompleteNs - EndNs') + # db.add_data_column('A', 'TotalDurNs', 'INTEGER', 'CompleteNs - DispatchNs') + # db.add_data_column(table_name, 'TimeNs', 'INTEGER', 'BeginNs - %d' % start_ns) + db.add_data_column(table_name, "DurationNs", "INTEGER", "EndNs - BeginNs") + if outfile != "": + db.dump_csv(table_name, outfile) + gen_message(outfile) + def gen_data_bins(db, outfile): - db.execute('create view C as select Name, Calls, TotalDurationNs, TotalDurationNs/Calls as AverageNs, TotalDurationNs*100.0/(select sum(TotalDurationNs) from %s) as Percentage from %s order by TotalDurationNs desc;' % ('B', 'B')); - db.dump_csv('C', outfile) - db.execute('DROP VIEW C') + db.execute( + "create view C as select Name, Calls, TotalDurationNs, TotalDurationNs/Calls as AverageNs, TotalDurationNs*100.0/(select sum(TotalDurationNs) from %s) as Percentage from %s order by TotalDurationNs desc;" + % ("B", "B") + ) + db.dump_csv("C", outfile) + db.execute("DROP VIEW C") + def gen_table_bins(db, table, outfile, name_var, dur_ns_var): - db.execute('create view B as select (%s) as Name, count(%s) as Calls, sum(%s) as TotalDurationNs from %s group by %s' % (name_var, name_var, dur_ns_var, table, name_var)) - gen_data_bins(db, outfile) - db.execute('DROP VIEW B') - gen_message(outfile) + db.execute( + "create view B as select (%s) as Name, count(%s) as Calls, sum(%s) as TotalDurationNs from %s group by %s" + % (name_var, name_var, dur_ns_var, table, name_var) + ) + gen_data_bins(db, outfile) + db.execute("DROP VIEW B") + gen_message(outfile) + def gen_api_json_trace(db, table, start_ns, outfile): - db.execute('create view B as select "Index", Name as name, __section as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (start_ns, table)); - db.dump_json('B', table, outfile) - db.execute('DROP VIEW B') - gen_message(outfile) + db.execute( + 'create view B as select "Index", Name as name, __section as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' + % (start_ns, table) + ) + db.dump_json("B", table, outfile) + db.execute("DROP VIEW B") + gen_message(outfile) + def gen_ext_json_trace(db, table, start_ns, outfile): - db.execute('create view B as select Name as name, __section as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, ((EndNs - BeginNs)/1000) as dur from %s;' % (start_ns, table)); - db.dump_json('B', table, outfile) - db.execute('DROP VIEW B') - gen_message(outfile) + db.execute( + "create view B as select Name as name, __section as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, ((EndNs - BeginNs)/1000) as dur from %s;" + % (start_ns, table) + ) + db.dump_json("B", table, outfile) + db.execute("DROP VIEW B") + gen_message(outfile) + def gen_ops_json_trace(db, table, base_pid, start_ns, outfile): - db.execute('create view B as select "Index", "%s" as name, ("dev-id" + %d) as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % ('roctx-range' if 'ROCP_RENAME_KERNEL' in os.environ else 'Name',base_pid, start_ns, table)); - db.dump_json('B', table, outfile) - db.execute('DROP VIEW B') - gen_message(outfile) + db.execute( + 'create view B as select "Index", "%s" as name, ("dev-id" + %d) as pid, __lane as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' + % ( + "roctx-range" if "ROCP_RENAME_KERNEL" in os.environ else "Name", + base_pid, + start_ns, + table, + ) + ) + db.dump_json("B", table, outfile) + db.execute("DROP VIEW B") + gen_message(outfile) + def gen_kernel_json_trace(db, table, base_pid, start_ns, outfile): - db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_ns, table)); - db.dump_json('B', table, outfile) - db.execute('DROP VIEW B') - gen_message(outfile) + db.execute( + 'create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' + % (base_pid, start_ns, table) + ) + db.dump_json("B", table, outfile) + db.execute("DROP VIEW B") + gen_message(outfile) + + ############################################################################################## diff --git a/bin/mem_manager.py b/bin/mem_manager.py index 4d654ad3..4f51ba50 100755 --- a/bin/mem_manager.py +++ b/bin/mem_manager.py @@ -23,368 +23,464 @@ import sys, os, re from sqlitedb import SQLiteDB -pinned = ['hipMallocHost', 'hipHostMalloc', 'hipHostAlloc'] -ondevice = ['hipMalloc', 'hipMallocPitch', 'hipMallocArray', 'hipMalloc3DArray', 'hsa_amd_memory_pool_allocate'] +pinned = ["hipMallocHost", "hipHostMalloc", "hipHostAlloc"] +ondevice = [ + "hipMalloc", + "hipMallocPitch", + "hipMallocArray", + "hipMalloc3DArray", + "hsa_amd_memory_pool_allocate", +] mm_table_descr = [ - ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'Direction', 'SrcType', 'DstType', 'Size', 'Async'], - {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'Direction':'TEXT', 'SrcType':'TEXT', 'DstType':'TEXT', 'Size':'INTEGER', 'Async':'TEXT'} + [ + "BeginNs", + "EndNs", + "pid", + "tid", + "Name", + "Direction", + "SrcType", + "DstType", + "Size", + "Async", + ], + { + "BeginNs": "INTEGER", + "EndNs": "INTEGER", + "pid": "INTEGER", + "tid": "INTEGER", + "Name": "TEXT", + "Direction": "TEXT", + "SrcType": "TEXT", + "DstType": "TEXT", + "Size": "INTEGER", + "Async": "TEXT", + }, ] + def fatal(msg): - sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); - sys.exit(1) + sys.stderr.write(sys.argv[0] + ": " + msg + "\n") + sys.exit(1) + + +DELIM = "," -DELIM = ',' # Mem copy manager class class MemManager: - - def __init__(self, db, indir): - self.db = db - self.allocations = {} - self.hsa_agent_types = {} - self.memcopies = {} - self.filename = '' - self.fd = 0 - self.parse_hsa_handles(indir + '/' + 'hsa_handles.txt'); - - def __del__(self): - if self.fd != 0: self.fd.close() - - # Parsing the mapping of HSA agent and memory pool handles - def parse_hsa_handles(self, infile): - if os.path.exists(infile): - inp = open(infile, 'r') - cpu_agent_ptrn = re.compile(r'(0x[0-9a-fA-F]+) agent cpu') - gpu_agent_ptrn = re.compile(r'(0x[0-9a-fA-F]+) agent gpu') - for line in inp.readlines(): - m_cpu = cpu_agent_ptrn.match(line) - if m_cpu: - self.hsa_agent_types[str(int(m_cpu.group(1),16))] = 0 # "cpu" - m_gpu = gpu_agent_ptrn.match(line) - if m_gpu: - self.hsa_agent_types[str(int(m_gpu.group(1),16))] = 1 # "gpu" - inp.close() - - # register alloc and memcpy API calls - # ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data'], - def register_api(self, rec_vals): - res = '' - record_name = rec_vals[4] # 'Name' - record_args = rec_vals[5] # 'args' - malloc_ptrn = re.compile(r'hip.*Malloc|hsa_amd_memory_pool_allocate') - mcopy_ptrn = re.compile(r'hipMemcpy|hsa_amd_memory_async_copy') - - if malloc_ptrn.match(record_name): - self.add_allocation(record_name, record_args) - elif mcopy_ptrn.match(record_name): - res = self.add_memcpy(rec_vals) - - return res - - - # register memcpy asynchronous copy - # ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index', ... - def register_copy(self, rec_vals): - data = '' - event = rec_vals[2] # 'Name' - procid = rec_vals[3] # 'pid' - recordid = rec_vals[5] # 'Index' - size_ptrn = re.compile(DELIM + 'Size=(\d+)' + DELIM) - # query syncronous memcopy API record - key = (recordid, procid, 0) - if key in self.memcopies: - data = self.memcopies[key] - - # query asyncronous memcopy API record - key = (recordid, procid, 1) - if key in self.memcopies: - if data != '': fatal('register_copy: corrupted record sync/async') - async_copy_start_time = rec_vals[0] - async_copy_end_time = rec_vals[1] - - tid = rec_vals[4] - copy_line_header = str(async_copy_start_time) + DELIM + str(async_copy_end_time) + DELIM + str(procid) + DELIM + str(tid) - copy_line_footer = 'Async=' + str(1) - data = copy_line_header + self.memcopies[key] + copy_line_footer - self.memcopies[key] = data - - return data - - # register memcpy asynchronous activity - # rec_vals: ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'Data', ... - def register_activity(self, rec_vals): - data = '' - procid = rec_vals[5] # 'pid' - recordid = rec_vals[7] # 'Index' - - # query syncronous memcopy API record - key = (recordid, procid, 0) - if key in self.memcopies: - data = self.memcopies[key] - - # query asyncronous memcopy API record - key = (recordid, procid, 1) - if key in self.memcopies: - if data != '': fatal('register_activity: corrupted record sync/async') - - async_copy_start_time = rec_vals[0] - async_copy_end_time = rec_vals[1] - - tid = rec_vals[6] - copy_line_header = str(async_copy_start_time) + DELIM + str(async_copy_end_time) + DELIM + str(procid) + DELIM + str(tid) - copy_line_footer = 'Async=' + str(1) - data = copy_line_header + self.memcopies[key] + copy_line_footer - self.memcopies[key] = data - - return data - - # add allocation to map - def add_allocation(self, event, args): - choice = 0 - if event == "hipMallocPitch": - malloc_args_ptrn = re.compile(r'\(ptr\((.*)\) width\((.*)\) height\((.*)\)\)') - choice = 1 - elif event == "hipMallocArray": - malloc_args_ptrn = re.compile(r'\(array\((.*)\) width\((.*)\) height\((.*)\)\)') - choice = 1 - elif event == "hipMalloc3DArray": - malloc_args_ptrn = re.compile(r'\(array\((.*)\) width\((.*)\) height\((.*)\) depth\((.*)\)\)') - choice = 2 - elif event == "hsa_amd_memory_pool_allocate": - #({handle=25291264}, 40, 0, 0x7ffc4c7bf1b0) - malloc_args_ptrn = re.compile(r'\({handle=\d+}, (\d+), \d+, (0x[0-9a-fA-F]+)\)') - choice = 4 - else: - #(ptr(0x7f3407000000) size(800000000) flags(0)) - malloc_args_ptrn = re.compile(r'\(ptr\((.*)\) size\((.*)\) .*\)') - choice = 3 - m = malloc_args_ptrn.match(args) - if m: - if choice == 4: - ptr = int(m.group(2), 16) - size = int(m.group(1)) - elif choice == 3: - ptr = int(m.group(1), 16) - size = int(m.group(2)) - elif choice == 1: - ptr = int(m.group(1), 16) - size = int(m.group(2)) * int(m.group(3)) - else: - ptr = int(m.group(1), 16) - size = int(m.group(2)) * int(m.group(3)) * int(m.group(4)) - self.allocations[ptr] = (size, event) - - #get type of ptr - def get_ptr_type(self, ptr): - addr = int(ptr, 16) - addr_type = 'unknown' - found = 0 - for base, (size, event) in self.allocations.items(): - if addr >= base and addr < base + size: - found = 1 - break - if not found: - addr_type = 'pageable' - elif event in pinned: - addr_type = 'pinned' - elif event in ondevice: - addr_type = 'device' - elif ptr in self.hsa_agent_types: - if self.hsa_agent_types[ptr] == 0: - addr_type = 'pinned' - elif self.hsa_agent_types[ptr] == 1: - addr_type = 'device' - else: - fatal('internal error: ptr(' + ptr + ') cannot be identified') - else: - fatal('internal error: ptr(' + ptr + ') cannot be identified') - return addr_type - - # add memcpy to map - def add_memcpy(self, recvals): - recordid = recvals[6] #same as corrid - event = recvals[4] - start_time = recvals[0] # sync time stamp - end_time = recvals[1] # sync time stamp - args = recvals[5] - procid = int(recvals[2]) # used to query async entries - pid = recvals[2] - tid = recvals[3] - - # hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) - hip_memcpy_ptrn = re.compile(r'\(\s*dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)') - # hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, - # size_t height, hipMemcpyKind kind); - hip_memcpy_ptrn2 = re.compile(r'\(\s*dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)') - # hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, - # size_t count, hipMemcpyKind kind); - hip_memcpy_ptrn3 = re.compile(r'\(\s*dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)') - # hipMemcpyToSymbol(const void* symbolName, const void* src, size_t sizeBytes, - # size_t offset = 0, hipMemcpyKind kind) - hip_memcpy_ptrn4 = re.compile(r'\(\s*symbol\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)') - # memcopy with kind argument - hip_memcpy_ptrn_kind = re.compile(r'.* kind\((\d+)\)\s*.*') - #hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src, - # hsa_agent_t src_agent, size_t size, - # uint32_t num_dep_signals, - # const hsa_signal_t* dep_signals, - # hsa_signal_t completion_signal); - # "(0x7f8ab6600000, 27064880, 0x7f8b16000000, 27059968, 800000000, 0, 0, 140240759809536) = 0" - # hsa_memcpy_ptrn_prev used to support format transition and will be cleaned up later. - hsa_memcpy_ptrn_prev = re.compile(r'\((0x[0-9a-fA-F]+), (\d+), (0x[0-9a-fA-F]+), (\d+), (\d+), .*\) = \d') - # "(0x7fd83bc00000, {handle=16124864}, 0x7fd89b600000, {handle=16119808}, 800000000, 0, 0, {handle=140573877724672}) = 0" - hsa_memcpy_ptrn = re.compile(r'\((0x[0-9a-fA-F]+), {handle=(\d+)}, (0x[0-9a-fA-F]+), {handle=(\d+)}, (\d+), .*\) = \d') - # "(0x7f9125cfe7b0, 0x7f9125cfe784, 0x7f9125cfe790, 0x7f9125cfe784, 0x7f9125cfe778, {handle=94324038652880}, 1, 0, 0, {handle=140261380710784}) = 0" - # dst, dst_offset, src, src_offset, range, copy_agent, dir, num_dep_signals, dep_signals, completion_signal - hsa_memcpy_ptrn2 = re.compile(r'\((0x[0-9a-fA-F]+), 0x[0-9a-fA-F]+, (0x[0-9a-fA-F]+), 0x[0-9a-fA-F]+, 0x[0-9a-fA-F]+, {z=(\d+), y=(\d+), x=(\d+)}, {handle=(\d+)}, .*\) = \d') - # aysnc memcopy - async_event_ptrn = re.compile(r'Async|async') - m_basic_hip = hip_memcpy_ptrn.match(args) - m_basic_hsa3 = hip_memcpy_ptrn4.match(args) - m_basic_hsa_prev = hsa_memcpy_ptrn_prev.match(args) - m_basic_hsa = hsa_memcpy_ptrn.match(args) - m_basic_hsa2 = hsa_memcpy_ptrn2.match(args) - is_hip = True if not (m_basic_hsa_prev or m_basic_hsa or m_basic_hsa2) else False - m_2d = hip_memcpy_ptrn2.match(args) - m_array = hip_memcpy_ptrn3.match(args) - is_async = 1 if async_event_ptrn.search(event) else 0 - copy_line = '' - size = 0 - dstptr_type = 'unknown' - srcptr_type = 'unknown' - direction = 'unknown' - kind_switcher = { - '0': "HtoH", - '1': "HtoD", - '2': "DtoH", - '3': "DtoD", - '4': "auto", - } - - condition_matched = False - if m_basic_hip: - dstptr = m_basic_hip.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_basic_hip.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = int(m_basic_hip.group(3)) - condition_matched = True - - if m_basic_hsa_prev: - dstptr = m_basic_hsa_prev.group(1) - dst_agent_ptr = m_basic_hsa_prev.group(2) - dstptr_type = self.get_ptr_type(dst_agent_ptr) - srcptr = m_basic_hsa_prev.group(3) - src_agent_ptr = m_basic_hsa_prev.group(4) - srcptr_type = self.get_ptr_type(src_agent_ptr) - size = int(m_basic_hsa_prev.group(5)) - condition_matched = True - - if m_basic_hsa: - dstptr = m_basic_hsa.group(1) - dst_agent_ptr = m_basic_hsa.group(2) - dstptr_type = self.get_ptr_type(dst_agent_ptr) - srcptr = m_basic_hsa.group(3) - src_agent_ptr = m_basic_hsa.group(4) - srcptr_type = self.get_ptr_type(src_agent_ptr) - size = int(m_basic_hsa.group(5)) - condition_matched = True - - if m_basic_hsa2: - dstptr = m_basic_hsa2.group(1) - dst_agent_ptr = m_basic_hsa2.group(6) - dstptr_type = self.get_ptr_type(dst_agent_ptr) - srcptr = m_basic_hsa2.group(2) - src_agent_ptr = m_basic_hsa2.group(6) - srcptr_type = self.get_ptr_type(src_agent_ptr) - z = int(m_basic_hsa2.group(3)) - y = int(m_basic_hsa2.group(4)) - x = int(m_basic_hsa2.group(5)) - size = x*y*z - condition_matched = True - - if m_basic_hsa3: - dstptr = m_basic_hsa3.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_basic_hsa3.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = int(m_basic_hsa3.group(3)) - condition_matched = True - - if m_array: - dstptr = m_array.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_array.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = m_array.group(3) - condition_matched = True - - if m_2d: - dstptr = m_2d.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_2d.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = int(m_2d.group(3))*int(m_2d.group(4)) - condition_matched = True - - if not condition_matched: fatal('Memcpy args \"' + args + '\" cannot be identified') - - if not is_async: - start_time = recvals[0] # sync time stamp - end_time = recvals[1] # sync time stamp - duration = (int(end_time) - int(start_time)) - - - evt_switcher = { - 'hipMemcpyDtoD': "DtoD", - 'hipMemcpyDtoDAsync': "DtoD", - 'hipMemcpyDtoH': "DtoH", - 'hipMemcpyDtoHAsync': "DtoH", - 'hipMemcpyHtoD': "HtoD", - 'hipMemcpyHtoDAsync': "HtoD", - } - - if is_hip: - m = hip_memcpy_ptrn_kind.match(args) - if m: - direction = kind_switcher.get(m.group(1), "unknown") - else: - direction = evt_switcher.get(event, "unknown") - else: - if dst_agent_ptr in self.hsa_agent_types and src_agent_ptr in self.hsa_agent_types: - if self.hsa_agent_types[src_agent_ptr] == 1: direction = 'D' - elif self.hsa_agent_types[src_agent_ptr] == 0: direction = 'H' - if direction != 'unknown': direction += 'to' - if self.hsa_agent_types[dst_agent_ptr] == 1: direction += 'D' - elif self.hsa_agent_types[dst_agent_ptr] == 0: direction += 'H' - - copy_line_header = '' - copy_line_footer = '' - copy_line_header = str(start_time) + DELIM + str(end_time) + DELIM + str(pid) + DELIM + str(tid) - copy_line_footer = 'Async=' + str(is_async) - - copy_line = copy_line_header + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + copy_line_footer - - self.memcopies[(recordid, procid, is_async)] = copy_line - return copy_line; - - def dump_data(self, table_name, file_name): - # To create memcopy info table in DB - print("File '" + file_name + "' is generating") - table_handle = self.db.add_table(table_name, mm_table_descr) - - fld_ptrn = re.compile(r'(.*)=(.*)') - for (key, record) in self.memcopies.items(): - rec_vals_array = [] - for rec in record.split(DELIM): - fld_ptrnm = fld_ptrn.match(rec) - if fld_ptrnm: - rec_vals_array.append(fld_ptrnm.group(2)) + def __init__(self, db, indir): + self.db = db + self.allocations = {} + self.hsa_agent_types = {} + self.memcopies = {} + self.filename = "" + self.fd = 0 + self.parse_hsa_handles(indir + "/" + "hsa_handles.txt") + + def __del__(self): + if self.fd != 0: + self.fd.close() + + # Parsing the mapping of HSA agent and memory pool handles + def parse_hsa_handles(self, infile): + if os.path.exists(infile): + inp = open(infile, "r") + cpu_agent_ptrn = re.compile(r"(0x[0-9a-fA-F]+) agent cpu") + gpu_agent_ptrn = re.compile(r"(0x[0-9a-fA-F]+) agent gpu") + for line in inp.readlines(): + m_cpu = cpu_agent_ptrn.match(line) + if m_cpu: + self.hsa_agent_types[str(int(m_cpu.group(1), 16))] = 0 # "cpu" + m_gpu = gpu_agent_ptrn.match(line) + if m_gpu: + self.hsa_agent_types[str(int(m_gpu.group(1), 16))] = 1 # "gpu" + inp.close() + + # register alloc and memcpy API calls + # ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data'], + def register_api(self, rec_vals): + res = "" + record_name = rec_vals[4] # 'Name' + record_args = rec_vals[5] # 'args' + malloc_ptrn = re.compile(r"hip.*Malloc|hsa_amd_memory_pool_allocate") + mcopy_ptrn = re.compile(r"hipMemcpy|hsa_amd_memory_async_copy") + + if malloc_ptrn.match(record_name): + self.add_allocation(record_name, record_args) + elif mcopy_ptrn.match(record_name): + res = self.add_memcpy(rec_vals) + + return res + + # register memcpy asynchronous copy + # ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index', ... + def register_copy(self, rec_vals): + data = "" + event = rec_vals[2] # 'Name' + procid = rec_vals[3] # 'pid' + recordid = rec_vals[5] # 'Index' + size_ptrn = re.compile(DELIM + "Size=(\d+)" + DELIM) + # query syncronous memcopy API record + key = (recordid, procid, 0) + if key in self.memcopies: + data = self.memcopies[key] + + # query asyncronous memcopy API record + key = (recordid, procid, 1) + if key in self.memcopies: + if data != "": + fatal("register_copy: corrupted record sync/async") + async_copy_start_time = rec_vals[0] + async_copy_end_time = rec_vals[1] + + tid = rec_vals[4] + copy_line_header = ( + str(async_copy_start_time) + + DELIM + + str(async_copy_end_time) + + DELIM + + str(procid) + + DELIM + + str(tid) + ) + copy_line_footer = "Async=" + str(1) + data = copy_line_header + self.memcopies[key] + copy_line_footer + self.memcopies[key] = data + + return data + + # register memcpy asynchronous activity + # rec_vals: ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'Data', ... + def register_activity(self, rec_vals): + data = "" + procid = rec_vals[5] # 'pid' + recordid = rec_vals[7] # 'Index' + + # query syncronous memcopy API record + key = (recordid, procid, 0) + if key in self.memcopies: + data = self.memcopies[key] + + # query asyncronous memcopy API record + key = (recordid, procid, 1) + if key in self.memcopies: + if data != "": + fatal("register_activity: corrupted record sync/async") + + async_copy_start_time = rec_vals[0] + async_copy_end_time = rec_vals[1] + + tid = rec_vals[6] + copy_line_header = ( + str(async_copy_start_time) + + DELIM + + str(async_copy_end_time) + + DELIM + + str(procid) + + DELIM + + str(tid) + ) + copy_line_footer = "Async=" + str(1) + data = copy_line_header + self.memcopies[key] + copy_line_footer + self.memcopies[key] = data + + return data + + # add allocation to map + def add_allocation(self, event, args): + choice = 0 + if event == "hipMallocPitch": + malloc_args_ptrn = re.compile(r"\(ptr\((.*)\) width\((.*)\) height\((.*)\)\)") + choice = 1 + elif event == "hipMallocArray": + malloc_args_ptrn = re.compile( + r"\(array\((.*)\) width\((.*)\) height\((.*)\)\)" + ) + choice = 1 + elif event == "hipMalloc3DArray": + malloc_args_ptrn = re.compile( + r"\(array\((.*)\) width\((.*)\) height\((.*)\) depth\((.*)\)\)" + ) + choice = 2 + elif event == "hsa_amd_memory_pool_allocate": + # ({handle=25291264}, 40, 0, 0x7ffc4c7bf1b0) + malloc_args_ptrn = re.compile( + r"\({handle=\d+}, (\d+), \d+, (0x[0-9a-fA-F]+)\)" + ) + choice = 4 else: - rec_vals_array.append(rec) - self.db.insert_entry(table_handle, rec_vals_array) - - # To dump the memcopy info table as CSV - self.db.dump_csv(table_name, file_name) + # (ptr(0x7f3407000000) size(800000000) flags(0)) + malloc_args_ptrn = re.compile(r"\(ptr\((.*)\) size\((.*)\) .*\)") + choice = 3 + m = malloc_args_ptrn.match(args) + if m: + if choice == 4: + ptr = int(m.group(2), 16) + size = int(m.group(1)) + elif choice == 3: + ptr = int(m.group(1), 16) + size = int(m.group(2)) + elif choice == 1: + ptr = int(m.group(1), 16) + size = int(m.group(2)) * int(m.group(3)) + else: + ptr = int(m.group(1), 16) + size = int(m.group(2)) * int(m.group(3)) * int(m.group(4)) + self.allocations[ptr] = (size, event) + + # get type of ptr + def get_ptr_type(self, ptr): + addr = int(ptr, 16) + addr_type = "unknown" + found = 0 + for base, (size, event) in self.allocations.items(): + if addr >= base and addr < base + size: + found = 1 + break + if not found: + addr_type = "pageable" + elif event in pinned: + addr_type = "pinned" + elif event in ondevice: + addr_type = "device" + elif ptr in self.hsa_agent_types: + if self.hsa_agent_types[ptr] == 0: + addr_type = "pinned" + elif self.hsa_agent_types[ptr] == 1: + addr_type = "device" + else: + fatal("internal error: ptr(" + ptr + ") cannot be identified") + else: + fatal("internal error: ptr(" + ptr + ") cannot be identified") + return addr_type + + # add memcpy to map + def add_memcpy(self, recvals): + recordid = recvals[6] # same as corrid + event = recvals[4] + start_time = recvals[0] # sync time stamp + end_time = recvals[1] # sync time stamp + args = recvals[5] + procid = int(recvals[2]) # used to query async entries + pid = recvals[2] + tid = recvals[3] + + # hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) + hip_memcpy_ptrn = re.compile( + r"\(\s*dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)" + ) + # hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, + # size_t height, hipMemcpyKind kind); + hip_memcpy_ptrn2 = re.compile( + r"\(\s*dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)" + ) + # hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, + # size_t count, hipMemcpyKind kind); + hip_memcpy_ptrn3 = re.compile( + r"\(\s*dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)" + ) + # hipMemcpyToSymbol(const void* symbolName, const void* src, size_t sizeBytes, + # size_t offset = 0, hipMemcpyKind kind) + hip_memcpy_ptrn4 = re.compile( + r"\(\s*symbol\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)" + ) + # memcopy with kind argument + hip_memcpy_ptrn_kind = re.compile(r".* kind\((\d+)\)\s*.*") + # hsa_amd_memory_async_copy(void* dst, hsa_agent_t dst_agent, const void* src, + # hsa_agent_t src_agent, size_t size, + # uint32_t num_dep_signals, + # const hsa_signal_t* dep_signals, + # hsa_signal_t completion_signal); + # "(0x7f8ab6600000, 27064880, 0x7f8b16000000, 27059968, 800000000, 0, 0, 140240759809536) = 0" + # hsa_memcpy_ptrn_prev used to support format transition and will be cleaned up later. + hsa_memcpy_ptrn_prev = re.compile( + r"\((0x[0-9a-fA-F]+), (\d+), (0x[0-9a-fA-F]+), (\d+), (\d+), .*\) = \d" + ) + # "(0x7fd83bc00000, {handle=16124864}, 0x7fd89b600000, {handle=16119808}, 800000000, 0, 0, {handle=140573877724672}) = 0" + hsa_memcpy_ptrn = re.compile( + r"\((0x[0-9a-fA-F]+), {handle=(\d+)}, (0x[0-9a-fA-F]+), {handle=(\d+)}, (\d+), .*\) = \d" + ) + # "(0x7f9125cfe7b0, 0x7f9125cfe784, 0x7f9125cfe790, 0x7f9125cfe784, 0x7f9125cfe778, {handle=94324038652880}, 1, 0, 0, {handle=140261380710784}) = 0" + # dst, dst_offset, src, src_offset, range, copy_agent, dir, num_dep_signals, dep_signals, completion_signal + hsa_memcpy_ptrn2 = re.compile( + r"\((0x[0-9a-fA-F]+), 0x[0-9a-fA-F]+, (0x[0-9a-fA-F]+), 0x[0-9a-fA-F]+, 0x[0-9a-fA-F]+, {z=(\d+), y=(\d+), x=(\d+)}, {handle=(\d+)}, .*\) = \d" + ) + # aysnc memcopy + async_event_ptrn = re.compile(r"Async|async") + m_basic_hip = hip_memcpy_ptrn.match(args) + m_basic_hsa3 = hip_memcpy_ptrn4.match(args) + m_basic_hsa_prev = hsa_memcpy_ptrn_prev.match(args) + m_basic_hsa = hsa_memcpy_ptrn.match(args) + m_basic_hsa2 = hsa_memcpy_ptrn2.match(args) + is_hip = True if not (m_basic_hsa_prev or m_basic_hsa or m_basic_hsa2) else False + m_2d = hip_memcpy_ptrn2.match(args) + m_array = hip_memcpy_ptrn3.match(args) + is_async = 1 if async_event_ptrn.search(event) else 0 + copy_line = "" + size = 0 + dstptr_type = "unknown" + srcptr_type = "unknown" + direction = "unknown" + kind_switcher = { + "0": "HtoH", + "1": "HtoD", + "2": "DtoH", + "3": "DtoD", + "4": "auto", + } + + condition_matched = False + if m_basic_hip: + dstptr = m_basic_hip.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_basic_hip.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = int(m_basic_hip.group(3)) + condition_matched = True + + if m_basic_hsa_prev: + dstptr = m_basic_hsa_prev.group(1) + dst_agent_ptr = m_basic_hsa_prev.group(2) + dstptr_type = self.get_ptr_type(dst_agent_ptr) + srcptr = m_basic_hsa_prev.group(3) + src_agent_ptr = m_basic_hsa_prev.group(4) + srcptr_type = self.get_ptr_type(src_agent_ptr) + size = int(m_basic_hsa_prev.group(5)) + condition_matched = True + + if m_basic_hsa: + dstptr = m_basic_hsa.group(1) + dst_agent_ptr = m_basic_hsa.group(2) + dstptr_type = self.get_ptr_type(dst_agent_ptr) + srcptr = m_basic_hsa.group(3) + src_agent_ptr = m_basic_hsa.group(4) + srcptr_type = self.get_ptr_type(src_agent_ptr) + size = int(m_basic_hsa.group(5)) + condition_matched = True + + if m_basic_hsa2: + dstptr = m_basic_hsa2.group(1) + dst_agent_ptr = m_basic_hsa2.group(6) + dstptr_type = self.get_ptr_type(dst_agent_ptr) + srcptr = m_basic_hsa2.group(2) + src_agent_ptr = m_basic_hsa2.group(6) + srcptr_type = self.get_ptr_type(src_agent_ptr) + z = int(m_basic_hsa2.group(3)) + y = int(m_basic_hsa2.group(4)) + x = int(m_basic_hsa2.group(5)) + size = x * y * z + condition_matched = True + + if m_basic_hsa3: + dstptr = m_basic_hsa3.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_basic_hsa3.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = int(m_basic_hsa3.group(3)) + condition_matched = True + + if m_array: + dstptr = m_array.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_array.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = m_array.group(3) + condition_matched = True + + if m_2d: + dstptr = m_2d.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_2d.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = int(m_2d.group(3)) * int(m_2d.group(4)) + condition_matched = True + + if not condition_matched: + fatal('Memcpy args "' + args + '" cannot be identified') + + if not is_async: + start_time = recvals[0] # sync time stamp + end_time = recvals[1] # sync time stamp + duration = int(end_time) - int(start_time) + + evt_switcher = { + "hipMemcpyDtoD": "DtoD", + "hipMemcpyDtoDAsync": "DtoD", + "hipMemcpyDtoH": "DtoH", + "hipMemcpyDtoHAsync": "DtoH", + "hipMemcpyHtoD": "HtoD", + "hipMemcpyHtoDAsync": "HtoD", + } + + if is_hip: + m = hip_memcpy_ptrn_kind.match(args) + if m: + direction = kind_switcher.get(m.group(1), "unknown") + else: + direction = evt_switcher.get(event, "unknown") + else: + if ( + dst_agent_ptr in self.hsa_agent_types + and src_agent_ptr in self.hsa_agent_types + ): + if self.hsa_agent_types[src_agent_ptr] == 1: + direction = "D" + elif self.hsa_agent_types[src_agent_ptr] == 0: + direction = "H" + if direction != "unknown": + direction += "to" + if self.hsa_agent_types[dst_agent_ptr] == 1: + direction += "D" + elif self.hsa_agent_types[dst_agent_ptr] == 0: + direction += "H" + + copy_line_header = "" + copy_line_footer = "" + copy_line_header = ( + str(start_time) + DELIM + str(end_time) + DELIM + str(pid) + DELIM + str(tid) + ) + copy_line_footer = "Async=" + str(is_async) + + copy_line = ( + copy_line_header + + DELIM + + event + + DELIM + + "Direction=" + + direction + + DELIM + + "SrcType=" + + srcptr_type + + DELIM + + "DstType=" + + dstptr_type + + DELIM + + "Size=" + + str(size) + + DELIM + + copy_line_footer + ) + + self.memcopies[(recordid, procid, is_async)] = copy_line + return copy_line + + def dump_data(self, table_name, file_name): + # To create memcopy info table in DB + print("File '" + file_name + "' is generating") + table_handle = self.db.add_table(table_name, mm_table_descr) + + fld_ptrn = re.compile(r"(.*)=(.*)") + for key, record in self.memcopies.items(): + rec_vals_array = [] + for rec in record.split(DELIM): + fld_ptrnm = fld_ptrn.match(rec) + if fld_ptrnm: + rec_vals_array.append(fld_ptrnm.group(2)) + else: + rec_vals_array.append(rec) + self.db.insert_entry(table_handle, rec_vals_array) + + # To dump the memcopy info table as CSV + self.db.dump_csv(table_name, file_name) diff --git a/bin/rocprofv2 b/bin/rocprofv2 index cf55435c..a2a86f66 100755 --- a/bin/rocprofv2 +++ b/bin/rocprofv2 @@ -1,8 +1,13 @@ -#!/bin/bash +#!/bin/bash -e + set -eo pipefail -CURRENT_DIR="$(dirname -- "$0")" -ROCPROFV2_DIR=$(dirname -- $(realpath ${BASH_SOURCE[0]})) -ROCM_DIR=$(dirname -- "$ROCPROFV2_DIR") + +# LD_PRELOAD on script will not get propagated +if [ -n "${ROCP_PRELOAD}" ]; then LD_PRELOAD="${ROCP_PRELOAD}"; fi + +CURRENT_DIR="$( dirname -- "$0"; )"; +ROCPROFV2_DIR=$(dirname -- $(realpath ${BASH_SOURCE[0]})); +ROCM_DIR=$( dirname -- "$ROCPROFV2_DIR"; ) PLUGIN_LIST=("ctf" "perfetto" "file" "att") RUN_FROM_BUILD=0 if [[ $ROCPROFV2_DIR == *"/build"* ]]; then diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index c1a3daa4..0f5b43c3 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -262,6 +262,7 @@ run() { MY_LD_PRELOAD="" if [ "$ROCTX_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":roctx" + MY_LD_PRELOAD="$TTLIB_PATH/libroctx64.so" fi if [ "$HIP_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":hip" @@ -273,18 +274,18 @@ run() { if [ "$HSA_TRACE" = 1 ] ; then export ROCTRACER_DOMAIN=$API_TRACE":hsa" MY_HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB $ROCM_LIB_PATH/libroctracer64.so.4" - MY_LD_PRELOAD="$TTLIB_PATH/libroctracer_tool.so" + MY_LD_PRELOAD="$MY_LD_PRELOAD:$TTLIB_PATH/libroctracer_tool.so" elif [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE OUTPUT_LIST="$ROCP_OUTPUT_DIR/" MY_HSA_TOOLS_LIB="$ROCM_LIB_PATH/libroctracer64.so.4" - MY_LD_PRELOAD="$TTLIB_PATH/libroctracer_tool.so" + MY_LD_PRELOAD="$MY_LD_PRELOAD:$TTLIB_PATH/libroctracer_tool.so" fi if [ "$ROCP_STATS_OPT" = 1 ] ; then if [ "$ROCTRACER_DOMAIN" = ":hip" ] ; then MY_HSA_TOOLS_LIB="$ROCM_LIB_PATH/libroctracer64.so.4" - MY_LD_PRELOAD="$TTLIB_PATH/libhip_stats.so" + MY_LD_PRELOAD="$MY_LD_PRELOAD:$TTLIB_PATH/libhip_stats.so" else error_message="ROCP_STATS_OPT is only available with --hip-trace option" echo $error_message diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index d1584e54..7dee934f 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -24,269 +24,326 @@ from functools import reduce from txt2params import gen_params + # SQLite Database class class SQLiteDB: - def __init__(self, file_name): - self.connection = sqlite3.connect(file_name) - self.tables = {} - self.section_index = 0 - - def __del__(self): - self.connection.close() - - # add DB table - def add_table(self, name, descr, extra = ()): - (field_list, field_dict) = descr - if name in self.tables: raise Exception('table is already added: "' + name + '"') - - # create DB table - table_descr = [] - for field in field_list: table_descr.append('"%s" %s' % (field, field_dict[field])) - for item in extra: table_descr.append('"%s" %s' % (item[0], item[1])) - stm = 'CREATE TABLE ' + name + ' (%s)' % ', '.join(table_descr) - cursor = self.connection.cursor() - cursor.execute(stm) - self.connection.commit() - - # register table - fields_str = ','.join(map(lambda x: '"' + x + '"', field_list)) - templ_str = ','.join('?' * len(field_list)) - stm = 'INSERT INTO ' + name + '(' + fields_str + ') VALUES(' + templ_str + ');' - self.tables[name] = stm - - return (cursor, stm); - - # add columns to table - def add_columns(self, name, columns): - cursor = self.connection.cursor() - for item in columns: - stm = 'ALTER TABLE ' + name + ' ADD COLUMN "%s" %s' % (item[0], item[1]) - cursor.execute(stm) - self.connection.commit() - - # add columns with expression - def add_data_column(self, table_name, data_label, data_type, data_expr): - cursor = self.connection.cursor() - cursor.execute('ALTER TABLE %s ADD COLUMN "%s" %s' % (table_name, data_label, data_type)) - cursor.execute('UPDATE %s SET %s = (%s);' % (table_name, data_label, data_expr)) - - def change_rec_name(self, table_name, rec_id, rec_name): - self.connection.execute('UPDATE ' + table_name + ' SET Name = ? WHERE "Index" = ?', (rec_name, rec_id)) - def change_rec_tid(self, table_name, rec_id, tid): - self.connection.execute('UPDATE ' + table_name + ' SET tid = ? WHERE "Index" = ?', (tid, rec_id)) - def change_rec_fld(self, table_name, fld_expr, rec_pat): - self.connection.execute('UPDATE ' + table_name + ' SET ' + fld_expr + ' WHERE ' + rec_pat) - def table_get_record(self, table_name, rec_pat): - cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE ' + rec_pat) - raws = cursor.fetchall() - if len(raws) != 1: raise Exception('Record (' + rec_pat + ') is not unique, table "' + table_name + '"') - return list(raws[0]) - - # populate DB table entry - def insert_entry(self, table, val_list): - (cursor, stm) = table - cursor.execute(stm, val_list) - - # populate DB table entry - def commit_entry(self, table, val_list): - self.insert_entry(table, val_list) - self.connection.commit() - - # populate DB table data - def insert_table(self, table, reader): - for val_list in reader: - if not val_list[-1]: val_list.pop() - self.insert_entry(table, val_list) - self.connection.commit() - - # return table fields list - def _get_fields(self, table_name): - cursor = self.connection.execute('SELECT * FROM ' + table_name) - return list(map(lambda x: '"%s"' % (x[0]), cursor.description)) - - # return table raws list - def _get_raws(self, table_name): - cursor = self.connection.execute('SELECT * FROM ' + table_name) - return cursor.fetchall() - def _get_raws_indexed(self, table_name): - cursor = self.connection.execute('SELECT * FROM ' + table_name + ' order by "Index" asc;') - return cursor.fetchall() - def _get_raw_by_id(self, table_name, rec_id): - cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (rec_id,)) - raws = cursor.fetchall() - if len(raws) != 1: raise Exception('Index is not unique, table "' + table_name + '"') - return list(raws[0]) - - def table_get_raws(self, table_name): - return self._get_raws(table_name) - - # dump CSV table - def dump_csv(self, table_name, file_name): - if not re.search(r'\.csv$', file_name): - raise Exception('wrong output file type: "' + file_name + '"' ) - - fields = self._get_fields(table_name) - with open(file_name, mode='w') as fd: - fd.write(','.join(fields) + '\n') - for raw in self._get_raws(table_name): - tmp = list(raw) - for idx in range(len(tmp)): - if type(tmp[idx]) == str: - if(not(tmp[idx][0] == tmp[idx][-1] == '"')): tmp[idx] = '"'+tmp[idx]+'"' - raw = tuple(tmp) - fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') - - # dump JSON trace - def open_json(self, file_name): - if not re.search(r'\.json$', file_name): - raise Exception('wrong output file type: "' + file_name + '"' ) - with open(file_name, mode='w') as fd: - fd.write('{ "traceEvents":[{}\n'); - - def close_json(self, file_name): - if not re.search(r'\.json$', file_name): - raise Exception('wrong output file type: "' + file_name + '"' ) - with open(file_name, mode='a') as fd: - fd.write('}') - - def label_json(self, pid, label, file_name): - if not re.search(r'\.json$', file_name): - raise Exception('wrong output file type: "' + file_name + '"' ) - with open(file_name, mode='a') as fd: - fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)) - self.section_index += 1 - - def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, file_name): - if not re.search(r'\.json$', file_name): - raise Exception('wrong output file type: "' + file_name + '"' ) - with open(file_name, mode='a') as fd: - dep_id = base_id - for ind in range(len(from_us_list)): - corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind - if corr_id in to_us_dict: - (from_ts, stream_id, tid) = from_us_list[ind] - to_ts = to_us_dict[corr_id] - if from_ts > to_ts: from_ts = to_ts - fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, tid)) - fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, stream_id)) - dep_id += 1 - - def metadata_json(self, jsonfile, sysinfo_file): - params = gen_params(sysinfo_file); - with open(jsonfile, mode='a') as fd: - cnt = 0 - fd.write('],\n') - fd.write('"otherData": {\n') - for nkey in sorted(params.keys()): - key = nkey[1] - cnt = cnt + 1 - if cnt == len(params): - fd.write(' "' + key + '": "' + params[nkey] + '"\n') - else: - fd.write(' "' + key + '": "' + params[nkey] + '",\n') - fd.write(' }\n') - - def dump_json(self, table_name, data_name, file_name): - if not re.search(r'\.json$', file_name): - raise Exception('wrong output file type: "' + file_name + '"' ) - - sub_ptrn = re.compile(r'(^"|"$)') - name_ptrn = re.compile(r'(name|Name)') - - table_fields = self._get_fields(table_name) - table_raws = self._get_raws(table_name) - data_fields = self._get_fields(data_name) - data_raws = self._get_raws(data_name) - - with open(file_name, mode='a') as fd: - table_raws_len = len(table_raws) - for raw_index in range(table_raws_len): - if (raw_index == table_raws_len - 1) or (raw_index % 1000 == 0): - sys.stdout.write( \ - "\rdump json " + str(raw_index) + ":" + str(len(table_raws)) + " "*100 \ - ) - - vals_list = [] - values = list(table_raws[raw_index]) - for value_index in range(len(values)): - label = table_fields[value_index] - value = values[value_index] - if name_ptrn.search(label): value = sub_ptrn.sub(r'', value) - if label != '"Index"': - if label == '"dur"' and value == 0: - vals_list.append('%s:"%s"' % (label, "1")) - else: - vals_list.append('%s:"%s"' % (label, value)) - - args_list = [] - data = list(data_raws[raw_index]) - for value_index in range(len(data)): - label = data_fields[value_index] - value = data[value_index] - if label[:3] == '"__': continue - if name_ptrn.search(label): value = sub_ptrn.sub(r'', value) - if label != '"Index"' and label != '"roctx-range"': args_list.append('%s:"%s"' % (label, value)) - - fd.write(',{"ph":"%s",%s,\n "args":{\n %s\n }\n}\n' % ('X', ','.join(vals_list), ',\n '.join(args_list))) - - sys.stdout.write('\n') - - # execute query on DB - def execute(self, cmd): - cursor = self.connection.cursor() - cursor.execute(cmd) - - # commit DB - def commit(self): - self.connection.commit() - - # close DB - def close(self): - self.connection.commit() - self.connection.close() - - # access DB - def get_raws(self, table_name): - cur = self.connection.cursor() - cur.execute("SELECT * FROM %s" % table_name) - return cur.fetchall() - - # return CSV descriptor - # list of fields and dictionaly for the fields types - def _get_csv_descr(self, table_name, fd): - reader = csv.DictReader(fd) - field_names = reader.fieldnames - if not field_names[-1]: field_names.pop() - field_types = {} - - for entry in reader: - fields_left = [f for f in field_names if f not in field_types.keys()] - # all fields processed - if not fields_left: break - - for field in fields_left: - data = entry[field] - # need data for the field to be processed - if len(data) == 0: continue - - if data.isdigit(): - field_types[field] = "INTEGER" - else: - field_types[field] = "TEXT" - - if len(fields_left) > 0: raise Exception('types not found for fields: ', fields_left) - return (field_names, field_types) - - # add CSV table - def add_csv_table(self, table_name, file_name, extra = ()): - with open(file_name, mode='r') as fd: - # get CSV table descriptor - descr = self._get_csv_descr(table_name, fd) - # reader to populate the table - fd.seek(0) - reader = csv.reader(fd) - reader.next() - table = self.add_table(table_name, descr, extra) - self.insert_table(table, reader) + def __init__(self, file_name): + self.connection = sqlite3.connect(file_name) + self.tables = {} + self.section_index = 0 + + def __del__(self): + self.connection.close() + + # add DB table + def add_table(self, name, descr, extra=()): + (field_list, field_dict) = descr + if name in self.tables: + raise Exception('table is already added: "' + name + '"') + + # create DB table + table_descr = [] + for field in field_list: + table_descr.append('"%s" %s' % (field, field_dict[field])) + for item in extra: + table_descr.append('"%s" %s' % (item[0], item[1])) + stm = "CREATE TABLE " + name + " (%s)" % ", ".join(table_descr) + cursor = self.connection.cursor() + cursor.execute(stm) + self.connection.commit() + + # register table + fields_str = ",".join(map(lambda x: '"' + x + '"', field_list)) + templ_str = ",".join("?" * len(field_list)) + stm = "INSERT INTO " + name + "(" + fields_str + ") VALUES(" + templ_str + ");" + self.tables[name] = stm + + return (cursor, stm) + + # add columns to table + def add_columns(self, name, columns): + cursor = self.connection.cursor() + for item in columns: + stm = "ALTER TABLE " + name + ' ADD COLUMN "%s" %s' % (item[0], item[1]) + cursor.execute(stm) + self.connection.commit() + + # add columns with expression + def add_data_column(self, table_name, data_label, data_type, data_expr): + cursor = self.connection.cursor() + cursor.execute( + 'ALTER TABLE %s ADD COLUMN "%s" %s' % (table_name, data_label, data_type) + ) + cursor.execute("UPDATE %s SET %s = (%s);" % (table_name, data_label, data_expr)) + + def change_rec_name(self, table_name, rec_id, rec_name): + self.connection.execute( + "UPDATE " + table_name + ' SET Name = ? WHERE "Index" = ?', (rec_name, rec_id) + ) + + def change_rec_tid(self, table_name, rec_id, tid): + self.connection.execute( + "UPDATE " + table_name + ' SET tid = ? WHERE "Index" = ?', (tid, rec_id) + ) + + def change_rec_fld(self, table_name, fld_expr, rec_pat): + self.connection.execute( + "UPDATE " + table_name + " SET " + fld_expr + " WHERE " + rec_pat + ) + + def table_get_record(self, table_name, rec_pat): + cursor = self.connection.execute( + "SELECT * FROM " + table_name + " WHERE " + rec_pat + ) + raws = cursor.fetchall() + if len(raws) != 1: + raise Exception( + "Record (" + rec_pat + ') is not unique, table "' + table_name + '"' + ) + return list(raws[0]) + + # populate DB table entry + def insert_entry(self, table, val_list): + (cursor, stm) = table + cursor.execute(stm, val_list) + + # populate DB table entry + def commit_entry(self, table, val_list): + self.insert_entry(table, val_list) + self.connection.commit() + + # populate DB table data + def insert_table(self, table, reader): + for val_list in reader: + if not val_list[-1]: + val_list.pop() + self.insert_entry(table, val_list) + self.connection.commit() + + # return table fields list + def _get_fields(self, table_name): + cursor = self.connection.execute("SELECT * FROM " + table_name) + return list(map(lambda x: '"%s"' % (x[0]), cursor.description)) + + # return table raws list + def _get_raws(self, table_name): + cursor = self.connection.execute("SELECT * FROM " + table_name) + return cursor.fetchall() + + def _get_raws_indexed(self, table_name): + cursor = self.connection.execute( + "SELECT * FROM " + table_name + ' order by "Index" asc;' + ) + return cursor.fetchall() + + def _get_raw_by_id(self, table_name, rec_id): + cursor = self.connection.execute( + "SELECT * FROM " + table_name + ' WHERE "Index"=?', (rec_id,) + ) + raws = cursor.fetchall() + if len(raws) != 1: + raise Exception('Index is not unique, table "' + table_name + '"') + return list(raws[0]) + + def table_get_raws(self, table_name): + return self._get_raws(table_name) + + # dump CSV table + def dump_csv(self, table_name, file_name): + if not re.search(r"\.csv$", file_name): + raise Exception('wrong output file type: "' + file_name + '"') + + fields = self._get_fields(table_name) + with open(file_name, mode="w") as fd: + fd.write(",".join(fields) + "\n") + for raw in self._get_raws(table_name): + tmp = list(raw) + for idx in range(len(tmp)): + if type(tmp[idx]) == str: + if not (tmp[idx][0] == tmp[idx][-1] == '"'): + tmp[idx] = '"' + tmp[idx] + '"' + raw = tuple(tmp) + fd.write(reduce(lambda a, b: str(a) + "," + str(b), raw) + "\n") + + # dump JSON trace + def open_json(self, file_name): + if not re.search(r"\.json$", file_name): + raise Exception('wrong output file type: "' + file_name + '"') + with open(file_name, mode="w") as fd: + fd.write('{ "traceEvents":[{}\n') + + def close_json(self, file_name): + if not re.search(r"\.json$", file_name): + raise Exception('wrong output file type: "' + file_name + '"') + with open(file_name, mode="a") as fd: + fd.write("}") + + def label_json(self, pid, label, file_name): + if not re.search(r"\.json$", file_name): + raise Exception('wrong output file type: "' + file_name + '"') + with open(file_name, mode="a") as fd: + fd.write( + ',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' + % (label, pid, self.section_index) + ) + self.section_index += 1 + + def flow_json( + self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, file_name + ): + if not re.search(r"\.json$", file_name): + raise Exception('wrong output file type: "' + file_name + '"') + with open(file_name, mode="a") as fd: + dep_id = base_id + for ind in range(len(from_us_list)): + corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind + if corr_id in to_us_dict: + (from_ts, stream_id, tid) = from_us_list[ind] + to_ts = to_us_dict[corr_id] + if from_ts > to_ts: + from_ts = to_ts + fd.write( + ',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' + % (from_ts, dep_id, from_pid, tid) + ) + fd.write( + ',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' + % (to_ts, dep_id, to_pid, stream_id) + ) + dep_id += 1 + + def metadata_json(self, jsonfile, sysinfo_file): + params = gen_params(sysinfo_file) + with open(jsonfile, mode="a") as fd: + cnt = 0 + fd.write("],\n") + fd.write('"otherData": {\n') + for nkey in sorted(params.keys()): + key = nkey[1] + cnt = cnt + 1 + if cnt == len(params): + fd.write(' "' + key + '": "' + params[nkey] + '"\n') + else: + fd.write(' "' + key + '": "' + params[nkey] + '",\n') + fd.write(" }\n") + + def dump_json(self, table_name, data_name, file_name): + if not re.search(r"\.json$", file_name): + raise Exception('wrong output file type: "' + file_name + '"') + + sub_ptrn = re.compile(r'(^"|"$)') + name_ptrn = re.compile(r"(name|Name)") + + table_fields = self._get_fields(table_name) + table_raws = self._get_raws(table_name) + data_fields = self._get_fields(data_name) + data_raws = self._get_raws(data_name) + + with open(file_name, mode="a") as fd: + table_raws_len = len(table_raws) + for raw_index in range(table_raws_len): + if (raw_index == table_raws_len - 1) or (raw_index % 1000 == 0): + sys.stdout.write( + "\rdump json " + + str(raw_index) + + ":" + + str(len(table_raws)) + + " " * 100 + ) + + vals_list = [] + values = list(table_raws[raw_index]) + for value_index in range(len(values)): + label = table_fields[value_index] + value = values[value_index] + if name_ptrn.search(label): + value = sub_ptrn.sub(r"", value) + if label != '"Index"': + if label == '"dur"' and value == 0: + vals_list.append('%s:"%s"' % (label, "1")) + else: + vals_list.append('%s:"%s"' % (label, value)) + + args_list = [] + data = list(data_raws[raw_index]) + for value_index in range(len(data)): + label = data_fields[value_index] + value = data[value_index] + if label[:3] == '"__': + continue + if name_ptrn.search(label): + value = sub_ptrn.sub(r"", value) + if label != '"Index"' and label != '"roctx-range"': + args_list.append('%s:"%s"' % (label, value)) + + fd.write( + ',{"ph":"%s",%s,\n "args":{\n %s\n }\n}\n' + % ("X", ",".join(vals_list), ",\n ".join(args_list)) + ) + + sys.stdout.write("\n") + + # execute query on DB + def execute(self, cmd): + cursor = self.connection.cursor() + cursor.execute(cmd) + + # commit DB + def commit(self): + self.connection.commit() + + # close DB + def close(self): + self.connection.commit() + self.connection.close() + + # access DB + def get_raws(self, table_name): + cur = self.connection.cursor() + cur.execute("SELECT * FROM %s" % table_name) + return cur.fetchall() + + # return CSV descriptor + # list of fields and dictionaly for the fields types + def _get_csv_descr(self, table_name, fd): + reader = csv.DictReader(fd) + field_names = reader.fieldnames + if not field_names[-1]: + field_names.pop() + field_types = {} + + for entry in reader: + fields_left = [f for f in field_names if f not in field_types.keys()] + # all fields processed + if not fields_left: + break + + for field in fields_left: + data = entry[field] + # need data for the field to be processed + if len(data) == 0: + continue + + if data.isdigit(): + field_types[field] = "INTEGER" + else: + field_types[field] = "TEXT" + + if len(fields_left) > 0: + raise Exception("types not found for fields: ", fields_left) + return (field_names, field_types) + + # add CSV table + def add_csv_table(self, table_name, file_name, extra=()): + with open(file_name, mode="r") as fd: + # get CSV table descriptor + descr = self._get_csv_descr(table_name, fd) + # reader to populate the table + fd.seek(0) + reader = csv.reader(fd) + reader.next() + table = self.add_table(table_name, descr, extra) + self.insert_table(table, reader) + ############################################################################################## diff --git a/bin/tblextr.py b/bin/tblextr.py index c549fc09..507e1aeb 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -50,317 +50,418 @@ # stream ID map stream_counter = 0 stream_id_map = {} + + def get_stream_index(stream_id): - global stream_counter - stream_ind = 0 - if stream_id.lower() != 'nil': - if not stream_id in stream_id_map: - stream_counter += 1 - stream_ind = stream_counter - stream_id_map[stream_id] = stream_ind - else: - stream_ind = stream_id_map[stream_id] - return stream_ind + global stream_counter + stream_ind = 0 + if stream_id.lower() != "nil": + if not stream_id in stream_id_map: + stream_counter += 1 + stream_ind = stream_counter + stream_id_map[stream_id] = stream_ind + else: + stream_ind = stream_id_map[stream_id] + return stream_ind + # patching activity records -def activity_record_patching(db, ops_table_name, kernel_found, kernel_name, stream_found, stream_ind, select_expr): - if kernel_found != 0: - db.change_rec_fld(ops_table_name, 'Name = "' + kernel_name + '"', select_expr) - if stream_found != 0: - db.change_rec_fld(ops_table_name, 'tid = ' + str(stream_ind), select_expr) +def activity_record_patching( + db, ops_table_name, kernel_found, kernel_name, stream_found, stream_ind, select_expr +): + if kernel_found != 0: + db.change_rec_fld(ops_table_name, 'Name = "' + kernel_name + '"', select_expr) + if stream_found != 0: + db.change_rec_fld(ops_table_name, "tid = " + str(stream_ind), select_expr) + # global vars -table_descr = [ - ['Index', 'KernelName'], - {'Index': 'INTEGER', 'KernelName': 'TEXT'} -] +table_descr = [["Index", "KernelName"], {"Index": "INTEGER", "KernelName": "TEXT"}] var_list = table_descr[0] var_table = {} ############################################################# + def fatal(msg): - sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); - sys.exit(1) + sys.stderr.write(sys.argv[0] + ": " + msg + "\n") + sys.exit(1) + dbglog_count = 0 + + def dbglog(msg): - global dbglog_count - dbglog_count += 1 - sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); - fatal("error") + global dbglog_count + dbglog_count += 1 + sys.stderr.write(sys.argv[0] + ": " + msg + "\n") + fatal("error") + + ############################################################# # Dumping sysinfo sysinfo_begin = 1 + + def metadata_gen(sysinfo_file, sysinfo_cmd): - global sysinfo_begin - if not re.search(r'\.txt$', sysinfo_file): - raise Exception('wrong output file type: "' + sysinfo_file + '"' ) - if sysinfo_begin == 1: - sysinfo_begin = 0 - with open(sysinfo_file, mode='w') as fd: fd.write('') - with open(sysinfo_file, mode='a') as fd: fd.write('CMD: ' + sysinfo_cmd + '\n') - status = subprocess.call(sysinfo_cmd + ' >> ' + sysinfo_file, - stderr=subprocess.STDOUT, - shell=True) - if status != 0: - raise Exception('Could not run command: "' + sysinfo_cmd + '"') + global sysinfo_begin + if not re.search(r"\.txt$", sysinfo_file): + raise Exception('wrong output file type: "' + sysinfo_file + '"') + if sysinfo_begin == 1: + sysinfo_begin = 0 + with open(sysinfo_file, mode="w") as fd: + fd.write("") + with open(sysinfo_file, mode="a") as fd: + fd.write("CMD: " + sysinfo_cmd + "\n") + status = subprocess.call( + sysinfo_cmd + " >> " + sysinfo_file, stderr=subprocess.STDOUT, shell=True + ) + if status != 0: + raise Exception('Could not run command: "' + sysinfo_cmd + '"') + # parse results method def parse_res(infile): - global max_gpu_id - if not os.path.isfile(infile): return - inp = open(infile, 'r') - - beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)") - prop_pattern = re.compile("([\w-]+)\((\w+)\)"); - ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") - # var pattern below matches a variable name and a variable value from a one - # line text in the format of for example "WRITE_SIZE (0.2500000000)" or - # "GRBM_GUI_ACTIVE (27867)" or "TA_TA_BUSY[0]" - var_pattern = re.compile("^\s*([a-zA-Z0-9_]+(?:\[\d+\])?)\s+\((\d+(?:\.\d+)?)\)") - pid_pattern = re.compile("pid\((\d*)\)") - - dispatch_number = 0 - var_table_pid = 0 - for line in inp.readlines(): - record = line[:-1] - - m = pid_pattern.search(record) - if m and not os.getenv('ROCP_MERGE_PIDS'): var_table_pid = int(m.group(1)) - - m = var_pattern.match(record) - if m: - if not (var_table_pid, dispatch_number) in var_table: fatal("Error: dispatch number not found '" + str(dispatch_number) + "'") - var = m.group(1) - val = m.group(2) - var_table[(var_table_pid, dispatch_number)][var] = val - if not var in var_list: var_list.append(var) - - m = beg_pattern.match(record) - if m: - dispatch_number = m.group(1) - if not (var_table_pid, dispatch_number) in var_table: - var_table[(var_table_pid, dispatch_number)] = { - 'Index': dispatch_number, - 'KernelName': "\"" + m.group(3) + "\"" - } - - gpu_id = 0 - queue_id = 0 - disp_pid = 0 - disp_tid = 0 - - kernel_properties = m.group(2) - for prop in kernel_properties.split(', '): - m = prop_pattern.match(prop) - if m: + global max_gpu_id + if not os.path.isfile(infile): + return + inp = open(infile, "r") + + beg_pattern = re.compile('^dispatch\[(\d*)\], (.*) kernel-name\("([^"]*)"\)') + prop_pattern = re.compile("([\w-]+)\((\w+)\)") + ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") + # var pattern below matches a variable name and a variable value from a one + # line text in the format of for example "WRITE_SIZE (0.2500000000)" or + # "GRBM_GUI_ACTIVE (27867)" or "TA_TA_BUSY[0]" + var_pattern = re.compile("^\s*([a-zA-Z0-9_]+(?:\[\d+\])?)\s+\((\d+(?:\.\d+)?)\)") + pid_pattern = re.compile("pid\((\d*)\)") + + dispatch_number = 0 + var_table_pid = 0 + for line in inp.readlines(): + record = line[:-1] + + m = pid_pattern.search(record) + if m and not os.getenv("ROCP_MERGE_PIDS"): + var_table_pid = int(m.group(1)) + + m = var_pattern.match(record) + if m: + if not (var_table_pid, dispatch_number) in var_table: + fatal("Error: dispatch number not found '" + str(dispatch_number) + "'") var = m.group(1) val = m.group(2) var_table[(var_table_pid, dispatch_number)][var] = val - if not var in var_list: var_list.append(var); - if var == 'gpu-id': - gpu_id = int(val) - if (gpu_id > max_gpu_id): max_gpu_id = gpu_id - if var == 'queue-id': queue_id = int(val) - if var == 'pid': disp_pid = int(val) - if var == 'tid': disp_tid = int(val) - else: fatal('wrong kernel property "' + prop + '" in "'+ kernel_properties + '"') - m = ts_pattern.search(record) + if not var in var_list: + var_list.append(var) + + m = beg_pattern.match(record) if m: - var_table[(var_table_pid, dispatch_number)]['DispatchNs'] = m.group(1) - var_table[(var_table_pid, dispatch_number)]['BeginNs'] = m.group(2) - var_table[(var_table_pid, dispatch_number)]['EndNs'] = m.group(3) - var_table[(var_table_pid, dispatch_number)]['CompleteNs'] = m.group(4) - - ## filling dependenciws - from_ns = int(m.group(1)) - to_ns = int(m.group(2)) - from_us = int((from_ns - START_NS) / 1000) - to_us = int((to_ns - START_NS) / 1000) - - kern_dep_list.append((from_ns, disp_pid, disp_tid)) - - gpu_pid = GPU_BASE_PID + int(gpu_id) - if not disp_pid in dep_dict: dep_dict[disp_pid] = {} - dep_proc = dep_dict[disp_pid] - if not gpu_pid in dep_proc: dep_proc[gpu_pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] } - dep_str = dep_proc[gpu_pid] - to_id = len(dep_str['from']) - dep_str['from'].append((from_us, disp_tid, disp_tid)) - dep_str['to'][to_id] = to_us - ## - - inp.close() + dispatch_number = m.group(1) + if not (var_table_pid, dispatch_number) in var_table: + var_table[(var_table_pid, dispatch_number)] = { + "Index": dispatch_number, + "KernelName": '"' + m.group(3) + '"', + } + + gpu_id = 0 + queue_id = 0 + disp_pid = 0 + disp_tid = 0 + + kernel_properties = m.group(2) + for prop in kernel_properties.split(", "): + m = prop_pattern.match(prop) + if m: + var = m.group(1) + val = m.group(2) + var_table[(var_table_pid, dispatch_number)][var] = val + if not var in var_list: + var_list.append(var) + if var == "gpu-id": + gpu_id = int(val) + if gpu_id > max_gpu_id: + max_gpu_id = gpu_id + if var == "queue-id": + queue_id = int(val) + if var == "pid": + disp_pid = int(val) + if var == "tid": + disp_tid = int(val) + else: + fatal( + 'wrong kernel property "' + + prop + + '" in "' + + kernel_properties + + '"' + ) + m = ts_pattern.search(record) + if m: + var_table[(var_table_pid, dispatch_number)]["DispatchNs"] = m.group(1) + var_table[(var_table_pid, dispatch_number)]["BeginNs"] = m.group(2) + var_table[(var_table_pid, dispatch_number)]["EndNs"] = m.group(3) + var_table[(var_table_pid, dispatch_number)]["CompleteNs"] = m.group(4) + + ## filling dependenciws + from_ns = int(m.group(1)) + to_ns = int(m.group(2)) + from_us = int((from_ns - START_NS) / 1000) + to_us = int((to_ns - START_NS) / 1000) + + kern_dep_list.append((from_ns, disp_pid, disp_tid)) + + gpu_pid = GPU_BASE_PID + int(gpu_id) + if not disp_pid in dep_dict: + dep_dict[disp_pid] = {} + dep_proc = dep_dict[disp_pid] + if not gpu_pid in dep_proc: + dep_proc[gpu_pid] = { + "pid": HSA_PID, + "from": [], + "to": {}, + "id": [], + } + dep_str = dep_proc[gpu_pid] + to_id = len(dep_str["from"]) + dep_str["from"].append((from_us, disp_tid, disp_tid)) + dep_str["to"][to_id] = to_us + ## + + inp.close() + + ############################################################# + # Comparator to sort a dictionary of tuples. This comparator will convert # the second element of tuple to an int and return the new tuple. Then # the dictionary can use the default comparison i.e sort by first element, # then sort by second element. -def tuple_comparator(tupleElem) : +def tuple_comparator(tupleElem): return tupleElem[0], int(tupleElem[1]) + # merge results table def merge_table(): - global var_list - keys = sorted(var_table.keys(), key=tuple_comparator) - - fields = set(var_table[keys[0]]) - if 'DispatchNs' in fields: - var_list.append('DispatchNs') - var_list.append('BeginNs') - var_list.append('EndNs') - var_list.append('CompleteNs') - var_list = [x for x in var_list if x in fields] + global var_list + keys = sorted(var_table.keys(), key=tuple_comparator) + + fields = set(var_table[keys[0]]) + if "DispatchNs" in fields: + var_list.append("DispatchNs") + var_list.append("BeginNs") + var_list.append("EndNs") + var_list.append("CompleteNs") + var_list = [x for x in var_list if x in fields] + + ############################################################# + # dump CSV results def dump_csv(file_name): - global var_list - keys = sorted(var_table.keys(), key=tuple_comparator) + global var_list + keys = sorted(var_table.keys(), key=tuple_comparator) + + with open(file_name, mode="w") as fd: + fd.write(",".join(var_list) + "\n") + for pid, ind in keys: + entry = var_table[(pid, ind)] + dispatch_number = entry["Index"] + if ind != dispatch_number: + fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") + val_list = [entry[var] for var in var_list] + fd.write(",".join(val_list) + "\n") + + print("File '" + file_name + "' is generating") - with open(file_name, mode='w') as fd: - fd.write(','.join(var_list) + '\n'); - for pid, ind in keys: - entry = var_table[(pid, ind)] - dispatch_number = entry['Index'] - if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") - val_list = [entry[var] for var in var_list] - fd.write(','.join(val_list) + '\n'); - print("File '" + file_name + "' is generating") ############################################################# + # fill kernels DB def fill_kernel_db(table_name, db): - global var_list - keys = sorted(var_table.keys(), key=tuple_comparator) + global var_list + keys = sorted(var_table.keys(), key=tuple_comparator) + + for var in set(var_list).difference(set(table_descr[1])): + table_descr[1][var] = "INTEGER" + table_descr[0] = var_list + + table_handle = db.add_table(table_name, table_descr) - for var in set(var_list).difference(set(table_descr[1])): - table_descr[1][var] = 'INTEGER' - table_descr[0] = var_list; + for pid, ind in keys: + entry = var_table[(pid, ind)] + dispatch_number = entry["Index"] + if ind != dispatch_number: + fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") + val_list = [entry[var] for var in var_list] + db.insert_entry(table_handle, val_list) - table_handle = db.add_table(table_name, table_descr) - for pid, ind in keys: - entry = var_table[(pid, ind)] - dispatch_number = entry['Index'] - if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") - val_list = [entry[var] for var in var_list] - db.insert_entry(table_handle, val_list) ############################################################# # Fill Ext DB ext_table_descr = [ - ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'Index', '__section', '__lane'], - {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'Index':'INTEGER', '__section':'INTEGER', '__lane':'INTEGER'} + ["BeginNs", "EndNs", "pid", "tid", "Name", "Index", "__section", "__lane"], + { + "BeginNs": "INTEGER", + "EndNs": "INTEGER", + "pid": "INTEGER", + "tid": "INTEGER", + "Name": "TEXT", + "Index": "INTEGER", + "__section": "INTEGER", + "__lane": "INTEGER", + }, ] + + def fill_ext_db(table_name, db, indir, trace_name, api_pid): - global range_data + global range_data - file_name = indir + '/' + trace_name + '_trace.txt' - # tms pid:tid cid:rid:'.....' - ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(\d+):"(.*)"$') + file_name = indir + "/" + trace_name + "_trace.txt" + # tms pid:tid cid:rid:'.....' + ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(\d+):"(.*)"$') - range_data = {} - range_stack = {} - range_map = {} + range_data = {} + range_stack = {} + range_map = {} - if not os.path.isfile(file_name): return 0 + if not os.path.isfile(file_name): + return 0 + + record_id = 0 + table_handle = db.add_table(table_name, ext_table_descr) + with open(file_name, mode="r") as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if m: + tms = int(m.group(1)) + pid = m.group(2) + tid = int(m.group(3)) + cid = int(m.group(4)) + rid = int(m.group(5)) + msg = m.group(6) + + rec_vals = [] + if not tid in range_data: + range_data[tid] = {} + + if cid != 2: + rec_vals.append(tms) + rec_vals.append(tms + 1) + rec_vals.append(pid) + rec_vals.append(tid) + rec_vals.append(msg) + rec_vals.append(record_id) + rec_vals.append(api_pid) # __section + rec_vals.append(tid) # __lane + + if cid == 1: + if not pid in range_stack: + range_stack[pid] = {} + pid_stack = range_stack[pid] + if not tid in pid_stack: + pid_stack[tid] = [] + rec_stack = pid_stack[tid] + rec_stack.append(rec_vals) + continue + + if cid == 2: + if not pid in range_stack: + fatal("ROCTX range begin not found, pid(" + pid + ")") + pid_stack = range_stack[pid] + if not tid in pid_stack: + fatal("ROCTX range begin not found, tid(" + tid + ")") + rec_stack = pid_stack[tid] + rec_vals = rec_stack.pop() + rec_vals[1] = tms + # record the range's start/stop timestamps, its parent (ranges can be nested), and its message. + range_start = rec_vals[0] + range_stop = tms + range_parent = rec_stack[-1][0] if len(rec_stack) != 0 else 0 + range_msg = rec_vals[4] + range_data[tid][range_start] = (range_stop, range_parent, range_msg) + + # range start + if cid == 3: + range_map[rid] = (tms, msg) + continue + + # range stop + if cid == 4: + if rid in range_map: + (tms, msg) = range_map[ + rid + ] # querying start timestamp if rid exists + del range_map[rid] + else: + fatal("range id(" + str(rid) + ") is not found") + rec_vals[0] = tms # begin timestamp + rec_vals[4] = msg # range message + rec_vals[7] = 0 # 0 lane for ranges + + db.insert_entry(table_handle, rec_vals) + record_id += 1 + + return 1 - record_id = 0 - table_handle = db.add_table(table_name, ext_table_descr) - with open(file_name, mode='r') as fd: - for line in fd.readlines(): - record = line[:-1] - m = ptrn_val.match(record) - if m: - tms = int(m.group(1)) - pid = m.group(2) - tid = int(m.group(3)) - cid = int(m.group(4)) - rid = int(m.group(5)) - msg = m.group(6) - rec_vals = [] - if not tid in range_data: range_data[tid] = {} - - if cid != 2: - rec_vals.append(tms) - rec_vals.append(tms + 1) - rec_vals.append(pid) - rec_vals.append(tid) - rec_vals.append(msg) - rec_vals.append(record_id) - rec_vals.append(api_pid) # __section - rec_vals.append(tid) # __lane - - if cid == 1: - if not pid in range_stack: range_stack[pid] = {} - pid_stack = range_stack[pid] - if not tid in pid_stack: pid_stack[tid] = [] - rec_stack = pid_stack[tid] - rec_stack.append(rec_vals) - continue - - if cid == 2: - if not pid in range_stack: fatal("ROCTX range begin not found, pid(" + pid + ")"); - pid_stack = range_stack[pid] - if not tid in pid_stack: fatal("ROCTX range begin not found, tid(" + tid + ")"); - rec_stack = pid_stack[tid] - rec_vals = rec_stack.pop() - rec_vals[1] = tms - # record the range's start/stop timestamps, its parent (ranges can be nested), and its message. - range_start = rec_vals[0] - range_stop = tms - range_parent = rec_stack[-1][0] if len(rec_stack) != 0 else 0 - range_msg = rec_vals[4] - range_data[tid][range_start] = (range_stop, range_parent, range_msg) - - # range start - if cid == 3: - range_map[rid] = (tms, msg) - continue - - # range stop - if cid == 4: - if rid in range_map: - (tms, msg) = range_map[rid] # querying start timestamp if rid exists - del range_map[rid] - else: fatal("range id(" + str(rid) + ") is not found") - rec_vals[0] = tms # begin timestamp - rec_vals[4] = msg # range message - rec_vals[7] = 0 # 0 lane for ranges - - db.insert_entry(table_handle, rec_vals) - record_id += 1 - - return 1 ############################################################# # arguments manipulation routines def get_field(args, field): - ptrn1_field = re.compile(r'^.* ' + field + '\('); - ptrn2_field = re.compile(r'\) .*$'); - ptrn3_field = re.compile(r'\)\)$'); - (field_name, n) = ptrn1_field.subn('', args, count=1); - if n != 0: - (field_name, n) = ptrn2_field.subn('', field_name, count=1) - if n == 0: - (field_name, n) = ptrn3_field.subn('', field_name, count=1) - return (field_name, n) + ptrn1_field = re.compile(r"^.* " + field + "\(") + ptrn2_field = re.compile(r"\) .*$") + ptrn3_field = re.compile(r"\)\)$") + (field_name, n) = ptrn1_field.subn("", args, count=1) + if n != 0: + (field_name, n) = ptrn2_field.subn("", field_name, count=1) + if n == 0: + (field_name, n) = ptrn3_field.subn("", field_name, count=1) + return (field_name, n) + def set_field(args, field, val): - return re.subn(field + '\(\w+\)([ \)])', field + '(' + str(val) + ')\\1', args, count=1) + return re.subn( + field + "\(\w+\)([ \)])", field + "(" + str(val) + ")\\1", args, count=1 + ) + hsa_patch_data = {} ops_patch_data = {} # Fill API DB api_table_descr = [ - ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data', '__section', '__lane'], - {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER', 'Data':'TEXT', '__section':'INTEGER', '__lane':'INTEGER'} + [ + "BeginNs", + "EndNs", + "pid", + "tid", + "Name", + "args", + "Index", + "Data", + "__section", + "__lane", + ], + { + "BeginNs": "INTEGER", + "EndNs": "INTEGER", + "pid": "INTEGER", + "tid": "INTEGER", + "Name": "TEXT", + "args": "TEXT", + "Index": "INTEGER", + "Data": "TEXT", + "__section": "INTEGER", + "__lane": "INTEGER", + }, ] + + # Filling API records DB table # table_name - created DB table name # db - DB handle @@ -370,53 +471,64 @@ def set_field(args, field, val): # dep_pid - PID of dependet domain # dep_list - list of dependet dospatch events # dep_filtr - registered dependencies by record ID -def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): - global hsa_activity_found - global memory_manager - - range_start_times = {} - copy_csv = '' - - ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') - hip_mcopy_ptrn = re.compile(r'hipMemcpy|hipMemset') - hip_wait_event_ptrn = re.compile(r'WaitEvent') - hip_sync_event_ptrn = re.compile(r'hipStreamSynchronize') - hip_sync_dev_event_ptrn = re.compile(r'hipDeviceSynchronize') - hip_graph_ptrn = re.compile(r'hipGraphLaunch') - wait_event_ptrn = re.compile(r'WaitEvent|hipStreamSynchronize|hipDeviceSynchronize') - hip_stream_wait_write_ptrn = re.compile(r'hipStreamWaitValue64|hipStreamWriteValue64|hipStreamWaitValue32|hipStreamWriteValue32') - prop_pattern = re.compile("([\w-]+)\((\w+)\)"); - beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)") - hip_strm_cr_event_ptrn = re.compile(r'hipStreamCreate') - hsa_mcopy_ptrn = re.compile(r'hsa_amd_memory_async_copy') - ptrn_fixformat = re.compile(r'(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$') - ptrn_fixkernel = re.compile(r'\s+kernel=(.*)$') - ptrn_multi_kernel = re.compile(r'(.*):(\d+)$') - ptrn_corr_id = re.compile(r'\ :(\d*)$') - - file_name = indir + '/' + api_name + '_api_trace.txt' - if not os.path.isfile(file_name): return 0 - - hsa_copy_file_name = indir + '/' + 'async_copy_trace.txt' - hsa_copy_file_name_present = 1 if os.path.isfile(file_name) else 0 - hsa_copy_deps = 1 if (api_pid == HSA_PID and hsa_copy_file_name_present == 1) else 0 - print("hsa_copy_deps: " + str(hsa_copy_deps)) - - # parsing an input trace file and creating a DB table - record_id_dict = {} - table_handle = db.add_table(table_name, api_table_descr) - with open(file_name, mode='r') as fd: - file_lines = fd.readlines() - total_lines = len(file_lines) - line_index = 0 - for line in file_lines: - if (line_index == total_lines - 1) or (line_index % 100 == 0): - sys.stdout.write( \ - "\rscan " + api_name + " API data " + str(line_index) + ":" + str(total_lines) + " "*100 \ - ) - line_index += 1 - - record = line[:-1] +def fill_api_db( + table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id +): + global hsa_activity_found + global memory_manager + + range_start_times = {} + copy_csv = "" + + ptrn_val = re.compile(r"(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$") + hip_mcopy_ptrn = re.compile(r"hipMemcpy|hipMemset") + hip_wait_event_ptrn = re.compile(r"WaitEvent") + hip_sync_event_ptrn = re.compile(r"hipStreamSynchronize") + hip_sync_dev_event_ptrn = re.compile(r"hipDeviceSynchronize") + hip_graph_ptrn = re.compile(r"hipGraphLaunch") + wait_event_ptrn = re.compile(r"WaitEvent|hipStreamSynchronize|hipDeviceSynchronize") + hip_stream_wait_write_ptrn = re.compile( + r"hipStreamWaitValue64|hipStreamWriteValue64|hipStreamWaitValue32|hipStreamWriteValue32" + ) + prop_pattern = re.compile("([\w-]+)\((\w+)\)") + beg_pattern = re.compile('^dispatch\[(\d*)\], (.*) kernel-name\("([^"]*)"\)') + hip_strm_cr_event_ptrn = re.compile(r"hipStreamCreate") + hsa_mcopy_ptrn = re.compile(r"hsa_amd_memory_async_copy") + ptrn_fixformat = re.compile(r"(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$") + ptrn_fixkernel = re.compile(r"\s+kernel=(.*)$") + ptrn_multi_kernel = re.compile(r"(.*):(\d+)$") + ptrn_corr_id = re.compile(r"\ :(\d*)$") + + file_name = indir + "/" + api_name + "_api_trace.txt" + if not os.path.isfile(file_name): + return 0 + + hsa_copy_file_name = indir + "/" + "async_copy_trace.txt" + hsa_copy_file_name_present = 1 if os.path.isfile(file_name) else 0 + hsa_copy_deps = 1 if (api_pid == HSA_PID and hsa_copy_file_name_present == 1) else 0 + print("hsa_copy_deps: " + str(hsa_copy_deps)) + + # parsing an input trace file and creating a DB table + record_id_dict = {} + table_handle = db.add_table(table_name, api_table_descr) + with open(file_name, mode="r") as fd: + file_lines = fd.readlines() + total_lines = len(file_lines) + line_index = 0 + for line in file_lines: + if (line_index == total_lines - 1) or (line_index % 100 == 0): + sys.stdout.write( + "\rscan " + + api_name + + " API data " + + str(line_index) + + ":" + + str(total_lines) + + " " * 100 + ) + line_index += 1 + + record = line[:-1] corr_id = 0 m = ptrn_corr_id.search(record) @@ -494,447 +606,690 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep for prop in kernel_properties.split(', '): m = prop_pattern.match(prop) if m: - val = m.group(2) - var = m.group(1) - if var == 'gpu-id': - gpu_id = int(val) + corr_id = int(m.group(1)) + record = ptrn_corr_id.sub("", record) + + kernel_arg = "" + m = ptrn_fixkernel.search(record) + if m: + kernel_arg = "kernel(" + m.group(1) + ") " + record = ptrn_fixkernel.sub("", record) + + mfixformat = ptrn_fixformat.match(record) + if mfixformat: # replace '=' in args with parentheses + reformated_args = ( + kernel_arg + + mfixformat.group(2) + .replace("=", "(") + .replace(",", ")") + .replace("\\", "\\\\") + .replace('"', '\\"') + + ")" + ) + record = mfixformat.group(1) + "( " + reformated_args + ")" + + m = ptrn_val.match(record) + if not m: + fatal(api_name + " bad record: '" + record + "'") + else: + rec_vals = [] + rec_len = len(api_table_descr[0]) - 3 + for ind in range(1, rec_len): + rec_vals.append(m.group(ind)) + proc_id = int(rec_vals[2]) + thread_id = int(rec_vals[3]) + record_name = rec_vals[4] + record_args = rec_vals[5] + + # incrementing per-process record id/correlation id + if not proc_id in record_id_dict: + record_id_dict[proc_id] = 0 + record_id_dict[proc_id] += 1 + record_id = record_id_dict[proc_id] + + # setting correlationid to record id if correlation id is not defined + if corr_id == 0: + corr_id = record_id + + rec_vals.append(corr_id) + # extracting/converting stream id + (stream_id, stream_found) = get_field(record_args, "stream") + if stream_found: + stream_id = get_stream_index(stream_id) + (rec_vals[5], found) = set_field(record_args, "stream", stream_id) + if found == 0: + fatal( + 'set_field() failed for "stream", args: "' + record_args + '"' + ) + else: + (stream_id, stream_found) = get_field(record_args, "hStream") + if stream_found: + stream_id = get_stream_index(stream_id) + (rec_vals[5], found) = set_field( + record_args, "hStream", stream_id + ) + if found == 0: + fatal( + 'set_field() failed for "stream", args: "' + + record_args + + '"' + ) + else: + stream_id = 0 + + if hip_strm_cr_event_ptrn.match(record_name): + hip_streams.append(stream_id) + + if hip_sync_event_ptrn.match(record_name): + if (proc_id, stream_id) in last_hip_api_map: + (last_hip_api_corr_id, last_hip_api_from_pid) = last_hip_api_map[ + (proc_id, stream_id) + ][-1] + sync_api_beg_us = int((int(rec_vals[0]) - START_NS) / 1000) + if HIP_PID not in dep_dict[proc_id]: + dep_dict[proc_id][HIP_PID] = { + "pid": last_hip_api_from_pid, + "from": [], + "to": {}, + "id": [], + } + dep_dict[proc_id][HIP_PID]["from"].append( + (-1, stream_id, thread_id) + ) + dep_dict[proc_id][HIP_PID]["id"].append(last_hip_api_corr_id) + dep_dict[proc_id][HIP_PID]["to"][ + last_hip_api_corr_id + ] = sync_api_beg_us + from_ids[(last_hip_api_corr_id, proc_id)] = ( + len(dep_dict[proc_id][HIP_PID]["from"]) - 1 + ) + + m = beg_pattern.match(record) + gpu_id = 0 + if m: + kernel_properties = m.group(2) + for prop in kernel_properties.split(", "): + m = prop_pattern.match(prop) + if m: + val = m.group(2) + var = m.group(1) + if var == "gpu-id": + gpu_id = int(val) + + if hsa_mcopy_ptrn.match(record_name) or hip_mcopy_ptrn.match(record_name): + ops_section_id = COPY_PID + else: + ops_section_id = GPU_BASE_PID + int(gpu_id) + + if (proc_id, stream_id) not in last_hip_api_map: + last_hip_api_map[(proc_id, stream_id)] = [] + last_hip_api_map[(proc_id, stream_id)].append((corr_id, ops_section_id)) + + # asyncronous opeartion API found + op_found = 0 + mcopy_found = 0 + + # extract kernel name string + (kernel_str, kernel_found) = get_field(record_args, "kernel") + if kernel_found == 0: + kernel_str = "" + else: + op_found = 1 + + if hip_mcopy_ptrn.match(record_name): + mcopy_found = 1 + op_found = 1 + + # HIP Graph API + if hip_graph_ptrn.search(record_name): + op_found = 1 + + # HIP WaitEvent API + if wait_event_ptrn.search(record_name): + op_found = 1 + + if hip_stream_wait_write_ptrn.search(record_name): + op_found = 1 + + # HSA memcopy API + if hsa_mcopy_ptrn.match(record_name): + mcopy_found = 1 + op_found = 1 + + stream_id = thread_id + hsa_patch_data[(corr_id, proc_id)] = thread_id + + if op_found: + roctx_msg = "" + + if not thread_id in range_start_times: + range_start_times[thread_id] = ( + sorted(range_data[thread_id].keys()) + if thread_id in range_data + else [] + ) + start_times = range_start_times[thread_id] + + index = bisect.bisect_right(start_times, int(rec_vals[0])) + if index > 0: + # We found the range that is closest to this operation. Iterate the + # range stack this range is part of until we find a range that entirely + # contains the operation. + range_start = start_times[index - 1] + while range_start != 0: + (range_end, range_start, msg) = range_data[thread_id][ + range_start + ] + if int(rec_vals[1]) < range_end: + # This range contains the operation. + roctx_msg = msg + break + + ops_patch_data[(corr_id, proc_id)] = ( + thread_id, + stream_id, + kernel_str, + roctx_msg, + ) + + if op_found: + op_found = 0 + beg_ns = int(rec_vals[0]) + end_ns = int(rec_vals[1]) + dur_us = int((end_ns - beg_ns) / 1000) + from_us = int((beg_ns - START_NS) / 1000) + dur_us / 2 + if api_pid == HIP_PID or hsa_copy_deps == 1: + if not proc_id in dep_dict: + dep_dict[proc_id] = {} + dep_proc = dep_dict[proc_id] + if not dep_pid in dep_proc: + if api_pid == "HIP_PID": + dep_proc[dep_pid] = {"pid": api_pid, "from": [], "id": []} + else: + dep_proc[dep_pid] = { + "pid": api_pid, + "from": [], + "id": [], + "to": {}, + } + dep_str = dep_proc[dep_pid] + dep_str["from"].append((from_us, stream_id, thread_id)) + if expl_id: + dep_str["id"].append(corr_id) + + # memcopy registering + api_data = ( + memory_manager.register_api(rec_vals) if mcopy_data_enabled else "" + ) + rec_vals.append(api_data) + + # setting section and lane + rec_vals.append(api_pid) # __section + rec_vals.append(thread_id) # __lane + + # inserting an API record to DB + db.insert_entry(table_handle, rec_vals) + + # inserting of dispatch events correlated to the dependent dispatches + for from_ns, proc_id, thread_id in dep_list: + if not proc_id in record_id_dict: + record_id_dict[proc_id] = 0 + record_id_dict[proc_id] += 1 + corr_id = record_id_dict[proc_id] + db.insert_entry( + table_handle, + [ + from_ns, + from_ns, + proc_id, + thread_id, + "hsa_dispatch", + "", + corr_id, + "", + api_pid, + thread_id, + ], + ) + + # generating memcopy CSV + if copy_csv != "": + file_name = os.environ["PWD"] + "/results_mcopy.csv" + with open(file_name, mode="w") as fd: + print("File '" + file_name + "' is generating") + fd.write(copy_csv) + + return 1 - if hsa_mcopy_ptrn.match(record_name) or hip_mcopy_ptrn.match(record_name): - ops_section_id = COPY_PID - else: - ops_section_id = GPU_BASE_PID + int(gpu_id) - - if (proc_id,stream_id) not in last_hip_api_map: - last_hip_api_map[(proc_id,stream_id)] = [] - last_hip_api_map[(proc_id, stream_id)].append((corr_id, ops_section_id)) - - # asyncronous opeartion API found - op_found = 0 - mcopy_found = 0 - - # extract kernel name string - (kernel_str, kernel_found) = get_field(record_args, 'kernel') - if kernel_found == 0: kernel_str = '' - else: op_found = 1 - - if hip_mcopy_ptrn.match(record_name): - mcopy_found = 1 - op_found = 1 - - # HIP Graph API - if hip_graph_ptrn.search(record_name): - op_found = 1 - - # HIP WaitEvent API - if wait_event_ptrn.search(record_name): - op_found = 1 - - if hip_stream_wait_write_ptrn.search(record_name): - op_found = 1 - - # HSA memcopy API - if hsa_mcopy_ptrn.match(record_name): - mcopy_found = 1 - op_found = 1 - - stream_id = thread_id - hsa_patch_data[(corr_id, proc_id)] = thread_id - - if op_found: - roctx_msg = '' - - if not thread_id in range_start_times: - range_start_times[thread_id] = sorted(range_data[thread_id].keys()) if thread_id in range_data else [] - start_times = range_start_times[thread_id] - - index = bisect.bisect_right(start_times,int(rec_vals[0])) - if index > 0: - # We found the range that is closest to this operation. Iterate the - # range stack this range is part of until we find a range that entirely - # contains the operation. - range_start = start_times[index - 1] - while range_start != 0: - (range_end, range_start, msg) = range_data[thread_id][range_start] - if int(rec_vals[1]) < range_end: - # This range contains the operation. - roctx_msg = msg - break - - ops_patch_data[(corr_id, proc_id)] = (thread_id, stream_id, kernel_str, roctx_msg) - - if op_found: - op_found = 0 - beg_ns = int(rec_vals[0]) - end_ns = int(rec_vals[1]) - dur_us = int((end_ns - beg_ns) / 1000) - from_us = int((beg_ns - START_NS) / 1000) + dur_us/2 - if api_pid == HIP_PID or hsa_copy_deps == 1: - if not proc_id in dep_dict: dep_dict[proc_id] = {} - dep_proc = dep_dict[proc_id] - if not dep_pid in dep_proc: - if api_pid == 'HIP_PID': dep_proc[dep_pid] = { 'pid': api_pid, 'from': [], 'id': [] } - else: dep_proc[dep_pid] = { 'pid': api_pid, 'from': [], 'id': [], 'to': {} } - dep_str = dep_proc[dep_pid] - dep_str['from'].append((from_us, stream_id, thread_id)) - if expl_id: dep_str['id'].append(corr_id) - - # memcopy registering - api_data = memory_manager.register_api(rec_vals) if mcopy_data_enabled else '' - rec_vals.append(api_data) - - # setting section and lane - rec_vals.append(api_pid) # __section - rec_vals.append(thread_id) # __lane - - # inserting an API record to DB - db.insert_entry(table_handle, rec_vals) - - # inserting of dispatch events correlated to the dependent dispatches - for (from_ns, proc_id, thread_id) in dep_list: - if not proc_id in record_id_dict: record_id_dict[proc_id] = 0 - record_id_dict[proc_id] += 1 - corr_id = record_id_dict[proc_id] - db.insert_entry(table_handle, [from_ns, from_ns, proc_id, thread_id, 'hsa_dispatch', '', corr_id, '', api_pid, thread_id]) - - # generating memcopy CSV - if copy_csv != '': - file_name = os.environ['PWD'] + '/results_mcopy.csv' - with open(file_name, mode='w') as fd: - print("File '" + file_name + "' is generating") - fd.write(copy_csv) - - return 1 ############################################################# # fill COPY DB copy_table_descr = [ - ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index', 'Data', '__section', '__lane'], - {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Data':'TEXT', '__section':'INTEGER', '__lane':'INTEGER'} + ["BeginNs", "EndNs", "Name", "pid", "tid", "Index", "Data", "__section", "__lane"], + { + "Index": "INTEGER", + "Name": "TEXT", + "args": "TEXT", + "BeginNs": "INTEGER", + "EndNs": "INTEGER", + "pid": "INTEGER", + "tid": "INTEGER", + "Data": "TEXT", + "__section": "INTEGER", + "__lane": "INTEGER", + }, ] -def fill_copy_db(table_name, db, indir): - sect_id = COPY_PID - file_name = indir + '/' + 'async_copy_trace.txt' - ptrn_val = re.compile(r'^(\d+):(\d+) (async-copy):(\d+):(\d+)$') - - if not os.path.isfile(file_name): return 0 - - table_handle = db.add_table(table_name, copy_table_descr) - with open(file_name, mode='r') as fd: - for line in fd.readlines(): - record = line[:-1] - m = ptrn_val.match(record) - if not m: fatal("bad async-copy entry '" + record + "'") - else: - rec_vals = [] - for ind in range(1,4): rec_vals.append(m.group(ind)) - corr_id = int(m.group(4)) - proc_id = int(m.group(5)) - - # querying tid value - if (corr_id, proc_id) in hsa_patch_data: - thread_id = hsa_patch_data[(corr_id, proc_id)] - else: - thread_id = -1 - - # completing record - rec_vals.append(proc_id) # tid - rec_vals.append(thread_id) # tid - rec_vals.append(corr_id) # Index - # registering memcopy information - activity_data = memory_manager.register_copy(rec_vals) if mcopy_data_enabled else '' - rec_vals.append(activity_data) - # appending straem ID and section ID - rec_vals.append(COPY_PID) # __section - rec_vals.append(thread_id) # __lane - - # inserting DB activity entry - db.insert_entry(table_handle, rec_vals) - - # filling dependencies - to_ns = int(rec_vals[0]) - to_us = int((to_ns - START_NS) / 1000) +def fill_copy_db(table_name, db, indir): + sect_id = COPY_PID + file_name = indir + "/" + "async_copy_trace.txt" + ptrn_val = re.compile(r"^(\d+):(\d+) (async-copy):(\d+):(\d+)$") + + if not os.path.isfile(file_name): + return 0 + + table_handle = db.add_table(table_name, copy_table_descr) + with open(file_name, mode="r") as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if not m: + fatal("bad async-copy entry '" + record + "'") + else: + rec_vals = [] + for ind in range(1, 4): + rec_vals.append(m.group(ind)) + corr_id = int(m.group(4)) + proc_id = int(m.group(5)) + + # querying tid value + if (corr_id, proc_id) in hsa_patch_data: + thread_id = hsa_patch_data[(corr_id, proc_id)] + else: + thread_id = -1 + + # completing record + rec_vals.append(proc_id) # tid + rec_vals.append(thread_id) # tid + rec_vals.append(corr_id) # Index + + # registering memcopy information + activity_data = ( + memory_manager.register_copy(rec_vals) if mcopy_data_enabled else "" + ) + rec_vals.append(activity_data) + + # appending straem ID and section ID + rec_vals.append(COPY_PID) # __section + rec_vals.append(thread_id) # __lane + + # inserting DB activity entry + db.insert_entry(table_handle, rec_vals) + + # filling dependencies + to_ns = int(rec_vals[0]) + to_us = int((to_ns - START_NS) / 1000) + + if thread_id != -1: + # if not proc_id in dep_dict: dep_dict[proc_id] = {} + dep_proc = dep_dict[proc_id] + # if not pid in dep_proc: dep_proc[pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] } + dep_str = dep_proc[sect_id] + dep_str["to"][corr_id] = to_us + dep_str["id"].append(corr_id) + + return 1 - if thread_id != -1: - #if not proc_id in dep_dict: dep_dict[proc_id] = {} - dep_proc = dep_dict[proc_id] - #if not pid in dep_proc: dep_proc[pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] } - dep_str = dep_proc[sect_id] - dep_str['to'][corr_id] = to_us - dep_str['id'].append(corr_id) - return 1 ############################################################# # fill HCC ops DB ops_table_descr = [ - ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'roctx-range', 'stream-id', 'Index', 'Data', '__section', '__lane'], - {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'roctx-range':'TEXT', 'Data':'TEXT', 'stream-id':'INTEGER', '__section':'INTEGER', '__lane':'INTEGER'} + [ + "BeginNs", + "EndNs", + "dev-id", + "queue-id", + "Name", + "pid", + "tid", + "roctx-range", + "stream-id", + "Index", + "Data", + "__section", + "__lane", + ], + { + "Index": "INTEGER", + "Name": "TEXT", + "args": "TEXT", + "BeginNs": "INTEGER", + "EndNs": "INTEGER", + "dev-id": "INTEGER", + "queue-id": "INTEGER", + "pid": "INTEGER", + "tid": "INTEGER", + "roctx-range": "TEXT", + "Data": "TEXT", + "stream-id": "INTEGER", + "__section": "INTEGER", + "__lane": "INTEGER", + }, ] -def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): - global max_gpu_id - file_name = indir + '/' + 'hcc_ops_trace.txt' - ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) (.*)$') - ptrn_id = re.compile(r'^([^:]+):(\d+):(\d+)$') - ptrn_mcopy = re.compile(r'(Memcpy|Copy|Fill)') - ptrn_barrier = re.compile(r'Marker') - - if not os.path.isfile(file_name): return {} - - filtr = {} - - kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr) - mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr) - with open(file_name, mode='r') as fd: - file_lines = fd.readlines() - total_lines = len(file_lines) - line_index = 0 - for line in file_lines: - if (line_index == total_lines - 1) or (line_index % 100 == 0): - sys.stdout.write( \ - "\rscan ops data " + str(line_index) + ":" + str(total_lines) + " "*100 \ - ) - line_index += 1 - record = line[:-1] - m = ptrn_val.match(record) - if m: - # parsing trace record - rec_vals = [] - for ind in range(1,6): rec_vals.append(m.group(ind)) - label = rec_vals[4] # record name - m = ptrn_id.match(label) - if not m: fatal("bad hcc ops entry '" + record + "'") - name = m.group(1) - corr_id = int(m.group(2)) - proc_id = int(m.group(3)) - - # checking name for memcopy pattern - is_barrier = 0 - if ptrn_mcopy.search(name): - rec_table_name = mcopy_table_name - table_handle = mcopy_table_handle - sect_id = COPY_PID; - else: - rec_table_name = kernel_table_name - table_handle = kernel_table_handle - - gpu_id = int(rec_vals[2]); - if (gpu_id > max_gpu_id): max_gpu_id = gpu_id - sect_id = GPU_BASE_PID + int(gpu_id) - - if ptrn_barrier.search(name): - name = '""' - is_barrier = 1 - - thread_id = 0 - stream_id = 0 - roctx_range = '' - if (corr_id, proc_id) in ops_patch_data: - (thread_id, stream_id, name_patch, roctx_range) = ops_patch_data[(corr_id, proc_id)] - if name_patch != '': name = name_patch - if roctx_range == '': roctx_range = name - else: - if is_barrier: continue - else: - if "ROCP_CTRL_RATE" in os.environ: continue - else: fatal("hcc ops data not found: '" + record + "', " + str(corr_id) + ", " + str(proc_id)) - - # activity record - rec_vals[4] = name # Name - rec_vals.append(proc_id) # pid - rec_vals.append(thread_id) # tid - rec_vals.append(roctx_range) # roctx-range - rec_vals.append(stream_id) # StreamId - rec_vals.append(corr_id) # Index - - # registering memcopy information - activity_data = memory_manager.register_activity(rec_vals) if mcopy_data_enabled else '' - rec_vals.append(activity_data) - - # activity record data for stream ID and sction ID - rec_vals.append(sect_id) # __section - rec_vals.append(stream_id) # __lane - - # inserting DB activity entry - db.insert_entry(table_handle, rec_vals) - - # registering a dependency filtr - filtr[(corr_id, proc_id)] = rec_table_name - - # filling a dependencies - to_ns = int(rec_vals[0]) - to_us = int((to_ns - START_NS) / 1000) - - end_ns = int(rec_vals[1]) - dur_us = int((end_ns - to_ns) / 1000) - - if (corr_id, proc_id) in from_ids: - depid = from_ids[(corr_id, proc_id)] - from_val = dep_dict[proc_id][HIP_PID]['from'][depid] - print("from_val" + str(from_val)) - from_val_new = (to_us + dur_us, from_val[1], from_val[2]) - dep_dict[proc_id][HIP_PID]['from'][depid] = from_val_new - - if not proc_id in dep_dict: dep_dict[proc_id] = {} - dep_proc = dep_dict[proc_id] - if not sect_id in dep_proc: dep_proc[sect_id] = { 'bsp': OPS_PID, 'to': {} } - dep_str = dep_proc[sect_id] - dep_str['to'][corr_id] = to_us - else: - fatal("hcc ops bad record: '" + record + "'") +def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): + global max_gpu_id + file_name = indir + "/" + "hcc_ops_trace.txt" + ptrn_val = re.compile(r"(\d+):(\d+) (\d+):(\d+) (.*)$") + ptrn_id = re.compile(r"^([^:]+):(\d+):(\d+)$") + ptrn_mcopy = re.compile(r"(Memcpy|Copy|Fill)") + ptrn_barrier = re.compile(r"Marker") + + if not os.path.isfile(file_name): + return {} + + filtr = {} + + kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr) + mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr) + with open(file_name, mode="r") as fd: + file_lines = fd.readlines() + total_lines = len(file_lines) + line_index = 0 + for line in file_lines: + if (line_index == total_lines - 1) or (line_index % 100 == 0): + sys.stdout.write( + "\rscan ops data " + + str(line_index) + + ":" + + str(total_lines) + + " " * 100 + ) + line_index += 1 + + record = line[:-1] + m = ptrn_val.match(record) + if m: + # parsing trace record + rec_vals = [] + for ind in range(1, 6): + rec_vals.append(m.group(ind)) + label = rec_vals[4] # record name + m = ptrn_id.match(label) + if not m: + fatal("bad hcc ops entry '" + record + "'") + name = m.group(1) + corr_id = int(m.group(2)) + proc_id = int(m.group(3)) + + # checking name for memcopy pattern + is_barrier = 0 + if ptrn_mcopy.search(name): + rec_table_name = mcopy_table_name + table_handle = mcopy_table_handle + sect_id = COPY_PID + else: + rec_table_name = kernel_table_name + table_handle = kernel_table_handle + + gpu_id = int(rec_vals[2]) + if gpu_id > max_gpu_id: + max_gpu_id = gpu_id + sect_id = GPU_BASE_PID + int(gpu_id) + + if ptrn_barrier.search(name): + name = '""' + is_barrier = 1 + + thread_id = 0 + stream_id = 0 + roctx_range = "" + if (corr_id, proc_id) in ops_patch_data: + (thread_id, stream_id, name_patch, roctx_range) = ops_patch_data[ + (corr_id, proc_id) + ] + if name_patch != "": + name = name_patch + if roctx_range == "": + roctx_range = name + else: + if is_barrier: + continue + else: + if "ROCP_CTRL_RATE" in os.environ: + continue + else: + fatal( + "hcc ops data not found: '" + + record + + "', " + + str(corr_id) + + ", " + + str(proc_id) + ) + + # activity record + rec_vals[4] = name # Name + rec_vals.append(proc_id) # pid + rec_vals.append(thread_id) # tid + rec_vals.append(roctx_range) # roctx-range + rec_vals.append(stream_id) # StreamId + rec_vals.append(corr_id) # Index + + # registering memcopy information + activity_data = ( + memory_manager.register_activity(rec_vals) + if mcopy_data_enabled + else "" + ) + rec_vals.append(activity_data) + + # activity record data for stream ID and sction ID + rec_vals.append(sect_id) # __section + rec_vals.append(stream_id) # __lane + + # inserting DB activity entry + db.insert_entry(table_handle, rec_vals) + + # registering a dependency filtr + filtr[(corr_id, proc_id)] = rec_table_name + + # filling a dependencies + to_ns = int(rec_vals[0]) + to_us = int((to_ns - START_NS) / 1000) + + end_ns = int(rec_vals[1]) + dur_us = int((end_ns - to_ns) / 1000) + + if (corr_id, proc_id) in from_ids: + depid = from_ids[(corr_id, proc_id)] + from_val = dep_dict[proc_id][HIP_PID]["from"][depid] + print("from_val" + str(from_val)) + from_val_new = (to_us + dur_us, from_val[1], from_val[2]) + dep_dict[proc_id][HIP_PID]["from"][depid] = from_val_new + + if not proc_id in dep_dict: + dep_dict[proc_id] = {} + dep_proc = dep_dict[proc_id] + if not sect_id in dep_proc: + dep_proc[sect_id] = {"bsp": OPS_PID, "to": {}} + dep_str = dep_proc[sect_id] + dep_str["to"][corr_id] = to_us + + else: + fatal("hcc ops bad record: '" + record + "'") + + return filtr + - return filtr ############################################################# # main -if (len(sys.argv) < 2): fatal("Usage: " + sys.argv[0] + " ") +if len(sys.argv) < 2: + fatal("Usage: " + sys.argv[0] + " ") outfile = sys.argv[1] infiles = sys.argv[2:] -indir = re.sub(r'\/[^\/]*$', r'', infiles[0]) -inext = re.sub(r'\s+$', r'', infiles[0]) -inext = re.sub(r'^.*(\.[^\.]+)$', r'\1', inext) - -dbfile = '' -csvfile = '' - -if 'ROCP_JSON_REBASE' in os.environ and os.environ['ROCP_JSON_REBASE'] == 0: - begin_ts_file = indir + '/begin_ts_file.txt' - if os.path.isfile(begin_ts_file): - with open(begin_ts_file, mode='r') as fd: - ind = 0 - for line in fd.readlines(): - val = int(line) - if ind == 0 or val < START_NS: START_NS = val - ind += 1 - print('START timestamp found (' + str(START_NS) + 'ns)') - -if re.search(r'\.csv$', outfile): - csvfile = outfile -elif re.search(r'\.db$', outfile): - dbfile = outfile - csvfile = re.sub(r'\.db$', '.csv', outfile) +indir = re.sub(r"\/[^\/]*$", r"", infiles[0]) +inext = re.sub(r"\s+$", r"", infiles[0]) +inext = re.sub(r"^.*(\.[^\.]+)$", r"\1", inext) + +dbfile = "" +csvfile = "" + +if "ROCP_JSON_REBASE" in os.environ and os.environ["ROCP_JSON_REBASE"] == 0: + begin_ts_file = indir + "/begin_ts_file.txt" + if os.path.isfile(begin_ts_file): + with open(begin_ts_file, mode="r") as fd: + ind = 0 + for line in fd.readlines(): + val = int(line) + if ind == 0 or val < START_NS: + START_NS = val + ind += 1 + print("START timestamp found (" + str(START_NS) + "ns)") + +if re.search(r"\.csv$", outfile): + csvfile = outfile +elif re.search(r"\.db$", outfile): + dbfile = outfile + csvfile = re.sub(r"\.db$", ".csv", outfile) else: - fatal("Bad output file '" + outfile + "'") + fatal("Bad output file '" + outfile + "'") -if inext == '.txt': - for f in infiles: parse_res(f) - if len(var_table) != 0: merge_table() +if inext == ".txt": + for f in infiles: + parse_res(f) + if len(var_table) != 0: + merge_table() -if dbfile == '': - dump_csv(csvfile) +if dbfile == "": + dump_csv(csvfile) else: - statfile = re.sub(r'\.csv$', '.stats.csv', csvfile) - jsonfile = re.sub(r'\.csv$', '.json', csvfile) - - hsa_statfile = re.sub(r'\.stats\.csv$', r'.hsa_stats.csv', statfile) - hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile) - ops_statfile = statfile - copy_statfile = re.sub(r'\.stats\.csv$', r'.copy_stats.csv', statfile) - memcopy_info_file = re.sub(r'\.stats\.csv$', r'.memcopy_info.csv', statfile) - sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo.txt', statfile) - metadata_gen(sysinfo_file, "@ROCMINFO_EXEC@") - - with open(dbfile, mode='w') as fd: fd.truncate() - db = SQLiteDB(dbfile) - memory_manager = MemManager(db, indir) - - ext_trace_found = fill_ext_db('rocTX', db, indir, 'roctx', EXT_PID) - - hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) - hsa_activity_found = fill_copy_db('COPY', db, indir) - - hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], {}, 1) - ops_filtr = fill_ops_db('OPS', 'COPY', db, indir) - - fill_kernel_db('KERN', db) - - any_trace_found = ext_trace_found | hsa_trace_found | hip_trace_found - copy_trace_found = 0 - if hsa_activity_found or len(ops_filtr): copy_trace_found = 1 - - if any_trace_found: - db.open_json(jsonfile) - - if ext_trace_found: - db.label_json(EXT_PID, "Markers and Ranges", jsonfile) - - if hip_trace_found: - db.label_json(HIP_PID, "CPU HIP API", jsonfile) - - if hsa_trace_found: - db.label_json(HSA_PID, "CPU HSA API", jsonfile) - - db.label_json(COPY_PID, "COPY", jsonfile) - - if any_trace_found and max_gpu_id >= 0: - for ind in range(0, int(max_gpu_id) + 1): - db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) - - if ext_trace_found: - dform.gen_ext_json_trace(db, 'rocTX', START_NS, jsonfile) - - if len(var_table) != 0: - dform.post_process_data(db, 'KERN', csvfile) - dform.gen_table_bins(db, 'KERN', statfile, 'KernelName', 'DurationNs') - if hsa_trace_found and 'BeginNs' in var_list: - dform.gen_kernel_json_trace(db, 'KERN', GPU_BASE_PID, START_NS, jsonfile) - - if hsa_trace_found: - dform.post_process_data(db, 'HSA') - dform.gen_table_bins(db, 'HSA', hsa_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'HSA', START_NS, jsonfile) - - if copy_trace_found: - dform.post_process_data(db, 'COPY') - dform.gen_table_bins(db, 'COPY', copy_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'COPY', START_NS, jsonfile) - - if hip_trace_found: - dform.post_process_data(db, 'HIP') - dform.gen_table_bins(db, 'HIP', hip_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'HIP', START_NS, jsonfile) - - if ops_filtr: - dform.post_process_data(db, 'OPS') - dform.gen_table_bins(db, 'OPS', ops_statfile, 'Name', 'DurationNs') - dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_NS, jsonfile) - - if any_trace_found: - dep_id = 0 - for (proc_id, dep_proc) in dep_dict.items(): - for (to_pid, dep_str) in dep_proc.items(): - if 'bsp' in dep_str: - bspid = dep_str['bsp'] - base_str = dep_proc[bspid] - for v in ('pid', 'from', 'id'): - dep_str[v] = base_str[v] - base_str['inv'] = 1 - - for (to_pid, dep_str) in dep_proc.items(): - if 'inv' in dep_str: continue - if not 'to' in dep_str: continue - - from_pid = dep_str['pid'] - from_us_list = dep_str['from'] - to_us_dict = dep_str['to'] - corr_id_list = dep_str['id'] - - db.flow_json(dep_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, jsonfile) - dep_id += len(from_us_list) - - if any_trace_found: - db.metadata_json(jsonfile, sysinfo_file) - db.close_json(jsonfile) - - if mcopy_data_enabled: - memory_manager.dump_data('MM', memcopy_info_file) - - db.close() + statfile = re.sub(r"\.csv$", ".stats.csv", csvfile) + jsonfile = re.sub(r"\.csv$", ".json", csvfile) + + hsa_statfile = re.sub(r"\.stats\.csv$", r".hsa_stats.csv", statfile) + hip_statfile = re.sub(r"\.stats\.csv$", r".hip_stats.csv", statfile) + ops_statfile = statfile + copy_statfile = re.sub(r"\.stats\.csv$", r".copy_stats.csv", statfile) + memcopy_info_file = re.sub(r"\.stats\.csv$", r".memcopy_info.csv", statfile) + sysinfo_file = re.sub(r"\.stats\.csv$", r".sysinfo.txt", statfile) + metadata_gen(sysinfo_file, "@ROCMINFO_EXEC@") + + with open(dbfile, mode="w") as fd: + fd.truncate() + db = SQLiteDB(dbfile) + memory_manager = MemManager(db, indir) + + ext_trace_found = fill_ext_db("rocTX", db, indir, "roctx", EXT_PID) + + hsa_trace_found = fill_api_db( + "HSA", db, indir, "hsa", HSA_PID, COPY_PID, kern_dep_list, {}, 0 + ) + hsa_activity_found = fill_copy_db("COPY", db, indir) + + hip_trace_found = fill_api_db("HIP", db, indir, "hip", HIP_PID, OPS_PID, [], {}, 1) + ops_filtr = fill_ops_db("OPS", "COPY", db, indir) + + fill_kernel_db("KERN", db) + + any_trace_found = ext_trace_found | hsa_trace_found | hip_trace_found + copy_trace_found = 0 + if hsa_activity_found or len(ops_filtr): + copy_trace_found = 1 + + if any_trace_found: + db.open_json(jsonfile) + + if ext_trace_found: + db.label_json(EXT_PID, "Markers and Ranges", jsonfile) + + if hip_trace_found: + db.label_json(HIP_PID, "CPU HIP API", jsonfile) + + if hsa_trace_found: + db.label_json(HSA_PID, "CPU HSA API", jsonfile) + + db.label_json(COPY_PID, "COPY", jsonfile) + + if any_trace_found and max_gpu_id >= 0: + for ind in range(0, int(max_gpu_id) + 1): + db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) + + if ext_trace_found: + dform.gen_ext_json_trace(db, "rocTX", START_NS, jsonfile) + + if len(var_table) != 0: + dform.post_process_data(db, "KERN", csvfile) + dform.gen_table_bins(db, "KERN", statfile, "KernelName", "DurationNs") + if hsa_trace_found and "BeginNs" in var_list: + dform.gen_kernel_json_trace(db, "KERN", GPU_BASE_PID, START_NS, jsonfile) + + if hsa_trace_found: + dform.post_process_data(db, "HSA") + dform.gen_table_bins(db, "HSA", hsa_statfile, "Name", "DurationNs") + dform.gen_api_json_trace(db, "HSA", START_NS, jsonfile) + + if copy_trace_found: + dform.post_process_data(db, "COPY") + dform.gen_table_bins(db, "COPY", copy_statfile, "Name", "DurationNs") + dform.gen_api_json_trace(db, "COPY", START_NS, jsonfile) + + if hip_trace_found: + dform.post_process_data(db, "HIP") + dform.gen_table_bins(db, "HIP", hip_statfile, "Name", "DurationNs") + dform.gen_api_json_trace(db, "HIP", START_NS, jsonfile) + + if ops_filtr: + dform.post_process_data(db, "OPS") + dform.gen_table_bins(db, "OPS", ops_statfile, "Name", "DurationNs") + dform.gen_ops_json_trace(db, "OPS", GPU_BASE_PID, START_NS, jsonfile) + + if any_trace_found: + dep_id = 0 + for proc_id, dep_proc in dep_dict.items(): + for to_pid, dep_str in dep_proc.items(): + if "bsp" in dep_str: + bspid = dep_str["bsp"] + base_str = dep_proc[bspid] + for v in ("pid", "from", "id"): + dep_str[v] = base_str[v] + base_str["inv"] = 1 + + for to_pid, dep_str in dep_proc.items(): + if "inv" in dep_str: + continue + if not "to" in dep_str: + continue + + from_pid = dep_str["pid"] + from_us_list = dep_str["from"] + to_us_dict = dep_str["to"] + corr_id_list = dep_str["id"] + + db.flow_json( + dep_id, + from_pid, + from_us_list, + to_pid, + to_us_dict, + corr_id_list, + jsonfile, + ) + dep_id += len(from_us_list) + + if any_trace_found: + db.metadata_json(jsonfile, sysinfo_file) + db.close_json(jsonfile) + + if mcopy_data_enabled: + memory_manager.dump_data("MM", memcopy_info_file) + + db.close() sys.exit(0) ############################################################# - diff --git a/bin/txt2params.py b/bin/txt2params.py index 7944029f..4be34266 100644 --- a/bin/txt2params.py +++ b/bin/txt2params.py @@ -22,89 +22,92 @@ import os, sys, re + # gen_params() takes a text file like the output of rocminfo cmd and parses it into a map {key,value} # where key is the param and value is the value of this param # for example: Threadmodel : "posix" # it also processes encompasing sections to generate a full param name such as (section names separated by '_'): # "Agent2_PoolInfo_ISAInfo_ISA1_WorkgroupMaxSizeperDimension_x": "1024(0x400)", def gen_params(txtfile): - fields = {} - counter = 0 - parent_field = '' - nbr_indent = 0 - nbr_indent_prev = 0 - check_for_dims = False - with open(txtfile) as fp: - for line in fp: - me = re.match(r'\*\*\* Done \*\*\*',line) #Marks the end of cmd - if me: - parent_field = '' - nbr_indent = 0 - nbr_indent_prev = 0 - check_for_dims = False - continue - mv = re.match(r'HCC clang version\s+(.*)',line) # outlier: only line with a version number and no ':', special case - if mv: - key = 'HCCclangversion' - val = mv.group(1) - counter = counter + 1 - fields[(counter,key)] = val - continue - # Variable 'check_for_dims' is True for text like this: - # Workgroup Max Size per Dimension: - # x 1024(0x400) - # y 1024(0x400) - # z 1024(0x400) - if check_for_dims == True: - mc = re.match(r'\s*([x|y|z])\s+(.*)',line) - if mc: - key_sav = mc.group(1) - if parent_field != '': - key = parent_field + '.' + mc.group(1) - else: - key = mc.group(1) - val = re.sub(r"\s+", "", mc.group(2)) - counter = counter + 1 - fields[(counter,key)] = val - if key_sav == 'z': - check_for_dims = False - nbr_indent_prev = nbr_indent - mi = re.search(r'^(\s+)\w+.*', line) - md = re.search(':', line) - if mi: - nbr_indent = int(len(mi.group(1)) / 2) #indentation cnt - else: - if not md: - tmp = re.sub(r"\s+", "", line) - if tmp.isalnum(): - parent_field = tmp + fields = {} + counter = 0 + parent_field = "" + nbr_indent = 0 + nbr_indent_prev = 0 + check_for_dims = False + with open(txtfile) as fp: + for line in fp: + me = re.match(r"\*\*\* Done \*\*\*", line) # Marks the end of cmd + if me: + parent_field = "" + nbr_indent = 0 + nbr_indent_prev = 0 + check_for_dims = False + continue + mv = re.match( + r"HCC clang version\s+(.*)", line + ) # outlier: only line with a version number and no ':', special case + if mv: + key = "HCCclangversion" + val = mv.group(1) + counter = counter + 1 + fields[(counter, key)] = val + continue + # Variable 'check_for_dims' is True for text like this: + # Workgroup Max Size per Dimension: + # x 1024(0x400) + # y 1024(0x400) + # z 1024(0x400) + if check_for_dims == True: + mc = re.match(r"\s*([x|y|z])\s+(.*)", line) + if mc: + key_sav = mc.group(1) + if parent_field != "": + key = parent_field + "." + mc.group(1) + else: + key = mc.group(1) + val = re.sub(r"\s+", "", mc.group(2)) + counter = counter + 1 + fields[(counter, key)] = val + if key_sav == "z": + check_for_dims = False + nbr_indent_prev = nbr_indent + mi = re.search(r"^(\s+)\w+.*", line) + md = re.search(":", line) + if mi: + nbr_indent = int(len(mi.group(1)) / 2) # indentation cnt + else: + if not md: + tmp = re.sub(r"\s+", "", line) + if tmp.isalnum(): + parent_field = tmp - if nbr_indent < nbr_indent_prev: - go_back_parent = (nbr_indent_prev - nbr_indent) - for i in range(go_back_parent): #decrease as many levels up as needed - pos = parent_field.rfind('.') - if pos != -1: - parent_field = parent_field[:pos] - # Process lines such as : - # Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED - # Size: 131897644(0x7dc992c) KB - for lin in line.split(';'): - lin = re.sub(r"\s+", "", lin) - m = re.match(r'(.*):(.*)', lin) - if m: - key, val = m.group(1), m.group(2) - if parent_field != '': - key = parent_field + '.' + key - if val == '': - mk = re.match(r'.*Dimension',key) - if mk: # expect x,y,z on next 3 lines - check_for_dims = True - parent_field = key - else: - counter = counter + 1 - fields[(counter,key)] = val - else: - if nbr_indent != nbr_indent_prev and not check_for_dims : - parent_field = parent_field + '.' + lin.replace(':','') + if nbr_indent < nbr_indent_prev: + go_back_parent = nbr_indent_prev - nbr_indent + for i in range(go_back_parent): # decrease as many levels up as needed + pos = parent_field.rfind(".") + if pos != -1: + parent_field = parent_field[:pos] + # Process lines such as : + # Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED + # Size: 131897644(0x7dc992c) KB + for lin in line.split(";"): + lin = re.sub(r"\s+", "", lin) + m = re.match(r"(.*):(.*)", lin) + if m: + key, val = m.group(1), m.group(2) + if parent_field != "": + key = parent_field + "." + key + if val == "": + mk = re.match(r".*Dimension", key) + if mk: # expect x,y,z on next 3 lines + check_for_dims = True + parent_field = key + else: + counter = counter + 1 + fields[(counter, key)] = val + else: + if nbr_indent != nbr_indent_prev and not check_for_dims: + parent_field = parent_field + "." + lin.replace(":", "") - return fields + return fields diff --git a/build.sh b/build.sh index 33ce9e5f..d1a0078a 100755 --- a/build.sh +++ b/build.sh @@ -50,7 +50,11 @@ while [ 1 ] ; do elif [[ "$1" = "-cb" || "$1" = "--clean-build" ]] ; then TO_CLEAN=yes shift - elif [[ "$1" = "-"* || "$1" = "--"* ]] ; then + elif [ "$1" = "--" ] ; then + shift + EXTRA_BUILD_ARGS=$@ + break + elif [[ "$1" = "-"* ]] ; then echo -e "Wrong option \"$1\", Please use the following options:\n" usage exit 1 @@ -73,14 +77,14 @@ if [ -z "$RUN_TEST" ] ; then RUN_TEST=no; fi if [ -z "$ASAN" ] ; then ASAN=False; fi if [ -z "$GPU_LIST" ] ; then GPU_LIST="gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx1100 gfx1101 gfx1102"; fi - ROCPROFILER_ROOT=$(cd $ROCPROFILER_ROOT && echo $PWD) if [ "$TO_CLEAN" = "yes" ] ; then rm -rf $BUILD_DIR; fi -mkdir -p $BUILD_DIR -pushd $BUILD_DIR -cmake \ +cmake -B ${BUILD_DIR} ${ROCPROFILER_ROOT} \ + -DROCPROFILER_BUILD_CI=1 \ + -DROCPROFILER_BUILD_TESTS=1 \ + -DROCPROFILER_BUILD_SAMPLES=1 \ -DCMAKE_EXPORT_COMPILE_COMMANDS=TRUE \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-'RelWithDebInfo'} \ -DCMAKE_MODULE_PATH="${ROCM_PATH}/hip/cmake;${ROCM_PATH}/lib/cmake" \ @@ -96,19 +100,9 @@ cmake \ -DCPACK_READELF_EXECUTABLE="${PACKAGE_ROOT}/llvm/bin/llvm-readelf" \ -DCPACK_STRIP_EXECUTABLE="${PACKAGE_ROOT}/llvm/bin/llvm-strip" \ -DCPACK_OBJDUMP_EXECUTABLE="${PACKAGE_ROOT}/llvm/bin/llvm-objdump" \ - -DHIP_ROOT_DIR=${ROCM_PATH} \ - $ROCPROFILER_ROOT - -popd - -MAKE_OPTS="-j -C $ROCPROFILER_ROOT/$BUILD_DIR" + ${EXTRA_BUILD_ARGS} -cmake --build "$BUILD_DIR" -- $MAKE_OPTS -cmake --build "$BUILD_DIR" -- $MAKE_OPTS mytest -if [ "$RUN_TEST" = "no" ] ; then - cmake --build "$BUILD_DIR" -- $MAKE_OPTS tests samples doc package -else - cmake --build "$BUILD_DIR" -- $MAKE_OPTS tests -fi +cmake --build "$BUILD_DIR" --target all --parallel $(nproc) +cmake --build "$BUILD_DIR" --target package --parallel $(nproc) exit 0 diff --git a/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in b/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in new file mode 100644 index 00000000..14a751d9 --- /dev/null +++ b/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.18.0 FATAL_ERROR) + +if(NOT @GTEST_DISCOVER_TESTS_TARGET@_TESTS) + message(FATAL_ERROR "@GTEST_DISCOVER_TESTS_TARGET@_TESTS is not defined") +endif() + +foreach(_TEST ${@GTEST_DISCOVER_TESTS_TARGET@_TESTS}) + set_tests_properties( + ${_TEST} PROPERTIES LABELS "@GTEST_DISCOVER_TESTS_LABELS@" ENVIRONMENT + "@GTEST_DISCOVER_TESTS_ENVIRONMENT@") +endforeach() diff --git a/cmake_modules/env.cmake b/cmake_modules/rocprofiler_env.cmake similarity index 78% rename from cmake_modules/env.cmake rename to cmake_modules/rocprofiler_env.cmake index 58412775..7b7c4727 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/rocprofiler_env.cmake @@ -20,29 +20,25 @@ # THE SOFTWARE. ################################################################################ -# Linux Compiler options -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions") +add_library(rocprofiler-build-flags INTERFACE) +add_library(rocprofiler::build-flags ALIAS rocprofiler-build-flags) -add_definitions(-DNEW_TRACE_API=1) - -# CLANG options -if("$ENV{CXX}" STREQUAL "/usr/bin/clang++") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000") -endif() +target_compile_options( + rocprofiler-build-flags + INTERFACE $<$:-W -Wall -Wextra -Wno-unused-parameter> + $<$:-fms-extensions> + $<$:$<$:-ferror-limit=1000000>> + ) +target_compile_definitions(rocprofiler-build-flags INTERFACE NEW_TRACE_API=1) # Enable debug trace -if(DEFINED ENV{CMAKE_DEBUG_TRACE}) - add_definitions(-DDEBUG_TRACE=1) -endif() - -# Enable AQL-profile new API -if(NOT DEFINED ENV{CMAKE_CURR_API}) - add_definitions(-DAQLPROF_NEW_API=1) +if(ROCPROFILER_DEBUG_TRACE) + target_compile_definitions(rocprofiler-build-flags INTERFACE DEBUG_TRACE=1) endif() # Enable direct loading of AQL-profile HSA extension -if(DEFINED ENV{CMAKE_LD_AQLPROFILE}) - add_definitions(-DROCP_LD_AQLPROFILE=1) +if(ROCPROFILER_LD_AQLPROFILE) + target_compile_definitions(rocprofiler-build-flags INTERFACE ROCP_LD_AQLPROFILE=1) endif() # Find hsa-runtime @@ -85,10 +81,8 @@ if("${ROCM_ROOT_DIR}" STREQUAL "") endif() find_library( - FIND_AQL_PROFILE_LIB "libhsa-amd-aqlprofile64.so" + HSA_AMD_AQLPROFILE_LIBRARY + NAMES hsa-amd-aqlprofile64 HINTS ${CMAKE_PREFIX_PATH} PATHS ${ROCM_ROOT_DIR} PATH_SUFFIXES lib REQUIRED) -if(NOT FIND_AQL_PROFILE_LIB) - message("AQL_PROFILE not installed. Please install AQL_PROFILE") -endif() diff --git a/cmake_modules/rocprofiler_formatting.cmake b/cmake_modules/rocprofiler_formatting.cmake new file mode 100644 index 00000000..35192a39 --- /dev/null +++ b/cmake_modules/rocprofiler_formatting.cmake @@ -0,0 +1,103 @@ +# ------------------------------------------------------------------------------# +# +# creates following targets to format code: +# - format +# - format-source +# - format-cmake +# - format-python +# - format-rocprofiler-source +# - format-rocprofiler-cmake +# - format-rocprofiler-python +# +# ------------------------------------------------------------------------------# + +include_guard(GLOBAL) + +find_program(ROCPROFILER_CLANG_FORMAT_EXE NAMES clang-format-11 clang-format-mp-11) +find_program(ROCPROFILER_CMAKE_FORMAT_EXE NAMES cmake-format) +find_program(ROCPROFILER_BLACK_FORMAT_EXE NAMES black) + +if(ROCPROFILER_CLANG_FORMAT_EXE + OR ROCPROFILER_BLACK_FORMAT_EXE + OR ROCPROFILER_CMAKE_FORMAT_EXE) + add_custom_target(format-rocprofiler) + + if(NOT TARGET format) + add_custom_target(format) + endif() + + foreach(_TYPE source python cmake) + if(NOT TARGET format-${_TYPE}) + add_custom_target(format-${_TYPE}) + endif() + endforeach() + + set(rocp_sources) + set(rocp_headers) + set(rocp_cmake_files) + set(rocp_python_files) + foreach(_DIR include src plugin samples test tests-v2 script cmake_modules) + foreach(_TYPE headers sources cmake_files python_files) + set(${_TYPE}) + endforeach() + file(GLOB_RECURSE headers ${PROJECT_SOURCE_DIR}/${_DIR}/*.h) + file(GLOB_RECURSE sources ${PROJECT_SOURCE_DIR}/${_DIR}/*.cpp) + file(GLOB_RECURSE cmake_files ${PROJECT_SOURCE_DIR}/${_DIR}/*CMakeLists.txt + ${PROJECT_SOURCE_DIR}/${_DIR}/*.cmake) + file(GLOB_RECURSE python_files ${PROJECT_SOURCE_DIR}/${_DIR}/*.py) + foreach(_TYPE headers sources cmake_files python_files) + list(APPEND rocp_${_TYPE} ${${_TYPE}}) + endforeach() + endforeach() + + if(ROCPROFILER_CLANG_FORMAT_EXE) + add_custom_target( + format-rocprofiler-source + ${ROCPROFILER_CLANG_FORMAT_EXE} -i ${rocp_sources} ${rocp_headers} + COMMENT + "[rocprofiler] Running source formatter ${ROCPROFILER_CLANG_FORMAT_EXE}..." + ) + endif() + + if(ROCPROFILER_BLACK_FORMAT_EXE) + add_custom_target( + format-rocprofiler-python + ${ROCPROFILER_BLACK_FORMAT_EXE} -q ${rocp_python_files} + COMMENT + "[rocprofiler] Running Python formatter ${ROCPROFILER_BLACK_FORMAT_EXE}..." + ) + if(NOT TARGET format-python) + add_custom_target(format-python) + endif() + endif() + + if(ROCPROFILER_CMAKE_FORMAT_EXE) + add_custom_target( + format-rocprofiler-cmake + ${ROCPROFILER_CMAKE_FORMAT_EXE} -i ${rocp_cmake_files} + COMMENT + "[rocprofiler] Running CMake formatter ${ROCPROFILER_CMAKE_FORMAT_EXE}..." + ) + if(NOT TARGET format-cmake) + add_custom_target(format-cmake) + endif() + endif() + + foreach(_TYPE source python cmake) + if(TARGET format-rocprofiler-${_TYPE}) + add_dependencies(format-rocprofiler format-rocprofiler-${_TYPE}) + add_dependencies(format-${_TYPE} format-rocprofiler-${_TYPE}) + endif() + endforeach() + + foreach(_TYPE source python) + if(TARGET format-rocprofiler-${_TYPE}) + add_dependencies(format format-rocprofiler-${_TYPE}) + endif() + endforeach() +else() + message( + STATUS + "no formatting tools (clang-format-11/black/cmake-format) could not be found. formatting build targets not available." + ) +endif() diff --git a/cmake_modules/rocprofiler_linting.cmake b/cmake_modules/rocprofiler_linting.cmake new file mode 100644 index 00000000..3e00aff9 --- /dev/null +++ b/cmake_modules/rocprofiler_linting.cmake @@ -0,0 +1,30 @@ +include_guard(DIRECTORY) + +# ----------------------------------------------------------------------------------------# +# +# Clang Tidy +# +# ----------------------------------------------------------------------------------------# + +if(ROCPROFILER_ENABLE_CLANG_TIDY) + find_program(ROCPROFILER_CLANG_TIDY_COMMAND NAMES clang-tidy) + + if(NOT ROCPROFILER_CLANG_TIDY_COMMAND) + message( + WARNING "ROCPROFILER_ENABLE_CLANG_TIDY is ON but clang-tidy is not found!") + set(ROCPROFILER_ENABLE_CLANG_TIDY OFF) + else() + set(CMAKE_CXX_CLANG_TIDY ${ROCPROFILER_CLANG_TIDY_COMMAND} + -header-filter=${PROJECT_SOURCE_DIR}/.*) + + # Create a preprocessor definition that depends on .clang-tidy content so the + # compile command will change when .clang-tidy changes. This ensures that a + # subsequent build re-runs clang-tidy on all sources even if they do not otherwise + # need to be recompiled. Nothing actually uses this definition. We add it to + # targets on which we run clang-tidy just to get the build dependency on the + # .clang-tidy file. + file(SHA1 ${PROJECT_SOURCE_DIR}/.clang-tidy clang_tidy_sha1) + set(CLANG_TIDY_DEFINITIONS "CLANG_TIDY_SHA1=${clang_tidy_sha1}") + unset(clang_tidy_sha1) + endif() +endif() diff --git a/cmake_modules/rocprofiler_options.cmake b/cmake_modules/rocprofiler_options.cmake new file mode 100644 index 00000000..41135b5a --- /dev/null +++ b/cmake_modules/rocprofiler_options.cmake @@ -0,0 +1,139 @@ +if("${PROJECT_SOURCE_DIR}" STREQUAL "${PROJECT_BINARY_DIR}") + message(STATUS "") + message(STATUS "rocprofiler does not support in-source builds.") + message(STATUS "Delete CMakeCache.txt and CMakeFiles in ${PROJECT_SOURCE_DIR}") + message(STATUS "and run cmake with `-B `") + message(STATUS "") + message(FATAL_ERROR "In-source build detected.") +endif() + +option(ROCPROFILER_BUILD_TESTS "Enable building the tests" OFF) +option(ROCPROFILER_BUILD_SAMPLES "Enable building the code samples" OFF) + +# CLI and FILE plugins are always built +foreach(_PLUGIN "ATT" "CTF" "PERFETTO") + option(ROCPROFILER_BUILD_PLUGIN_${_PLUGIN} "Enable building the ${_PLUGIN} plugin" ON) +endforeach() + +option(ROCPROFILER_DEBUG_TRACE "Enable debug tracing" OFF) +mark_as_advanced(ROCPROFILER_DEBUG_TRACE) + +option(ROCPROFILER_LD_AQLPROFILE "Enable direct loading of AQL-profile HSA extension" OFF) +mark_as_advanced(ROCPROFILER_LD_AQLPROFILE) + +option(ROCPROFILER_BUILD_CI "Enable continuous integration additions" OFF) +mark_as_advanced(ROCPROFILER_BUILD_CI) + +option(ROCPROFILER_ENABLE_CLANG_TIDY "Enable clang-tidy checks" OFF) +mark_as_advanced(ROCPROFILER_ENABLE_CLANG_TIDY) + +set(ROCPROFILER_BUILD_TYPES "Release" "RelWithDebInfo" "Debug" "MinSizeRel" "Coverage") + +# export compile commands in the project +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE + "Release" + CACHE STRING "Build type" FORCE) +endif() + +if(NOT CMAKE_BUILD_TYPE IN_LIST ROCPROFILER_BUILD_TYPES) + message( + FATAL_ERROR + "Unsupported build type '${CMAKE_BUILD_TYPE}'. Options: ${ROCPROFILER_BUILD_TYPES}" + ) +endif() + +if(ROCPROFILER_BUILD_CI) + foreach(_BUILD_TYPE ${ROCPROFILER_BUILD_TYPES}) + string(TOUPPER "${_BUILD_TYPE}" _BUILD_TYPE) + + # remove NDEBUG preprocessor def so that asserts are triggered + string(REGEX REPLACE ".DNDEBUG" "" CMAKE_C_FLAGS_${_BUILD_TYPE} + "${CMAKE_C_FLAGS_${_BUILD_TYPE}}") + string(REGEX REPLACE ".DNDEBUG" "" CMAKE_CXX_FLAGS_${_BUILD_TYPE} + "${CMAKE_CXX_FLAGS_${_BUILD_TYPE}}") + endforeach() +endif() + +if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "${ROCPROFILER_BUILD_TYPES}") +endif() + +set(ROCPROFILER_MEMCHECK + "" + CACHE STRING "Memory checker type") +mark_as_advanced(ROCPROFILER_MEMCHECK) + +# ASAN is defined by testing team on Jenkins +if(ASAN) + set(ROCPROFILER_MEMCHECK + "AddressSanitizer" + CACHE STRING "Memory checker type (forced by ASAN defined)" FORCE) +endif() + +set(ROCPROFILER_MEMCHECK_TYPES "ThreadSanitizer" "AddressSanitizer" "LeakSanitizer" + "MemorySanitizer" "UndefinedBehaviorSanitizer") + +if(ROCPROFILER_MEMCHECK AND NOT ROCPROFILER_MEMCHECK IN_LIST ROCPROFILER_MEMCHECK_TYPES) + message( + FATAL_ERROR + "Unsupported memcheck type '${ROCPROFILER_MEMCHECK}'. Options: ${ROCPROFILER_MEMCHECK_TYPES}" + ) +endif() + +set_property(CACHE ROCPROFILER_MEMCHECK PROPERTY STRINGS "${ROCPROFILER_MEMCHECK_TYPES}") + +add_library(rocprofiler-memcheck INTERFACE) +add_library(rocprofiler::memcheck ALIAS rocprofiler-memcheck) + +function(rocprofiler_add_memcheck_flags _TYPE) + target_compile_options( + rocprofiler-memcheck INTERFACE $) + target_link_options(rocprofiler-memcheck INTERFACE + $) +endfunction() + +function(rocprofiler_set_memcheck_env _TYPE _LIB_BASE) + set(_LIBS ${_LIB_BASE}) + foreach(_N 6 5 4 3 2 1 0) + list( + APPEND _LIBS + ${CMAKE_SHARED_LIBRARY_PREFIX}${_LIB_BASE}${CMAKE_SHARED_LIBRARY_SUFFIX}.${_N} + ) + endforeach() + foreach(_LIB ${_LIBS}) + if(NOT ${_TYPE}_LIBRARY) + find_library(${_TYPE}_LIBRARY NAMES ${_LIB} ${ARGN}) + endif() + endforeach() + + target_link_libraries(rocprofiler-memcheck INTERFACE ${_LIB_BASE}) + + if(${_TYPE}_LIBRARY) + set(ROCPROFILER_MEMCHECK_PRELOAD_ENV + "LD_PRELOAD=${${_TYPE}_LIBRARY};LD_LIBRARY_PATH=${PROJECT_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}" + CACHE INTERNAL "LD_PRELOAD env variable for tests" FORCE) + endif() +endfunction() + +# always unset so that it doesn't preload if memcheck disabled +unset(ROCPROFILER_MEMCHECK_PRELOAD_ENV CACHE) + +if(ROCPROFILER_MEMCHECK STREQUAL "AddressSanitizer") + rocprofiler_add_memcheck_flags("address") + rocprofiler_set_memcheck_env("${ROCPROFILER_MEMCHECK}" "asan") +elseif(ROCPROFILER_MEMCHECK STREQUAL "LeakSanitizer") + rocprofiler_add_memcheck_flags("leak") + rocprofiler_set_memcheck_env("${ROCPROFILER_MEMCHECK}" "lsan") +elseif(ROCPROFILER_MEMCHECK STREQUAL "MemorySanitizer") + rocprofiler_add_memcheck_flags("memory") +elseif(ROCPROFILER_MEMCHECK STREQUAL "ThreadSanitizer") + rocprofiler_add_memcheck_flags("thread") + rocprofiler_set_memcheck_env("${ROCPROFILER_MEMCHECK}" "tsan") +elseif(ROCPROFILER_MEMCHECK STREQUAL "UndefinedBehaviorSanitizer") + rocprofiler_add_memcheck_flags("undefined") + rocprofiler_set_memcheck_env("${ROCPROFILER_MEMCHECK}" "ubsan") +endif() diff --git a/cmake_modules/utils.cmake b/cmake_modules/rocprofiler_utils.cmake similarity index 94% rename from cmake_modules/utils.cmake rename to cmake_modules/rocprofiler_utils.cmake index f1f85656..9b0607f4 100644 --- a/cmake_modules/utils.cmake +++ b/cmake_modules/rocprofiler_utils.cmake @@ -22,7 +22,7 @@ # Parses the VERSION_STRING variable and places the first, second and third number values # in the major, minor and patch variables. -function(parse_version VERSION_STRING) +function(rocprofiler_parse_version VERSION_STRING) string(FIND ${VERSION_STRING} "-" STRING_INDEX) @@ -72,9 +72,9 @@ endfunction() # Gets the current version of the repository using versioning tags and git describe. # Passes back a packaging version string and a library version string. -function(get_version DEFAULT_VERSION_STRING) +function(rocprofiler_get_version DEFAULT_VERSION_STRING) - parse_version(${DEFAULT_VERSION_STRING}) + rocprofiler_parse_version(${DEFAULT_VERSION_STRING}) find_program(GIT NAMES git) @@ -89,7 +89,7 @@ function(get_version DEFAULT_VERSION_STRING) if(${RESULT} EQUAL 0) - parse_version(${GIT_TAG_STRING}) + rocprofiler_parse_version(${GIT_TAG_STRING}) endif() diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt index 03201d40..8dad8387 100644 --- a/plugin/CMakeLists.txt +++ b/plugin/CMakeLists.txt @@ -20,8 +20,22 @@ # IN THE SOFTWARE. ################################################################################ -add_subdirectory(file) -add_subdirectory(perfetto) -add_subdirectory(ctf) -add_subdirectory(att) +if(ROCPROFILER_BUILD_CODECOV) + set(CMAKE_BUILD_TYPE "Coverage") +endif() + +# these two "native" plugins are always built add_subdirectory(cli) +add_subdirectory(file) + +if(ROCPROFILER_BUILD_PLUGIN_PERFETTO) + add_subdirectory(perfetto) +endif() + +if(ROCPROFILER_BUILD_PLUGIN_CTF) + add_subdirectory(ctf) +endif() + +if(ROCPROFILER_BUILD_PLUGIN_ATT) + add_subdirectory(att) +endif() diff --git a/plugin/att/att.py b/plugin/att/att.py index cf1778f9..7c9251e7 100755 --- a/plugin/att/att.py +++ b/plugin/att/att.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import sys + if sys.version_info[0] < 3: raise Exception("Must be using Python 3") @@ -18,23 +19,33 @@ try: from mpi4py import MPI + MPI_IMPORTED = True except: MPI_IMPORTED = False + class PerfEvent(ctypes.Structure): _fields_ = [ - ('time', c_uint64), - ('event0', c_uint16), - ('event1', c_uint16), - ('event2', c_uint16), - ('event3', c_uint16), - ('cu', c_uint8), - ('bank', c_uint8), + ("time", c_uint64), + ("event0", c_uint16), + ("event1", c_uint16), + ("event2", c_uint16), + ("event3", c_uint16), + ("cu", c_uint8), + ("bank", c_uint8), ] + def toTuple(self): - return (int(self.time), int(self.event0), int(self.event1), - int(self.event2), int(self.event3), int(self.cu), int(self.bank)) + return ( + int(self.time), + int(self.event0), + int(self.event1), + int(self.event2), + int(self.event3), + int(self.cu), + int(self.bank), + ) class CodeWrapped(ctypes.Structure): @@ -64,44 +75,41 @@ class ReturnAssemblyInfo(ctypes.Structure): class Wave(ctypes.Structure): _fields_ = [ - ('simd', ctypes.c_uint64), - ('wave_id', ctypes.c_uint64), - ('begin_time', ctypes.c_uint64), # Begin and end cycle - ('end_time', ctypes.c_uint64), - + ("simd", ctypes.c_uint64), + ("wave_id", ctypes.c_uint64), + ("begin_time", ctypes.c_uint64), # Begin and end cycle + ("end_time", ctypes.c_uint64), # total VMEM/FLAT/LDS/SMEM instructions issued # total issued memory instructions - ('num_mem_instrs', ctypes.c_uint64), + ("num_mem_instrs", ctypes.c_uint64), # total issued instructions (compute + memory) - ('num_issued_instrs', ctypes.c_uint64), - ('num_valu_instrs', ctypes.c_uint64), - ('num_valu_stalls', ctypes.c_uint64), + ("num_issued_instrs", ctypes.c_uint64), + ("num_valu_instrs", ctypes.c_uint64), + ("num_valu_stalls", ctypes.c_uint64), # VMEM Pipeline: instrs and stalls - ('num_vmem_instrs', ctypes.c_uint64), - ('num_vmem_stalls', ctypes.c_uint64), + ("num_vmem_instrs", ctypes.c_uint64), + ("num_vmem_stalls", ctypes.c_uint64), # FLAT instrs and stalls - ('num_flat_instrs', ctypes.c_uint64), - ('num_flat_stalls', ctypes.c_uint64), - + ("num_flat_instrs", ctypes.c_uint64), + ("num_flat_stalls", ctypes.c_uint64), # LDS instr and stalls - ('num_lds_instrs', ctypes.c_uint64), - ('num_lds_stalls', ctypes.c_uint64), - + ("num_lds_instrs", ctypes.c_uint64), + ("num_lds_stalls", ctypes.c_uint64), # SCA instrs stalls - ('num_salu_instrs', ctypes.c_uint64), - ('num_smem_instrs', ctypes.c_uint64), - ('num_salu_stalls', ctypes.c_uint64), - ('num_smem_stalls', ctypes.c_uint64), - + ("num_salu_instrs", ctypes.c_uint64), + ("num_smem_instrs", ctypes.c_uint64), + ("num_salu_stalls", ctypes.c_uint64), + ("num_smem_stalls", ctypes.c_uint64), # Branch - ('num_branch_instrs', ctypes.c_uint64), - ('num_branch_taken_instrs', ctypes.c_uint64), - ('num_branch_stalls', ctypes.c_uint64), + ("num_branch_instrs", ctypes.c_uint64), + ("num_branch_taken_instrs", ctypes.c_uint64), + ("num_branch_stalls", ctypes.c_uint64), + ("timeline_array", POINTER(ctypes.c_int64)), + ("instructions_array", POINTER(ctypes.c_int64)), + ("timeline_size", ctypes.c_uint64), + ("instructions_size", ctypes.c_uint64), + ] - ('timeline_array', POINTER(ctypes.c_int64)), - ('instructions_array', POINTER(ctypes.c_int64)), - ('timeline_size', ctypes.c_uint64), - ('instructions_size', ctypes.c_uint64)] class PythonWave: def __init__(self, source_wave): @@ -110,20 +118,26 @@ def __init__(self, source_wave): self.timeline_array = None self.instructions_array = None + # Flags : # IS_NAVI = 0x1 class ReturnInfo(ctypes.Structure): - _fields_ = [('num_waves', ctypes.c_uint64), - ('wavedata', POINTER(Wave)), - ('num_events', ctypes.c_uint64), - ('perfevents', POINTER(PerfEvent)), - ('occupancy', POINTER(ctypes.c_uint64)), - ('num_occupancy', ctypes.c_uint64), - ('flags', ctypes.c_uint64)] - -rocprofv2_att_lib = os.getenv('ROCPROFV2_ATT_LIB_PATH') + _fields_ = [ + ("num_waves", ctypes.c_uint64), + ("wavedata", POINTER(Wave)), + ("num_events", ctypes.c_uint64), + ("perfevents", POINTER(PerfEvent)), + ("occupancy", POINTER(ctypes.c_uint64)), + ("num_occupancy", ctypes.c_uint64), + ("flags", ctypes.c_uint64), + ] + + +rocprofv2_att_lib = os.getenv("ROCPROFV2_ATT_LIB_PATH") if rocprofv2_att_lib is None: - print("ATT Lib path not set. Use export ROCPROFV2_ATT_LIB_PATH=/path/to/librocprofv2_att.so") + print( + "ATT Lib path not set. Use export ROCPROFV2_ATT_LIB_PATH=/path/to/librocprofv2_att.so" + ) quit() path_to_parser = os.path.abspath(rocprofv2_att_lib) SO = CDLL(path_to_parser) @@ -133,18 +147,19 @@ class ReturnInfo(ctypes.Structure): SO.wrapped_parse_binary.argtypes = [ctypes.c_char_p, ctypes.c_char_p] SO.wrapped_parse_binary.restype = ReturnAssemblyInfo + def parse_binary(filename, kernel=None): - if kernel is None or kernel == '': + if kernel is None or kernel == "": kernel = ctypes.c_char_p(0) - print('Parsing all kernels') + print("Parsing all kernels") else: - with open(glob.glob(kernel)[0], 'r') as file: + with open(glob.glob(kernel)[0], "r") as file: kernel = file.readlines() - print('Parsing kernel:', kernel[0].split(': ')[0]) - kernel = kernel[0].split(': ')[1].split('.kd')[0] - kernel = str(kernel).encode('utf-8') + print("Parsing kernel:", kernel[0].split(": ")[0]) + kernel = kernel[0].split(": ")[1].split(".kd")[0] + kernel = str(kernel).encode("utf-8") filename = os.path.abspath(str(filename)) - info = SO.wrapped_parse_binary(str(filename).encode('utf-8'), kernel) + info = SO.wrapped_parse_binary(str(filename).encode("utf-8"), kernel) code = [] for k in range(info.code_len): @@ -169,23 +184,32 @@ def parse_binary(filename, kernel=None): def getWaves_binary(name, shader_engine_data_dict, target_cu, depth): filename = os.path.abspath(str(name)) - info = SO.AnalyseBinary(filename.encode('utf-8'), target_cu, False) + info = SO.AnalyseBinary(filename.encode("utf-8"), target_cu, False) waves = [info.wavedata[k] for k in range(info.num_waves)] events = [deepcopy(info.perfevents[k]) for k in range(info.num_events)] occupancy = [int(info.occupancy[k]) for k in range(int(info.num_occupancy))] - flags = 'navi' if (info.flags & 0x1) else 'vega' + flags = "navi" if (info.flags & 0x1) else "vega" wave_slot_count = [[0 for k in range(20)] for j in range(4)] waves_python = [] for wave in waves: - if wave_slot_count[wave.simd][wave.wave_id] >= depth or wave.instructions_size == 0: + if ( + wave_slot_count[wave.simd][wave.wave_id] >= depth + or wave.instructions_size == 0 + ): continue wave_slot_count[wave.simd][wave.wave_id] += 1 pwave = PythonWave(wave) - pwave.timeline = [(wave.timeline_array[2*k], wave.timeline_array[2*k+1]) for k in range(wave.timeline_size)] - pwave.instructions = [tuple([wave.instructions_array[4*k+m] for m in range(4)]) for k in range(wave.instructions_size)] - waves_python.append( pwave ) + pwave.timeline = [ + (wave.timeline_array[2 * k], wave.timeline_array[2 * k + 1]) + for k in range(wave.timeline_size) + ] + pwave.instructions = [ + tuple([wave.instructions_array[4 * k + m] for m in range(4)]) + for k in range(wave.instructions_size) + ] + waves_python.append(pwave) shader_engine_data_dict[name] = (waves_python, events, occupancy, flags) @@ -233,31 +257,31 @@ def persist(trace_file, SIMD): instructions.append(wave.instructions) df = { - 'name': [trace for _ in range(len(begin_time))], - 'id': [i for i in range(len(begin_time))], - 'simd': simds, - 'wave_slot': waves, - 'begin_time': begin_time, - 'end_time': end_time, - 'mem_ins': mem_ins, - 'issued_ins': issued_ins, - 'valu_ins': valu_ins, - 'valu_stalls': valu_stalls, - 'vmem_ins': vmem_ins, - 'vmem_stalls': vmem_stalls, - 'flat_ins': flat_ins, - 'flat_stalls': flat_stalls, - 'lds_ins': lds_ins, - 'lds_stalls': lds_stalls, - 'salu_ins': salu_ins, - 'salu_stalls': salu_stalls, - 'smem_ins': smem_ins, - 'smem_stalls': smem_stalls, - 'br_ins': br_ins, - 'br_taken_ins': br_taken_ins, - 'br_stalls': br_stalls, - 'timeline': timeline, - 'instructions': instructions, + "name": [trace for _ in range(len(begin_time))], + "id": [i for i in range(len(begin_time))], + "simd": simds, + "wave_slot": waves, + "begin_time": begin_time, + "end_time": end_time, + "mem_ins": mem_ins, + "issued_ins": issued_ins, + "valu_ins": valu_ins, + "valu_stalls": valu_stalls, + "vmem_ins": vmem_ins, + "vmem_stalls": vmem_stalls, + "flat_ins": flat_ins, + "flat_stalls": flat_stalls, + "lds_ins": lds_ins, + "lds_stalls": lds_stalls, + "salu_ins": salu_ins, + "salu_stalls": salu_stalls, + "smem_ins": smem_ins, + "smem_stalls": smem_stalls, + "br_ins": br_ins, + "br_taken_ins": br_taken_ins, + "br_stalls": br_stalls, + "timeline": timeline, + "instructions": instructions, } return df @@ -271,68 +295,85 @@ def mem_max(array): mem_dict[inst[0]][0] = max(mem_dict[inst[0]][0], inst[1]) except: mem_dict[inst[0]] = inst[1:] - assert(mem_dict[inst[0]][1] == inst[2]) + assert mem_dict[inst[0]][1] == inst[2] return mem_dict + def lgk(count): - return 'lgkmcnt({0})'.format(count) + return "lgkmcnt({0})".format(count) + + def vmc(count): - return 'vmcnt({0})'.format(count) + return "vmcnt({0})".format(count) + + def both_cnt(count): - return lgk(count)+' '+vmc(count) + return lgk(count) + " " + vmc(count) + def insert_waitcnt(flight_count, assembly_code): flight_count = mem_max(flight_count) for key in sorted(flight_count): line_n = key - issue_amount, waitcnt_amount, = flight_count[key] - if 'vmcnt' in assembly_code[line_n] and 'lgkmcnt' in assembly_code[line_n]: + ( + issue_amount, + waitcnt_amount, + ) = flight_count[key] + if "vmcnt" in assembly_code[line_n] and "lgkmcnt" in assembly_code[line_n]: counter_type = both_cnt - elif 'vmcnt' in assembly_code[line_n]: + elif "vmcnt" in assembly_code[line_n]: counter_type = vmc - elif 'lgkmcnt' in assembly_code[line_n]: + elif "lgkmcnt" in assembly_code[line_n]: counter_type = lgk else: - print('Error: Line mismatch') + print("Error: Line mismatch") exit(-1) - for count in range(waitcnt_amount+1, issue_amount): - print('Inserted line: '+str(line_n)) - as_index = line_n - count/(issue_amount+1) - assembly_code[as_index] = \ - '\ts_waitcnt {0}\t\t; Timing analysis.'.format(counter_type(count)) - as_index += 0.5/(issue_amount+1) - assembly_code[as_index] = '\ts_nop 0\t\t\t\t\t\t; Counters: '+str(issue_amount) + for count in range(waitcnt_amount + 1, issue_amount): + print("Inserted line: " + str(line_n)) + as_index = line_n - count / (issue_amount + 1) + assembly_code[as_index] = "\ts_waitcnt {0}\t\t; Timing analysis.".format( + counter_type(count) + ) + as_index += 0.5 / (issue_amount + 1) + assembly_code[as_index] = "\ts_nop 0\t\t\t\t\t\t; Counters: " + str( + issue_amount + ) return assembly_code def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): for n, occ in enumerate(OCCUPANCY): - OCCUPANCY[n] = [max(min(int((u>>16)-min_event_time)<<16,2**42),0) | (u&0xFFFFF) for u in occ] + OCCUPANCY[n] = [ + max(min(int((u >> 16) - min_event_time) << 16, 2**42), 0) | (u & 0xFFFFF) + for u in occ + ] for perf in EVENTS: for p in perf: p.time -= min_event_time for df in DBFILES: - for T in range(len(df['timeline'])): - timeline = df['timeline'][T] + for T in range(len(df["timeline"])): + timeline = df["timeline"][T] time_acc = 0 - tuples3 = [(0,df['begin_time'][T]-min_event_time)]+[(int(t[0]),int(t[1])) for t in timeline] + tuples3 = [(0, df["begin_time"][T] - min_event_time)] + [ + (int(t[0]), int(t[1])) for t in timeline + ] for state in tuples3: - if state[1] > 1E8: - print('Warning: Time limit reached for ',state[0], state[1]) + if state[1] > 1e8: + print("Warning: Time limit reached for ", state[0], state[1]) break - if time_acc+state[1] > TIMELINES[state[0]].size: - TIMELINES[state[0]] = np.hstack([ - TIMELINES[state[0]], - np.zeros_like(TIMELINES[state[0]]) - ]) - TIMELINES[state[0]][time_acc:time_acc+state[1]] += 1 + if time_acc + state[1] > TIMELINES[state[0]].size: + TIMELINES[state[0]] = np.hstack( + [TIMELINES[state[0]], np.zeros_like(TIMELINES[state[0]])] + ) + TIMELINES[state[0]][time_acc : time_acc + state[1]] += 1 time_acc += state[1] + if __name__ == "__main__": comm = None mpi_root = True @@ -344,25 +385,41 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): else: mpi_root = comm.Get_rank() == 0 except: - print('Could not load MPI') + print("Could not load MPI") comm = None - pathenv = os.getenv('OUTPUT_PATH') + pathenv = os.getenv("OUTPUT_PATH") if pathenv is None: pathenv = "." parser = argparse.ArgumentParser() - parser.add_argument("assembly_code", help="Path to the assembly code. Must be the first parameter.") - parser.add_argument("--depth", help="Maximum number of parsed waves per slot", default=100, type=int) - parser.add_argument("--trace_file", help="Filter for trace files", default=None, type=str) - parser.add_argument("--att_kernel", help="Kernel file", - type=str, default=pathenv+'/*_kernel.txt') + parser.add_argument( + "assembly_code", help="Path to the assembly code. Must be the first parameter." + ) + parser.add_argument( + "--depth", help="Maximum number of parsed waves per slot", default=100, type=int + ) + parser.add_argument( + "--trace_file", help="Filter for trace files", default=None, type=str + ) + parser.add_argument( + "--att_kernel", help="Kernel file", type=str, default=pathenv + "/*_kernel.txt" + ) parser.add_argument("--ports", help="Server and websocket ports, default: 8000,18000") - parser.add_argument("--genasm", - help="Generate post-processed asm file at this path", type=str, default="") - parser.add_argument("--mode", help='''ATT analysis modes:\n + parser.add_argument( + "--genasm", + help="Generate post-processed asm file at this path", + type=str, + default="", + ) + parser.add_argument( + "--mode", + help="""ATT analysis modes:\n off: Only run ATT collection, disable analysis.\n file: dump json files to disk.\n - network: Open att server over the network.''', type=str, default="off") + network: Open att server over the network.""", + type=str, + default="off", + ) args = parser.parse_args() CSV_MODE = False @@ -370,38 +427,38 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): CSV_MODE = True elif args.mode.lower() == 'file': args.dumpfiles = True - elif args.mode.lower() == 'network': + elif args.mode.lower() == "network": args.dumpfiles = False else: - print('Skipping analysis.') + print("Skipping analysis.") quit() - with open(os.getenv("COUNTERS_PATH"), 'r') as f: - lines = [l.split('//')[0] for l in f.readlines()] + with open(os.getenv("COUNTERS_PATH"), "r") as f: + lines = [l.split("//")[0] for l in f.readlines()] EVENT_NAMES = [] - clean = lambda x: x.split('=')[1].split(' ')[0].split('\n')[0] + clean = lambda x: x.split("=")[1].split(" ")[0].split("\n")[0] for line in lines: - if 'PERFCOUNTER_ID=' in line: - EVENT_NAMES += ['id: '+clean(line)] - elif 'att: TARGET_CU' in line: + if "PERFCOUNTER_ID=" in line: + EVENT_NAMES += ["id: " + clean(line)] + elif "att: TARGET_CU" in line: args.target_cu = int(clean(line)) for line in lines: - if 'PERFCOUNTER=' in line: - EVENT_NAMES += [clean(line).split('SQ_')[1].lower()] + if "PERFCOUNTER=" in line: + EVENT_NAMES += [clean(line).split("SQ_")[1].lower()] if args.target_cu is None: args.target_cu = 1 att_kernel = glob.glob(args.att_kernel) if len(att_kernel) == 0: - print('Could not find att output kernel:', args.att_kernel) + print("Could not find att output kernel:", args.att_kernel) exit(1) elif len(att_kernel) > 1: if mpi_root: - print('Found multiple kernel matching given filters:') + print("Found multiple kernel matching given filters:") for n, k in enumerate(att_kernel): - print('\t', n, '->', k) + print("\t", n, "->", k) bValid = False while bValid == False: @@ -411,7 +468,7 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): except KeyboardInterrupt: exit(0) except: - print('Invalid option.') + print("Invalid option.") if comm is not None: args.att_kernel = comm.bcast(args.att_kernel, root=0) else: @@ -429,13 +486,13 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): # Trace Parsing if args.trace_file is None: - filenames = glob.glob(args.att_kernel.split('_kernel.txt')[0]+'_*.att') + filenames = glob.glob(args.att_kernel.split("_kernel.txt")[0] + "_*.att") else: filenames = glob.glob(args.trace_file) - assert(len(filenames) > 0) + assert len(filenames) > 0 if comm is not None: - filenames = filenames[comm.Get_rank()::comm.Get_size()] + filenames = filenames[comm.Get_rank() :: comm.Get_size()] code = jumps = None if mpi_root: @@ -443,7 +500,7 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): code, jumps = parse_binary(args.assembly_code, None if bIsAuto else args.att_kernel) DBFILES = [] - TIMELINES = [np.zeros(int(1E4),dtype=np.int16) for k in range(5)] + TIMELINES = [np.zeros(int(1e4), dtype=np.int16) for k in range(5)] EVENTS = [] OCCUPANCY = [] GFXV = [] @@ -479,16 +536,16 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): gc.collect() min_event_time = 2**62 for df in DBFILES: - if len(df['begin_time']) > 0: - min_event_time = min(min_event_time, np.min(df['begin_time'])) + if len(df["begin_time"]) > 0: + min_event_time = min(min_event_time, np.min(df["begin_time"])) for perf in EVENTS: for p in perf: min_event_time = min(min_event_time, p.time) for occ in OCCUPANCY: - min_event_time = min(min_event_time, np.min(np.array(occ)>>16)) + min_event_time = min(min_event_time, np.min(np.array(occ) >> 16)) gc.collect() - min_event_time = max(0, min_event_time-32) + min_event_time = max(0, min_event_time - 32) if comm is not None: min_event_time = comm.reduce(min_event_time, op=MPI.MIN) min_event_time = comm.bcast(min_event_time, root=0) @@ -513,14 +570,17 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): OCCUPANCY = [e for elem in OCCUPANCY for e in elem] gathered_filenames = [e for elem in gathered_filenames for e in elem] gfxv = [e for elem in GFXV for e in elem][0] - + TIMELINES_GATHER = TIMELINES - TIMELINES = [np.zeros((np.max([len(tm[k]) for tm in TIMELINES])), np.int16) for k in range(5)] + TIMELINES = [ + np.zeros((np.max([len(tm[k]) for tm in TIMELINES])), np.int16) + for k in range(5) + ] for gather in TIMELINES_GATHER: for t, m in zip(TIMELINES, gather): - t[:len(m)] += m - del(TIMELINES_GATHER) - else: # free up memory + t[: len(m)] += m + del TIMELINES_GATHER + else: # free up memory TIMELINES = [] OCCUPANCY = [] EVENTS = [] @@ -542,17 +602,49 @@ def apply_min_event(min_event_time, OCCUPANCY, EVENTS, DBFILES, TIMELINES): gc.collect() print("Min time:", min_event_time) - drawinfo = {'TIMELINES':TIMELINES, 'EVENTS':EVENTS, 'EVENT_NAMES':EVENT_NAMES, 'OCCUPANCY': OCCUPANCY, 'ShaderNames': gathered_filenames} + drawinfo = { + "TIMELINES": TIMELINES, + "EVENTS": EVENTS, + "EVENT_NAMES": EVENT_NAMES, + "OCCUPANCY": OCCUPANCY, + "ShaderNames": gathered_filenames, + } if args.genasm and len(args.genasm) > 0: - flight_count = view_trace(args, code, DBFILES, analysed_filenames, True, OCCUPANCY, args.dumpfiles, min_event_time, gfxv, drawinfo, comm, mpi_root) - with open(args.assembly_code, 'r') as file: + flight_count = view_trace( + args, + code, + DBFILES, + analysed_filenames, + True, + OCCUPANCY, + args.dumpfiles, + min_event_time, + gfxv, + drawinfo, + comm, + mpi_root, + ) + with open(args.assembly_code, "r") as file: lines = file.readlines() - assembly_code = {l+1.0: lines[l][:-1] for l in range(len(lines))} + assembly_code = {l + 1.0: lines[l][:-1] for l in range(len(lines))} assembly_code = insert_waitcnt(flight_count, assembly_code) - with open(args.genasm, 'w') as file: + with open(args.genasm, "w") as file: keys = sorted(assembly_code.keys()) for k in keys: - file.write(assembly_code[k]+'\n') + file.write(assembly_code[k] + "\n") else: - view_trace(args, code, DBFILES, analysed_filenames, False, OCCUPANCY, args.dumpfiles, min_event_time, gfxv, drawinfo, comm, mpi_root) + view_trace( + args, + code, + DBFILES, + analysed_filenames, + False, + OCCUPANCY, + args.dumpfiles, + min_event_time, + gfxv, + drawinfo, + comm, + mpi_root, + ) diff --git a/plugin/att/drawing.py b/plugin/att/drawing.py index 63176260..b6a5e62a 100644 --- a/plugin/att/drawing.py +++ b/plugin/att/drawing.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import sys + if sys.version_info[0] < 3: raise Exception("Must be using Python 3") @@ -9,7 +10,8 @@ from copy import deepcopy import json -COUNTERS_MAX_CAPTURES = 1<<12 +COUNTERS_MAX_CAPTURES = 1 << 12 + class Readable: def __init__(self, jsonstring): @@ -17,19 +19,20 @@ def __init__(self, jsonstring): self.seek = 0 def read(self, length=0): - if length<=0: + if length <= 0: return self.jsonstr else: if self.seek >= len(self): self.seek = 0 return None - response = self.jsonstr[self.seek:self.seek+length] + response = self.jsonstr[self.seek : self.seek + length] self.seek += length - return bytes(response, 'utf-8') + return bytes(response, "utf-8") def __len__(self): return len(self.jsonstr) + class FileBytesIO: def __init__(self, iobytes): self.iobytes = deepcopy(iobytes) @@ -39,72 +42,103 @@ def __len__(self): return self.iobytes.getbuffer().nbytes def read(self, length=0): - if length<=0: + if length <= 0: return bytes(self.iobytes.getbuffer()) else: if self.seek >= self.iobytes.getbuffer().nbytes: self.seek = 0 return None - response = self.iobytes.getbuffer()[self.seek:self.seek+length] + response = self.iobytes.getbuffer()[self.seek : self.seek + length] self.seek += length return bytes(response) + def get_delta_time(events): try: - CUS = [[e.time for e in events if e.cu==k and e.bank==0] for k in range(16)] + CUS = [[e.time for e in events if e.cu == k and e.bank == 0] for k in range(16)] CUS = [np.asarray(c).astype(np.int64) for c in CUS if len(c) > 2] - return np.min([np.min(abs(c[1:]-c[:-1])) for c in CUS]) + return np.min([np.min(abs(c[1:] - c[:-1])) for c in CUS]) except: return 1 + def draw_wave_metrics(selections, normalize, TIMELINES, EVENTS, EVENT_NAMES): - plt.figure(figsize=(15,4)) + plt.figure(figsize=(15, 4)) delta_step = 8 - quad_delta_time = max(delta_step,int(0.5+np.min([get_delta_time(events) for events in EVENTS]))) - maxtime = np.max([np.max([e.time for e in events]) for events in EVENTS])/quad_delta_time+1 - - if maxtime*delta_step >= COUNTERS_MAX_CAPTURES: + quad_delta_time = max( + delta_step, int(0.5 + np.min([get_delta_time(events) for events in EVENTS])) + ) + maxtime = ( + np.max([np.max([e.time for e in events]) for events in EVENTS]) / quad_delta_time + + 1 + ) + + if maxtime * delta_step >= COUNTERS_MAX_CAPTURES: delta_step = 1 while maxtime >= COUNTERS_MAX_CAPTURES: quad_delta_time *= 2 maxtime /= 2 - maxtime = int(min(maxtime*delta_step, COUNTERS_MAX_CAPTURES)) + maxtime = int(min(maxtime * delta_step, COUNTERS_MAX_CAPTURES)) event_timeline = np.zeros((16, maxtime), dtype=np.int32) - print('Delta:', quad_delta_time) - print('Max_cycles:', maxtime*quad_delta_time*4//delta_step) + print("Delta:", quad_delta_time) + print("Max_cycles:", maxtime * quad_delta_time * 4 // delta_step) - cycles = 4*quad_delta_time//delta_step*np.arange(maxtime) - kernel = len(EVENTS)*quad_delta_time + cycles = 4 * quad_delta_time // delta_step * np.arange(maxtime) + kernel = len(EVENTS) * quad_delta_time for events in EVENTS: - for e in range(len(events)-1): - bk = events[e].bank*4 - start = events[e].time // (quad_delta_time//delta_step) - end = start+delta_step - event_timeline[bk:bk+4, start:end] += np.asarray(events[e].toTuple()[1:5])[:, None] + for e in range(len(events) - 1): + bk = events[e].bank * 4 + start = events[e].time // (quad_delta_time // delta_step) + end = start + delta_step + event_timeline[bk : bk + 4, start:end] += np.asarray( + events[e].toTuple()[1:5] + )[:, None] start = events[-1].time - event_timeline[bk:bk+4, start:start+delta_step] += \ - np.asarray(events[-1].toTuple()[1:5])[:, None] + event_timeline[bk : bk + 4, start : start + delta_step] += np.asarray( + events[-1].toTuple()[1:5] + )[:, None] - event_timeline = [np.convolve(e, [kernel for k in range(3)])[1:-1] for e in event_timeline] - #event_timeline = [e/kernel for e in event_timeline] + event_timeline = [ + np.convolve(e, [kernel for k in range(3)])[1:-1] for e in event_timeline + ] + # event_timeline = [e/kernel for e in event_timeline] if normalize: - event_timeline = [100*e/max(e.max(), 1E-5) for e in event_timeline] - - colors = ['blue', 'green', 'gray', 'red', 'orange', 'cyan', 'black', 'darkviolet', - 'yellow', 'darkred', 'pink', 'lime', 'gold', 'tan', 'aqua', 'olive'] - [plt.plot(cycles, e, '-', label=n, color=c) - for e, n, c, sel in zip(event_timeline, EVENT_NAMES, colors, selections) if sel] + event_timeline = [100 * e / max(e.max(), 1e-5) for e in event_timeline] + + colors = [ + "blue", + "green", + "gray", + "red", + "orange", + "cyan", + "black", + "darkviolet", + "yellow", + "darkred", + "pink", + "lime", + "gold", + "tan", + "aqua", + "olive", + ] + [ + plt.plot(cycles, e, "-", label=n, color=c) + for e, n, c, sel in zip(event_timeline, EVENT_NAMES, colors, selections) + if sel + ] plt.legend() if normalize: - plt.ylabel('As % of maximum') + plt.ylabel("As % of maximum") else: - plt.ylabel('Value') - plt.xlabel('Cycle') + plt.ylabel("Value") + plt.xlabel("Cycle") plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1) figure_bytes = BytesIO() @@ -114,39 +148,56 @@ def draw_wave_metrics(selections, normalize, TIMELINES, EVENTS, EVENT_NAMES): def draw_wave_states(selections, normalize, TIMELINES): plot_indices = [1, 2, 3, 4] - STATES = [['Empty', 'Idle', 'Exec', 'Wait', 'Stall'][k] for k in plot_indices] - colors = [['gray', 'orange', 'green', 'red', 'blue'][k] for k in plot_indices] + STATES = [["Empty", "Idle", "Exec", "Wait", "Stall"][k] for k in plot_indices] + colors = [["gray", "orange", "green", "red", "blue"][k] for k in plot_indices] - plt.figure(figsize=(15,4)) + plt.figure(figsize=(15, 4)) maxtime = max([np.max((TIMELINES[k]!=0)*np.arange(0,TIMELINES[k].size)) for k in plot_indices]) maxtime = max(maxtime, 1) timelines = [deepcopy(TIMELINES[k][:maxtime]) for k in plot_indices] - timelines = [np.pad(t, [0, maxtime-t.size]) for t in timelines] + timelines = [np.pad(t, [0, maxtime - t.size]) for t in timelines] if normalize: - timelines = np.array(timelines) / np.maximum(np.sum(timelines,0)*1E-2,1E-7) - - trim = max(maxtime//5000,1) - cycles = np.arange(0, timelines[0].size//trim, 1)*trim - timelines = [time[:trim*(time.size//trim)].reshape((-1, trim)).mean(-1) if len(time) > 0 else cycles*0 for time in timelines] + timelines = np.array(timelines) / np.maximum(np.sum(timelines, 0) * 1e-2, 1e-7) + + trim = max(maxtime // 5000, 1) + cycles = np.arange(0, timelines[0].size // trim, 1) * trim + timelines = [ + time[: trim * (time.size // trim)].reshape((-1, trim)).mean(-1) + if len(time) > 0 + else cycles * 0 + for time in timelines + ] kernsize = 21 - kernel = np.asarray([np.exp(-abs(10*k/kernsize)) for k in range(-kernsize//2,kernsize//2+1)]) + kernel = np.asarray( + [ + np.exp(-abs(10 * k / kernsize)) + for k in range(-kernsize // 2, kernsize // 2 + 1) + ] + ) kernel /= np.sum(kernel) - timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2] for time in timelines if len(time) > 0] + timelines = [ + np.convolve(time, kernel)[kernsize // 2 : -kernsize // 2] + for time in timelines + if len(time) > 0 + ] - [plt.plot(cycles, t, label='State '+s, linewidth=1.1, color=c) - for t, s, c, sel in zip(timelines, STATES, colors, selections) if sel] + [ + plt.plot(cycles, t, label="State " + s, linewidth=1.1, color=c) + for t, s, c, sel in zip(timelines, STATES, colors, selections) + if sel + ] plt.legend() if normalize: - plt.ylabel('Waves state %') + plt.ylabel("Waves state %") else: - plt.ylabel('Waves state total') - plt.xlabel('Cycle') + plt.ylabel("Waves state total") + plt.xlabel("Cycle") plt.ylim(-1) - plt.xlim(-maxtime//200, maxtime+maxtime//200+1) + plt.xlim(-maxtime // 200, maxtime + maxtime // 200 + 1) plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1) figure_bytes = BytesIO() plt.savefig(figure_bytes, dpi=150) @@ -154,7 +205,7 @@ def draw_wave_states(selections, normalize, TIMELINES): def draw_occupancy(selections, normalize, OCCUPANCY, shadernames): - plt.figure(figsize=(15,4)) + plt.figure(figsize=(15, 4)) names = [] if len(OCCUPANCY) == 1: # If single SE, do occupancy per CU/WGP OCCUPANCY = [[u for u in OCCUPANCY[0] if u&0xFF==k] for k in range(16)] @@ -166,7 +217,7 @@ def draw_occupancy(selections, normalize, OCCUPANCY, shadernames): for name, occ in zip(shadernames, OCCUPANCY): occ_values = [0] occ_times = [0] - occ = [(int(u>>16), (u>>8)&0xFF, u&0xFF) for u in occ] + occ = [(int(u >> 16), (u >> 8) & 0xFF, u & 0xFF) for u in occ] current_occ = [0 for k in range(16)] for time, value, cu in occ: @@ -180,30 +231,30 @@ def draw_occupancy(selections, normalize, OCCUPANCY, shadernames): NUM_DOTS = 1500 maxtime = np.max(occ_times) - delta = max(1, maxtime//NUM_DOTS) - chart = np.zeros((maxtime//delta+1), dtype=np.float32) + delta = max(1, maxtime // NUM_DOTS) + chart = np.zeros((maxtime // delta + 1), dtype=np.float32) norm_fact = np.zeros_like(chart) for i, t in enumerate(occ_times[:-1]): - b = t//delta - e = max(b+1,occ_times[i+1]//delta) + b = t // delta + e = max(b + 1, occ_times[i + 1] // delta) chart[b:e] += occ_values[i] norm_fact[b:e] += 1 - chart /= np.maximum(norm_fact,1) + chart /= np.maximum(norm_fact, 1) if normalize: - chart /= max(chart.max(),1E-6) + chart /= max(chart.max(), 1e-6) - plt.plot(np.arange(chart.size)*delta, chart, label=name, linewidth=1.1) + plt.plot(np.arange(chart.size) * delta, chart, label=name, linewidth=1.1) plt.legend() if normalize: - plt.ylabel('Occupancy %') + plt.ylabel("Occupancy %") else: - plt.ylabel('Occupancy total') - plt.xlabel('Cycle') + plt.ylabel("Occupancy total") + plt.xlabel("Cycle") plt.ylim(-1) - plt.xlim(-maxtime//200, maxtime+maxtime//200+delta+1) + plt.xlim(-maxtime // 200, maxtime + maxtime // 200 + delta + 1) plt.subplots_adjust(left=0.04, right=1, top=1, bottom=0.1) figure_bytes = BytesIO() plt.savefig(figure_bytes, dpi=150) @@ -211,22 +262,26 @@ def draw_occupancy(selections, normalize, OCCUPANCY, shadernames): def GeneratePIC(drawinfo, selections=[True for k in range(16)], normalize=False): - EVENTS = drawinfo['EVENTS'] + EVENTS = drawinfo["EVENTS"] response = {} figures = {} - states, figure = draw_occupancy(selections, normalize, drawinfo['OCCUPANCY'], drawinfo['ShaderNames']) - response['occupancy.png'] = states - figures['occupancy.png'] = figure + states, figure = draw_occupancy( + selections, normalize, drawinfo["OCCUPANCY"], drawinfo["ShaderNames"] + ) + response["occupancy.png"] = states + figures["occupancy.png"] = figure - states, figure = draw_wave_states(selections, normalize, drawinfo['TIMELINES']) - response['timeline.png'] = states - figures['timeline.png'] = figure + states, figure = draw_wave_states(selections, normalize, drawinfo["TIMELINES"]) + response["timeline.png"] = states + figures["timeline.png"] = figure if len(EVENTS) > 0 and np.sum([len(e) for e in EVENTS]) > 32: - EVENT_NAMES, figure = draw_wave_metrics(selections, normalize, drawinfo['TIMELINES'], EVENTS, drawinfo['EVENT_NAMES']) - response['counters.png'] = EVENT_NAMES - figures['counters.png'] = figure + EVENT_NAMES, figure = draw_wave_metrics( + selections, normalize, drawinfo["TIMELINES"], EVENTS, drawinfo["EVENT_NAMES"] + ) + response["counters.png"] = EVENT_NAMES + figures["counters.png"] = figure return Readable(response), figures diff --git a/plugin/att/stitch.py b/plugin/att/stitch.py index e1ada4e6..0cd03bd5 100644 --- a/plugin/att/stitch.py +++ b/plugin/att/stitch.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import sys + if sys.version_info[0] < 3: raise Exception("Must be using Python 3") @@ -54,33 +55,35 @@ # Keeps track of register states for hipcc-generated assembly class RegisterWatchList: def __init__(self, labels): - self.registers = {'v'+str(k): [[] for m in range(64)] for k in range(64)} + self.registers = {"v" + str(k): [[] for m in range(64)] for k in range(64)} for k in range(64): - self.registers['s'+str(k)] = [] + self.registers["s" + str(k)] = [] self.labels = labels def try_translate(self, tok): - if tok[0] in ['s']: + if tok[0] in ["s"]: return self.registers[self.range(tok)[0]] - elif '@' in tok: - return self.labels[tok.split('@')[0]]+1 + elif "@" in tok: + return self.labels[tok.split("@")[0]] + 1 def range(self, r): - reg = r.split(':') + reg = r.split(":") if len(reg) == 1: return reg else: - r0 = reg[0].split('[') - return [r0[0]+str(k) for k in range(int(r0[1]), int(reg[1][:-1])+1)] + r0 = reg[0].split("[") + return [r0[0] + str(k) for k in range(int(r0[1]), int(reg[1][:-1]) + 1)] def tokenize(self, line): - return [u for u in [t.split(',')[0].strip() for t in line.split(' ')] if len(u) > 0] + return [ + u for u in [t.split(",")[0].strip() for t in line.split(" ")] if len(u) > 0 + ] def getpc(self, line, next_line): - #print('Get pc:', line) + # print('Get pc:', line) try: - dst = line.split(' ')[1].strip() - label_dest = next_line.split(', ')[-1].split('@')[0] + dst = line.split(" ")[1].strip() + label_dest = next_line.split(", ")[-1].split("@")[0] for reg in self.range(dst): self.registers[reg].append(deepcopy(self.labels[label_dest])) except: @@ -94,7 +97,7 @@ def swappc(self, line, line_num, inst_num): popped = self.registers[self.range(src)[0]][-1] self.registers[self.range(src)[0]] = self.registers[self.range(src)[0]][:-1] - self.registers[self.range(dst)[0]].append(line_num+1) + self.registers[self.range(dst)[0]].append(line_num + 1) return popped except: return 0 @@ -111,12 +114,12 @@ def setpc(self, line, inst_num): def scratch(self, line): try: tokens = self.tokenize(line) - if '_load' in tokens[0]: + if "_load" in tokens[0]: dst = tokens[1] - src = tokens[3]+tokens[4] + src = tokens[3] + tokens[4] else: src = tokens[2] - dst = tokens[3]+tokens[4] + dst = tokens[3] + tokens[4] self.registers[dst] = self.registers[src] except: pass @@ -124,19 +127,27 @@ def scratch(self, line): def move(self, line): try: tokens = self.tokenize(line) - if tokens[2][0] in ['s', 'd'] and tokens[1][0] in ['s', 'd']: - self.registers[self.range(tokens[1])[0]] = deepcopy(self.registers[self.range(tokens[2])[0]]) + if tokens[2][0] in ["s", "d"] and tokens[1][0] in ["s", "d"]: + self.registers[self.range(tokens[1])[0]] = deepcopy( + self.registers[self.range(tokens[2])[0]] + ) except: pass def updatelane(self, line): tokens = self.tokenize(line) try: - if 'v_readlane' in tokens[0]: - self.registers[tokens[1]].append(self.registers[tokens[2]][int(tokens[3])][-1]) - self.registers[tokens[2]][int(tokens[3])] = self.registers[tokens[2]][int(tokens[3])][:-1] - elif 'v_writelane' in tokens[0]: - self.registers[tokens[1]][int(tokens[3])].append(self.registers[tokens[2]][-1]) + if "v_readlane" in tokens[0]: + self.registers[tokens[1]].append( + self.registers[tokens[2]][int(tokens[3])][-1] + ) + self.registers[tokens[2]][int(tokens[3])] = self.registers[tokens[2]][ + int(tokens[3]) + ][:-1] + elif "v_writelane" in tokens[0]: + self.registers[tokens[1]][int(tokens[3])].append( + self.registers[tokens[2]][-1] + ) self.registers[tokens[2]] = self.registers[tokens[2]][-STACK_SIZE_LIMIT:] except Exception as e: pass @@ -179,7 +190,8 @@ def updatelane(self, line): # Matches tokens in reverse order def try_match_swapped(insts, code, i, line): - return insts[i+1][1] == code[line][1] and insts[i][1] == code[line+1][1] + return insts[i + 1][1] == code[line][1] and insts[i][1] == code[line + 1][1] + FORK_NAMES = 1 # A successful parsed instruction @@ -197,7 +209,7 @@ def __init__(self): self.data = None self.name = FORK_NAMES FORK_NAMES += 1 - #print('Created new fork: ', self.name) + # print('Created new fork: ', self.name) # Try to match sequence "insts" with the branch "fork", starting at position "i" def move_down_fork(fork, insts, i): #(fork : Fork, insts : list, i : int): @@ -217,6 +229,7 @@ def move_down_fork(fork, insts, i): #(fork : Fork, insts : list, i : int): return True, i + FORK_TREE = Fork() # Check if there exists a previous wave with the same sequence of instructions executed @@ -227,7 +240,7 @@ def fromDict(insts): while i < N: tillEnd, final_pos = move_down_fork(cur_fork, insts, i) if tillEnd: - #print('Reached end') + # print('Reached end') return True, cur_fork i += final_pos @@ -250,7 +263,7 @@ def fromDict(insts): last_inst.forks.append(cur_fork) return False, cur_fork - print('Warning: Reached end of loop!') + print("Warning: Reached end of loop!") return False, cur_fork @@ -279,8 +292,8 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): SMEM_INST = [] # scalar memory - VLMEM_INST = [] # vector memory load - VSMEM_INST = [] # vector memory store + VLMEM_INST = [] # vector memory load + VSMEM_INST = [] # vector memory store FLAT_INST = [] NUM_SMEM = 0 NUM_VLMEM = 0 @@ -302,20 +315,20 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): line = len(code) print('Begin at:', line, c) c = list(c) - c[0] = c[0].split(';')[0].split('//')[0].strip() + c[0] = c[0].split(";")[0].split("//")[0].strip() if c[1] != 100: code.append(c) - elif ':' in c[0]: - labels[c[0].split(':')[0]] = len(code) - jump_map.append(len(code)-1) + elif ":" in c[0]: + labels[c[0].split(":")[0]] = len(code) + jump_map.append(len(code) - 1) reverse_map = [] for k, v in enumerate(jump_map): if v >= len(reverse_map): reverse_map.append(k) - jumps = {jump_map[j]+1: j for j in jumps} + jumps = {jump_map[j] + 1: j for j in jumps} # Checks if we have guaranteed ordering in memory operations smem_ordering = 0 @@ -340,7 +353,7 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): as_line = code[line] matched = True - next = line+1 + next = line + 1 if not bIsAuto: if '_mov_' in as_line[0]: @@ -375,25 +388,25 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): pcsequence.append(insts[i][2]) elif inst[1] == as_line[1]: if line in jumps: - loopCount[jumps[line]-1] += 1 + loopCount[jumps[line] - 1] += 1 num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM if inst[1] == SMEM or inst[1] == LDS: smem_ordering = 1 if inst[1] == SMEM else smem_ordering - SMEM_INST.append([reverse_map[line], num_inflight]) + SMEM_INST.append([reverse_map[line], num_inflight]) NUM_SMEM += 1 - elif inst[1] == VMEM or (inst[1] == FLAT and 'global_' in as_line[0]): + elif inst[1] == VMEM or (inst[1] == FLAT and "global_" in as_line[0]): inc_ordering = False - if 'flat_' in as_line[0]: + if "flat_" in as_line[0]: inc_ordering = True - if not bGFX9 and 'store' in as_line[0]: - VSMEM_INST.append([reverse_map[line], num_inflight]) + if not bGFX9 and "store" in as_line[0]: + VSMEM_INST.append([reverse_map[line], num_inflight]) NUM_VSMEM += 1 if inc_ordering: vsmem_ordering = 1 else: - VLMEM_INST.append([reverse_map[line], num_inflight]) + VLMEM_INST.append([reverse_map[line], num_inflight]) NUM_VLMEM += 1 if inc_ordering: vlmem_ordering = 1 @@ -401,44 +414,48 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): smem_ordering = 1 vlmem_ordering = 1 vsmem_ordering = 1 - FLAT_INST.append([reverse_map[line], num_inflight]) + FLAT_INST.append([reverse_map[line], num_inflight]) NUM_FLAT += 1 - elif inst[1] == IMMED and 's_waitcnt' in as_line[0]: - if 'lgkmcnt' in as_line[0]: - wait_N = int(as_line[0].split('lgkmcnt(')[1].split(')')[0]) + elif inst[1] == IMMED and "s_waitcnt" in as_line[0]: + if "lgkmcnt" in as_line[0]: + wait_N = int(as_line[0].split("lgkmcnt(")[1].split(")")[0]) flight_count.append([as_line[5], num_inflight, wait_N]) if wait_N == 0: smem_ordering = 0 if smem_ordering == 0: - offset = len(SMEM_INST)-wait_N - mem_unroll.append( [reverse_map[line], SMEM_INST[:offset]+FLAT_INST] ) + offset = len(SMEM_INST) - wait_N + mem_unroll.append( + [reverse_map[line], SMEM_INST[:offset] + FLAT_INST] + ) SMEM_INST = SMEM_INST[offset:] NUM_SMEM = len(SMEM_INST) FLAT_INST = [] NUM_FLAT = 0 else: - NUM_SMEM = min(max(wait_N-NUM_FLAT, 0), NUM_SMEM) - NUM_FLAT = min(max(wait_N-NUM_SMEM, 0), NUM_FLAT) + NUM_SMEM = min(max(wait_N - NUM_FLAT, 0), NUM_SMEM) + NUM_FLAT = min(max(wait_N - NUM_SMEM, 0), NUM_FLAT) num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM - if 'vmcnt' in as_line[0]: - wait_N = int(as_line[0].split('vmcnt(')[1].split(')')[0]) + if "vmcnt" in as_line[0]: + wait_N = int(as_line[0].split("vmcnt(")[1].split(")")[0]) flight_count.append([as_line[5], num_inflight, wait_N]) if wait_N == 0: vlmem_ordering = 0 if vlmem_ordering == 0: - offset = len(VLMEM_INST)-wait_N - mem_unroll.append( [reverse_map[line], VLMEM_INST[:offset]+FLAT_INST] ) + offset = len(VLMEM_INST) - wait_N + mem_unroll.append( + [reverse_map[line], VLMEM_INST[:offset] + FLAT_INST] + ) VLMEM_INST = VLMEM_INST[offset:] NUM_VLMEM = len(VLMEM_INST) FLAT_INST = [] NUM_FLAT = 0 else: - NUM_VLMEM = min(max(wait_N-NUM_FLAT, 0), NUM_VLMEM) - NUM_FLAT = min(max(wait_N-NUM_VLMEM, 0), NUM_FLAT) + NUM_VLMEM = min(max(wait_N - NUM_FLAT, 0), NUM_VLMEM) + NUM_FLAT = min(max(wait_N - NUM_VLMEM, 0), NUM_FLAT) num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM - if 'vscnt' in as_line[0] or (bGFX9 and 'vmcnt' in as_line[0]): + if "vscnt" in as_line[0] or (bGFX9 and "vmcnt" in as_line[0]): try: wait_N = int(as_line[0].split('vscnt(')[1].split(')')[0]) except: @@ -450,35 +467,37 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): if wait_N == 0: vsmem_ordering = 0 if vsmem_ordering == 0: - offset = len(VSMEM_INST)-wait_N - mem_unroll.append( [reverse_map[line], VSMEM_INST[:offset]+FLAT_INST] ) + offset = len(VSMEM_INST) - wait_N + mem_unroll.append( + [reverse_map[line], VSMEM_INST[:offset] + FLAT_INST] + ) VSMEM_INST = VSMEM_INST[offset:] NUM_VSMEM = len(VSMEM_INST) FLAT_INST = [] NUM_FLAT = 0 else: - NUM_VSMEM = min(max(wait_N-NUM_FLAT, 0), NUM_VSMEM) - NUM_FLAT = min(max(wait_N-NUM_VSMEM, 0), NUM_FLAT) + NUM_VSMEM = min(max(wait_N - NUM_FLAT, 0), NUM_VSMEM) + NUM_FLAT = min(max(wait_N - NUM_VSMEM, 0), NUM_FLAT) num_inflight = NUM_FLAT + NUM_SMEM + NUM_VLMEM + NUM_VSMEM elif inst[1] == JUMP and as_line[1] == BRANCH: next = jump_map[as_line[2]] if next is None or next == 0: - print('Jump to unknown location!', as_line) + print("Jump to unknown location!", as_line) break elif inst[1] == NEXT and as_line[1] == BRANCH: next = line + 1 else: matched = False next = line + 1 - if i+1 < N and line+1 < len(code): + if i + 1 < N and line + 1 < len(code): if try_match_swapped(insts, code, i, line): temp = insts[i] - insts[i] = insts[i+1] - insts[i+1] = temp + insts[i] = insts[i + 1] + insts[i + 1] = temp next = line - elif 's_waitcnt ' in as_line[0] or '_load_' in as_line[0]: - if skipped_immed > 0 and 's_waitcnt ' in as_line[0]: + elif "s_waitcnt " in as_line[0] or "_load_" in as_line[0]: + if skipped_immed > 0 and "s_waitcnt " in as_line[0]: matched = True skipped_immed -= 1 elif 'scratch_' not in as_line[0]: @@ -508,8 +527,10 @@ def stitch(insts, raw_code, jumps, gfxv, bIsAuto): pass else: while line < len(code): - if 's_endpgm' in code[line]: - mem_unroll.append( [reverse_map[line], SMEM_INST+VLMEM_INST+VSMEM_INST+FLAT_INST] ) + if "s_endpgm" in code[line]: + mem_unroll.append( + [reverse_map[line], SMEM_INST + VLMEM_INST + VSMEM_INST + FLAT_INST] + ) break line += 1 diff --git a/plugin/att/trace_view.py b/plugin/att/trace_view.py index 0539b33d..806a90bb 100755 --- a/plugin/att/trace_view.py +++ b/plugin/att/trace_view.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import sys + if sys.version_info[0] < 3: raise Exception("Must be using Python 3") @@ -23,6 +24,7 @@ JSON_GLOBAL_DICTIONARY = {} + def get_ip(): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.settimeout(0) @@ -31,51 +33,64 @@ def get_ip(): IPAddr = socket.gethostbyname(hostname) s.connect(({IPAddr}, 1)) except Exception: - IPAddr = '127.0.0.1' + IPAddr = "127.0.0.1" finally: return IPAddr IPAddr = get_ip() PORT, WebSocketPort = 8000, 18000 -SP = '\u00A0' +SP = "\u00A0" def get_top_n(code): TOP_N = 10 top_n = sorted(deepcopy(code), key=lambda x: x[-1], reverse=True)[:TOP_N] - return [(line_num, hitc, 0, run_time) for _, _, _, _, line_num, _, hitc, run_time in top_n] + return [ + (line_num, hitc, 0, run_time) for _, _, _, _, line_num, _, hitc, run_time in top_n + ] def wave_info(df, id): dic = { - 'Issue': df['issued_ins'][id], - 'Valu': df['valu_ins'][id], 'Valu_stall': df['valu_stalls'][id], - 'Salu': df['salu_ins'][id], 'Salu_stall': df['salu_stalls'][id], - 'Vmem': df['vmem_ins'][id], 'Vmem_stall': df['vmem_stalls'][id], - 'Smem': df['smem_ins'][id], 'Smem_stall': df['smem_stalls'][id], - 'Flat': df['flat_ins'][id], 'Flat_stall': df['flat_stalls'][id], - 'Lds': df['lds_ins'][id], 'Lds_stall': df['lds_stalls'][id], - 'Br': df['br_ins'][id], 'Br_stall': df['br_stalls'][id], + "Issue": df["issued_ins"][id], + "Valu": df["valu_ins"][id], + "Valu_stall": df["valu_stalls"][id], + "Salu": df["salu_ins"][id], + "Salu_stall": df["salu_stalls"][id], + "Vmem": df["vmem_ins"][id], + "Vmem_stall": df["vmem_stalls"][id], + "Smem": df["smem_ins"][id], + "Smem_stall": df["smem_stalls"][id], + "Flat": df["flat_ins"][id], + "Flat_stall": df["flat_stalls"][id], + "Lds": df["lds_ins"][id], + "Lds_stall": df["lds_stalls"][id], + "Br": df["br_ins"][id], + "Br_stall": df["br_stalls"][id], } - dic['Issue_stall'] = int(np.sum([dic[key] for key in dic.keys() if '_STALL' in key])) + dic["Issue_stall"] = int(np.sum([dic[key] for key in dic.keys() if "_STALL" in key])) return dic def extract_data(df, se_number): - if len(df['id']) == 0 or len(df['instructions']) == 0 or len(df['timeline']) == 0: + if len(df["id"]) == 0 or len(df["instructions"]) == 0 or len(df["timeline"]) == 0: return None wave_filenames = [] flight_count = [] - wave_slot_count = [{df['wave_slot'][wave_id]: 0 for wave_id in df['id']} for k in range(4)] - - print('Number of waves:', len(df['id'])) + wave_slot_count = [ + {df["wave_slot"][wave_id]: 0 for wave_id in df["id"]} for k in range(4) + ] + + print("Number of waves:", len(df["id"])) allwaves_maxline = 0 - for wave_id in df['id']: - stitched, loopCount, mem_unroll, count, maxline, num_insts = df['instructions'][wave_id] - timeline = df['timeline'][wave_id] + for wave_id in df["id"]: + stitched, loopCount, mem_unroll, count, maxline, num_insts = df["instructions"][ + wave_id + ] + timeline = df["timeline"][wave_id] if len(stitched) == 0 or len(timeline) == 0 or len(stitched) != num_insts: continue @@ -84,18 +99,18 @@ def extract_data(df, se_number): flight_count.append(count) wave_entry = { - "id": int(df['id'][wave_id]), - "simd": int(df['simd'][wave_id]), - "slot": int(df['wave_slot'][wave_id]), - "begin": int(df['begin_time'][wave_id]), - "end": int(df['end_time'][wave_id]), + "id": int(df["id"][wave_id]), + "simd": int(df["simd"][wave_id]), + "slot": int(df["wave_slot"][wave_id]), + "begin": int(df["begin_time"][wave_id]), + "end": int(df["end_time"][wave_id]), "info": wave_info(df, wave_id), "instructions": stitched, "timeline": timeline, - "waitcnt": mem_unroll + "waitcnt": mem_unroll, } data_obj = { - "name": 'SE'.format(se_number), + "name": "SE".format(se_number), "duration": sum(dur for (_, dur) in timeline), "wave": wave_entry, "loop_count": loopCount, @@ -103,26 +118,36 @@ def extract_data(df, se_number): "num_stitched": len(stitched), "num_insts": num_insts, "websocket_port": WebSocketPort, - "generation_time": time.ctime() + "generation_time": time.ctime(), } - simd_id = df['simd'][wave_id] - slot_id = df['wave_slot'][wave_id] + simd_id = df["simd"][wave_id] + slot_id = df["wave_slot"][wave_id] slot_count = wave_slot_count[simd_id][slot_id] wave_slot_count[simd_id][slot_id] += 1 - OUT = 'se'+str(se_number)+'_sm'+str(simd_id)+'_sl'+str(slot_id)+'_wv'+str(slot_count)+'.json' + OUT = ( + "se" + + str(se_number) + + "_sm" + + str(simd_id) + + "_sl" + + str(slot_id) + + "_wv" + + str(slot_count) + + ".json" + ) JSON_GLOBAL_DICTIONARY[OUT] = Readable(data_obj) - wave_filenames.append((OUT, df['begin_time'][wave_id], df['end_time'][wave_id])) + wave_filenames.append((OUT, df["begin_time"][wave_id], df["end_time"][wave_id])) data_obj = { - "name": 'SE'.format(se_number), + "name": "SE".format(se_number), "websocket_port": WebSocketPort, - "generation_time": time.ctime() + "generation_time": time.ctime(), } se_filename = None if len(wave_filenames) > 0: - se_filename = 'se'+str(se_number)+'_info.json' + se_filename = "se" + str(se_number) + "_info.json" JSON_GLOBAL_DICTIONARY[se_filename] = Readable(data_obj) return flight_count, wave_filenames, se_filename, allwaves_maxline @@ -139,36 +164,43 @@ def send_my_headers(self): self.send_header("Expires", "0") def do_GET(self): - if '.png?' in self.path and self.path.split('/')[-1] not in JSON_GLOBAL_DICTIONARY.keys(): - selections = [int(s)!=0 for s in self.path.split('.png?')[-1]] - counters_json, imagebytes = GeneratePIC(self.drawinfo, selections[1:], selections[0]) - JSON_GLOBAL_DICTIONARY['graph_options.json'] = counters_json - JSON_GLOBAL_DICTIONARY[self.path.split('/')[-1]] = imagebytes[self.path.split('/')[-1].split('?')[0]] - - if '.json' in self.path or '.png' in self.path: + if ( + ".png?" in self.path + and self.path.split("/")[-1] not in JSON_GLOBAL_DICTIONARY.keys() + ): + selections = [int(s) != 0 for s in self.path.split(".png?")[-1]] + counters_json, imagebytes = GeneratePIC( + self.drawinfo, selections[1:], selections[0] + ) + JSON_GLOBAL_DICTIONARY["graph_options.json"] = counters_json + JSON_GLOBAL_DICTIONARY[self.path.split("/")[-1]] = imagebytes[ + self.path.split("/")[-1].split("?")[0] + ] + + if ".json" in self.path or ".png" in self.path: try: - response_file = JSON_GLOBAL_DICTIONARY[self.path.split('/')[-1]] + response_file = JSON_GLOBAL_DICTIONARY[self.path.split("/")[-1]] except: - print('Invalid json request:', self.path) + print("Invalid json request:", self.path) print(JSON_GLOBAL_DICTIONARY.keys()) self.send_error(HTTPStatus.NOT_FOUND, "File not found") return self.send_response(HTTPStatus.OK) self.send_header("Content-Length", str(len(response_file))) - if '.b' in self.path: - self.send_header("Content-type", 'application/octet-stream') + if ".b" in self.path: + self.send_header("Content-type", "application/octet-stream") response_file = BytesIO(response_file) - elif 'timeline.png' in self.path: - self.send_header("Content-type", 'image/png') + elif "timeline.png" in self.path: + self.send_header("Content-type", "image/png") else: - self.send_header("Content-type", 'application/json') + self.send_header("Content-type", "application/json") self.send_header("Last-Modified", self.date_time_string(time.time())) self.end_headers() self.copyfile(response_file, self.wfile) - elif self.path in ['/', '/styles.css', '/index.html', '/logo.svg']: + elif self.path in ["/", "/styles.css", "/index.html", "/logo.svg"]: http.server.SimpleHTTPRequestHandler.do_GET(self) else: - print('Invalid request:', self.path) + print("Invalid request:", self.path) self.send_error(HTTPStatus.NOT_FOUND, "File not found") @@ -181,8 +213,8 @@ def server_bind(self): def run_server(drawinfo): Handler = NoCacheHTTPRequestHandler Handler.drawinfo = drawinfo - os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'ui/')) - #os.chdir('ui/') + os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), "ui/")) + # os.chdir('ui/') try: with RocTCPServer((IPAddr, PORT), Handler) as httpd: httpd.serve_forever() @@ -191,25 +223,32 @@ def run_server(drawinfo): def fix_space(line): - line = line.replace(' ', SP) - line = line.replace('\t', SP*4) + line = line.replace(" ", SP) + line = line.replace("\t", SP * 4) return line def WebSocketserver(websocket, path): data = websocket.recv() - cpp, ln, _ = data.split(':') + cpp, ln, _ = data.split(":") ln = int(ln) - HL, EMP = 'highlight', '' + HL, EMP = "highlight", "" content = None print("loading...") try: - f = open(cpp, 'r', errors='replace') - content = ''.join('
  • '+str(i).ljust(5)+fix_space(l)+'
  • ' - for i, l in enumerate(f.readlines(), 1)) + f = open(cpp, "r", errors="replace") + content = "".join( + '
  • ' + + str(i).ljust(5) + + fix_space(l) + + "
  • " + for i, l in enumerate(f.readlines(), 1) + ) except FileNotFoundError: - content = cpp + ' not found!' + content = cpp + " not found!" websocket.send(content) @@ -223,12 +262,14 @@ def run_websocket(): def assign_ports(ports): - ps = [int(port) for port in ports.split(',')] + ps = [int(port) for port in ports.split(",")] if ps[0] <= 5000 or ps[1] <= 5000: - print('Need to have port values > 5000') + print("Need to have port values > 5000") sys.exit(1) elif ps[0] == ps[1]: - print('Can not use the same port for both web server and websocket server: '+ps[0]) + print( + "Can not use the same port for both web server and websocket server: " + ps[0] + ) sys.exit(1) global IPAddr, PORT, WebSocketPort PORT, WebSocketPort = ps[0], ps[1] @@ -236,35 +277,54 @@ def assign_ports(ports): def call_picture_callback(return_dict, drawinfo): response, imagebytes = GeneratePIC(drawinfo) - return_dict['graph_options.json'] = response + return_dict["graph_options.json"] = response for k, v in imagebytes.items(): return_dict[k] = v - for n, m in enumerate(drawinfo['TIMELINES']): - return_dict['wstates'+str(n)+'.json'] = Readable({"data": [int(n) for n in list(np.asarray(m))]}) - for n, e in enumerate(drawinfo['EVENTS']): - return_dict['se'+str(n)+'_perfcounter.json'] = Readable({"data": [v.toTuple() for v in e]}) - - -def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpOnly, se_time_begin, gfxv, drawinfo, MPI_COMM, mpi_root): + for n, m in enumerate(drawinfo["TIMELINES"]): + return_dict["wstates" + str(n) + ".json"] = Readable( + {"data": [int(n) for n in list(np.asarray(m))]} + ) + for n, e in enumerate(drawinfo["EVENTS"]): + return_dict["se" + str(n) + "_perfcounter.json"] = Readable( + {"data": [v.toTuple() for v in e]} + ) + + +def view_trace( + args, + code, + dbnames, + att_filenames, + bReturnLoc, + OCCUPANCY, + bDumpOnly, + se_time_begin, + gfxv, + drawinfo, + MPI_COMM, + mpi_root, +): global JSON_GLOBAL_DICTIONARY pic_thread = None if mpi_root: manager = Manager() return_dict = manager.dict() - JSON_GLOBAL_DICTIONARY['occupancy.json'] = Readable({str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))}) + JSON_GLOBAL_DICTIONARY["occupancy.json"] = Readable( + {str(k): OCCUPANCY[k] for k in range(len(OCCUPANCY))} + ) pic_thread = Process(target=call_picture_callback, args=(return_dict, drawinfo)) pic_thread.start() att_filenames = [Path(f).name for f in att_filenames] - se_numbers = [int(a.split('_se')[1].split('.att')[0]) for a in att_filenames] + se_numbers = [int(a.split("_se")[1].split(".att")[0]) for a in att_filenames] flight_count = [] simd_wave_filenames = {} se_filenames = [] allse_maxline = 0 for se_number, dbname in zip(se_numbers, dbnames): - if len(dbname['id']) == 0: + if len(dbname["id"]) == 0: continue count, wv_filenames, se_filename, maxline = extract_data(dbname, se_number) @@ -282,12 +342,15 @@ def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpO JSON_GLOBAL_DICTIONARY['code.json'] = Readable({"code": code_sel, "top_n": get_top_n(code_sel)}) for key in simd_wave_filenames.keys(): - wv_array = [[ - int(s[0].split('_sm')[1].split('_sl')[0]), - int(s[0].split('_sl')[1].split('_wv')[0]), - int(s[0].split('_wv')[1].split('.')[0]), - s - ] for s in simd_wave_filenames[key]] + wv_array = [ + [ + int(s[0].split("_sm")[1].split("_sl")[0]), + int(s[0].split("_sl")[1].split("_wv")[0]), + int(s[0].split("_wv")[1].split(".")[0]), + s, + ] + for s in simd_wave_filenames[key] + ] wv_dict = {} for wv in wv_array: @@ -309,13 +372,19 @@ def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpO simd_wave_filenames = MPI_COMM.gather(simd_wave_filenames, root=0) if mpi_root: se_filenames = [e for elem in se_filenames for e in elem] - simd_wave_filenames = {k:v for smf in simd_wave_filenames for k,v in smf.items()} + simd_wave_filenames = { + k: v for smf in simd_wave_filenames for k, v in smf.items() + } if mpi_root: - JSON_GLOBAL_DICTIONARY['filenames.json'] = Readable({"wave_filenames": simd_wave_filenames, - "se_filenames": se_filenames, - "global_begin_time": int(se_time_begin), - "gfxv": gfxv}) + JSON_GLOBAL_DICTIONARY["filenames.json"] = Readable( + { + "wave_filenames": simd_wave_filenames, + "se_filenames": se_filenames, + "global_begin_time": int(se_time_begin), + "gfxv": gfxv, + } + ) if pic_thread is not None: pic_thread.join() @@ -330,14 +399,19 @@ def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpO JSON_GLOBAL_DICTIONARY = MPI_COMM.gather(JSON_GLOBAL_DICTIONARY, root=0) if not mpi_root: quit() - JSON_GLOBAL_DICTIONARY = {k:v for smf in JSON_GLOBAL_DICTIONARY for k,v in smf.items()} + JSON_GLOBAL_DICTIONARY = { + k: v for smf in JSON_GLOBAL_DICTIONARY for k, v in smf.items() + } - JSON_GLOBAL_DICTIONARY['live.json'] = Readable({'live': 1}) + JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 1}) if args.ports: assign_ports(args.ports) - print('serving at ports: {0},{1}'.format(PORT, WebSocketPort)) + print("serving at ports: {0},{1}".format(PORT, WebSocketPort)) try: - PROCS = [Process(target=run_server, args=[drawinfo]), Process(target=run_websocket)] + PROCS = [ + Process(target=run_server, args=[drawinfo]), + Process(target=run_websocket), + ] for p in PROCS: p.start() for p in PROCS: @@ -345,10 +419,14 @@ def view_trace(args, code, dbnames, att_filenames, bReturnLoc, OCCUPANCY, bDumpO except KeyboardInterrupt: print("Exitting.") else: - os.makedirs('ui/', exist_ok=True) + os.makedirs("ui/", exist_ok=True) if mpi_root: - JSON_GLOBAL_DICTIONARY['live.json'] = Readable({'live': 0}) - os.system('cp ' + os.path.join(os.path.abspath(os.path.dirname(__file__)),'ui') + '/* ui/' ) + JSON_GLOBAL_DICTIONARY["live.json"] = Readable({"live": 0}) + os.system( + "cp " + + os.path.join(os.path.abspath(os.path.dirname(__file__)), "ui") + + "/* ui/" + ) for k, v in JSON_GLOBAL_DICTIONARY.items(): - with open(os.path.join('ui',k), 'w' if '.json' in k else 'wb') as f: + with open(os.path.join("ui", k), "w" if ".json" in k else "wb") as f: f.write(v.read()) diff --git a/plugin/att/ui/httpserver.py b/plugin/att/ui/httpserver.py index 8e75b7be..ce3e1b35 100644 --- a/plugin/att/ui/httpserver.py +++ b/plugin/att/ui/httpserver.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import sys + if sys.version_info[0] < 3: raise Exception("Must be using Python 3") @@ -9,6 +10,7 @@ import os import sys + class NoCacheHTTPRequestHandler(http.server.SimpleHTTPRequestHandler): def end_headers(self): self.send_my_headers() @@ -20,25 +22,28 @@ def send_my_headers(self): self.send_header("Expires", "0") def do_GET(self): - if '.png?' in self.path: - self.path = self.path.split('.png?')[0]+'.png' + if ".png?" in self.path: + self.path = self.path.split(".png?")[0] + ".png" http.server.SimpleHTTPRequestHandler.do_GET(self) + class RocTCPServer(socketserver.TCPServer): def server_bind(self): self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.socket.bind(self.server_address) + def run_server(): Handler = NoCacheHTTPRequestHandler - os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)),'.')) + os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".")) try: with RocTCPServer((IPAddr, PORT), Handler) as httpd: httpd.serve_forever() except KeyboardInterrupt: pass + def get_ip(): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.settimeout(0) @@ -47,16 +52,17 @@ def get_ip(): IPAddr = socket.gethostbyname(hostname) s.connect(({IPAddr}, 1)) except Exception: - IPAddr = '127.0.0.1' + IPAddr = "127.0.0.1" finally: return IPAddr + IPAddr = get_ip() PORT = 8000 if len(sys.argv) > 1: PORT = int(sys.argv[1]) -print('serving at port: {0}'.format(PORT)) +print("serving at port: {0}".format(PORT)) try: run_server() diff --git a/plugin/ctf/CMakeLists.txt b/plugin/ctf/CMakeLists.txt index c523e1e2..2ed5c287 100644 --- a/plugin/ctf/CMakeLists.txt +++ b/plugin/ctf/CMakeLists.txt @@ -42,11 +42,9 @@ set(METADATA_STREAM_FILE_DIR "${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/plugin/ct target_compile_definitions( ctf_plugin PUBLIC AMD_INTERNAL_BUILD - PRIVATE - HIP_PROF_HIP_API_STRING=1 - __HIP_PLATFORM_AMD__=1 - CTF_PLUGIN_METADATA_FILE_PATH="${CMAKE_INSTALL_PREFIX}/${METADATA_STREAM_FILE_DIR}/metadata" - ) + PRIVATE HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1 + CTF_PLUGIN_METADATA_FILE_PATH="${METADATA_STREAM_FILE_DIR}/metadata" + CTF_PLUGIN_INSTALL_PREFIX="${CMAKE_INSTALL_PREFIX}") target_include_directories( ctf_plugin PRIVATE "${PROJECT_SOURCE_DIR}" "${CMAKE_BINARY_DIR}/src/api" "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/plugin/ctf/ctf.cpp b/plugin/ctf/ctf.cpp index 5a90296e..97265f38 100644 --- a/plugin/ctf/ctf.cpp +++ b/plugin/ctf/ctf.cpp @@ -18,10 +18,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include #include #include #include +#include +#include +#include +#include +#include +#include #include +#include #include "rocprofiler.h" #include "rocprofiler_plugin.h" @@ -49,17 +57,53 @@ ROCPROFILER_EXPORT int rocprofiler_plugin_initialize(const uint32_t rocprofiler_ return -1; } - const char* output_dir = []() -> const char* { + auto output_dir = []() -> std::string { if (const char* output_dir_internal = getenv("OUTPUT_PATH"); output_dir_internal != nullptr) { return output_dir_internal; } return "./"; }(); + auto output_file = []() -> std::string { + auto _v = getenv("OUTPUT_FILE"); + return (_v) ? _v : "trace-{PID}"; + }(); + + auto _replace = [&output_dir, &output_file](const char* _key, auto _value) { + using value_type = std::remove_cv_t>>; + auto _value_str = std::to_string(_value); + + const auto _re = std::regex{_key, std::regex_constants::icase}; + output_dir = std::regex_replace(output_dir, _re, _value_str); + output_file = std::regex_replace(output_file, _re, _value_str); + }; + + _replace("\\{PID\\}", getpid()); + _replace("\\$ENV\\{PID\\}", getpid()); + _replace("\\{PPID\\}", getppid()); + _replace("\\$ENV\\{PPID\\}", getppid()); + // Create the plugin instance. + auto* this_plugin_handle = dlopen("libctf_plugin.so", RTLD_LAZY | RTLD_NOLOAD); + auto* librocprofiler_handle = dlopen("librocprofiler64.so", RTLD_LAZY | RTLD_NOLOAD); + auto metadata_path = std::string{CTF_PLUGIN_METADATA_FILE_PATH}; + struct link_map* _link_map = nullptr; + if (this_plugin_handle && dlinfo(this_plugin_handle, RTLD_DI_LINKMAP, &_link_map) == 0) { + metadata_path = fs::path{_link_map->l_name}.parent_path() / fs::path{"../.."} / + CTF_PLUGIN_METADATA_FILE_PATH; + } else if (librocprofiler_handle && + dlinfo(librocprofiler_handle, RTLD_DI_LINKMAP, &_link_map) == 0) { + metadata_path = + fs::path{_link_map->l_name}.parent_path() / ".." / CTF_PLUGIN_METADATA_FILE_PATH; + } + + if (!fs::exists(metadata_path)) { + metadata_path = fs::path{CTF_PLUGIN_INSTALL_PREFIX} / CTF_PLUGIN_METADATA_FILE_PATH; + } + try { - the_plugin = new rocm_ctf::Plugin{256 * 1024, fs::path{output_dir} / "trace", - CTF_PLUGIN_METADATA_FILE_PATH}; + the_plugin = new rocm_ctf::Plugin{256 * 1024, fs::path{output_dir} / output_file, + fs::absolute(metadata_path)}; } catch (const std::exception& exc) { std::cerr << "rocprofiler_plugin_initialize(): " << exc.what() << std::endl; return -1; diff --git a/plugin/ctf/gen_api_files.py b/plugin/ctf/gen_api_files.py index 58743bcb..c6e37468 100644 --- a/plugin/ctf/gen_api_files.py +++ b/plugin/ctf/gen_api_files.py @@ -33,12 +33,12 @@ class _NumericFt: # Returns the C++ expression to cast the expression `expr` to the C # type of this field type. def cast(self, expr): - return f'static_cast<{self.c_type}>({expr})' + return f"static_cast<{self.c_type}>({expr})" # Integer field type (abstract). class _IntFt(_NumericFt): - def __init__(self, size, pref_disp_base='dec'): + def __init__(self, size, pref_disp_base="dec"): self._size = size self._pref_disp_base = pref_disp_base @@ -56,8 +56,8 @@ def pref_disp_base(self): @property def barectf_yaml(self): return { - 'size': self._size, - 'preferred-display-base': self._pref_disp_base, + "size": self._size, + "preferred-display-base": self._pref_disp_base, } @@ -67,13 +67,13 @@ class _SIntFt(_IntFt): @property def barectf_yaml(self): ret = super().barectf_yaml - ret['class'] = 'sint' + ret["class"] = "sint" return ret # Equivalent C type @property def c_type(self): - return f'std::int{self._size}_t' + return f"std::int{self._size}_t" # Unsigned integer field type. @@ -82,24 +82,24 @@ class _UIntFt(_IntFt): @property def barectf_yaml(self): ret = super().barectf_yaml - ret['class'] = 'uint' + ret["class"] = "uint" return ret # Equivalent C type. @property def c_type(self): - return f'std::uint{self._size}_t' + return f"std::uint{self._size}_t" # Pointer field type. class _PointerFt(_UIntFt): def __init__(self): - super().__init__(64, 'hex') + super().__init__(64, "hex") # Returns the C++ expression to cast the expression `expr` to the C # type of this field type. def cast(self, expr): - return f'static_cast<{self.c_type}>(reinterpret_cast({expr}))' + return f"static_cast<{self.c_type}>(reinterpret_cast({expr}))" # Enumeration field type (abstract). @@ -122,7 +122,7 @@ def barectf_yaml(self): for name, val in self._mappings.items(): mappings[name] = [val] - ret['mappings'] = mappings + ret["mappings"] = mappings return ret @@ -132,7 +132,7 @@ class _UEnumFt(_EnumFt, _UIntFt): @property def barectf_yaml(self): ret = super().barectf_yaml - ret['class'] = 'uenum' + ret["class"] = "uenum" return ret @@ -142,7 +142,7 @@ class _SEnumFt(_EnumFt, _UIntFt): @property def barectf_yaml(self): ret = super().barectf_yaml - ret['class'] = 'senum' + ret["class"] = "senum" return ret @@ -152,7 +152,7 @@ class _OptStrFt: @property def barectf_yaml(self): return { - 'class': 'str', + "class": "str", } @@ -175,18 +175,18 @@ def size(self): @property def barectf_yaml(self): return { - 'class': 'real', - 'size': self._size, + "class": "real", + "size": self._size, } # Equivalent C type. @property def c_type(self): if self._size == 32: - return 'float' + return "float" else: assert self._size == 64 - return 'double' + return "double" # Event record type. @@ -210,16 +210,16 @@ def members(self): class _BeginErt(_Ert): # Name of event record type depending on the API prefix. def name(self, api_prefix): - suffix = '_begin' if api_prefix == 'hsa' else 'Begin' - return f'{self._api_func_name}{suffix}' + suffix = "_begin" if api_prefix == "hsa" else "Begin" + return f"{self._api_func_name}{suffix}" # End event record type. class _EndErt(_Ert): # Name of event record type depending on the API prefix. def name(self, api_prefix): - suffix = '_end' if api_prefix == 'hsa' else 'End' - return f'{self._api_func_name}{suffix}' + suffix = "_end" if api_prefix == "hsa" else "End" + return f"{self._api_func_name}{suffix}" # Event record type member. @@ -251,20 +251,20 @@ def ft(self): # This is an unconditional assertion. def _make_sure(cond, error_msg): if not cond: - print(f'Error: {error_msg}', file=sys.stderr) + print(f"Error: {error_msg}", file=sys.stderr) sys.exit(1) def _enumerator_effective_val(enum_val): # Try the value, but this value may be a string (an # enumerator/definition). - val = enum_val.get('value') + val = enum_val.get("value") if type(val) is int: return val # Try the raw value. - val = enum_val.get('raw_value') + val = enum_val.get("raw_value") if val is not None: if type(val) is int: @@ -277,58 +277,61 @@ def _enumerator_effective_val(enum_val): except: pass - _make_sure(False, - f'Cannot get the integral value of enumerator `{enum_val["name"]}`') + _make_sure(False, f'Cannot get the integral value of enumerator `{enum_val["name"]}`') # Returns the equivalent field type of the C type `c_type`. def _number_ft_from_c_type(cpp_header, c_type): # Check for known enumeration. - m = re.match(r'(?:enum\s+)?(\w+)', c_type) + m = re.match(r"(?:enum\s+)?(\w+)", c_type) if m: size = 32 for enum_info in cpp_header.enums: - if m.group(1) == enum_info.get('name'): + if m.group(1) == enum_info.get("name"): # Fill enumeration field type mappings. mappings = { - str(v['name']): _enumerator_effective_val(v) - for v in enum_info['values'] + str(v["name"]): _enumerator_effective_val(v) + for v in enum_info["values"] } if len(mappings) == 0: return _SIntFt(64) - if max(mappings.values()) >= 2**31 or min(mappings.values()) < -2**31: + if max(mappings.values()) >= 2**31 or min(mappings.values()) < -( + 2**31 + ): size = 64 - _make_sure(len(mappings) > 0, f'Enumeration `{enum_info["name"]}` is empty') + _make_sure( + len(mappings) > 0, f'Enumeration `{enum_info["name"]}` is empty' + ) # Create corresponding enumeration field type. return _SEnumFt(size, mappings) # Find corresponding basic field type. - is_unsigned = 'unsigned' in c_type + is_unsigned = "unsigned" in c_type - if 'long' in c_type: + if "long" in c_type: if is_unsigned: return _UIntFt(64) else: return _SIntFt(64) - elif 'short' in c_type: + elif "short" in c_type: if is_unsigned: return _UIntFt(16) else: return _SIntFt(16) - elif 'char' in c_type: + elif "char" in c_type: if is_unsigned: return _UIntFt(8) else: return _SIntFt(8) - elif 'float' in c_type: + elif "float" in c_type: return _FloatFt(32) - elif 'double' in c_type: + elif "double" in c_type: return _FloatFt(64) else: # Assume `int` (often an unresolved C enumeration). @@ -340,23 +343,23 @@ def _number_ft_from_c_type(cpp_header, c_type): # Returns whether or not a property has a pointer type. def _prop_is_pointer(prop, c_type): - if prop['pointer'] or prop['function_pointer']: + if prop["pointer"] or prop["function_pointer"]: return True - if prop['array'] and 'array_size' in prop: + if prop["array"] and "array_size" in prop: return True - if prop['unresolved']: + if prop["unresolved"]: # HSA API function pointers. - if prop['name'] in ('callback', 'handler'): + if prop["name"] in ("callback", "handler"): return True # HIP API function pointers. - if c_type.endswith('Fn_t'): + if c_type.endswith("Fn_t"): return True # Check the C type itself. - if '*' in c_type or '*' in prop.get('raw_type', ''): + if "*" in c_type or "*" in prop.get("raw_type", ""): return True return False @@ -369,24 +372,24 @@ def _get_ert_members_for_struct(cpp_header, struct, access, member_names): members = [] member_names = member_names.copy() member_names.append(None) - props = struct['properties']['public'] + props = struct["properties"]["public"] for index, prop in enumerate(props): # Property name. - name = prop['name'] + name = prop["name"] # Member names, access, and C type. member_names[-1] = str(name) - this_access = f'{access}.{name}' - c_type = prop['type'] - aliases = prop['aliases'] + this_access = f"{access}.{name}" + c_type = prop["type"] + aliases = prop["aliases"] # Skip no type. - if c_type == '': + if c_type == "": continue # Skip unnamed or union. - if name == '' or 'union' in name or re.match(r'\bunion\b', c_type): + if name == "" or "union" in name or re.match(r"\bunion\b", c_type): continue # Check for known C type alias. @@ -399,8 +402,7 @@ def _get_ert_members_for_struct(cpp_header, struct, access, member_names): c_type = c_type_alias # Check for C string. - if re.match(r'^((const\s+char)|(char\s+const)|char)\s*\*$', - c_type.strip()): + if re.match(r"^((const\s+char)|(char\s+const)|char)\s*\*$", c_type.strip()): members.append(_ErtMember(this_access, member_names, _OptStrFt())) continue @@ -417,13 +419,17 @@ def _get_ert_members_for_struct(cpp_header, struct, access, member_names): sub_struct = cpp_header.classes.get(aliases[0]) if sub_struct is not None: - members += _get_ert_members_for_struct(cpp_header, sub_struct, - this_access, member_names) + members += _get_ert_members_for_struct( + cpp_header, sub_struct, this_access, member_names + ) continue # Use a basic field type. - members.append(_ErtMember(this_access, member_names, - _number_ft_from_c_type(cpp_header, c_type))) + members.append( + _ErtMember( + this_access, member_names, _number_ft_from_c_type(cpp_header, c_type) + ) + ) return members @@ -439,40 +445,48 @@ def _erts_from_cb_data_struct(api_prefix, cpp_header, retval_info, struct): if retval_info is not None: args_nested_cls_index = 1 retval_members = {} - nested_classes = struct['nested_classes'] - _make_sure(len(nested_classes) >= 1, - f"Return value union doesn't exist in `{struct['name']}`") + nested_classes = struct["nested_classes"] + _make_sure( + len(nested_classes) >= 1, + f"Return value union doesn't exist in `{struct['name']}`", + ) retval_union = nested_classes[0] - for prop in retval_union['properties']['public']: - name = str(prop['name']) - member = _ErtMember(f'GetApiData().{name}', ['retval'], - _number_ft_from_c_type(cpp_header, prop['type'])) - retval_members[prop['name']] = member + for prop in retval_union["properties"]["public"]: + name = str(prop["name"]) + member = _ErtMember( + f"GetApiData().{name}", + ["retval"], + _number_ft_from_c_type(cpp_header, prop["type"]), + ) + retval_members[prop["name"]] = member # Make sure we have everything we need. for api_func_name, retval_name in retval_info.items(): if retval_name is not None: - _make_sure(retval_name in retval_members, - f"Return value union member `{retval_name}` doesn't exist (function {api_func_name}())") + _make_sure( + retval_name in retval_members, + f"Return value union member `{retval_name}` doesn't exist (function {api_func_name}())", + ) # Create beginning/end event record type objects. begin_erts = [] end_erts = [] - nested_classes = struct['nested_classes'][args_nested_cls_index]['nested_classes'] - props = struct['nested_classes'][args_nested_cls_index]['properties']['public'] - _make_sure(len(nested_classes) == len(props), - f'Mismatch between nested structure and member count in `{struct["name"]}`') + nested_classes = struct["nested_classes"][args_nested_cls_index]["nested_classes"] + props = struct["nested_classes"][args_nested_cls_index]["properties"]["public"] + _make_sure( + len(nested_classes) == len(props), + f'Mismatch between nested structure and member count in `{struct["name"]}`', + ) for index, prop in enumerate(props): # API function name is the name of the member. - api_func_name = str(prop['name']) + api_func_name = str(prop["name"]) # Get the parameters. - members = _get_ert_members_for_struct(cpp_header, - nested_classes[index], - f'GetApiData().args.{api_func_name}', - []) + members = _get_ert_members_for_struct( + cpp_header, nested_classes[index], f"GetApiData().args.{api_func_name}", [] + ) # Append new beginning event record type object. begin_erts.append(_BeginErt(api_func_name, members)) @@ -499,7 +513,7 @@ def _erts_from_cb_data_struct(api_prefix, cpp_header, retval_info, struct): # This only applies to the HSA API: for other APIs, this function # returns `None`. def _get_retval_info(path): - if 'hsa' not in os.path.basename(path): + if "hsa" not in os.path.basename(path): return retval_info = {} @@ -508,7 +522,7 @@ def _get_retval_info(path): with open(path) as f: for line in f: if 'out << ")' in line and cur_api_func_name is not None: - m = re.search(r'api_data.(\w+_retval)', line) + m = re.search(r"api_data.(\w+_retval)", line) retval_info[cur_api_func_name] = m.group(1) if m else None else: m = re.search(r'out << "(hsa_\w+)\(";', line) @@ -525,7 +539,7 @@ def _yaml_dst_from_erts(api_prefix, erts): # Base. yaml_erts = {} yaml_dst = { - 'event-record-types': yaml_erts, + "event-record-types": yaml_erts, } # Create one event record type per API function. @@ -533,9 +547,9 @@ def _yaml_dst_from_erts(api_prefix, erts): # Base. yaml_members = [] yaml_ert = { - 'payload-field-type': { - 'class': 'struct', - 'members': yaml_members, + "payload-field-type": { + "class": "struct", + "members": yaml_members, }, } @@ -543,11 +557,14 @@ def _yaml_dst_from_erts(api_prefix, erts): for member in ert.members: # barectf doesn't support nested CTF structures, so join # individual member names with `__` to flatten. - yaml_members.append({ - '_' + '__'.join(member.member_names): { - 'field-type': member.ft.barectf_yaml, - }, - }) + yaml_members.append( + { + "_" + + "__".join(member.member_names): { + "field-type": member.ft.barectf_yaml, + }, + } + ) # Add event record type. yaml_erts[ert.name(api_prefix)] = yaml_ert @@ -560,23 +577,23 @@ def _yaml_dst_from_erts(api_prefix, erts): # tracing function depending on the API function operation ID. def _cpp_switch_statement_from_erts(api_prefix, erts): lines = [] - lines.append('switch (GetOp()) {') + lines.append("switch (GetOp()) {") for ert in erts: - lines.append(f' case {api_prefix.upper()}_API_ID_{ert.api_func_name}:') - lines.append(f' barectf_{api_prefix}_api_trace_{ert.name(api_prefix)}(') - lines.append(f' &barectf_ctx,') - lines.append(f' GetThreadId(),') - lines.append(f' GetQueueId(),') - lines.append(f' GetAgentId(),') - lines.append(f' GetCorrelationId(),') + lines.append(f" case {api_prefix.upper()}_API_ID_{ert.api_func_name}:") + lines.append(f" barectf_{api_prefix}_api_trace_{ert.name(api_prefix)}(") + lines.append(f" &barectf_ctx,") + lines.append(f" GetThreadId(),") + lines.append(f" GetQueueId(),") + lines.append(f" GetAgentId(),") + lines.append(f" GetCorrelationId(),") - if api_prefix == 'hip': - lines.append(f' GetKernelName().c_str(),') + if api_prefix == "hip": + lines.append(f" GetKernelName().c_str(),") if len(ert.members) == 0: # Remove last comma. - lines[-1] = lines[-1].replace(',', '') + lines[-1] = lines[-1].replace(",", "") for index, member in enumerate(ert.members): if type(member.ft) is _OptStrFt: @@ -584,17 +601,17 @@ def _cpp_switch_statement_from_erts(api_prefix, erts): # an empty string. lines.append(f' {member.access} ? {member.access} : ""') elif type(member.ft) is _StrFt: - lines.append(f' {member.access}') + lines.append(f" {member.access}") else: - lines.append(f' {member.ft.cast(member.access)}') + lines.append(f" {member.ft.cast(member.access)}") if index + 1 < len(ert.members): - lines[-1] += ',' + lines[-1] += "," - lines.append(' );') - lines.append(' break;') + lines.append(" );") + lines.append(" break;") - lines.append('}') + lines.append("}") return lines @@ -612,29 +629,28 @@ def _process_file(api_prefix, path): # Find callback data structure. for struct_name, struct in cpp_header.classes.items(): - if re.match(r'^' + api_prefix + r'_api_data\w+$', struct_name): + if re.match(r"^" + api_prefix + r"_api_data\w+$", struct_name): # Process callback data structure. - begin_erts, end_erts = _erts_from_cb_data_struct(api_prefix, - cpp_header, - retval_info, - struct) + begin_erts, end_erts = _erts_from_cb_data_struct( + api_prefix, cpp_header, retval_info, struct + ) # Write barectf YAML file. - with open(f'{api_prefix}_erts.yaml', 'w') as f: + with open(f"{api_prefix}_erts.yaml", "w") as f: f.write(_yaml_dst_from_erts(api_prefix, begin_erts + end_erts)) # Write C++ code (beginning event record). - with open(f'{api_prefix}_begin.cpp.i', 'w') as f: - f.write('\n'.join(_cpp_switch_statement_from_erts(api_prefix, - begin_erts))) + with open(f"{api_prefix}_begin.cpp.i", "w") as f: + f.write( + "\n".join(_cpp_switch_statement_from_erts(api_prefix, begin_erts)) + ) # Write C++ code (end event record). - with open(f'{api_prefix}_end.cpp.i', 'w') as f: - f.write('\n'.join(_cpp_switch_statement_from_erts(api_prefix, - end_erts))) + with open(f"{api_prefix}_end.cpp.i", "w") as f: + f.write("\n".join(_cpp_switch_statement_from_erts(api_prefix, end_erts))) -if __name__ == '__main__': +if __name__ == "__main__": # Disable `CppHeaderParser` printing to standard output. CppHeaderParser.CppHeaderParser.print_warnings = 0 CppHeaderParser.CppHeaderParser.print_errors = 0 diff --git a/plugin/ctf/gen_env_yaml.py b/plugin/ctf/gen_env_yaml.py index 009f3689..4cf2222a 100644 --- a/plugin/ctf/gen_env_yaml.py +++ b/plugin/ctf/gen_env_yaml.py @@ -24,10 +24,14 @@ import yaml -if __name__ == '__main__': - with open('env.yaml', 'w') as f: - f.write(yaml.dump({ - 'environment': { - 'rocprofiler_version': sys.argv[1], - } - })) +if __name__ == "__main__": + with open("env.yaml", "w") as f: + f.write( + yaml.dump( + { + "environment": { + "rocprofiler_version": sys.argv[1], + } + } + ) + ) diff --git a/plugin/perfetto/CMakeLists.txt b/plugin/perfetto/CMakeLists.txt index 3db11c13..9c5ea030 100644 --- a/plugin/perfetto/CMakeLists.txt +++ b/plugin/perfetto/CMakeLists.txt @@ -1,7 +1,8 @@ file(GLOB ROCPROFILER_UTIL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp) -add_library(perfetto_plugin ${LIBRARY_TYPE} ${ROCPROFILER_UTIL_SRC_FILES} perfetto.cpp - perfetto_sdk/sdk/perfetto.cc) +add_subdirectory(perfetto_sdk) + +add_library(perfetto_plugin ${LIBRARY_TYPE} ${ROCPROFILER_UTIL_SRC_FILES} perfetto.cpp) set_target_properties( perfetto_plugin @@ -13,16 +14,14 @@ set_target_properties( target_compile_definitions(perfetto_plugin PRIVATE HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1) -target_include_directories( - perfetto_plugin PRIVATE ${PROJECT_SOURCE_DIR} - ${PROJECT_SOURCE_DIR}/plugin/perfetto/perfetto_sdk/sdk) +target_include_directories(perfetto_plugin PRIVATE ${PROJECT_SOURCE_DIR}) target_link_options( perfetto_plugin PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exportmap -Wl,--no-undefined) -target_link_libraries(perfetto_plugin PRIVATE rocprofiler-v2 Threads::Threads stdc++fs - amd_comgr) +target_link_libraries(perfetto_plugin PRIVATE rocprofiler-v2 rocprofiler::perfetto-sdk + Threads::Threads stdc++fs amd_comgr) install(TARGETS perfetto_plugin LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} COMPONENT plugins) diff --git a/plugin/perfetto/perfetto_sdk/CMakeLists.txt b/plugin/perfetto/perfetto_sdk/CMakeLists.txt new file mode 100644 index 00000000..385d0820 --- /dev/null +++ b/plugin/perfetto/perfetto_sdk/CMakeLists.txt @@ -0,0 +1,8 @@ +set(CMAKE_CXX_CLANG_TIDY) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +add_library(rocprofiler-perfetto-sdk STATIC sdk/perfetto.h sdk/perfetto.cc) +add_library(rocprofiler::perfetto-sdk ALIAS rocprofiler-perfetto-sdk) + +target_include_directories(rocprofiler-perfetto-sdk + PUBLIC $) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..585d9d4c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,25 @@ + +[tool.black] +line-length = 90 +target-version = ['py36', 'py37', 'py38', 'py39', 'py310'] +include = '\.py$' +exclude = ''' +( + /( + \.eggs + | \.git + | \.github + | \.tox + | \.venv + | \.misc + | \.vscode + | \.cache + | \.pytest_cache + | dist + | external + | build + | build-release + | build-rocprofiler + )/ +) +''' diff --git a/requirements.txt b/requirements.txt index 6cf1c143..69a7c594 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,10 @@ -barectf==3.1.1 -bcrypt==3.2.0 -CppHeaderParser==2.7.4 -lxml==4.9.2 -matplotlib==3.7.1 -pandas==2.0.2 -plotly==5.15.0 -ply==3.11 -protobuf==3.20.3 -pycparser==2.21 -pyparsing==3.0.9 -websocket-client==1.5.2 -websockets==11.0.3 +barectf +bcrypt +CppHeaderParser +lxml +matplotlib +pandas +protobuf +pycparser +pyparsing +websockets \ No newline at end of file diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 6b280148..2eaadd49 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -23,6 +23,9 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip") set(CMAKE_HIP_ARCHITECTURES OFF) +if(DEFINED ROCM_PATH) + set(HIP_ROOT_DIR "${ROCM_PATH}/bin") +endif() find_package(HIP REQUIRED MODULE) find_package( @@ -36,7 +39,7 @@ find_package(LibElf REQUIRED) find_package(LibDw REQUIRED) # Add a custom targets to build and run all the tests -add_custom_target(samples) +add_custom_target(samples ALL) add_dependencies(samples rocprofiler-v2) add_custom_target( run-samples @@ -46,6 +49,32 @@ add_custom_target( file(GLOB ROCPROFILER_UTIL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp) # ######################################################################################## +function(rocprofiler_sample_add_test _TARGET _ARGS) + if(TARGET ${_TARGET}) + if(NOT TEST ${_TARGET}) + add_test( + NAME ${_TARGET} + COMMAND $ ${_ARGS} + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + endif() + set_tests_properties( + ${_TARGET} + PROPERTIES + LABELS + "samples" + ENVIRONMENT + "ROCPROFILER_METRICS_PATH=${PROJECT_BINARY_DIR}/libexec/rocprofiler/counters/derived_counters.xml;${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" + RUN_SERIAL + TRUE + ${ARGN}) + endif() +endfunction() + +function(rocprofiler_sample_add_executable _TARGET) + hip_add_executable(${_TARGET} ${ARGN}) + rocprofiler_sample_add_test(${_TARGET} "") +endfunction() + # ######################################################################################## # ######################################################################################## # Samples Build & Run Script @@ -59,7 +88,7 @@ file(GLOB ROCPROFILER_UTIL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp) # Build Kernel No Replay Sample set_source_files_properties(profiler/kernel_profiling_no_replay_sample.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable( +rocprofiler_sample_add_executable( profiler_kernel_no_replay profiler/kernel_profiling_no_replay_sample.cpp ${ROCPROFILER_UTIL_SRC_FILES}) target_include_directories( @@ -75,8 +104,9 @@ install(TARGETS profiler_kernel_no_replay # Build Device Profiling Sample set_source_files_properties(profiler/device_profiling_sample.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(profiler_device_profiling profiler/device_profiling_sample.cpp - ${ROCPROFILER_UTIL_SRC_FILES}) +rocprofiler_sample_add_executable( + profiler_device_profiling profiler/device_profiling_sample.cpp + ${ROCPROFILER_UTIL_SRC_FILES}) target_include_directories( profiler_device_profiling PRIVATE ${PROJECT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/common) @@ -86,12 +116,14 @@ add_dependencies(samples profiler_device_profiling) install(TARGETS profiler_device_profiling RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/samples COMPONENT samples) +set_tests_properties(profiler_device_profiling PROPERTIES DISABLED TRUE) # Build Counters Sampling example set_source_files_properties(counters_sampler/pcie_counters_example.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(pcie_counters_sampler counters_sampler/pcie_counters_example.cpp - ${ROCPROFILER_UTIL_SRC_FILES}) +rocprofiler_sample_add_executable( + pcie_counters_sampler counters_sampler/pcie_counters_example.cpp + ${ROCPROFILER_UTIL_SRC_FILES}) target_include_directories( pcie_counters_sampler PRIVATE ${PROJECT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/common) @@ -105,7 +137,7 @@ install(TARGETS pcie_counters_sampler # Build XGMI Counters Sampling example set_source_files_properties(counters_sampler/xgmi_counters_sampler_example.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable( +rocprofiler_sample_add_executable( xgmi_counters_sampler counters_sampler/xgmi_counters_sampler_example.cpp ${ROCPROFILER_UTIL_SRC_FILES}) target_include_directories( @@ -117,6 +149,7 @@ add_dependencies(samples xgmi_counters_sampler) install(TARGETS xgmi_counters_sampler RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/samples COMPONENT samples) +set_tests_properties(xgmi_counters_sampler PROPERTIES DISABLED TRUE) # ######################################################################################## @@ -126,7 +159,8 @@ install(TARGETS xgmi_counters_sampler # Build HIP/HSA Trace Sample set_source_files_properties(tracer/sample.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(tracer_hip_hsa tracer/sample.cpp ${ROCPROFILER_UTIL_SRC_FILES}) +rocprofiler_sample_add_executable(tracer_hip_hsa tracer/sample.cpp + ${ROCPROFILER_UTIL_SRC_FILES}) target_include_directories(tracer_hip_hsa PRIVATE ${PROJECT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/common) target_link_libraries(tracer_hip_hsa PRIVATE rocprofiler-v2 amd_comgr) @@ -139,8 +173,8 @@ install(TARGETS tracer_hip_hsa # Build HIP/HSA Trace with async output api trace data Sample set_source_files_properties(tracer/sample_async.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(tracer_hip_hsa_async tracer/sample_async.cpp - ${ROCPROFILER_UTIL_SRC_FILES}) +rocprofiler_sample_add_executable(tracer_hip_hsa_async tracer/sample_async.cpp + ${ROCPROFILER_UTIL_SRC_FILES}) target_include_directories( tracer_hip_hsa_async PRIVATE ${PROJECT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/common) target_link_libraries(tracer_hip_hsa_async PRIVATE rocprofiler-v2 amd_comgr) @@ -162,6 +196,7 @@ hip_add_executable( pc_sampling_code_printing ${PC_SAMPLING_CODE_PRINTING_FILES} HIPCC_OPTIONS -std=c++17 # Include debugging symbols and source for the contextual disassembly -gdwarf-4) +rocprofiler_sample_add_test(pc_sampling_code_printing "-d;0;-n;100000000;10;43532") check_c_source_compiles( " diff --git a/samples/profiler/device_profiling_sample.cpp b/samples/profiler/device_profiling_sample.cpp index c6ad3f85..98d1495e 100644 --- a/samples/profiler/device_profiling_sample.cpp +++ b/samples/profiler/device_profiling_sample.cpp @@ -24,7 +24,7 @@ int main(int argc, char** argv) { int gpu_agent = 0; int cpu_agent = 0; CHECK_ROCPROFILER(rocprofiler_device_profiling_session_create( - &counters[0], counters.size(), &dp_session_id, gpu_agent, cpu_agent)); + &counters[0], counters.size(), &dp_session_id, cpu_agent, gpu_agent)); printf("session start \n"); // start GPU device profiling diff --git a/script/address-sanitizer-suppr.txt b/script/address-sanitizer-suppr.txt new file mode 100644 index 00000000..e69de29b diff --git a/script/gen_ostream_ops.py b/script/gen_ostream_ops.py index acd0fdf0..9084d73f 100755 --- a/script/gen_ostream_ops.py +++ b/script/gen_ostream_ops.py @@ -27,118 +27,164 @@ import argparse import string -LICENSE = \ -'/*\n' + \ -'Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.\n' + \ -'\n' + \ -'Permission is hereby granted, free of charge, to any person obtaining a copy\n' + \ -'of this software and associated documentation files (the "Software"), to deal\n' + \ -'in the Software without restriction, including without limitation the rights\n' + \ -'to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n' + \ -'copies of the Software, and to permit persons to whom the Software is\n' + \ -'furnished to do so, subject to the following conditions:\n' + \ -'\n' + \ -'The above copyright notice and this permission notice shall be included in\n' + \ -'all copies or substantial portions of the Software.\n' + \ -'\n' + \ -'THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' + \ -'IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n' + \ -'FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n' + \ -'AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n' + \ -'LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n' + \ -'OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n' + \ -'THE SOFTWARE.\n' + \ -'*/\n' - - -header_basic = \ -'namespace detail {\n' + \ -'template \n' + \ -' inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n' + \ -' using std::operator<<;\n' + \ -' static bool recursion = false;\n' + \ -' if (recursion == false) { recursion = true; out << v; recursion = false; }\n' + \ -' return out;\n }\n' + \ -'\n' + \ -' inline static std::ostream &operator<<(std::ostream &out, const unsigned char &v) {\n' + \ -' out << (unsigned int)v;\n' + \ -' return out;\n }\n' + \ -'\n' + \ -' inline static std::ostream &operator<<(std::ostream &out, const char &v) {\n' + \ -' out << (unsigned char)v;\n' + \ -' return out;\n }\n' +LICENSE = ( + "/*\n" + + "Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.\n" + + "\n" + + "Permission is hereby granted, free of charge, to any person obtaining a copy\n" + + 'of this software and associated documentation files (the "Software"), to deal\n' + + "in the Software without restriction, including without limitation the rights\n" + + "to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n" + + "copies of the Software, and to permit persons to whom the Software is\n" + + "furnished to do so, subject to the following conditions:\n" + + "\n" + + "The above copyright notice and this permission notice shall be included in\n" + + "all copies or substantial portions of the Software.\n" + + "\n" + + 'THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' + + "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" + + "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n" + + "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n" + + "LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n" + + "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n" + + "THE SOFTWARE.\n" + + "*/\n" +) + + +header_basic = ( + "namespace detail {\n" + + "template \n" + + " inline static std::ostream& operator<<(std::ostream& out, const T& v) {\n" + + " using std::operator<<;\n" + + " static bool recursion = false;\n" + + " if (recursion == false) { recursion = true; out << v; recursion = false; }\n" + + " return out;\n }\n" + + "\n" + + " inline static std::ostream &operator<<(std::ostream &out, const unsigned char &v) {\n" + + " out << (unsigned int)v;\n" + + " return out;\n }\n" + + "\n" + + " inline static std::ostream &operator<<(std::ostream &out, const char &v) {\n" + + " out << (unsigned char)v;\n" + + " return out;\n }\n" +) structs_analyzed = {} -global_ops = '' -global_str = '' +global_ops = "" +global_str = "" output_filename_h = None apiname = "" + # process_struct traverses recursively all structs to extract all fields def process_struct(file_handle, cppHeader_struct, cppHeader, parent_hier_name, apiname): -# file_handle: handle for output file {api_name}_ostream_ops.h to be generated -# cppHeader_struct: cppHeader struct being processed -# cppHeader: cppHeader object created by CppHeaderParser.CppHeader(...) -# parent_hier_name: parent hierarchical name used for nested structs/enums -# apiname: for example hip. + # file_handle: handle for output file {api_name}_ostream_ops.h to be generated + # cppHeader_struct: cppHeader struct being processed + # cppHeader: cppHeader object created by CppHeaderParser.CppHeader(...) + # parent_hier_name: parent hierarchical name used for nested structs/enums + # apiname: for example hip. global global_str - if cppHeader_struct == 'max_align_t': #function pointers not working in cppheaderparser + if ( + cppHeader_struct == "max_align_t" + ): # function pointers not working in cppheaderparser return if cppHeader_struct not in cppHeader.classes: return if cppHeader_struct in structs_analyzed: return structs_analyzed[cppHeader_struct] = 1 - for l in reversed(range(len(cppHeader.classes[cppHeader_struct]["properties"]["public"]))): - key = 'name' + for l in reversed( + range(len(cppHeader.classes[cppHeader_struct]["properties"]["public"])) + ): + key = "name" name = "" if key in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]: - if parent_hier_name != '': - name = parent_hier_name + '.' + cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key] - else: - name = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key] - if name == '': - continue - key2 = 'type' + if parent_hier_name != "": + name = ( + parent_hier_name + + "." + + cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key] + ) + else: + name = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key] + if name == "": + continue + key2 = "type" mtype = "" if key2 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]: mtype = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key2] - if mtype == '': - continue - key3 = 'array_size' + if mtype == "": + continue + key3 = "array_size" array_size = "" if key3 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]: - array_size = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key3] - key4 = 'property_of_class' + array_size = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][ + key3 + ] + key4 = "property_of_class" prop = "" - if key4 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]: + if key4 in cppHeader.classes[cppHeader_struct]["properties"]["public"][l]: prop = cppHeader.classes[cppHeader_struct]["properties"]["public"][l][key4] - str = '' + str = "" if "union" not in mtype: indent = "" - str += " if (std::string(\"" + cppHeader_struct + "::" + name + "\").find(" + apiname.upper() + "_structs_regex" + ") != std::string::npos) {\n" + str += ( + ' if (std::string("' + + cppHeader_struct + + "::" + + name + + '").find(' + + apiname.upper() + + "_structs_regex" + + ") != std::string::npos) {\n" + ) indent = " " - str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \"" + name + "=\");\n" - str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, v." + name + ");\n" - str += indent + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, \", \");\n" + str += ( + indent + + " roctracer::" + + apiname.lower() + + '_support::detail::operator<<(out, "' + + name + + '=");\n' + ) + str += ( + indent + + " roctracer::" + + apiname.lower() + + "_support::detail::operator<<(out, v." + + name + + ");\n" + ) + str += ( + indent + + " roctracer::" + + apiname.lower() + + '_support::detail::operator<<(out, ", ");\n' + ) str += " }\n" if "void" not in mtype: global_str += str else: - if prop != '': - next_cppHeader_struct = prop + "::" - process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname) - next_cppHeader_struct = prop + "::" + mtype + " " - process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname) + if prop != "": + next_cppHeader_struct = prop + "::" + process_struct( + file_handle, next_cppHeader_struct, cppHeader, name, apiname + ) + next_cppHeader_struct = prop + "::" + mtype + " " + process_struct( + file_handle, next_cppHeader_struct, cppHeader, name, apiname + ) next_cppHeader_struct = cppHeader_struct + "::" process_struct(file_handle, next_cppHeader_struct, cppHeader, name, apiname) + # Parses API header file and generates ostream ops files ostream_ops.h def gen_cppheader(infilepath, outfilepath, rank): -# infilepath: API Header file to be parsed -# outfilepath: Output file where ostream operators are written + # infilepath: API Header file to be parsed + # outfilepath: Output file where ostream operators are written global global_ops global output_filename_h global apiname @@ -149,94 +195,142 @@ def gen_cppheader(infilepath, outfilepath, rank): print(e) sys.exit(1) if rank == 0 or rank == 2: - mpath = os.path.dirname(outfilepath) - if mpath == "": - mpath = os.getcwd() - apiname = outfilepath.replace(mpath + "/","") - output_filename_h = open(outfilepath,"w+") - apiname = apiname.replace("_ostream_ops.h","") - apiname = apiname.upper() - output_filename_h.write("// automatically generated\n") - output_filename_h.write(LICENSE + '\n') - header_s = \ - '#ifndef INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - '#define INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - '\n' + \ - '#include "src/core/session/tracer/src/roctracer.h"\n' + \ - '\n' + \ - '#ifdef __cplusplus\n' + \ - '#include \n' + \ - '#include \n' - - output_filename_h.write(header_s) - output_filename_h.write('\n') - output_filename_h.write('namespace roctracer {\n') - output_filename_h.write('namespace ' + apiname.lower() + '_support {\n') - output_filename_h.write('static int ' + apiname.upper() + '_depth_max = 1;\n') - output_filename_h.write('static int ' + apiname.upper() + '_depth_max_cnt = 0;\n') - output_filename_h.write('static std::string ' + apiname.upper() + '_structs_regex = \"\";\n') - output_filename_h.write('// begin ostream ops for '+ apiname + ' \n') - output_filename_h.write("// basic ostream ops\n") - output_filename_h.write(header_basic) - output_filename_h.write("// End of basic ostream ops\n\n") + mpath = os.path.dirname(outfilepath) + if mpath == "": + mpath = os.getcwd() + apiname = outfilepath.replace(mpath + "/", "") + output_filename_h = open(outfilepath, "w+") + apiname = apiname.replace("_ostream_ops.h", "") + apiname = apiname.upper() + output_filename_h.write("// automatically generated\n") + output_filename_h.write(LICENSE + "\n") + header_s = ( + "#ifndef INC_" + + apiname + + "_OSTREAM_OPS_H_\n" + + "#define INC_" + + apiname + + "_OSTREAM_OPS_H_\n" + + "\n" + + '#include "src/core/session/tracer/src/roctracer.h"\n' + + "\n" + + "#ifdef __cplusplus\n" + + "#include \n" + + "#include \n" + ) + + output_filename_h.write(header_s) + output_filename_h.write("\n") + output_filename_h.write("namespace roctracer {\n") + output_filename_h.write("namespace " + apiname.lower() + "_support {\n") + output_filename_h.write("static int " + apiname.upper() + "_depth_max = 1;\n") + output_filename_h.write("static int " + apiname.upper() + "_depth_max_cnt = 0;\n") + output_filename_h.write( + "static std::string " + apiname.upper() + '_structs_regex = "";\n' + ) + output_filename_h.write("// begin ostream ops for " + apiname + " \n") + output_filename_h.write("// basic ostream ops\n") + output_filename_h.write(header_basic) + output_filename_h.write("// End of basic ostream ops\n\n") for c in cppHeader.classes: - if c[-2] == ':' and c[-1] == ':': continue #ostream operator cannot be overloaded for anonymous struct therefore it is skipped + if c[-2] == ":" and c[-1] == ":": + continue # ostream operator cannot be overloaded for anonymous struct therefore it is skipped if "union" in c: continue if c in structs_analyzed: continue - if c == 'max_align_t' or c == '__fsid_t': # Skipping as it is defined in multiple domains - continue + if ( + c == "max_align_t" or c == "__fsid_t" + ): # Skipping as it is defined in multiple domains + continue if len(cppHeader.classes[c]["properties"]["public"]) != 0: - output_filename_h.write("inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n") - output_filename_h.write("{\n") - output_filename_h.write(" std::operator<<(out, '{');\n") - output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt++;\n") - output_filename_h.write(" if (" + apiname.upper() + "_depth_max == -1 || " + apiname.upper() + "_depth_max_cnt <= " + apiname.upper() + "_depth_max" + ") {\n" ) - process_struct(output_filename_h, c, cppHeader, "", apiname) - global_str = "\n".join(global_str.split("\n")[0:-3]) - if global_str != '': global_str += "\n }\n" - output_filename_h.write(global_str) - output_filename_h.write(" };\n") - output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt--;\n") - output_filename_h.write(" std::operator<<(out, '}');\n") - output_filename_h.write(" return out;\n") - output_filename_h.write("}\n") - global_str = '' - global_ops += "inline static std::ostream& operator<<(std::ostream& out, const " + c + "& v)\n" + "{\n" + " roctracer::" + apiname.lower() + "_support::detail::operator<<(out, v);\n" + " return out;\n" + "}\n\n" + output_filename_h.write( + "inline static std::ostream& operator<<(std::ostream& out, const " + + c + + "& v)\n" + ) + output_filename_h.write("{\n") + output_filename_h.write(" std::operator<<(out, '{');\n") + output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt++;\n") + output_filename_h.write( + " if (" + + apiname.upper() + + "_depth_max == -1 || " + + apiname.upper() + + "_depth_max_cnt <= " + + apiname.upper() + + "_depth_max" + + ") {\n" + ) + process_struct(output_filename_h, c, cppHeader, "", apiname) + global_str = "\n".join(global_str.split("\n")[0:-3]) + if global_str != "": + global_str += "\n }\n" + output_filename_h.write(global_str) + output_filename_h.write(" };\n") + output_filename_h.write(" " + apiname.upper() + "_depth_max_cnt--;\n") + output_filename_h.write(" std::operator<<(out, '}');\n") + output_filename_h.write(" return out;\n") + output_filename_h.write("}\n") + global_str = "" + global_ops += ( + "inline static std::ostream& operator<<(std::ostream& out, const " + + c + + "& v)\n" + + "{\n" + + " roctracer::" + + apiname.lower() + + "_support::detail::operator<<(out, v);\n" + + " return out;\n" + + "}\n\n" + ) if rank == 1 or rank == 2: - footer = '// end ostream ops for '+ apiname + ' \n' - footer += '};};};\n\n' - output_filename_h.write(footer) - output_filename_h.write(global_ops) - footer = '#endif //__cplusplus\n' + \ - '#endif // INC_' + apiname + '_OSTREAM_OPS_H_\n' + \ - ' \n' - output_filename_h.write(footer) - output_filename_h.write('#include ') - output_filename_h.close() - print('File ' + outfilepath + ' generated') + footer = "// end ostream ops for " + apiname + " \n" + footer += "};};};\n\n" + output_filename_h.write(footer) + output_filename_h.write(global_ops) + footer = ( + "#endif //__cplusplus\n" + + "#endif // INC_" + + apiname + + "_OSTREAM_OPS_H_\n" + + " \n" + ) + output_filename_h.write(footer) + output_filename_h.write("#include ") + output_filename_h.close() + print("File " + outfilepath + " generated") return -parser = argparse.ArgumentParser(description='genOstreamOps.py: generates ostream operators for all typedefs in provided input file.') -requiredNamed = parser.add_argument_group('Required arguments') -requiredNamed.add_argument('-in', metavar='fileList', help='Comma separated list of header files to be parsed', required=True) -requiredNamed.add_argument('-out', metavar='file', help='Output file with ostream operators', required=True) + +parser = argparse.ArgumentParser( + description="genOstreamOps.py: generates ostream operators for all typedefs in provided input file." +) +requiredNamed = parser.add_argument_group("Required arguments") +requiredNamed.add_argument( + "-in", + metavar="fileList", + help="Comma separated list of header files to be parsed", + required=True, +) +requiredNamed.add_argument( + "-out", metavar="file", help="Output file with ostream operators", required=True +) args = vars(parser.parse_args()) -if __name__ == '__main__': - flist = args['in'].split(',') - if len(flist) == 1: - gen_cppheader(flist[0], args['out'],2) - else: - for i in range(len(flist)): - if i == 0: - gen_cppheader(flist[i], args['out'],0) - elif i == len(flist)-1: - gen_cppheader(flist[i], args['out'],1) - else: - gen_cppheader(flist[i], args['out'],-1) +if __name__ == "__main__": + flist = args["in"].split(",") + if len(flist) == 1: + gen_cppheader(flist[0], args["out"], 2) + else: + for i in range(len(flist)): + if i == 0: + gen_cppheader(flist[i], args["out"], 0) + elif i == len(flist) - 1: + gen_cppheader(flist[i], args["out"], 1) + else: + gen_cppheader(flist[i], args["out"], -1) diff --git a/script/hsaap.py b/script/hsaap.py index 784a6432..153a5e00 100755 --- a/script/hsaap.py +++ b/script/hsaap.py @@ -25,507 +25,610 @@ from __future__ import print_function import os, sys, re -H_OUT='hsa_prof_str.h' -CPP_OUT='hsa_prof_str.inline.h' -API_TABLES_H = 'hsa_api_trace.h' +H_OUT = "hsa_prof_str.h" +CPP_OUT = "hsa_prof_str.inline.h" +API_TABLES_H = "hsa_api_trace.h" API_HEADERS_H = ( - ('CoreApi', 'hsa.h'), - ('AmdExt', 'hsa_ext_amd.h'), - ('ImageExt', 'hsa_ext_image.h'), - ('AmdExt', API_TABLES_H), + ("CoreApi", "hsa.h"), + ("AmdExt", "hsa_ext_amd.h"), + ("ImageExt", "hsa_ext_image.h"), + ("AmdExt", API_TABLES_H), +) + +LICENSE = ( + "/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.\n" + + "\n" + + " Permission is hereby granted, free of charge, to any person obtaining a copy\n" + + ' of this software and associated documentation files (the "Software"), to deal\n' + + " in the Software without restriction, including without limitation the rights\n" + + " to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n" + + " copies of the Software, and to permit persons to whom the Software is\n" + + " furnished to do so, subject to the following conditions:\n" + + "\n" + + " The above copyright notice and this permission notice shall be included in\n" + + " all copies or substantial portions of the Software.\n" + + "\n" + + ' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' + + " IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" + + " FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n" + + " AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n" + + " LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n" + + " OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n" + + " THE SOFTWARE. */\n" ) -LICENSE = \ -'/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.\n' + \ -'\n' + \ -' Permission is hereby granted, free of charge, to any person obtaining a copy\n' + \ -' of this software and associated documentation files (the "Software"), to deal\n' + \ -' in the Software without restriction, including without limitation the rights\n' + \ -' to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n' + \ -' copies of the Software, and to permit persons to whom the Software is\n' + \ -' furnished to do so, subject to the following conditions:\n' + \ -'\n' + \ -' The above copyright notice and this permission notice shall be included in\n' + \ -' all copies or substantial portions of the Software.\n' + \ -'\n' + \ -' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' + \ -' IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n' + \ -' FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n' + \ -' AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n' + \ -' LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n' + \ -' OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n' + \ -' THE SOFTWARE. */\n' ############################################################# # Error handler def fatal(module, msg): - print (module + ' Error: "' + msg + '"', file = sys.stderr) - sys.exit(1) + print(module + ' Error: "' + msg + '"', file=sys.stderr) + sys.exit(1) + # Get next text block def NextBlock(pos, record): - if len(record) == 0: return pos - - space_pattern = re.compile(r'(\s+)') - word_pattern = re.compile(r'([\w\*]+)') - if record[pos] != '(': - m = space_pattern.match(record, pos) - if not m: - m = word_pattern.match(record, pos) - if m: - return pos + len(m.group(1)) + if len(record) == 0: + return pos + + space_pattern = re.compile(r"(\s+)") + word_pattern = re.compile(r"([\w\*]+)") + if record[pos] != "(": + m = space_pattern.match(record, pos) + if not m: + m = word_pattern.match(record, pos) + if m: + return pos + len(m.group(1)) + else: + fatal("NextBlock", "bad record '" + record + "' pos(" + str(pos) + ")") else: - fatal('NextBlock', "bad record '" + record + "' pos(" + str(pos) + ")") - else: - count = 0 - for index in range(pos, len(record)): - if record[index] == '(': - count = count + 1 - elif record[index] == ')': - count = count - 1 - if count == 0: - index = index + 1 - break - if count != 0: - fatal('NextBlock', "count is not zero (" + str(count) + ")") - if record[index - 1] != ')': - fatal('NextBlock', "last char is not ')' '" + record[index - 1] + "'") - return index + count = 0 + for index in range(pos, len(record)): + if record[index] == "(": + count = count + 1 + elif record[index] == ")": + count = count - 1 + if count == 0: + index = index + 1 + break + if count != 0: + fatal("NextBlock", "count is not zero (" + str(count) + ")") + if record[index - 1] != ")": + fatal("NextBlock", "last char is not ')' '" + record[index - 1] + "'") + return index + ############################################################# # API table parser class class API_TableParser: - def fatal(self, msg): - fatal('API_TableParser', msg) - - def __init__(self, header, name): - self.name = name - - if not os.path.isfile(header): - self.fatal("file '" + header + "' not found") - - self.inp = open(header, 'r') - - self.beg_pattern = re.compile('^\s*struct\s+' + name + 'Table\s*{\s*$') - self.end_pattern = re.compile('^\s*};\s*$') - self.array = [] - self.parse() - - # normalizing a line - def norm_line(self, line): - return re.sub(r'^\s+', r' ', line[:-1]) - - # check for start record - def is_start(self, record): - return self.beg_pattern.match(record) - - # check for end record - def is_end(self, record): - return self.end_pattern.match(record) - - # check for declaration entry record - def is_entry(self, record): - return re.match(r'^\s*decltype\(([^\)]*)\)', record) - - # parse method - def parse(self): - active = 0 - for line in self.inp.readlines(): - record = self.norm_line(line) - if self.is_start(record): active = 1 - if active != 0: - if self.is_end(record): return - m = self.is_entry(record) - if m: - self.array.append(m.group(1)) + def fatal(self, msg): + fatal("API_TableParser", msg) + + def __init__(self, header, name): + self.name = name + + if not os.path.isfile(header): + self.fatal("file '" + header + "' not found") + + self.inp = open(header, "r") + + self.beg_pattern = re.compile("^\s*struct\s+" + name + "Table\s*{\s*$") + self.end_pattern = re.compile("^\s*};\s*$") + self.array = [] + self.parse() + + # normalizing a line + def norm_line(self, line): + return re.sub(r"^\s+", r" ", line[:-1]) + + # check for start record + def is_start(self, record): + return self.beg_pattern.match(record) + + # check for end record + def is_end(self, record): + return self.end_pattern.match(record) + + # check for declaration entry record + def is_entry(self, record): + return re.match(r"^\s*decltype\(([^\)]*)\)", record) + + # parse method + def parse(self): + active = 0 + for line in self.inp.readlines(): + record = self.norm_line(line) + if self.is_start(record): + active = 1 + if active != 0: + if self.is_end(record): + return + m = self.is_entry(record) + if m: + self.array.append(m.group(1)) + ############################################################# # API declaration parser class class API_DeclParser: - def fatal(self, msg): - fatal('API_DeclParser', msg) - - def __init__(self, header, array, data): - if not os.path.isfile(header): - self.fatal("file '" + header + "' not found") - - self.inp = open(header, 'r') - - self.end_pattern = re.compile('\);\s*$') - self.data = data - for call in array: - if call in data: - self.fatal(call + ' is already found') - self.parse(call) - - # api record filter - def api_filter(self, record): - record = re.sub(r'\sHSA_API\s', r' ', record) - record = re.sub(r'\sHSA_DEPRECATED\s', r' ', record) - return record - - # check for start record - def is_start(self, call, record): - return re.search('\s' + call + '\s*\(', record) - - # check for API method record - def is_api(self, call, record): - record = self.api_filter(record) - return re.match('\s+\S+\s+' + call + '\s*\(', record) - - # check for end record - def is_end(self, record): - return self.end_pattern.search(record) - - # parse method args - def get_args(self, record): - struct = {'ret': '', 'args': '', 'astr': {}, 'alst': [], 'tlst': []} - record = re.sub(r'^\s+', r'', record) - record = re.sub(r'\s*(\*+)\s*', r'\1 ', record) - rind = NextBlock(0, record) - struct['ret'] = record[0:rind] - pos = record.find('(') - end = NextBlock(pos, record); - args = record[pos:end] - args = re.sub(r'^\(\s*', r'', args) - args = re.sub(r'\s*\)$', r'', args) - args = re.sub(r'\s*,\s*', r',', args) - struct['args'] = re.sub(r',', r', ', args) - if len(args) == 0: return struct - - pos = 0 - args = args + ',' - while pos < len(args): - ind1 = NextBlock(pos, args) # type - ind2 = NextBlock(ind1, args) # space - if args[ind2] != '(': - while ind2 < len(args): - end = NextBlock(ind2, args) - if args[end] == ',': break - else: ind2 = end - name = args[ind2:end] - else: - ind3 = NextBlock(ind2, args) # field - m = re.match(r'\(\s*\*\s*(\S+)\s*\)', args[ind2:ind3]) - if not m: - self.fatal("bad block3 '" + args + "' : '" + args[ind2:ind3] + "'") - name = m.group(1) - end = NextBlock(ind3, args) # the rest - item = args[pos:end] - struct['astr'][name] = item - struct['alst'].append(name) - struct['tlst'].append(item) - if args[end] != ',': - self.fatal("no comma '" + args + "'") - pos = end + 1 - - return struct - - # parse given api - def parse(self, call): - record = '' - active = 0 - found = 0 - api_name = '' - prev_line = '' - - self.inp.seek(0) - for line in self.inp.readlines(): - record += ' ' + line[:-1] - record = re.sub(r'^\s*', r' ', record) - - if active == 0: - if self.is_start(call, record): - active = 1 - m = self.is_api(call, record) - if not m: - record = ' ' + prev_line + ' ' + record - m = self.is_api(call, record) - if not m: - self.fatal("bad api '" + line + "'") - - if active == 1: - if self.is_end(record): - self.data[call] = self.get_args(record) - active = 0 - found = 0 - - if active == 0: record = '' - prev_line = line + def fatal(self, msg): + fatal("API_DeclParser", msg) + + def __init__(self, header, array, data): + if not os.path.isfile(header): + self.fatal("file '" + header + "' not found") + + self.inp = open(header, "r") + + self.end_pattern = re.compile("\);\s*$") + self.data = data + for call in array: + if call in data: + self.fatal(call + " is already found") + self.parse(call) + + # api record filter + def api_filter(self, record): + record = re.sub(r"\sHSA_API\s", r" ", record) + record = re.sub(r"\sHSA_DEPRECATED\s", r" ", record) + return record + + # check for start record + def is_start(self, call, record): + return re.search("\s" + call + "\s*\(", record) + + # check for API method record + def is_api(self, call, record): + record = self.api_filter(record) + return re.match("\s+\S+\s+" + call + "\s*\(", record) + + # check for end record + def is_end(self, record): + return self.end_pattern.search(record) + + # parse method args + def get_args(self, record): + struct = {"ret": "", "args": "", "astr": {}, "alst": [], "tlst": []} + record = re.sub(r"^\s+", r"", record) + record = re.sub(r"\s*(\*+)\s*", r"\1 ", record) + rind = NextBlock(0, record) + struct["ret"] = record[0:rind] + pos = record.find("(") + end = NextBlock(pos, record) + args = record[pos:end] + args = re.sub(r"^\(\s*", r"", args) + args = re.sub(r"\s*\)$", r"", args) + args = re.sub(r"\s*,\s*", r",", args) + struct["args"] = re.sub(r",", r", ", args) + if len(args) == 0: + return struct + + pos = 0 + args = args + "," + while pos < len(args): + ind1 = NextBlock(pos, args) # type + ind2 = NextBlock(ind1, args) # space + if args[ind2] != "(": + while ind2 < len(args): + end = NextBlock(ind2, args) + if args[end] == ",": + break + else: + ind2 = end + name = args[ind2:end] + else: + ind3 = NextBlock(ind2, args) # field + m = re.match(r"\(\s*\*\s*(\S+)\s*\)", args[ind2:ind3]) + if not m: + self.fatal("bad block3 '" + args + "' : '" + args[ind2:ind3] + "'") + name = m.group(1) + end = NextBlock(ind3, args) # the rest + item = args[pos:end] + struct["astr"][name] = item + struct["alst"].append(name) + struct["tlst"].append(item) + if args[end] != ",": + self.fatal("no comma '" + args + "'") + pos = end + 1 + + return struct + + # parse given api + def parse(self, call): + record = "" + active = 0 + found = 0 + api_name = "" + prev_line = "" + + self.inp.seek(0) + for line in self.inp.readlines(): + record += " " + line[:-1] + record = re.sub(r"^\s*", r" ", record) + + if active == 0: + if self.is_start(call, record): + active = 1 + m = self.is_api(call, record) + if not m: + record = " " + prev_line + " " + record + m = self.is_api(call, record) + if not m: + self.fatal("bad api '" + line + "'") + + if active == 1: + if self.is_end(record): + self.data[call] = self.get_args(record) + active = 0 + found = 0 + + if active == 0: + record = "" + prev_line = line + ############################################################# # API description parser class class API_DescrParser: - def fatal(self, msg): - fatal('API_DescrParser', msg) - - def __init__(self, out_h_file, hsa_dir, api_table_h, api_headers, license): - out_macro = re.sub(r'[\/\.]', r'_', out_h_file.upper()) + '_' - - self.h_content = '' - self.cpp_content = '' - self.api_names = [] - self.api_calls = {} - self.api_rettypes = set() - self.api_id = {} - - api_data = {} - api_list = [] - ns_calls = [] - - for i in range(0, len(api_headers)): - (name, header) = api_headers[i] - - if i < len(api_headers) - 1: - api = API_TableParser(hsa_dir + api_table_h, name) - api_list = api.array - self.api_names.append(name) - self.api_calls[name] = api_list - else: - api_list = ns_calls - ns_calls = [] + def fatal(self, msg): + fatal("API_DescrParser", msg) - for call in api_list: - if call in api_data: - self.fatal("call '" + call + "' is already found") + def __init__(self, out_h_file, hsa_dir, api_table_h, api_headers, license): + out_macro = re.sub(r"[\/\.]", r"_", out_h_file.upper()) + "_" - API_DeclParser(hsa_dir + header, api_list, api_data) + self.h_content = "" + self.cpp_content = "" + self.api_names = [] + self.api_calls = {} + self.api_rettypes = set() + self.api_id = {} - for call in api_list: - if not call in api_data: - # Not-supported functions - ns_calls.append(call) + api_data = {} + api_list = [] + ns_calls = [] + + for i in range(0, len(api_headers)): + (name, header) = api_headers[i] + + if i < len(api_headers) - 1: + api = API_TableParser(hsa_dir + api_table_h, name) + api_list = api.array + self.api_names.append(name) + self.api_calls[name] = api_list + else: + api_list = ns_calls + ns_calls = [] + + for call in api_list: + if call in api_data: + self.fatal("call '" + call + "' is already found") + + API_DeclParser(hsa_dir + header, api_list, api_data) + + for call in api_list: + if not call in api_data: + # Not-supported functions + ns_calls.append(call) + else: + # API ID map + self.api_id[call] = "HSA_API_ID_" + call + # Return types + self.api_rettypes.add(api_data[call]["ret"]) + + self.api_rettypes.discard("void") + self.api_data = api_data + self.ns_calls = ns_calls + + self.h_content += ( + "/* Generated by " + os.path.basename(__file__) + " */\n" + license + "\n\n" + ) + + self.h_content += "/* HSA API tracing primitives\n" + for name, header in api_headers: + self.h_content += ( + " '" + + name + + "', header '" + + header + + "', " + + str(len(self.api_calls[name])) + + " funcs\n" + ) + for call in self.ns_calls: + self.h_content += " " + call + " was not parsed\n" + self.h_content += " */\n" + self.h_content += "\n" + self.h_content += "#ifndef " + out_macro + "\n" + self.h_content += "#define " + out_macro + "\n" + + self.h_content += self.add_section("API ID enumeration", " ", self.gen_id_enum) + + self.h_content += "/* Declarations of APIs intended for use only by tools. */\n" + self.h_content += "typedef void (*hsa_amd_queue_intercept_packet_writer)(const void*, uint64_t);\n" + self.h_content += "typedef void (*hsa_amd_queue_intercept_handler)(const void*, uint64_t, uint64_t, void*,\n" + self.h_content += " hsa_amd_queue_intercept_packet_writer);\n" + self.h_content += "typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t*, hsa_agent_t, void*);\n" + + self.h_content += self.add_section( + "API arg structure", " ", self.gen_arg_struct + ) + self.h_content += self.add_section( + "API output stream", " ", self.gen_out_stream + ) + self.h_content += "#endif /* " + out_macro + " */\n" + + self.cpp_content += ( + "/* Generated by " + os.path.basename(__file__) + " */\n" + license + "\n\n" + ) + + self.cpp_content += "#include \n" + self.cpp_content += "#include \n" + self.cpp_content += "namespace roctracer::hsa_support::detail {\n" + + self.cpp_content += "static CoreApiTable CoreApi_saved_before_cb;\n" + self.cpp_content += "static AmdExtTable AmdExt_saved_before_cb;\n" + self.cpp_content += "static ImageExtTable ImageExt_saved_before_cb;\n\n" + + self.cpp_content += self.add_section( + "API callback functions", "", self.gen_callbacks + ) + self.cpp_content += self.add_section( + "API intercepting code", "", self.gen_intercept + ) + self.cpp_content += self.add_section( + "API get_name function", " ", self.gen_get_name + ) + self.cpp_content += self.add_section( + "API get_code function", " ", self.gen_get_code + ) + self.cpp_content += "\n};\n" + + # add code section + def add_section(self, title, gap, fun): + content = "" + n = 0 + content += "\n/* section: " + title + " */\n\n" + content += fun(-1, "-", "-", {}) + for index in range(len(self.api_names)): + last = index == len(self.api_names) - 1 + name = self.api_names[index] + if n != 0: + if gap == "": + content += fun(n, name, "-", {}) + content += "\n" + content += gap + "/* block: " + name + " API */\n" + for call in self.api_calls[name]: + content += fun(n, name, call, self.api_data[call]) + n += 1 + content += fun(n, "-", "-", {}) + return content + + # generate API ID enumeration + def gen_id_enum(self, n, name, call, data): + content = "" + if n == -1: + content += "enum hsa_api_id_t {\n" + return content + if call != "-": + content += " " + self.api_id[call] + " = " + str(n) + ",\n" else: - # API ID map - self.api_id[call] = 'HSA_API_ID_' + call - # Return types - self.api_rettypes.add(api_data[call]['ret']) - - self.api_rettypes.discard('void') - self.api_data = api_data - self.ns_calls = ns_calls - - self.h_content += "/* Generated by " + os.path.basename(__file__) + " */\n" + license + "\n\n" - - self.h_content += "/* HSA API tracing primitives\n" - for (name, header) in api_headers: - self.h_content += " '" + name + "', header '" + header + "', " + str(len(self.api_calls[name])) + ' funcs\n' - for call in self.ns_calls: - self.h_content += ' ' + call + ' was not parsed\n' - self.h_content += " */\n" - self.h_content += '\n' - self.h_content += '#ifndef ' + out_macro + '\n' - self.h_content += '#define ' + out_macro + '\n' - - self.h_content += self.add_section('API ID enumeration', ' ', self.gen_id_enum) - - self.h_content += '/* Declarations of APIs intended for use only by tools. */\n' - self.h_content += 'typedef void (*hsa_amd_queue_intercept_packet_writer)(const void*, uint64_t);\n' - self.h_content += 'typedef void (*hsa_amd_queue_intercept_handler)(const void*, uint64_t, uint64_t, void*,\n' - self.h_content += ' hsa_amd_queue_intercept_packet_writer);\n' - self.h_content += 'typedef void (*hsa_amd_runtime_queue_notifier)(const hsa_queue_t*, hsa_agent_t, void*);\n' - - self.h_content += self.add_section('API arg structure', ' ', self.gen_arg_struct) - self.h_content += self.add_section('API output stream', ' ', self.gen_out_stream) - self.h_content += '#endif /* ' + out_macro + ' */\n' - - self.cpp_content += "/* Generated by " + os.path.basename(__file__) + " */\n" + license + "\n\n" - - self.cpp_content += '#include \n' - self.cpp_content += '#include \n' - self.cpp_content += 'namespace roctracer::hsa_support::detail {\n' - - self.cpp_content += 'static CoreApiTable CoreApi_saved_before_cb;\n' - self.cpp_content += 'static AmdExtTable AmdExt_saved_before_cb;\n' - self.cpp_content += 'static ImageExtTable ImageExt_saved_before_cb;\n\n' - - self.cpp_content += self.add_section('API callback functions', '', self.gen_callbacks) - self.cpp_content += self.add_section('API intercepting code', '', self.gen_intercept) - self.cpp_content += self.add_section('API get_name function', ' ', self.gen_get_name) - self.cpp_content += self.add_section('API get_code function', ' ', self.gen_get_code) - self.cpp_content += '\n};\n' - - # add code section - def add_section(self, title, gap, fun): - content = '' - n = 0 - content += '\n/* section: ' + title + ' */\n\n' - content += fun(-1, '-', '-', {}) - for index in range(len(self.api_names)): - last = (index == len(self.api_names) - 1) - name = self.api_names[index] - if n != 0: - if gap == '': content += fun(n, name, '-', {}) - content += '\n' - content += gap + '/* block: ' + name + ' API */\n' - for call in self.api_calls[name]: - content += fun(n, name, call, self.api_data[call]) - n += 1 - content += fun(n, '-', '-', {}) - return content - - # generate API ID enumeration - def gen_id_enum(self, n, name, call, data): - content = '' - if n == -1: - content += 'enum hsa_api_id_t {\n' - return content - if call != '-': - content += ' ' + self.api_id[call] + ' = ' + str(n) + ',\n' - else: - content += '\n' - content += ' HSA_API_ID_DISPATCH = ' + str(n) + ',\n' - content += ' HSA_API_ID_NUMBER = ' + str(n + 1) + ',\n' - content += '};\n' - return content - - # generate API args structure - def gen_arg_struct(self, n, name, call, struct): - content = '' - if n == -1: - content += 'typedef struct hsa_api_data_s {\n' - content += ' uint64_t correlation_id;\n' - content += ' uint32_t phase;\n' - content += ' union {\n' - for ret_type in self.api_rettypes: - content += ' ' + ret_type + ' ' + ret_type + '_retval;\n' - content += ' };\n' - content += ' union {\n' - return content - if call != '-': - content += ' struct {\n' - for (var, item) in struct['astr'].items(): - content += ' ' + item + ';\n' - if call == "hsa_amd_memory_async_copy_rect" and item == "const hsa_dim3_t* range": - content += ' hsa_dim3_t range__val;\n' - content += ' } ' + call + ';\n' - else: - content += ' } args;\n' - content += ' uint64_t *phase_data;\n' - content += '} hsa_api_data_t;\n' - return content - - # generate API callbacks - def gen_callbacks(self, n, name, call, struct): - content = '' - if n == -1: - content += '/* section: Static declarations */\n' - content += '\n' - if call != '-': - call_id = self.api_id[call]; - ret_type = struct['ret'] - content += 'static ' + ret_type + ' ' + call + '_callback(' + struct['args'] + ') {\n' - - content += ' hsa_trace_data_t trace_data;\n' - content += ' bool enabled{false};\n' - content += '\n' - content += ' if (auto function = report_activity.load(std::memory_order_relaxed); function &&\n' - content += ' (enabled =\n' - content += ' function(ACTIVITY_DOMAIN_HSA_API, ' + call_id + ', &trace_data) == 0)) {\n' - content += ' if (trace_data.phase_enter != nullptr) {\n' - - for var in struct['alst']: - item = struct['astr'][var]; - if re.search(r'char\* ', item): - # FIXME: we should not strdup the char* arguments here, as the callback will not outlive the scope of this function. Instead, we - # should generate a helper function to capture the content of the arguments similar to hipApiArgsInit for HIP. We also need a - # helper to free the memory that is allocated to capture the content. - content += ' trace_data.api_data.args.' + call + '.' + var + ' = ' + '(' + var + ' != NULL) ? strdup(' + var + ')' + ' : NULL;\n' + content += "\n" + content += " HSA_API_ID_DISPATCH = " + str(n) + ",\n" + content += " HSA_API_ID_NUMBER = " + str(n + 1) + ",\n" + content += "};\n" + return content + + # generate API args structure + def gen_arg_struct(self, n, name, call, struct): + content = "" + if n == -1: + content += "typedef struct hsa_api_data_s {\n" + content += " uint64_t correlation_id;\n" + content += " uint32_t phase;\n" + content += " union {\n" + for ret_type in self.api_rettypes: + content += " " + ret_type + " " + ret_type + "_retval;\n" + content += " };\n" + content += " union {\n" + return content + if call != "-": + content += " struct {\n" + for var, item in struct["astr"].items(): + content += " " + item + ";\n" + if ( + call == "hsa_amd_memory_async_copy_rect" + and item == "const hsa_dim3_t* range" + ): + content += " hsa_dim3_t range__val;\n" + content += " } " + call + ";\n" else: - content += ' trace_data.api_data.args.' + call + '.' + var + ' = ' + var + ';\n' - if call == 'hsa_amd_memory_async_copy_rect' and var == 'range': - content += ' trace_data.api_data.args.' + call + '.' + var + '__val = ' + '*(' + var + ');\n' - - content += ' trace_data.phase_enter(' + call_id + ', &trace_data);\n' - content += ' }\n' - content += ' }\n' - content += '\n' - - if ret_type != 'void': - content += ' trace_data.api_data.' + ret_type + '_retval = ' - content += ' ' + name + '_saved_before_cb.' + call + '_fn(' + ', '.join(struct['alst']) + ');\n' - - content += '\n' - content += ' if (enabled && trace_data.phase_exit != nullptr)\n' - content += ' trace_data.phase_exit(' + call_id + ', &trace_data);\n' - - if ret_type != 'void': - content += ' return trace_data.api_data.' + ret_type + '_retval;\n' - content += '}\n' - - return content - - # generate API intercepting code - def gen_intercept(self, n, name, call, struct): - content = '' - if n > 0 and call == '-': - content += '};\n' - if n == 0 or (call == '-' and name != '-'): - content += 'static void Install' + name + 'Wrappers(' + name + 'Table* table) {\n' - content += ' ' + name + '_saved_before_cb = *table;\n' - if call != '-': - if call != 'hsa_shut_down': - content += ' table->' + call + '_fn = ' + call + '_callback;\n' - else: - content += ' { void* p = (void*)' + call + '_callback; (void)p; }\n' - return content - - # generate API name function - def gen_get_name(self, n, name, call, struct): - content = '' - if n == -1: - content += 'static const char* GetApiName(uint32_t id) {\n' - content += ' switch (id) {\n' - return content - if call != '-': - content += ' case ' + self.api_id[call] + ': return "' + call + '";\n' - else: - content += ' }\n' - content += ' return "unknown";\n' - content += '}\n' - return content - - # generate API code function - def gen_get_code(self, n, name, call, struct): - content = '' - if n == -1: - content += 'static uint32_t GetApiCode(const char* str) {\n' - return content - if call != '-': - content += ' if (strcmp("' + call + '", str) == 0) return ' + self.api_id[call] + ';\n' - else: - content += ' return HSA_API_ID_NUMBER;\n' - content += '}\n' - return content - - # generate stream operator - def gen_out_stream(self, n, name, call, struct): - content = '' - if n == -1: - content += '#ifdef __cplusplus\n' - content += '#include "hsa_ostream_ops.h"\n' - content += 'typedef std::pair hsa_api_data_pair_t;\n' - content += 'inline std::ostream& operator<< (std::ostream& out, const hsa_api_data_pair_t& data_pair) {\n' - content += ' const uint32_t cid = data_pair.first;\n' - content += ' const hsa_api_data_t& api_data = data_pair.second;\n' - content += ' switch(cid) {\n' - return content - if call != '-': - content += ' case ' + self.api_id[call] + ': {\n' - content += ' out << "' + call + '(";\n' - arg_list = struct['alst'] - if len(arg_list) != 0: - for ind in range(len(arg_list)): - arg_var = arg_list[ind] - arg_val = 'api_data.args.' + call + '.' + arg_var - if re.search(r'char\* ', struct['astr'][arg_var]): - content += ' out << "0x" << std::hex << (uint64_t)' + arg_val - else: - content += ' out << ' + arg_val - if call == "hsa_amd_memory_async_copy_rect" and arg_var == "range": - content += ' << ", ";\n' - content += ' out << ' + arg_val + '__val' - ''' + content += " } args;\n" + content += " uint64_t *phase_data;\n" + content += "} hsa_api_data_t;\n" + return content + + # generate API callbacks + def gen_callbacks(self, n, name, call, struct): + content = "" + if n == -1: + content += "/* section: Static declarations */\n" + content += "\n" + if call != "-": + call_id = self.api_id[call] + ret_type = struct["ret"] + content += ( + "static " + + ret_type + + " " + + call + + "_callback(" + + struct["args"] + + ") {\n" + ) + + content += " hsa_trace_data_t trace_data;\n" + content += " bool enabled{false};\n" + content += "\n" + content += " if (auto function = report_activity.load(std::memory_order_relaxed); function &&\n" + content += " (enabled =\n" + content += ( + " function(ACTIVITY_DOMAIN_HSA_API, " + + call_id + + ", &trace_data) == 0)) {\n" + ) + content += " if (trace_data.phase_enter != nullptr) {\n" + + for var in struct["alst"]: + item = struct["astr"][var] + if re.search(r"char\* ", item): + # FIXME: we should not strdup the char* arguments here, as the callback will not outlive the scope of this function. Instead, we + # should generate a helper function to capture the content of the arguments similar to hipApiArgsInit for HIP. We also need a + # helper to free the memory that is allocated to capture the content. + content += ( + " trace_data.api_data.args." + + call + + "." + + var + + " = " + + "(" + + var + + " != NULL) ? strdup(" + + var + + ")" + + " : NULL;\n" + ) + else: + content += ( + " trace_data.api_data.args." + + call + + "." + + var + + " = " + + var + + ";\n" + ) + if call == "hsa_amd_memory_async_copy_rect" and var == "range": + content += ( + " trace_data.api_data.args." + + call + + "." + + var + + "__val = " + + "*(" + + var + + ");\n" + ) + + content += " trace_data.phase_enter(" + call_id + ", &trace_data);\n" + content += " }\n" + content += " }\n" + content += "\n" + + if ret_type != "void": + content += " trace_data.api_data." + ret_type + "_retval = " + content += ( + " " + + name + + "_saved_before_cb." + + call + + "_fn(" + + ", ".join(struct["alst"]) + + ");\n" + ) + + content += "\n" + content += " if (enabled && trace_data.phase_exit != nullptr)\n" + content += " trace_data.phase_exit(" + call_id + ", &trace_data);\n" + + if ret_type != "void": + content += " return trace_data.api_data." + ret_type + "_retval;\n" + content += "}\n" + + return content + + # generate API intercepting code + def gen_intercept(self, n, name, call, struct): + content = "" + if n > 0 and call == "-": + content += "};\n" + if n == 0 or (call == "-" and name != "-"): + content += ( + "static void Install" + name + "Wrappers(" + name + "Table* table) {\n" + ) + content += " " + name + "_saved_before_cb = *table;\n" + if call != "-": + if call != "hsa_shut_down": + content += " table->" + call + "_fn = " + call + "_callback;\n" + else: + content += " { void* p = (void*)" + call + "_callback; (void)p; }\n" + return content + + # generate API name function + def gen_get_name(self, n, name, call, struct): + content = "" + if n == -1: + content += "static const char* GetApiName(uint32_t id) {\n" + content += " switch (id) {\n" + return content + if call != "-": + content += " case " + self.api_id[call] + ': return "' + call + '";\n' + else: + content += " }\n" + content += ' return "unknown";\n' + content += "}\n" + return content + + # generate API code function + def gen_get_code(self, n, name, call, struct): + content = "" + if n == -1: + content += "static uint32_t GetApiCode(const char* str) {\n" + return content + if call != "-": + content += ( + ' if (strcmp("' + + call + + '", str) == 0) return ' + + self.api_id[call] + + ";\n" + ) + else: + content += " return HSA_API_ID_NUMBER;\n" + content += "}\n" + return content + + # generate stream operator + def gen_out_stream(self, n, name, call, struct): + content = "" + if n == -1: + content += "#ifdef __cplusplus\n" + content += '#include "hsa_ostream_ops.h"\n' + content += ( + "typedef std::pair hsa_api_data_pair_t;\n" + ) + content += "inline std::ostream& operator<< (std::ostream& out, const hsa_api_data_pair_t& data_pair) {\n" + content += " const uint32_t cid = data_pair.first;\n" + content += " const hsa_api_data_t& api_data = data_pair.second;\n" + content += " switch(cid) {\n" + return content + if call != "-": + content += " case " + self.api_id[call] + ": {\n" + content += ' out << "' + call + '(";\n' + arg_list = struct["alst"] + if len(arg_list) != 0: + for ind in range(len(arg_list)): + arg_var = arg_list[ind] + arg_val = "api_data.args." + call + "." + arg_var + if re.search(r"char\* ", struct["astr"][arg_var]): + content += ' out << "0x" << std::hex << (uint64_t)' + arg_val + else: + content += " out << " + arg_val + if ( + call == "hsa_amd_memory_async_copy_rect" + and arg_var == "range" + ): + content += ' << ", ";\n' + content += " out << " + arg_val + "__val" + """ arg_item = struct['tlst'][ind] if re.search(r'\(\* ', arg_item): arg_pref = '' elif re.search(r'void\* ', arg_item): arg_pref = '' @@ -536,46 +639,53 @@ def gen_out_stream(self, n, name, call, struct): content += ' if (' + arg_val + ') out << ' + arg_pref + '(' + arg_val + '); else out << ' + arg_val else: content += ' out << ' + arg_val - ''' - if ind < len(arg_list) - 1: content += ' << ", ";\n' - else: content += ';\n' - if struct['ret'] != 'void': - content += ' out << ") = " << api_data.' + struct['ret'] + '_retval;\n' - else: - content += ' out << ") = void";\n' - content += ' break;\n' - content += ' }\n' - else: - content += ' default:\n' - content += ' out << "ERROR: unknown API";\n' - content += ' abort();\n' - content += ' }\n' - content += ' return out;\n' - content += '}\n' - content += '#endif\n' - return content + """ + if ind < len(arg_list) - 1: + content += ' << ", ";\n' + else: + content += ";\n" + if struct["ret"] != "void": + content += ( + ' out << ") = " << api_data.' + struct["ret"] + "_retval;\n" + ) + else: + content += ' out << ") = void";\n' + content += " break;\n" + content += " }\n" + else: + content += " default:\n" + content += ' out << "ERROR: unknown API";\n' + content += " abort();\n" + content += " }\n" + content += " return out;\n" + content += "}\n" + content += "#endif\n" + return content + ############################################################# # main # Usage if len(sys.argv) != 3: - print ("Usage:", sys.argv[0], " ", file=sys.stderr) - sys.exit(1) + print( + "Usage:", sys.argv[0], " ", file=sys.stderr + ) + sys.exit(1) else: - PREFIX = sys.argv[1] + '/' - HSA_DIR = sys.argv[2] + '/' + PREFIX = sys.argv[1] + "/" + HSA_DIR = sys.argv[2] + "/" descr = API_DescrParser(H_OUT, HSA_DIR, API_TABLES_H, API_HEADERS_H, LICENSE) out_file = PREFIX + H_OUT -print ('Generating "' + out_file + '"') -f = open(out_file, 'w') +print('Generating "' + out_file + '"') +f = open(out_file, "w") f.write(descr.h_content[:-1]) f.close() out_file = PREFIX + CPP_OUT -print ('Generating "' + out_file + '"') -f = open(out_file, 'w') +print('Generating "' + out_file + '"') +f = open(out_file, "w") f.write(descr.cpp_content[:-1]) f.close() ############################################################# diff --git a/script/leak-sanitizer-suppr.txt b/script/leak-sanitizer-suppr.txt new file mode 100644 index 00000000..8aad4454 --- /dev/null +++ b/script/leak-sanitizer-suppr.txt @@ -0,0 +1,8 @@ +# +# LeakSanitizer suppressions file for rocprofiler project. +# + +leak:amd_comgr +leak:hsa-runtime +leak:amdhip +leak:python diff --git a/script/run-ci.py b/script/run-ci.py new file mode 100755 index 00000000..51243891 --- /dev/null +++ b/script/run-ci.py @@ -0,0 +1,454 @@ +#!/usr/bin/env python3 + + +import os +import re +import sys +import glob +import socket +import shutil +import argparse +import multiprocessing + +# this constant is used to define CTEST_PROJECT_NAME +# and default value for CTEST_SUBMIT_URL +_PROJECT_NAME = "rocprofiler" +_BASE_URL = "10.194.116.31/cdash" + + +def which(cmd, require): + v = shutil.which(cmd) + if require and v is None: + raise RuntimeError(f"{cmd} not found") + return v if v is not None else "" + + +def generate_custom(args, cmake_args, ctest_args): + if not os.path.exists(args.binary_dir): + os.makedirs(args.binary_dir) + + if args.memcheck is not None: + if args.coverage: + raise ValueError( + f"Enabling --memcheck={args.memcheck} and --coverage not supported" + ) + cmake_args += [f"-DROCPROFILER_MEMCHECK={args.memcheck}"] + + NAME = args.name + SITE = args.site + BUILD_JOBS = args.build_jobs + SUBMIT_URL = args.submit_url + SOURCE_DIR = os.path.realpath(args.source_dir) + BINARY_DIR = os.path.realpath(args.binary_dir) + CMAKE_ARGS = " ".join(cmake_args) + CTEST_ARGS = " ".join(ctest_args) + + GIT_CMD = which("git", require=True) + GCOV_CMD = which("gcov", require=False) + CMAKE_CMD = which("cmake", require=True) + # CTEST_CMD = which("ctest", require=True) + + NAME = re.sub(r"(.*)-([0-9]+)/merge", "PR_\\2_\\1", NAME) + + DEFAULT_CMAKE_ARGS = " ".join( + [f"-DROCPROFILER_BUILD_{x}=ON" for x in ["CI", "TESTS", "SAMPLES"]] + ) + + GPU_TARGETS = ";".join(args.gpu_targets) + MEMCHECK_TYPE = "" if args.memcheck is None else args.memcheck + + MEMCHECK_SANITIZER_OPTIONS = "" + MEMCHECK_SUPPRESSION_FILE = "" + + if MEMCHECK_TYPE == "AddressSanitizer": + MEMCHECK_SANITIZER_OPTIONS = "detect_leaks=0 use_sigaltstack=0" + MEMCHECK_SUPPRESSION_FILE = f"{SOURCE_DIR}/script/address-sanitizer-suppr.txt" + elif MEMCHECK_TYPE == "LeakSanitizer": + MEMCHECK_SUPPRESSION_FILE = f"{SOURCE_DIR}/script/leak-sanitizer-suppr.txt" + elif MEMCHECK_TYPE == "ThreadSanitizer": + external_symbolizer_path = "" + for version in range(8, 20): + _symbolizer = shutil.which(f"llvm-symbolizer-{version}") + if _symbolizer: + external_symbolizer_path = f"external_symbolizer_path={_symbolizer}" + os.environ["TSAN_OPTIONS"] = " ".join( + [ + "history_size=5", + "second_deadlock_stack=1", + f"suppressions={SOURCE_DIR}/script/thread-sanitizer-suppr.txt", + external_symbolizer_path, + os.environ.get("TSAN_OPTIONS", ""), + ] + ) + + return f""" + set(CTEST_PROJECT_NAME "{_PROJECT_NAME}") + set(CTEST_NIGHTLY_START_TIME "05:00:00 UTC") + + set(CTEST_DROP_METHOD "http") + set(CTEST_DROP_SITE_CDASH TRUE) + set(CTEST_SUBMIT_URL "http://{SUBMIT_URL}") + + set(CTEST_UPDATE_TYPE git) + set(CTEST_UPDATE_VERSION_ONLY TRUE) + set(CTEST_GIT_COMMAND {GIT_CMD}) + set(CTEST_GIT_INIT_SUBMODULES FALSE) + + set(CTEST_OUTPUT_ON_FAILURE TRUE) + set(CTEST_USE_LAUNCHERS TRUE) + set(CMAKE_CTEST_ARGUMENTS --output-on-failure {CTEST_ARGS}) + + set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100") + set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100") + set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "51200") + set(CTEST_CUSTOM_COVERAGE_EXCLUDE "/usr/.*;/opt/.*;.*external/.*;.*samples/.*;.*test/.*;.*tests-v2/.*;.*perfetto/perfetto_sdk/.*;.*ctf/barectf.*") + + set(CTEST_MEMORYCHECK_TYPE "{MEMCHECK_TYPE}") + set(CTEST_MEMORYCHECK_SUPPRESSIONS_FILE "{MEMCHECK_SUPPRESSION_FILE}") + set(CTEST_MEMORYCHECK_SANITIZER_OPTIONS "{MEMCHECK_SANITIZER_OPTIONS}") + + set(CTEST_SITE "{SITE}") + set(CTEST_BUILD_NAME "{NAME}") + + set(CTEST_SOURCE_DIRECTORY {SOURCE_DIR}) + set(CTEST_BINARY_DIRECTORY {BINARY_DIR}) + + set(CTEST_CONFIGURE_COMMAND "{CMAKE_CMD} -B {BINARY_DIR} {SOURCE_DIR} {DEFAULT_CMAKE_ARGS} -DGPU_TARGETS={GPU_TARGETS} {CMAKE_ARGS}") + set(CTEST_BUILD_COMMAND "{CMAKE_CMD} --build {BINARY_DIR} --target all --parallel {BUILD_JOBS}") + set(CTEST_COVERAGE_COMMAND {GCOV_CMD}) + """ + + +def generate_dashboard_script(args): + CODECOV = 1 if args.coverage else 0 + DASHBOARD_MODE = args.mode + SOURCE_DIR = os.path.realpath(args.source_dir) + BINARY_DIR = os.path.realpath(args.binary_dir) + MEMCHECK = 1 if args.memcheck is not None else 0 + SUBMIT = 0 if args.disable_cdash else 1 + ARGN = "${ARGN}" + + if args.memcheck == "ThreadSanitizer": + MEMCHECK = 0 + + _script = f""" + macro(dashboard_submit) + if("{SUBMIT}" GREATER 0) + ctest_submit({ARGN}) + endif() + endmacro() + """ + + _script += """ + + include("${CMAKE_CURRENT_LIST_DIR}/CTestCustom.cmake") + + macro(handle_error _message _ret) + if(NOT ${${_ret}} EQUAL 0) + dashboard_submit(PARTS Done RETURN_VALUE _submit_ret) + message(FATAL_ERROR "${_message} failed: ${${_ret}}") + endif() + endmacro() + """ + + _script += f""" + ctest_start({DASHBOARD_MODE}) + ctest_update(SOURCE "{SOURCE_DIR}" RETURN_VALUE _update_ret + CAPTURE_CMAKE_ERROR _update_err) + ctest_configure(BUILD "{BINARY_DIR}" RETURN_VALUE _configure_ret) + dashboard_submit(PARTS Start Update Configure RETURN_VALUE _submit_ret) + + if(NOT _update_err EQUAL 0) + message(WARNING "ctest_update failed") + endif() + + handle_error("Configure" _configure_ret) + + ctest_build(BUILD "{BINARY_DIR}" RETURN_VALUE _build_ret) + dashboard_submit(PARTS Build RETURN_VALUE _submit_ret) + + handle_error("Build" _build_ret) + + if("{MEMCHECK}" GREATER 0) + ctest_memcheck(BUILD "{BINARY_DIR}" RETURN_VALUE _test_ret) + dashboard_submit(PARTS Test RETURN_VALUE _submit_ret) + else() + ctest_test(BUILD "{BINARY_DIR}" RETURN_VALUE _test_ret) + dashboard_submit(PARTS Test RETURN_VALUE _submit_ret) + endif() + + if("{CODECOV}" GREATER 0) + ctest_coverage( + BUILD "{BINARY_DIR}" + RETURN_VALUE _coverage_ret + CAPTURE_CMAKE_ERROR _coverage_err) + dashboard_submit(PARTS Coverage RETURN_VALUE _submit_ret) + endif() + + handle_error("Testing" _test_ret) + + dashboard_submit(PARTS Done RETURN_VALUE _submit_ret) + """ + return _script + + +def parse_cdash_args(args): + BUILD_JOBS = multiprocessing.cpu_count() + DASHBOARD_MODE = "Continuous" + DASHBOARD_STAGES = [ + "Start", + "Update", + "Configure", + "Build", + "Test", + "MemCheck", + "Coverage", + "Submit", + ] + SOURCE_DIR = os.getcwd() + BINARY_DIR = os.path.join(SOURCE_DIR, "build") + SITE = socket.gethostname() + SUBMIT_URL = f"{_BASE_URL}/submit.php?project={_PROJECT_NAME}" + + parser = argparse.ArgumentParser() + + parser.add_argument( + "-n", "--name", help="Job name", default=None, type=str, required=True + ) + parser.add_argument("-s", "--site", help="Site name", default=SITE, type=str) + parser.add_argument( + "-q", "--quiet", help="Disable printing logs", action="store_true" + ) + parser.add_argument( + "-c", "--coverage", help="Enable code coverage", action="store_true" + ) + parser.add_argument( + "-j", + "--build-jobs", + help="Number of build tasks", + default=BUILD_JOBS, + type=int, + ) + parser.add_argument( + "-B", + "--binary-dir", + help="Build directory", + default=BINARY_DIR, + type=str, + ) + parser.add_argument( + "-S", + "--source-dir", + help="Source directory", + default=SOURCE_DIR, + type=str, + ) + parser.add_argument( + "-F", + "--clean", + help="Remove existing build directory", + action="store_true", + ) + parser.add_argument( + "-M", + "--mode", + help="Dashboard mode", + default=DASHBOARD_MODE, + choices=("Continuous", "Nightly", "Experimental"), + type=str, + ) + parser.add_argument( + "-T", + "--stages", + help="Dashboard stages", + nargs="+", + default=DASHBOARD_STAGES, + choices=DASHBOARD_STAGES, + type=str, + ) + parser.add_argument( + "--submit-url", + help="CDash submission site", + default=SUBMIT_URL, + type=str, + ) + parser.add_argument( + "--repeat-until-pass", + help=" for --repeat until-pass:", + default=None, + type=int, + ) + parser.add_argument( + "--repeat-until-fail", + help=" for --repeat until-fail:", + default=None, + type=int, + ) + parser.add_argument( + "--repeat-after-timeout", + help=" for --repeat after-timeout:", + default=None, + type=int, + ) + parser.add_argument( + "--disable-cdash", + help="Disable submitting results to CDash dashboard", + action="store_true", + ) + parser.add_argument( + "--gpu-targets", + help="GPU build architectures", + default="gfx900 gfx906 gfx908 gfx90a gfx1030 gfx1100".split(), + type=str, + nargs="+", + ) + parser.add_argument( + "--memcheck", + help="Run dynamic analysis tool", + default=None, + type=str, + choices=( + "ThreadSanitizer", + "AddressSanitizer", + "LeakSanitizer", + "MemorySanitizer", + "UndefinedBehaviorSanitizer", + ), + ) + parser.add_argument( + "--linter", + help="Enable linting tool", + default=None, + type=str, + choices=("clang-tidy",), + ) + + return parser.parse_args(args) + + +def parse_args(args=None): + if args is None: + args = sys.argv[1:] + + index = 0 + input_args = [] + ctest_args = [] + cmake_args = [] + data = [input_args, cmake_args, ctest_args] + cmd = os.path.basename(sys.argv[0]) + + for itr in args: + if itr == "--": + index += 1 + if index > 2: + raise RuntimeError( + f"Usage: {cmd} -- -- " + ) + else: + data[index].append(itr) + + cdash_args = parse_cdash_args(input_args) + + if cdash_args.coverage: + cmake_args += ["-DROCPROFILER_BUILD_CODECOV=ON"] + + if cdash_args.linter == "clang-tidy": + cmake_args += ["-DROCPROFILER_ENABLE_CLANG_TIDY=ON"] + + def get_repeat_val(_param): + _value = getattr(cdash_args, f"repeat_{_param}".replace("-", "_")) + return [f"{_param}:{_value}"] if _value is not None and _value > 1 else [] + + repeat_args = ( + get_repeat_val("until-pass") + + get_repeat_val("until-fail") + + get_repeat_val("after-timeout") + ) + ctest_args += ["--repeat"] + repeat_args if len(repeat_args) > 0 else [] + + return [cdash_args, cmake_args, ctest_args] + + +def run(*args, **kwargs): + import subprocess + + return subprocess.run(*args, **kwargs) + + +if __name__ == "__main__": + args, cmake_args, ctest_args = parse_args() + + if args.clean and os.path.exists(args.binary_dir): + if args.source_dir == args.binary_dir: + raise RuntimeError( + f"cannot clean binary directory == source directory ({args.source_dir})" + ) + + shutil.rmtree(args.binary_dir) + + if not os.path.exists(args.binary_dir): + os.makedirs(args.binary_dir) + + from textwrap import dedent + + _config = dedent(generate_custom(args, cmake_args, ctest_args)) + _script = dedent(generate_dashboard_script(args)) + + if not args.quiet: + sys.stderr.write(f"##### CTestCustom.cmake #####\n\n{_config}\n\n") + sys.stderr.write(f"##### dashboard.cmake #####\n\n{_script}\n\n") + + with open(os.path.join(args.binary_dir, "CTestCustom.cmake"), "w") as f: + f.write(f"{_config}\n") + + with open(os.path.join(args.binary_dir, "dashboard.cmake"), "w") as f: + f.write(f"{_script}\n") + + CTEST_CMD = which("ctest", require=True) + + dashboard_args = ["-D"] + for itr in args.stages: + dashboard_args.append(f"{args.mode}{itr}") + + try: + if not args.quiet and len(ctest_args) == 0: + ctest_args = ["--output-on-failure", "-V"] + + run( + [CTEST_CMD] + + dashboard_args + + [ + "-S", + os.path.join(args.binary_dir, "dashboard.cmake"), + ] + + ctest_args, + check=True, + ) + finally: + if "-VV" not in ctest_args and not args.quiet: + for file in glob.glob( + os.path.join(args.binary_dir, "Testing/Temporary/**"), + recursive=True, + ): + if not os.path.isfile(file): + continue + if ( + re.match( + r"Last(Start|Update|Configure|Build|Test).*\.log$", + os.path.basename(file), + ) + is None + ): + continue + + print(f"\n\n\n###### Reading {file}... ######\n\n\n") + with open(file, "r") as inpf: + fdata = inpf.read() + if "LastTest" not in file and "Coverage" not in file: + print(fdata) + oname = os.path.basename(file) + if oname.endswith(".log"): + oname += ".log" + with open(os.path.join(args.binary_dir, oname), "w") as outf: + print(f"\n\n###### Writing {oname}... ######\n\n") + outf.write(fdata) diff --git a/script/thread-sanitizer-suppr.txt b/script/thread-sanitizer-suppr.txt new file mode 100644 index 00000000..32d5847d --- /dev/null +++ b/script/thread-sanitizer-suppr.txt @@ -0,0 +1,9 @@ +# +# ThreadSanitizer suppressions file for rocprofiler project. +# + +# leaked thread +thread:libhsa-runtime64.so + +# unlock of an unlocked mutex (or by a wrong thread) +mutex:librocm_smi64.so diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8ab325d3..c97ad943 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,5 +20,9 @@ # THE SOFTWARE. ################################################################################ +if(ROCPROFILER_BUILD_CODECOV) + set(CMAKE_BUILD_TYPE "Coverage") +endif() + add_subdirectory(api) add_subdirectory(tools) diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index f994227e..c184210a 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -242,8 +242,9 @@ target_include_directories( ${ROCPROFILER_TARGET} PUBLIC $ PRIVATE ${LIB_DIR} ${ROOT_DIR} ${PROJECT_SOURCE_DIR}/include/rocprofiler) -target_link_libraries(${ROCPROFILER_TARGET} PRIVATE ${AQLPROFILE_LIB} - hsa-runtime64::hsa-runtime64 c stdc++) +target_link_libraries( + ${ROCPROFILER_TARGET} PRIVATE ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 c stdc++ + dl rocprofiler::build-flags rocprofiler::memcheck) get_target_property(ROCPROFILER_LIBRARY_V1_NAME ${ROCPROFILER_TARGET} NAME) get_target_property(ROCPROFILER_LIBRARY_V1_VERSION ${ROCPROFILER_TARGET} VERSION) @@ -313,47 +314,26 @@ target_include_directories( $ PRIVATE ${LIB_DIR} ${ROOT_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/tools) -if(ASAN) - target_compile_options(rocprofiler-v2 PRIVATE -fsanitize=address) - target_link_options( - rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap - -Wl,--no-undefined,-fsanitize=address) - target_link_libraries( - rocprofiler-v2 - PRIVATE ${AQLPROFILE_LIB} - hsa-runtime64::hsa-runtime64 - Threads::Threads - atomic - numa - asan - dl - c - stdc++ - stdc++fs - amd_comgr - dw - elf - ${PCIACCESS_LIBRARIES}) -else() - target_link_options( - rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap - -Wl,--no-undefined) - target_link_libraries( - rocprofiler-v2 - PRIVATE ${AQLPROFILE_LIB} - hsa-runtime64::hsa-runtime64 - Threads::Threads - atomic - numa - dl - c - stdc++ - stdc++fs - amd_comgr - dw - elf - ${PCIACCESS_LIBRARIES}) -endif() +target_link_libraries(rocprofiler-v2 PRIVATE rocprofiler::build-flags) +target_link_options( + rocprofiler-v2 PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap + -Wl,--no-undefined) +target_link_libraries( + rocprofiler-v2 + PRIVATE ${AQLPROFILE_LIB} + hsa-runtime64::hsa-runtime64 + Threads::Threads + atomic + numa + dl + c + stdc++ + stdc++fs + amd_comgr + dw + elf + ${PCIACCESS_LIBRARIES} + rocprofiler::memcheck) get_target_property(ROCPROFILER_LIBRARY_V2_NAME rocprofiler-v2 OUTPUT_NAME) get_target_property(ROCPROFILER_LIBRARY_V2_VERSION rocprofiler-v2 VERSION) @@ -372,7 +352,12 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E create_symlink lib${ROCPROFILER_LIBRARY_V2_NAME}.so.${ROCPROFILER_LIBRARY_V2_SOVERSION} - ${CMAKE_BINARY_DIR}/lib/lib${ROCPROFILER_LIBRARY_V2_NAME}v2.so) + ${CMAKE_BINARY_DIR}/lib/lib${ROCPROFILER_LIBRARY_V2_NAME}v2.so + # Temporarily up till Jenkins side is fixed + COMMAND + ${CMAKE_COMMAND} -E create_symlink + lib/lib${ROCPROFILER_LIBRARY_V1_NAME}.so + ${CMAKE_BINARY_DIR}/lib${ROCPROFILER_LIBRARY_V1_NAME}.so) # Add custom target to trigger the create_symlink command add_custom_target(create_rocprofiler_lib DEPENDS rocprofiler-v2 ${ROCPROFILER_TARGET}) diff --git a/src/core/counters/basic/xml_parser_basic.py b/src/core/counters/basic/xml_parser_basic.py index 9ada504d..1478815d 100644 --- a/src/core/counters/basic/xml_parser_basic.py +++ b/src/core/counters/basic/xml_parser_basic.py @@ -6,228 +6,334 @@ from lxml import etree import sys -CPP_OUT='basic_counter.cpp' +CPP_OUT = "basic_counter.cpp" -if (__name__ == "__main__"): - cpp_content = '' - cpp_content += '/* Copyright (c) 2022 Advanced Micro Devices, Inc.\n' - cpp_content += '\n' - cpp_content += ' Permission is hereby granted, free of charge, to any person obtaining a copy\n' - cpp_content += ' of this software and associated documentation files (the \"Software\"), to deal\n' - cpp_content += ' in the Software without restriction, including without limitation the rights\n' - cpp_content += ' to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n' - cpp_content += ' copies of the Software, and to permit persons to whom the Software is\n' - cpp_content += ' furnished to do so, subject to the following conditions:\n' - cpp_content += '\n' - cpp_content += ' The above copyright notice and this permission notice shall be included in\n' - cpp_content += ' all copies or substantial portions of the Software.\n' - cpp_content += '\n' - cpp_content += ' THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' - cpp_content += ' IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n' - cpp_content += ' FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n' - cpp_content += ' AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n' - cpp_content += ' LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n' - cpp_content += ' OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n' - cpp_content += ' THE SOFTWARE. */\n' - cpp_content += '\n' - cpp_content += '#include \n' - cpp_content += '#include "src/utils/helper.h"\n' - cpp_content += '\n' - cpp_content += '#include \"src/core/counters/basic/basic_counter.h\"\n' - cpp_content += '#include \"src/core/hardware/hsa_info.h\"\n' - cpp_content += '\n' - cpp_content += '#define ASSERTM(exp, msg) assert(((void)msg, exp))\n' - cpp_content += '\n' - cpp_content += '#pragma GCC diagnostic push\n' - cpp_content += '#pragma GCC diagnostic ignored \"-Wmaybe-uninitialized\"\n' - cpp_content += 'namespace Counter {\n' - cpp_content += '\n' - cpp_content += 'BasicCounter::BasicCounter(uint64_t event_id, std::string block_id,\n' - cpp_content += ' std::string name, std::string description,\n' - cpp_content += ' std::string gpu_name)\n' - cpp_content += ' : Counter(name, description, gpu_name),\n' - cpp_content += ' event_id_(event_id),\n' - cpp_content += ' block_id_(block_id) {\n' - cpp_content += ' AddCounterToCounterMap();\n' - cpp_content += '}\n' - cpp_content += '\n' - cpp_content += 'BasicCounter::~BasicCounter() {}\n' - cpp_content += '\n' - cpp_content += 'uint64_t BasicCounter::GetBasicCounterID() {\n' - cpp_content += ' return GetCounterID();\n' - cpp_content += '}\n' - cpp_content += '\n' - cpp_content += 'uint64_t BasicCounter::GetEventId() { return event_id_; }\n' - cpp_content += 'std::string BasicCounter::GetBlockId() { return block_id_; }\n' - cpp_content += 'std::string BasicCounter::GetName() { return Counter::GetName(); }\n' - cpp_content += '\n' - cpp_content += 'bool BasicCounter::GetValue(uint64_t* value, int64_t instance_id = -1) {\n' - cpp_content += ' Agent::CounterHardwareInfo* agent_info =\n' - cpp_content += ' reinterpret_cast(counter_hw_info);\n' - cpp_content += ' if ((agent_info->getNumInstances() > 1 && instance_id == -1) ||\n' - cpp_content += ' instance_id < -1 || instance_id >= agent_info->getNumInstances())\n' - cpp_content += ' return false;\n' - cpp_content += ' if (instance_id == -1) *value = instances_values_[0];\n' - cpp_content += ' *value = instances_values_[instance_id];\n' - cpp_content += ' return true;\n' - cpp_content += '}\n' - cpp_content += '\n' - cpp_content += 'uint64_t BasicCounter::GetValue(int64_t instance_id) {\n' - cpp_content += ' Agent::CounterHardwareInfo* agent_info =\n' - cpp_content += ' reinterpret_cast(counter_hw_info);\n' - cpp_content += ' if ((agent_info->getNumInstances() > 1 && instance_id == -1) ||\n' - cpp_content += ' instance_id < -1 || instance_id >= agent_info->getNumInstances())\n' - cpp_content += ' throw(std::string(\"Error: Wrong number of instances (\") +\n' - cpp_content += ' std::to_string(agent_info->getNumInstances()) +\n' - cpp_content += ' \") OR Instance ID is less than 0 \");\n' - cpp_content += ' if (instance_id == -1) return instances_values_[0];\n' - cpp_content += ' return instances_values_[instance_id];\n' - cpp_content += '}\n' - cpp_content += '\n' - cpp_content += 'uint64_t BasicCounter::avr(int64_t instances_count) {\n' - cpp_content += ' Agent::CounterHardwareInfo* agent_info =\n' - cpp_content += ' reinterpret_cast(counter_hw_info);\n' - cpp_content += ' if (agent_info->getNumInstances() > instances_count)\n' - cpp_content += ' throw(std::string(\"Error: Number of instances (\") +\n' - cpp_content += ' std::to_string(agent_info->getNumInstances()) +\n' - cpp_content += ' \") is greater than the given instance count(\" +\n' - cpp_content += ' std::to_string(instances_count) + \")\");\n' - cpp_content += ' uint64_t result = 0;\n' - cpp_content += ' int64_t instance_id;\n' - cpp_content += ' for (instance_id = 0; instance_id < instances_count; instance_id++) {\n' - cpp_content += ' uint64_t value;\n' - cpp_content += ' if (GetValue(&value, instance_id)) result += value;\n' - cpp_content += ' }\n' - cpp_content += ' return result / instances_count;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t BasicCounter::max(int64_t instances_count) {\n' - cpp_content += ' uint64_t result = 0;\n' - cpp_content += ' int64_t instance_id;\n' - cpp_content += ' for (instance_id = 0; instance_id < instances_count; instance_id++) {\n' - cpp_content += ' uint64_t value;\n' - cpp_content += ' if (GetValue(&value, instance_id) && result < value) result = value;\n' - cpp_content += ' }\n' - cpp_content += ' return result;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t BasicCounter::min(int64_t instances_count) {\n' - cpp_content += ' int64_t instance_id;\n' - cpp_content += ' uint64_t result = 0;\n' - cpp_content += ' for (instance_id = 0; instance_id < instances_count; instance_id++) {\n' - cpp_content += ' uint64_t value;\n' - cpp_content += ' if (GetValue(&value, instance_id) && result > value) result = value;\n' - cpp_content += ' }\n' - cpp_content += ' return result;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t BasicCounter::sum(int64_t instances_count) {\n' - cpp_content += ' int64_t instance_id;\n' - cpp_content += ' uint64_t result = 0;\n' - cpp_content += ' for (instance_id = 0; instance_id < instances_count; instance_id++) {\n' - cpp_content += ' uint64_t value;\n' - cpp_content += ' if (GetValue(&value, instance_id)) result += value;\n' - cpp_content += ' }\n' - cpp_content += ' return result;\n' - cpp_content += '}\n' - cpp_content += '\n' - cpp_content += 'uint64_t operator+(BasicCounter counter, const uint64_t number) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value = 0;\n' - cpp_content += ' ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n' - cpp_content += ' return number + value;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t operator-(BasicCounter counter, const uint64_t number) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value = 0;\n' - cpp_content += ' ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n' - cpp_content += ' return number - value;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t operator*(BasicCounter counter, const uint64_t number) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value = 0;\n' - cpp_content += ' ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n' - cpp_content += ' return number * value;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t operator/(BasicCounter counter, const uint64_t number) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value = 0;\n' - cpp_content += ' ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n' - cpp_content += ' return number / value;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t operator^(BasicCounter counter, const uint64_t number) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value = 0;\n' - cpp_content += ' ASSERTM(counter.GetValue(&value), \"Error: Counter has no value!\");\n' - cpp_content += ' return number ^ value;\n' - cpp_content += '}\n' - cpp_content += '\n' - cpp_content += 'uint64_t operator+(BasicCounter counter1, BasicCounter counter2) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value1 = 0;\n' - cpp_content += ' ASSERTM(counter1.GetValue(&value1), \"Error: Counter has no value!\");\n' - cpp_content += ' [[maybe_unused]] uint64_t value2 = 0;\n' - cpp_content += ' ASSERTM(counter2.GetValue(&value2), \"Error: Counter has no value!\");\n' - cpp_content += ' return value1 + value2;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t operator-(BasicCounter counter1, BasicCounter counter2) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value1 = 0;\n' - cpp_content += ' ASSERTM(counter1.GetValue(&value1), \"Error: Counter has no value!\");\n' - cpp_content += ' [[maybe_unused]] uint64_t value2 = 0;\n' - cpp_content += ' ASSERTM(counter2.GetValue(&value2), \"Error: Counter has no value!\");\n' - cpp_content += ' return value1 - value2;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t operator*(BasicCounter counter1, BasicCounter counter2) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value1 = 0;\n' - cpp_content += ' ASSERTM(counter1.GetValue(&value1), \"Error: Counter has no value!\");\n' - cpp_content += ' [[maybe_unused]] uint64_t value2 = 0;\n' - cpp_content += ' ASSERTM(counter2.GetValue(&value2), \"Error: Counter has no value!\");\n' - cpp_content += ' return value1 * value2;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t operator/(BasicCounter counter1, BasicCounter counter2) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value1 = 0;\n' - cpp_content += ' ASSERTM(counter1.GetValue( & value1), \"Error: Counter has no value!\");\n' - cpp_content += ' [[maybe_unused]] uint64_t value2 = 0;\n' - cpp_content += ' ASSERTM(counter2.GetValue( & value2), \"Error: Counter has no value!\");\n' - cpp_content += ' return value1 / value2;\n' - cpp_content += '}\n' - cpp_content += 'uint64_t operator^(BasicCounter counter1, BasicCounter counter2) {\n' - cpp_content += ' [[maybe_unused]] uint64_t value1 = 0;\n' - cpp_content += ' ASSERTM(counter1.GetValue(&value1), \"Error: Counter has no value!\");\n' - cpp_content += ' [[maybe_unused]] uint64_t value2 = 0;\n' - cpp_content += ' ASSERTM(counter2.GetValue(&value2), \"Error: Counter has no value!\");\n' - cpp_content += ' return value1 ^ value2;\n' - cpp_content += '}\n' - cpp_content += '\n' - cpp_content += 'static std::map basic_counters;\n' - cpp_content += '\n' - cpp_content += 'BasicCounter* GetGeneratedBasicCounter(uint64_t id) {\n' - cpp_content += ' return &basic_counters.at(id);\n' - cpp_content += '}\n' - cpp_content += '\n' - cpp_content += 'void ClearBasicCounters() {\n' - cpp_content += ' basic_counters.clear();\n' - cpp_content += '}\n' - cpp_content += '\n' - cpp_content += '/**\n' - cpp_content += ' * @brief Basic Counters\n' - cpp_content += ' *\n' - cpp_content += ' * @{\n' - cpp_content += ' */\n' - cpp_content += 'uint64_t GetBasicCounter(const char* name, const char* gpu_name) {\n' - cpp_content += ' std::string gpu;\n' - parser=etree.XMLParser(recover=True, encoding='utf-8') - xml_file=ET.parse(sys.argv[1] + '/gfx_metrics.xml', parser=parser) - root=xml_file.getroot() - for gpu in root: - cpp_content += "\n\t/**\n\t * @brief Basic " + gpu.tag + " counters\n\t *\n\t * @{\n\t */\n" - cpp_content += "\tgpu = \"" + gpu.tag + "\";\n\n" - cpp_content += "\tif (strncmp(gpu_name, gpu.c_str(), gpu.length())==0) {\n" - for child in gpu: - cpp_content += "\t/**\n\t * Basic Counter: " + child.attrib['name'] + "\n\t *\n\t * " + child.attrib['descr'] + "\n\t */\n\tif (strcmp(name, \"" + child.attrib['name'] + "\")==0) {\n\t\tbasic_counters.emplace(" + child.attrib['event'] + ", BasicCounter{" + child.attrib['event'] + ", \"" + child.attrib['block'] + "\", \"" + child.attrib['name'] + "\", \"" + child.attrib['descr'] + "\", \"" + gpu.tag + "\"});\n\t\treturn " + child.attrib['event'] + ";\n\t}\n" - cpp_content += "\t}\n\n\t/**\n\t * @}\n\t */\n" - cpp_content += ' throw(\"Couldn\'t find the required Counter name for the mentioned GPU!\");\n' - cpp_content += ' return 0;\n' - cpp_content += '}\n' - cpp_content += '/**\n' - cpp_content += ' * @}\n' - cpp_content += ' */\n' - cpp_content += '\n' - cpp_content += '} // namespace Counter\n' - cpp_content += '\n' - cpp_content += '#pragma GCC diagnostic pop\n' - print ('Generating "' + sys.argv[2] + '"') - f = open(sys.argv[2], 'w') - f.write(cpp_content[:-1]) - f.close() +if __name__ == "__main__": + cpp_content = "" + cpp_content += "/* Copyright (c) 2022 Advanced Micro Devices, Inc.\n" + cpp_content += "\n" + cpp_content += ( + " Permission is hereby granted, free of charge, to any person obtaining a copy\n" + ) + cpp_content += ( + ' of this software and associated documentation files (the "Software"), to deal\n' + ) + cpp_content += ( + " in the Software without restriction, including without limitation the rights\n" + ) + cpp_content += ( + " to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n" + ) + cpp_content += ( + " copies of the Software, and to permit persons to whom the Software is\n" + ) + cpp_content += " furnished to do so, subject to the following conditions:\n" + cpp_content += "\n" + cpp_content += ( + " The above copyright notice and this permission notice shall be included in\n" + ) + cpp_content += " all copies or substantial portions of the Software.\n" + cpp_content += "\n" + cpp_content += ( + ' THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' + ) + cpp_content += ( + " IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" + ) + cpp_content += ( + " FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n" + ) + cpp_content += ( + " AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n" + ) + cpp_content += ( + " LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n" + ) + cpp_content += ( + " OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n" + ) + cpp_content += " THE SOFTWARE. */\n" + cpp_content += "\n" + cpp_content += "#include \n" + cpp_content += '#include "src/utils/helper.h"\n' + cpp_content += "\n" + cpp_content += '#include "src/core/counters/basic/basic_counter.h"\n' + cpp_content += '#include "src/core/hardware/hsa_info.h"\n' + cpp_content += "\n" + cpp_content += "#define ASSERTM(exp, msg) assert(((void)msg, exp))\n" + cpp_content += "\n" + cpp_content += "#pragma GCC diagnostic push\n" + cpp_content += '#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"\n' + cpp_content += "namespace Counter {\n" + cpp_content += "\n" + cpp_content += "BasicCounter::BasicCounter(uint64_t event_id, std::string block_id,\n" + cpp_content += ( + " std::string name, std::string description,\n" + ) + cpp_content += " std::string gpu_name)\n" + cpp_content += " : Counter(name, description, gpu_name),\n" + cpp_content += " event_id_(event_id),\n" + cpp_content += " block_id_(block_id) {\n" + cpp_content += " AddCounterToCounterMap();\n" + cpp_content += "}\n" + cpp_content += "\n" + cpp_content += "BasicCounter::~BasicCounter() {}\n" + cpp_content += "\n" + cpp_content += "uint64_t BasicCounter::GetBasicCounterID() {\n" + cpp_content += " return GetCounterID();\n" + cpp_content += "}\n" + cpp_content += "\n" + cpp_content += "uint64_t BasicCounter::GetEventId() { return event_id_; }\n" + cpp_content += "std::string BasicCounter::GetBlockId() { return block_id_; }\n" + cpp_content += "std::string BasicCounter::GetName() { return Counter::GetName(); }\n" + cpp_content += "\n" + cpp_content += ( + "bool BasicCounter::GetValue(uint64_t* value, int64_t instance_id = -1) {\n" + ) + cpp_content += " Agent::CounterHardwareInfo* agent_info =\n" + cpp_content += ( + " reinterpret_cast(counter_hw_info);\n" + ) + cpp_content += " if ((agent_info->getNumInstances() > 1 && instance_id == -1) ||\n" + cpp_content += ( + " instance_id < -1 || instance_id >= agent_info->getNumInstances())\n" + ) + cpp_content += " return false;\n" + cpp_content += " if (instance_id == -1) *value = instances_values_[0];\n" + cpp_content += " *value = instances_values_[instance_id];\n" + cpp_content += " return true;\n" + cpp_content += "}\n" + cpp_content += "\n" + cpp_content += "uint64_t BasicCounter::GetValue(int64_t instance_id) {\n" + cpp_content += " Agent::CounterHardwareInfo* agent_info =\n" + cpp_content += ( + " reinterpret_cast(counter_hw_info);\n" + ) + cpp_content += " if ((agent_info->getNumInstances() > 1 && instance_id == -1) ||\n" + cpp_content += ( + " instance_id < -1 || instance_id >= agent_info->getNumInstances())\n" + ) + cpp_content += ' throw(std::string("Error: Wrong number of instances (") +\n' + cpp_content += " std::to_string(agent_info->getNumInstances()) +\n" + cpp_content += ' ") OR Instance ID is less than 0 ");\n' + cpp_content += " if (instance_id == -1) return instances_values_[0];\n" + cpp_content += " return instances_values_[instance_id];\n" + cpp_content += "}\n" + cpp_content += "\n" + cpp_content += "uint64_t BasicCounter::avr(int64_t instances_count) {\n" + cpp_content += " Agent::CounterHardwareInfo* agent_info =\n" + cpp_content += ( + " reinterpret_cast(counter_hw_info);\n" + ) + cpp_content += " if (agent_info->getNumInstances() > instances_count)\n" + cpp_content += ' throw(std::string("Error: Number of instances (") +\n' + cpp_content += " std::to_string(agent_info->getNumInstances()) +\n" + cpp_content += ' ") is greater than the given instance count(" +\n' + cpp_content += ' std::to_string(instances_count) + ")");\n' + cpp_content += " uint64_t result = 0;\n" + cpp_content += " int64_t instance_id;\n" + cpp_content += ( + " for (instance_id = 0; instance_id < instances_count; instance_id++) {\n" + ) + cpp_content += " uint64_t value;\n" + cpp_content += " if (GetValue(&value, instance_id)) result += value;\n" + cpp_content += " }\n" + cpp_content += " return result / instances_count;\n" + cpp_content += "}\n" + cpp_content += "uint64_t BasicCounter::max(int64_t instances_count) {\n" + cpp_content += " uint64_t result = 0;\n" + cpp_content += " int64_t instance_id;\n" + cpp_content += ( + " for (instance_id = 0; instance_id < instances_count; instance_id++) {\n" + ) + cpp_content += " uint64_t value;\n" + cpp_content += ( + " if (GetValue(&value, instance_id) && result < value) result = value;\n" + ) + cpp_content += " }\n" + cpp_content += " return result;\n" + cpp_content += "}\n" + cpp_content += "uint64_t BasicCounter::min(int64_t instances_count) {\n" + cpp_content += " int64_t instance_id;\n" + cpp_content += " uint64_t result = 0;\n" + cpp_content += ( + " for (instance_id = 0; instance_id < instances_count; instance_id++) {\n" + ) + cpp_content += " uint64_t value;\n" + cpp_content += ( + " if (GetValue(&value, instance_id) && result > value) result = value;\n" + ) + cpp_content += " }\n" + cpp_content += " return result;\n" + cpp_content += "}\n" + cpp_content += "uint64_t BasicCounter::sum(int64_t instances_count) {\n" + cpp_content += " int64_t instance_id;\n" + cpp_content += " uint64_t result = 0;\n" + cpp_content += ( + " for (instance_id = 0; instance_id < instances_count; instance_id++) {\n" + ) + cpp_content += " uint64_t value;\n" + cpp_content += " if (GetValue(&value, instance_id)) result += value;\n" + cpp_content += " }\n" + cpp_content += " return result;\n" + cpp_content += "}\n" + cpp_content += "\n" + cpp_content += "uint64_t operator+(BasicCounter counter, const uint64_t number) {\n" + cpp_content += " [[maybe_unused]] uint64_t value = 0;\n" + cpp_content += ( + ' ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n' + ) + cpp_content += " return number + value;\n" + cpp_content += "}\n" + cpp_content += "uint64_t operator-(BasicCounter counter, const uint64_t number) {\n" + cpp_content += " [[maybe_unused]] uint64_t value = 0;\n" + cpp_content += ( + ' ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n' + ) + cpp_content += " return number - value;\n" + cpp_content += "}\n" + cpp_content += "uint64_t operator*(BasicCounter counter, const uint64_t number) {\n" + cpp_content += " [[maybe_unused]] uint64_t value = 0;\n" + cpp_content += ( + ' ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n' + ) + cpp_content += " return number * value;\n" + cpp_content += "}\n" + cpp_content += "uint64_t operator/(BasicCounter counter, const uint64_t number) {\n" + cpp_content += " [[maybe_unused]] uint64_t value = 0;\n" + cpp_content += ( + ' ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n' + ) + cpp_content += " return number / value;\n" + cpp_content += "}\n" + cpp_content += "uint64_t operator^(BasicCounter counter, const uint64_t number) {\n" + cpp_content += " [[maybe_unused]] uint64_t value = 0;\n" + cpp_content += ( + ' ASSERTM(counter.GetValue(&value), "Error: Counter has no value!");\n' + ) + cpp_content += " return number ^ value;\n" + cpp_content += "}\n" + cpp_content += "\n" + cpp_content += "uint64_t operator+(BasicCounter counter1, BasicCounter counter2) {\n" + cpp_content += " [[maybe_unused]] uint64_t value1 = 0;\n" + cpp_content += ( + ' ASSERTM(counter1.GetValue(&value1), "Error: Counter has no value!");\n' + ) + cpp_content += " [[maybe_unused]] uint64_t value2 = 0;\n" + cpp_content += ( + ' ASSERTM(counter2.GetValue(&value2), "Error: Counter has no value!");\n' + ) + cpp_content += " return value1 + value2;\n" + cpp_content += "}\n" + cpp_content += "uint64_t operator-(BasicCounter counter1, BasicCounter counter2) {\n" + cpp_content += " [[maybe_unused]] uint64_t value1 = 0;\n" + cpp_content += ( + ' ASSERTM(counter1.GetValue(&value1), "Error: Counter has no value!");\n' + ) + cpp_content += " [[maybe_unused]] uint64_t value2 = 0;\n" + cpp_content += ( + ' ASSERTM(counter2.GetValue(&value2), "Error: Counter has no value!");\n' + ) + cpp_content += " return value1 - value2;\n" + cpp_content += "}\n" + cpp_content += "uint64_t operator*(BasicCounter counter1, BasicCounter counter2) {\n" + cpp_content += " [[maybe_unused]] uint64_t value1 = 0;\n" + cpp_content += ( + ' ASSERTM(counter1.GetValue(&value1), "Error: Counter has no value!");\n' + ) + cpp_content += " [[maybe_unused]] uint64_t value2 = 0;\n" + cpp_content += ( + ' ASSERTM(counter2.GetValue(&value2), "Error: Counter has no value!");\n' + ) + cpp_content += " return value1 * value2;\n" + cpp_content += "}\n" + cpp_content += "uint64_t operator/(BasicCounter counter1, BasicCounter counter2) {\n" + cpp_content += " [[maybe_unused]] uint64_t value1 = 0;\n" + cpp_content += ( + ' ASSERTM(counter1.GetValue( & value1), "Error: Counter has no value!");\n' + ) + cpp_content += " [[maybe_unused]] uint64_t value2 = 0;\n" + cpp_content += ( + ' ASSERTM(counter2.GetValue( & value2), "Error: Counter has no value!");\n' + ) + cpp_content += " return value1 / value2;\n" + cpp_content += "}\n" + cpp_content += "uint64_t operator^(BasicCounter counter1, BasicCounter counter2) {\n" + cpp_content += " [[maybe_unused]] uint64_t value1 = 0;\n" + cpp_content += ( + ' ASSERTM(counter1.GetValue(&value1), "Error: Counter has no value!");\n' + ) + cpp_content += " [[maybe_unused]] uint64_t value2 = 0;\n" + cpp_content += ( + ' ASSERTM(counter2.GetValue(&value2), "Error: Counter has no value!");\n' + ) + cpp_content += " return value1 ^ value2;\n" + cpp_content += "}\n" + cpp_content += "\n" + cpp_content += "static std::map basic_counters;\n" + cpp_content += "\n" + cpp_content += "BasicCounter* GetGeneratedBasicCounter(uint64_t id) {\n" + cpp_content += " return &basic_counters.at(id);\n" + cpp_content += "}\n" + cpp_content += "\n" + cpp_content += "void ClearBasicCounters() {\n" + cpp_content += " basic_counters.clear();\n" + cpp_content += "}\n" + cpp_content += "\n" + cpp_content += "/**\n" + cpp_content += " * @brief Basic Counters\n" + cpp_content += " *\n" + cpp_content += " * @{\n" + cpp_content += " */\n" + cpp_content += "uint64_t GetBasicCounter(const char* name, const char* gpu_name) {\n" + cpp_content += " std::string gpu;\n" + parser = etree.XMLParser(recover=True, encoding="utf-8") + xml_file = ET.parse(sys.argv[1] + "/gfx_metrics.xml", parser=parser) + root = xml_file.getroot() + for gpu in root: + cpp_content += ( + "\n\t/**\n\t * @brief Basic " + gpu.tag + " counters\n\t *\n\t * @{\n\t */\n" + ) + cpp_content += '\tgpu = "' + gpu.tag + '";\n\n' + cpp_content += "\tif (strncmp(gpu_name, gpu.c_str(), gpu.length())==0) {\n" + for child in gpu: + cpp_content += ( + "\t/**\n\t * Basic Counter: " + + child.attrib["name"] + + "\n\t *\n\t * " + + child.attrib["descr"] + + '\n\t */\n\tif (strcmp(name, "' + + child.attrib["name"] + + '")==0) {\n\t\tbasic_counters.emplace(' + + child.attrib["event"] + + ", BasicCounter{" + + child.attrib["event"] + + ', "' + + child.attrib["block"] + + '", "' + + child.attrib["name"] + + '", "' + + child.attrib["descr"] + + '", "' + + gpu.tag + + '"});\n\t\treturn ' + + child.attrib["event"] + + ";\n\t}\n" + ) + cpp_content += "\t}\n\n\t/**\n\t * @}\n\t */\n" + cpp_content += ( + ' throw("Couldn\'t find the required Counter name for the mentioned GPU!");\n' + ) + cpp_content += " return 0;\n" + cpp_content += "}\n" + cpp_content += "/**\n" + cpp_content += " * @}\n" + cpp_content += " */\n" + cpp_content += "\n" + cpp_content += "} // namespace Counter\n" + cpp_content += "\n" + cpp_content += "#pragma GCC diagnostic pop\n" + print('Generating "' + sys.argv[2] + '"') + f = open(sys.argv[2], "w") + f.write(cpp_content[:-1]) + f.close() diff --git a/src/core/counters/derived/xml_parser_derived.py b/src/core/counters/derived/xml_parser_derived.py index 07f5fe5b..be9e0283 100644 --- a/src/core/counters/derived/xml_parser_derived.py +++ b/src/core/counters/derived/xml_parser_derived.py @@ -7,82 +7,125 @@ import ast import sys -ops = {'Div': '/', 'Mult': '*', 'Add': '+', 'Sub': '-'} -calls = {'avr', 'max', 'min', 'sum'} +ops = {"Div": "/", "Mult": "*", "Add": "+", "Sub": "-"} +calls = {"avr", "max", "min", "sum"} + def parse_expr(gpu_tag, data): - global exprs_counters - global exprs_counters_init - global expr_print - global counter_count - global counters_dictionary - expr_queue = deque() - for line in data.split('\n'): - if 'Constant' in line: - number = line.split('(')[1].split(')')[0] - expr_queue.append('(uint64_t)' + number) - if 'Name' in line: - name = line.split('\'')[1] - if name in calls: - expr_queue.append(name) - else: - if not name in exprs_counters: - exprs_counters += "getGeneratedBasicCounter(" + name + "_id), " - exprs_counters_init += "\n\t\tuint64_t " + name + "_id = getBasicCounter(\"" + name + "\", \"" + gpu_tag + "\");" - counters_dictionary[name] = counter_count - counter_count+=1 - expr_queue.append("counter.getBasicCounterFromDerived(" + str(counters_dictionary[name]) + ")") - op = line.split('(')[0] - if op in ops: - expr_queue.append(ops[op]) - expr_print += "\n\t\t\t\treturn " - i = 0 - for element in expr_queue: - if element in calls: - i = 1 - call = element - elif i == 1: - expr_print += element + "." + call + "(" - call = "" - i = 2 - elif i == 2: - expr_print += element + ")" - i = 0 - else: - expr_print += element - if "counter.getBasicCounterFromDerived" == element[0:34]: - expr_print += "->getValue()" + global exprs_counters + global exprs_counters_init + global expr_print + global counter_count + global counters_dictionary + expr_queue = deque() + for line in data.split("\n"): + if "Constant" in line: + number = line.split("(")[1].split(")")[0] + expr_queue.append("(uint64_t)" + number) + if "Name" in line: + name = line.split("'")[1] + if name in calls: + expr_queue.append(name) + else: + if not name in exprs_counters: + exprs_counters += "getGeneratedBasicCounter(" + name + "_id), " + exprs_counters_init += ( + "\n\t\tuint64_t " + + name + + '_id = getBasicCounter("' + + name + + '", "' + + gpu_tag + + '");' + ) + counters_dictionary[name] = counter_count + counter_count += 1 + expr_queue.append( + "counter.getBasicCounterFromDerived(" + + str(counters_dictionary[name]) + + ")" + ) + op = line.split("(")[0] + if op in ops: + expr_queue.append(ops[op]) + expr_print += "\n\t\t\t\treturn " + i = 0 + for element in expr_queue: + if element in calls: + i = 1 + call = element + elif i == 1: + expr_print += element + "." + call + "(" + call = "" + i = 2 + elif i == 2: + expr_print += element + ")" + i = 0 + else: + expr_print += element + if "counter.getBasicCounterFromDerived" == element[0:34]: + expr_print += "->getValue()" -if (__name__ == "__main__"): - global exprs_counters - global exprs_counters_init - global expr_print - global counter_count - parser = etree.XMLParser(recover=True, encoding='utf-8') - xml_file = ET.parse(sys.argv[1] + '/metrics.xml', parser=parser) - root = xml_file.getroot() - print( - "uint64_t getDerivedCounter(const char* name, const char* gpu_name) {") - for gpu in root: - print("\n\t/**\n\t * @brief Derived " + gpu.tag + " counters\n\t *\n\t * @{\n\t */") - print("\tif (strcmp(gpu_name, \"" + gpu.tag + "\")==0) {") - for child in gpu: - exprs_counters = "" - exprs_counters_init = "" - expr_print = "" - counter_count = 0 - counters_dictionary = {} - parse_expr(gpu.tag.split("_")[0], ast.dump(ast.parse( - child.attrib['expr'], mode='eval'), annotate_fields=False, include_attributes=False, indent=0)) - print("\t/**\n\t * Derived Counter: " + child.attrib['name'] + "\n\t *\n\t * " + child.attrib['descr'] + "\n\t */\n\tif (strcmp(name, \"" + - child.attrib['name'] + "\")==0) {" + exprs_counters_init + "\n\t\tDerivedCounter counter = DerivedCounter(\"" + child.attrib['name'] + - "\", \"" + child.attrib['descr'] + "\", \"" + gpu.tag.split("_")[0] + "\");") - exprs_counter_count = 0 - for expr_counter in exprs_counters[0:-2].split(", "): - print("\n\t\tcounter.addBasicCounter(" + str(exprs_counter_count) + ", " + expr_counter + ");") - exprs_counter_count += 1 - # print("\n\t\tcounter.evaluate_metric = [counter]() {" + expr_print + ";\n\t\t\t};") - print("\n\t\tderived_counters.emplace(counter.getMetricId(), counter);\n\t\treturn counter.getMetricId();\n\t}") - print("\t}\n\n\t/**\n\t * @}\n\t */") - print("\n\treturn 0;\n}\n") +if __name__ == "__main__": + global exprs_counters + global exprs_counters_init + global expr_print + global counter_count + parser = etree.XMLParser(recover=True, encoding="utf-8") + xml_file = ET.parse(sys.argv[1] + "/metrics.xml", parser=parser) + root = xml_file.getroot() + print("uint64_t getDerivedCounter(const char* name, const char* gpu_name) {") + for gpu in root: + print( + "\n\t/**\n\t * @brief Derived " + gpu.tag + " counters\n\t *\n\t * @{\n\t */" + ) + print('\tif (strcmp(gpu_name, "' + gpu.tag + '")==0) {') + for child in gpu: + exprs_counters = "" + exprs_counters_init = "" + expr_print = "" + counter_count = 0 + counters_dictionary = {} + parse_expr( + gpu.tag.split("_")[0], + ast.dump( + ast.parse(child.attrib["expr"], mode="eval"), + annotate_fields=False, + include_attributes=False, + indent=0, + ), + ) + print( + "\t/**\n\t * Derived Counter: " + + child.attrib["name"] + + "\n\t *\n\t * " + + child.attrib["descr"] + + '\n\t */\n\tif (strcmp(name, "' + + child.attrib["name"] + + '")==0) {' + + exprs_counters_init + + '\n\t\tDerivedCounter counter = DerivedCounter("' + + child.attrib["name"] + + '", "' + + child.attrib["descr"] + + '", "' + + gpu.tag.split("_")[0] + + '");' + ) + exprs_counter_count = 0 + for expr_counter in exprs_counters[0:-2].split(", "): + print( + "\n\t\tcounter.addBasicCounter(" + + str(exprs_counter_count) + + ", " + + expr_counter + + ");" + ) + exprs_counter_count += 1 + # print("\n\t\tcounter.evaluate_metric = [counter]() {" + expr_print + ";\n\t\t\t};") + print( + "\n\t\tderived_counters.emplace(counter.getMetricId(), counter);\n\t\treturn counter.getMetricId();\n\t}" + ) + print("\t}\n\n\t/**\n\t * @}\n\t */") + print("\n\treturn 0;\n}\n") diff --git a/src/core/counters/metrics/eval_metrics.cpp b/src/core/counters/metrics/eval_metrics.cpp index d8aebcfa..1a1abdd7 100644 --- a/src/core/counters/metrics/eval_metrics.cpp +++ b/src/core/counters/metrics/eval_metrics.cpp @@ -46,7 +46,7 @@ hsa_status_t pmcCallback(hsa_ven_amd_aqlprofile_info_type_t info_type, if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) { if (IsEventMatch(info_data->pmc_data.event, (*data_it)->event)) { uint32_t xcc_index = floor(passed_data->index / passed_data->single_xcc_buff_size); - (*data_it)->xcc_vals[xcc_index] += + (*data_it)->xcc_vals.at(xcc_index) += info_data->pmc_data.result; // stores event result from each xcc separately (*data_it)->val_double += info_data->pmc_data.result; // stores accumulated event result from all xccs diff --git a/src/core/hsa/hsa_support.cpp b/src/core/hsa/hsa_support.cpp index 62c20abd..ab676543 100644 --- a/src/core/hsa/hsa_support.cpp +++ b/src/core/hsa/hsa_support.cpp @@ -41,6 +41,7 @@ #include #include #include +#include #include "core/hardware/hsa_info.h" #include "src/core/session/tracer/src/correlation_id.h" @@ -68,15 +69,16 @@ hsa_status_t hsa_executable_iteration_callback(hsa_executable_t executable, hsa_ symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length); // TODO(aelwazir): to be removed if the HSA fixed the issue of corrupted // names overflowing the length given + name_length = std::min(name_length, PATH_MAX); if (name_length > 1) { if (!(*static_cast(args))) { - char name[name_length + 1]; + auto name = std::vector(name_length + 1, '\0'); uint64_t kernel_object; hsasupport_singleton.GetCoreApiTable().hsa_executable_symbol_get_info_fn( - symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name.data()); hsasupport_singleton.GetCoreApiTable().hsa_executable_symbol_get_info_fn( symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); - std::string kernel_name = std::string(name).substr(0, name_length); + auto kernel_name = std::string{name.data()}.substr(0, name_length); rocprofiler::AddKernelName(kernel_object, kernel_name); } else { uint64_t kernel_object; @@ -751,8 +753,9 @@ void HSASupport_Singleton::SetHSALoaderApi() { const Agent::DeviceInfo& HSAAgentInfo::GetDeviceInfo() const { - if (type_ == HSA_DEVICE_TYPE_GPU) + if (type_ == HSA_DEVICE_TYPE_GPU) { return device_info_; + } assert("Attempting to read deviceInfo for a CPU agent"); } diff --git a/src/core/hsa/packets/packets_generator.cpp b/src/core/hsa/packets/packets_generator.cpp index 20cb11e1..85712153 100644 --- a/src/core/hsa/packets/packets_generator.cpp +++ b/src/core/hsa/packets/packets_generator.cpp @@ -82,7 +82,8 @@ static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool if (nullptr == data) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); err = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_get_info_fn( pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); ASSERTM(err != HSA_STATUS_ERROR, "hsa_amd_memory_pool_get_info"); @@ -115,10 +116,10 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { } void InitializePools(hsa_agent_t cpu_agent, rocprofiler::HSAAgentInfo* agent_info) { - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); - hsa_status_t status = - hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( - cpu_agent, FindStandardPool, &(agent_info->cpu_pool_)); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); + hsa_status_t status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( + cpu_agent, FindStandardPool, &(agent_info->cpu_pool_)); CHECK_HSA_STATUS("Error: Command Buffer Pool is not initialized", status); status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( @@ -127,9 +128,10 @@ void InitializePools(hsa_agent_t cpu_agent, rocprofiler::HSAAgentInfo* agent_inf } void InitializeGPUPool(hsa_agent_t gpu_agent, rocprofiler::HSAAgentInfo* agent_info) { - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); - hsa_status_t status = - hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(gpu_agent, FindStandardPool, &(agent_info->gpu_pool_)); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); + hsa_status_t status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( + gpu_agent, FindStandardPool, &(agent_info->gpu_pool_)); CHECK_HSA_STATUS("hsa_amd_agent_iterate_memory_pools(gpu_pool)", status); } @@ -146,7 +148,8 @@ std::map metricsDict; void CheckPacketReqiurements() { - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); for (auto& gpu_agent : hsasupport_singleton.gpu_agents) { // get the instance of MetricsDict rocprofiler::HSAAgentInfo& agentInfo = hsasupport_singleton.GetHSAAgentInfo(gpu_agent.handle); @@ -162,8 +165,10 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, std::vector& counter_names, rocprofiler_session_id_t session_id, bool is_spm) { hsa_status_t status = HSA_STATUS_SUCCESS; - rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = rocprofiler::ROCProfiler_Singleton::GetInstance(); - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); + rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = + rocprofiler::ROCProfiler_Singleton::GetInstance(); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); if (!counters_added.load(std::memory_order_acquire)) { for (auto& name : counter_names) { if (rocprofiler_singleton.HasActiveSession()) { @@ -337,7 +342,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, << "Error: Command buffer given size is " << size << std::endl; abort(); } - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( agentInfo.cpu_pool_, size, 0, reinterpret_cast(&(profile->command_buffer.ptr))); if (status != HSA_STATUS_SUCCESS) { profile->command_buffer.ptr = malloc(size); @@ -351,7 +356,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, } } else { // Both the CPU and GPU can access the memory - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn( + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn( ag_list_count, ag_list, NULL, profile->command_buffer.ptr); CHECK_HSA_STATUS("Error: Allowing access to Command Buffer", status); } @@ -364,8 +369,9 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, << "Error: Output buffer given size is " << size << std::endl; abort(); } - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( - agentInfo.kernarg_pool_, size, 0, reinterpret_cast(&profile->output_buffer.ptr)); + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( + agentInfo.kernarg_pool_, size, 0, + reinterpret_cast(&profile->output_buffer.ptr)); if (status != HSA_STATUS_SUCCESS) { profile->output_buffer.ptr = malloc(size); /*numa_alloc_onnode( @@ -378,7 +384,7 @@ InitializeAqlPackets(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, abort(); } } else { - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn( + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn( ag_list_count, ag_list, NULL, profile->output_buffer.ptr); CHECK_HSA_STATUS("Error: GPU Agent can't have output buffer access", status); memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size); @@ -426,34 +432,33 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets( // Preparing an Getting the size of the command and output buffers status = hsa_ven_amd_aqlprofile_start(profile, NULL); - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); rocprofiler::HSAAgentInfo& agentInfo = hsasupport_singleton.GetHSAAgentInfo(gpu_agent.handle); size_t ag_list_count = 1; hsa_agent_t ag_list[ag_list_count]; ag_list[0] = gpu_agent; // Allocating Command Buffer - //FixMe: Command buffer and output buffers are allocated repetatively. + // FixMe: Command buffer and output buffers are allocated repetatively. status = HSA_STATUS_ERROR; size_t size = profile->command_buffer.size; profile->command_buffer.ptr = nullptr; if (size <= 0) return nullptr; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( agentInfo.cpu_pool_, size, 0, reinterpret_cast(&(profile->command_buffer.ptr))); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn( + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn( ag_list_count, ag_list, NULL, profile->command_buffer.ptr); CHECK_HSA_STATUS("Error: GPU Agent can't have command buffer access", status); } else { hsa_agent_t near_cpu_node = agentInfo.GetNearCpuAgent(); uint32_t near_cpu_node_id = 0; - hsasupport_singleton.GetCoreApiTable().hsa_agent_get_info_fn(near_cpu_node, - HSA_AGENT_INFO_NODE, &near_cpu_node_id); - profile->command_buffer.ptr = numa_alloc_onnode( - profile->command_buffer.size, - near_cpu_node_id); + hsasupport_singleton.GetCoreApiTable().hsa_agent_get_info_fn(near_cpu_node, HSA_AGENT_INFO_NODE, + &near_cpu_node_id); + profile->command_buffer.ptr = numa_alloc_onnode(profile->command_buffer.size, near_cpu_node_id); if (profile->command_buffer.ptr != nullptr) { status = HSA_STATUS_SUCCESS; } else { @@ -466,12 +471,12 @@ hsa_ven_amd_aqlprofile_profile_t* InitializeDeviceProfilingAqlPackets( size = profile->output_buffer.size; profile->output_buffer.ptr = nullptr; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( agentInfo.gpu_pool_, size, 0, reinterpret_cast(&(profile->output_buffer.ptr))); CHECK_HSA_STATUS("Error: Can't Allocate Output Buffer", status); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn( + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agents_allow_access_fn( ag_list_count, ag_list, NULL, profile->output_buffer.ptr); CHECK_HSA_STATUS("Error: Can't allow access on the Output Buffer for the GPU", status); memset(profile->output_buffer.ptr, 0x0, profile->output_buffer.size); @@ -501,8 +506,9 @@ uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_po hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( *cpu_pool, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { @@ -516,16 +522,20 @@ uint8_t* AllocateSysMemory(hsa_agent_t gpu_agent, size_t size, hsa_amd_memory_po // Allocate memory for use by a kernel of specified size uint8_t* AllocateLocalMemory(size_t size, hsa_amd_memory_pool_t* gpu_pool) { hsa_status_t status = HSA_STATUS_ERROR; - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn(*gpu_pool, size, 0, reinterpret_cast(&buffer)); + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_allocate_fn( + *gpu_pool, size, 0, reinterpret_cast(&buffer)); uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; return ptr; } -hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile, size_t att_buffer_size) { - rocprofiler::HSAAgentInfo& agentInfo = rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(gpu_agent.handle); +hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* profile, + size_t att_buffer_size) { + rocprofiler::HSAAgentInfo& agentInfo = + rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(gpu_agent.handle); profile->command_buffer.ptr = AllocateSysMemory(gpu_agent, profile->command_buffer.size, &agentInfo.cpu_pool_); profile->output_buffer.size = att_buffer_size; @@ -538,11 +548,14 @@ hsa_status_t Allocate(hsa_agent_t gpu_agent, hsa_ven_amd_aqlprofile_profile_t* p bool AllocateMemoryPools(hsa_agent_t cpu_agent, hsa_agent_t gpu_agent, hsa_amd_memory_pool_t* cpu_pool, hsa_amd_memory_pool_t* gpu_pool) { - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); - hsa_status_t status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(cpu_agent, FindStandardPool, cpu_pool); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); + hsa_status_t status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( + cpu_agent, FindStandardPool, cpu_pool); CHECK_HSA_STATUS("hsa_amd_agent_iterate_memory_pools(cpu_pool)", status); - status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn(gpu_agent, FindStandardPool, gpu_pool); + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_agent_iterate_memory_pools_fn( + gpu_agent, FindStandardPool, gpu_pool); CHECK_HSA_STATUS("hsa_amd_agent_iterate_memory_pools(gpu_pool)", status); return true; diff --git a/src/core/hsa/queues/queue.cpp b/src/core/hsa/queues/queue.cpp index 4b8c2aa0..56fc00cf 100644 --- a/src/core/hsa/queues/queue.cpp +++ b/src/core/hsa/queues/queue.cpp @@ -114,7 +114,6 @@ std::string GetKernelNameUsingDispatchID(uint64_t given_id) { } - struct kernel_descriptor_t { uint8_t reserved0[16]; int64_t kernel_code_entry_byte_offset; @@ -126,7 +125,7 @@ struct kernel_descriptor_t { uint8_t reserved2[6]; }; // AMD Compute Program Resource Register Three. -typedef uint32_t amd_compute_pgm_rsrc_three32_t; +using amd_compute_pgm_rsrc_three32_t = uint32_t; enum amd_compute_gfx9_pgm_rsrc_three_t { AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET, 0, 5), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_COMPUTE_PGM_RSRC_THREE_TG_SPLIT, 16, 1) @@ -158,17 +157,20 @@ enum amd_kernel_code_property_t { static const kernel_descriptor_t* GetKernelCode(uint64_t kernel_object) { const kernel_descriptor_t* kernel_code = NULL; - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); - hsa_status_t status = hsasupport_singleton.GetHSALoaderApi().hsa_ven_amd_loader_query_host_address( - reinterpret_cast(kernel_object), reinterpret_cast(&kernel_code)); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); + hsa_status_t status = + hsasupport_singleton.GetHSALoaderApi().hsa_ven_amd_loader_query_host_address( + reinterpret_cast(kernel_object), + reinterpret_cast(&kernel_code)); if (HSA_STATUS_SUCCESS != status) { kernel_code = reinterpret_cast(kernel_object); } return kernel_code; } -static uint32_t arch_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code) { - +static uint32_t arch_vgpr_count(const std::string_view& name, + const kernel_descriptor_t& kernel_code) { std::string info_name(name.data(), name.size()); if (strcmp(name.data(), "gfx90a") == 0 || strncmp(name.data(), "gfx94", 5) == 0) return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3, @@ -184,8 +186,8 @@ static uint32_t arch_vgpr_count(const std::string_view& name, const kernel_descr ? 8 : 4); } -static uint32_t accum_vgpr_count(const std::string_view& name, const kernel_descriptor_t& kernel_code) { - +static uint32_t accum_vgpr_count(const std::string_view& name, + const kernel_descriptor_t& kernel_code) { std::string info_name(name.data(), name.size()); if (strcmp(info_name.c_str(), "gfx908") == 0) return arch_vgpr_count(name, kernel_code); if (strcmp(info_name.c_str(), "gfx90a") == 0 || strncmp(info_name.c_str(), "gfx94", 5) == 0) @@ -204,19 +206,19 @@ static uint32_t sgpr_count(const std::string_view& name, const kernel_descriptor // TODO(srnagara): Recheck the extraction of gfxip from gpu name const char* name_data = name.data(); const size_t gfxip_label_len = std::min(name.size() - 2, size_t{63}); - if (gfxip_label_len > 0 && strlen(name_data) >= gfxip_label_len) { - char gfxip[gfxip_label_len]; + if (gfxip_label_len > 0 && strnlen(name_data, gfxip_label_len + 1) >= gfxip_label_len) { + char gfxip[gfxip_label_len + 1]; memcpy(gfxip, name_data, gfxip_label_len); + gfxip[gfxip_label_len] = '\0'; // TODO(srnagara): Check if it is hardcoded - if (std::atoi(&gfxip[3]) >= 10) return 128; + if (std::stoi(&gfxip[3]) >= 10) return 128; return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) / 2 + 1) * 16; - } else { - return 0; } + return 0; } rocprofiler_kernel_properties_t set_kernel_properties(hsa_kernel_dispatch_packet_t packet, @@ -233,9 +235,11 @@ rocprofiler_kernel_properties_t set_kernel_properties(hsa_kernel_dispatch_packet kernel_properties_ptr.workgroup_size = (uint32_t)workgroup_size; kernel_properties_ptr.lds_size = packet.group_segment_size; kernel_properties_ptr.scratch_size = packet.private_segment_size; - HSAAgentInfo agent_info = HSASupport_Singleton::GetInstance().GetHSAAgentInfo(agent.handle); - kernel_properties_ptr.arch_vgpr_count = arch_vgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code); - kernel_properties_ptr.accum_vgpr_count = accum_vgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code); + HSAAgentInfo agent_info = HSASupport_Singleton::GetInstance().GetHSAAgentInfo(agent.handle); + kernel_properties_ptr.arch_vgpr_count = + arch_vgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code); + kernel_properties_ptr.accum_vgpr_count = + accum_vgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code); kernel_properties_ptr.sgpr_count = sgpr_count(agent_info.GetDeviceInfo().getName(), *kernel_code); kernel_properties_ptr.wave_size = AMD_HSA_BITS_GET(kernel_code->kernel_code_properties, @@ -249,7 +253,7 @@ rocprofiler_kernel_properties_t set_kernel_properties(hsa_kernel_dispatch_packet namespace queue { - hsa_status_t pmcCallback(hsa_ven_amd_aqlprofile_info_type_t info_type, +hsa_status_t pmcCallback(hsa_ven_amd_aqlprofile_info_type_t info_type, hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; pmc_callback_data_t* passed_data = reinterpret_cast(data); @@ -302,7 +306,8 @@ void AddRecordCounters(rocprofiler_record_profiler_t* record, const pending_sign rocprofiler_record_counter_value_t{value}}); } record->counters = counters; - rocprofiler::Session* session = rocprofiler::ROCProfiler_Singleton::GetInstance().GetSession(pending->session_id); + rocprofiler::Session* session = + rocprofiler::ROCProfiler_Singleton::GetInstance().GetSession(pending->session_id); void* initial_handle = const_cast(record->counters); if (session->FindBuffer(pending->buffer_id)) { Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id); @@ -365,10 +370,11 @@ void AddAttRecord(rocprofiler_record_att_tracer_t* record, hsa_agent_t gpu_agent bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) { auto queue_info_session = static_cast(data); - rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = rocprofiler::ROCProfiler_Singleton::GetInstance(); - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); - if (!queue_info_session || - !rocprofiler_singleton.GetSession(queue_info_session->session_id) || + rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = + rocprofiler::ROCProfiler_Singleton::GetInstance(); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); + if (!queue_info_session || !rocprofiler_singleton.GetSession(queue_info_session->session_id) || !rocprofiler_singleton.GetSession(queue_info_session->session_id)->GetProfiler()) return true; rocprofiler::Session* session = rocprofiler_singleton.GetSession(queue_info_session->session_id); @@ -381,9 +387,10 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) { for (auto it = pending_signals.begin(); it != pending_signals.end(); it = pending_signals.erase(it)) { auto& pending = *it; - if (hsasupport_singleton.GetCoreApiTable().hsa_signal_load_relaxed_fn(pending->new_signal)) return true; + if (hsasupport_singleton.GetCoreApiTable().hsa_signal_load_relaxed_fn(pending->new_signal)) + return true; hsa_amd_profiling_dispatch_time_t time; - hsasupport_singleton.GetAmdExtTable().hsa_amd_profiling_get_dispatch_time_fn( + hsasupport_singleton.GetAmdExtTable().hsa_amd_profiling_get_dispatch_time_fn( queue_info_session->agent, pending->original_signal, &time); uint32_t record_count = 1; bool is_individual_xcc_mode = false; @@ -429,7 +436,7 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) { pending->context->metrics_list, time.end - time.start); AddRecordCounters(&record, pending); - }else { + } else { if (session->FindBuffer(pending->buffer_id)) { Memory::GenericBuffer* buffer = session->GetBuffer(pending->buffer_id); buffer->AddRecord(record); @@ -440,12 +447,12 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) { // TODO(aelwazir): we need a better way of distributing events and free them // if (pending->profile->output_buffer.ptr) // numa_free(pending->profile->output_buffer.ptr, pending->profile->output_buffer.size); - hsa_status_t status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_free_fn( + hsa_status_t status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_free_fn( (pending->profile->output_buffer.ptr)); CHECK_HSA_STATUS("Error: Couldn't free output buffer memory", status); // if (pending->profile->command_buffer.ptr) // numa_free(pending->profile->command_buffer.ptr, pending->profile->command_buffer.size); - status =hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_free_fn( + status = hsasupport_singleton.GetAmdExtTable().hsa_amd_memory_pool_free_fn( (pending->profile->command_buffer.ptr)); CHECK_HSA_STATUS("Error: Couldn't free command buffer memory", status); delete pending->profile; @@ -455,9 +462,10 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) { delete pending->context; } if (pending->new_signal.handle) - hsasupport_singleton.GetCoreApiTable().hsa_signal_destroy_fn(pending->new_signal); + hsasupport_singleton.GetCoreApiTable().hsa_signal_destroy_fn(pending->new_signal); if (queue_info_session->interrupt_signal.handle) - hsasupport_singleton.GetCoreApiTable().hsa_signal_destroy_fn(queue_info_session->interrupt_signal); + hsasupport_singleton.GetCoreApiTable().hsa_signal_destroy_fn( + queue_info_session->interrupt_signal); } } delete queue_info_session; @@ -466,12 +474,12 @@ bool AsyncSignalHandler(hsa_signal_value_t signal_value, void* data) { } bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) { - auto queue_info_session = static_cast(data); - rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = rocprofiler::ROCProfiler_Singleton::GetInstance(); - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); - if (!queue_info_session || - !rocprofiler_singleton.GetSession(queue_info_session->session_id) || + rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = + rocprofiler::ROCProfiler_Singleton::GetInstance(); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); + if (!queue_info_session || !rocprofiler_singleton.GetSession(queue_info_session->session_id) || !rocprofiler_singleton.GetSession(queue_info_session->session_id)->GetAttTracer()) return true; rocprofiler::Session* session = rocprofiler_singleton.GetSession(queue_info_session->session_id); @@ -487,7 +495,8 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) { it = pending_signals.erase(it)) { auto& pending = *it; std::lock_guard lock(session->GetSessionLock()); - if (hsasupport_singleton.GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.new_signal)) return true; + if (hsasupport_singleton.GetCoreApiTable().hsa_signal_load_relaxed_fn(pending.new_signal)) + return true; rocprofiler_record_att_tracer_t record{}; record.kernel_id = rocprofiler_kernel_id_t{pending.kernel_descriptor}; record.gpu_id = rocprofiler_agent_id_t{(uint64_t)queue_info_session->gpu_index}; @@ -535,7 +544,7 @@ bool AsyncSignalHandlerATT(hsa_signal_value_t /* signal */, void* data) { void CreateBarrierPacket(const hsa_signal_t& packet_completion_signal, std::vector* transformed_packets) { - hsa_barrier_and_packet_t barrier{0}; + hsa_barrier_and_packet_t barrier{}; barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; barrier.dep_signal[0] = packet_completion_signal; void* barrier_ptr = &barrier; @@ -549,20 +558,23 @@ void AddVendorSpecificPacket(const Packet::packet_t* packet, } void SignalAsyncHandler(const hsa_signal_t& signal, void* data) { - hsa_status_t status = HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_async_handler_fn( - signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandler, data); + hsa_status_t status = + HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_async_handler_fn( + signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandler, data); CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler failed", status); } void signalAsyncHandlerATT(const hsa_signal_t& signal, void* data) { - hsa_status_t status = HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_async_handler_fn( - signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandlerATT, data); + hsa_status_t status = + HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_async_handler_fn( + signal, HSA_SIGNAL_CONDITION_EQ, 0, AsyncSignalHandlerATT, data); CHECK_HSA_STATUS("Error: hsa_amd_signal_async_handler for ATT failed", status); } void CreateSignal(uint32_t attribute, hsa_signal_t* signal) { hsa_status_t status = - HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_create_fn(1, 0, nullptr, attribute, signal); + HSASupport_Singleton::GetInstance().GetAmdExtTable().hsa_amd_signal_create_fn( + 1, 0, nullptr, attribute, signal); CHECK_HSA_STATUS("Error: hsa_amd_signal_create failed", status); } @@ -604,15 +616,16 @@ void ResetSessionID(rocprofiler_session_id_t id) { session_id = id; } void CheckNeededProfileConfigs() { rocprofiler_session_id_t internal_session_id; - // Getting Session ID - rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = rocprofiler::ROCProfiler_Singleton::GetInstance(); + // Getting Session ID + rocprofiler::ROCProfiler_Singleton& rocprofiler_singleton = + rocprofiler::ROCProfiler_Singleton::GetInstance(); internal_session_id = rocprofiler_singleton.GetCurrentSessionId(); if (session_id.handle == 0 || internal_session_id.handle != session_id.handle) { session_id = internal_session_id; // Getting Counters count from the Session - if (session_id.handle > 0 ) { + if (session_id.handle > 0) { session = rocprofiler_singleton.GetSession(session_id); if (session && session->FindFilterWithKind(ROCPROFILER_COUNTERS_COLLECTION)) { rocprofiler_filter_id_t filter_id = @@ -658,7 +671,8 @@ std::pair, bool> GetAllowedProfilesList(const void* packets, i std::vector can_profile_packet; bool b_can_profile_anypacket = false; can_profile_packet.reserve(pkt_count); - rocprofiler::HSASupport_Singleton& hsasupport_singleton = rocprofiler::HSASupport_Singleton::GetInstance(); + rocprofiler::HSASupport_Singleton& hsasupport_singleton = + rocprofiler::HSASupport_Singleton::GetInstance(); std::lock_guard lock(hsasupport_singleton.ksymbol_map_lock); assert(hsasupport_singleton.ksymbols); @@ -702,13 +716,9 @@ std::pair, bool> GetAllowedProfilesList(const void* packets, i return {can_profile_packet, b_can_profile_anypacket}; } -std::pair -ProcessATTParams( - Packet::packet_t& start_packet, - Packet::packet_t& stop_packet, - Queue& queue_info, - rocprofiler::HSAAgentInfo& agentInfo -) { +std::pair ProcessATTParams( + Packet::packet_t& start_packet, Packet::packet_t& stop_packet, Queue& queue_info, + rocprofiler::HSAAgentInfo& agentInfo) { std::vector att_params; int num_att_counters = 0; uint32_t att_buffer_size = DEFAULT_ATT_BUFFER_SIZE; @@ -762,8 +772,9 @@ ProcessATTParams( for (; num_att_counters < 16; num_att_counters++) att_params.push_back(zero_perf); } // Get the PM4 Packets using packets_generator - return {Packet::GenerateATTPackets(queue_info.GetCPUAgent(), queue_info.GetGPUAgent(), - att_params, &start_packet, &stop_packet, att_buffer_size), capture_mode}; + return {Packet::GenerateATTPackets(queue_info.GetCPUAgent(), queue_info.GetGPUAgent(), att_params, + &start_packet, &stop_packet, att_buffer_size), + capture_mode}; } /** @@ -773,9 +784,8 @@ ProcessATTParams( * pointer to the packet. This packet is written into the queue by this * interceptor by invoking the writer function. */ -void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt_index, void* data, - hsa_amd_queue_intercept_packet_writer writer) { - +void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t user_pkt_index, + void* data, hsa_amd_queue_intercept_packet_writer writer) { static const char* env_MAX_ATT_PROFILES = getenv("ROCPROFILER_MAX_ATT_PROFILES"); static int MAX_ATT_PROFILES = env_MAX_ATT_PROFILES ? atoi(env_MAX_ATT_PROFILES) : 1; @@ -871,7 +881,7 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u // Make a copy of the original packet, adding its signal to a barrier // packet and create a new signal for it to get timestamps if (original_packet.completion_signal.handle) { - hsa_barrier_and_packet_t barrier{0}; + hsa_barrier_and_packet_t barrier{}; barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; Packet::packet_t* __attribute__((__may_alias__)) pkt = (reinterpret_cast(&barrier)); @@ -897,26 +907,26 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u // Added Interrupt Signal with barrier and provided handler for it CreateBarrierPacket(interrupt_signal, &transformed_packets); } else { - hsa_barrier_and_packet_t barrier{0}; + hsa_barrier_and_packet_t barrier{}; barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; barrier.completion_signal = interrupt_signal; Packet::packet_t* __attribute__((__may_alias__)) pkt = (reinterpret_cast(&barrier)); transformed_packets.emplace_back(*pkt); } - rocprofiler::HSAAgentInfo& agentInfo = - rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(queue_info.GetGPUAgent().handle); + rocprofiler::HSAAgentInfo& agentInfo = + rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo( + queue_info.GetGPUAgent().handle); // Creating Async Handler to be called every time the interrupt signal is // marked complete - SignalAsyncHandler( - interrupt_signal, - new queue_info_session_t{queue_info.GetGPUAgent(), session_id_snapshot, queue_info.GetQueueID(), - writer_id, interrupt_signal, agentInfo.GetDeviceInfo().getGPUId(), - agentInfo.GetDeviceInfo().getXccCount()}); + SignalAsyncHandler(interrupt_signal, + new queue_info_session_t{ + queue_info.GetGPUAgent(), session_id_snapshot, queue_info.GetQueueID(), + writer_id, interrupt_signal, agentInfo.GetDeviceInfo().getGPUId(), + agentInfo.GetDeviceInfo().getXccCount()}); ACTIVE_INTERRUPT_SIGNAL_COUNT.fetch_add(1, std::memory_order_relaxed); // profile_id++; // } while (replay_mode_count > 0 && profile_id < replay_mode_count); // Profiles loop end - } /* Write the transformed packets to the hardware queue. */ @@ -927,7 +937,9 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u // Getting Queue Data and Information auto& queue_info = *static_cast(data); std::lock_guard lk(queue_info.qw_mutex); - rocprofiler::HSAAgentInfo& agentInfo = rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo(queue_info.GetGPUAgent().handle); + rocprofiler::HSAAgentInfo& agentInfo = + rocprofiler::HSASupport_Singleton::GetInstance().GetHSAAgentInfo( + queue_info.GetGPUAgent().handle); bool can_profile_anypacket = false; std::vector can_profile_packet; @@ -947,11 +959,8 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u rocprofiler_codeobj_capture_mode_t capture_mode = ROCPROFILER_CAPTURE_SYMBOLS_ONLY; if (att_parameters_data.size() > 0) { - std::tie(profile, capture_mode) = ProcessATTParams(start_packet, - stop_packet, - queue_info, - agentInfo - ); + std::tie(profile, capture_mode) = + ProcessATTParams(start_packet, stop_packet, queue_info, agentInfo); } // Searching across all the packets given during this write @@ -974,7 +983,7 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u KernelInterceptCount += 1; writer_id = WRITER_ID.fetch_add(1, std::memory_order_release); - if (att_parameters_data.size() > 0 && profile) { + if (!att_parameters_data.empty() && profile) { // Adding start packet and its barrier with a dummy signal hsa_signal_t dummy_signal{}; dummy_signal.handle = 0; @@ -1001,14 +1010,14 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u kernel_properties, (uint32_t)syscall(__NR_gettid), user_pkt_index); uint64_t off = dispatch_packet.kernel_object + - GetKernelCode(dispatch_packet.kernel_object)->kernel_code_entry_byte_offset; + GetKernelCode(dispatch_packet.kernel_object)->kernel_code_entry_byte_offset; codeobj_record::make_capture(rocprofiler_record_id_t{record_id}, capture_mode, off); codeobj_record::start_capture(rocprofiler_record_id_t{record_id}); codeobj_record::stop_capture(rocprofiler_record_id_t{record_id}); // Make a copy of the original packet, adding its signal to a barrier packet - if (original_packet.completion_signal.handle) { - hsa_barrier_and_packet_t barrier{0}; + if (original_packet.completion_signal.handle != 0U) { + hsa_barrier_and_packet_t barrier{}; barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; Packet::packet_t* __attribute__((__may_alias__)) pkt = (reinterpret_cast(&barrier)); @@ -1028,7 +1037,7 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u // Added Interrupt Signal with barrier and provided handler for it CreateBarrierPacket(interrupt_signal, &transformed_packets); } else { - hsa_barrier_and_packet_t barrier{0}; + hsa_barrier_and_packet_t barrier{}; barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; barrier.completion_signal = interrupt_signal; Packet::packet_t* __attribute__((__may_alias__)) pkt = @@ -1050,13 +1059,12 @@ void Queue::WriteInterceptor(const void* packets, uint64_t pkt_count, uint64_t u /* Write the original packets to the hardware queue if no profiling session * is active */ writer(packets, pkt_count); - } } Queue::Queue(const hsa_agent_t cpu_agent, const hsa_agent_t gpu_agent, hsa_queue_t* queue) - : cpu_agent_(cpu_agent), gpu_agent_(gpu_agent), intercept_queue_(queue) { } + : cpu_agent_(cpu_agent), gpu_agent_(gpu_agent), intercept_queue_(queue) {} Queue::~Queue() { while (ACTIVE_INTERRUPT_SIGNAL_COUNT.load(std::memory_order_acquire) > 0) { @@ -1071,8 +1079,7 @@ hsa_agent_t Queue::GetCPUAgent() { return cpu_agent_; } uint64_t Queue::GetQueueID() { return intercept_queue_->id; } -void CheckPacketReqiurements() { - Packet::CheckPacketReqiurements();} +void CheckPacketReqiurements() { Packet::CheckPacketReqiurements(); } } // namespace queue } // namespace rocprofiler diff --git a/src/core/profile.h b/src/core/profile.h index d91e265c..8c9c7561 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -198,7 +198,6 @@ class Profile { status = api->hsa_ven_amd_aqlprofile_stop(&profile_, &stop); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_stop"); hsa_status_t rd_status = HSA_STATUS_ERROR; -#ifdef AQLPROF_NEW_API if (profile_.type == HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC) { rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); if (is_concurrent) { // concurrent: one more read @@ -208,7 +207,6 @@ class Profile { } #if 0 // Read API returns error if disabled if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); -#endif #endif // Set completion signal of start diff --git a/src/core/session/tracer/src/roctracer.cpp b/src/core/session/tracer/src/roctracer.cpp index 363108de..fd19bdbf 100644 --- a/src/core/session/tracer/src/roctracer.cpp +++ b/src/core/session/tracer/src/roctracer.cpp @@ -833,9 +833,7 @@ static std::string getKernelNameMultiKernelMultiDevice(hipLaunchParams* launchPa return name_str.str(); } -template struct Overloaded : Ts... { - using Ts::operator()...; -}; +template struct Overloaded : Ts... { using Ts::operator()...; }; template Overloaded(Ts...) -> Overloaded; std::optional GetHipKernelName(uint32_t cid, hip_api_data_t* data) { diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt index 4cfa1e89..e17664c5 100644 --- a/src/tools/CMakeLists.txt +++ b/src/tools/CMakeLists.txt @@ -28,38 +28,17 @@ target_compile_definitions( PUBLIC AMD_INTERNAL_BUILD PRIVATE HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1) -if(ASAN) - target_compile_options(rocprofiler_tool PRIVATE -fsanitize=address) - target_link_libraries( - rocprofiler_tool - rocprofiler-v2 - hsa-runtime64::hsa-runtime64 - Threads::Threads - atomic - asan - dl - rt - stdc++fs - amd_comgr) - target_link_options( - rocprofiler_tool PRIVATE - -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap - -Wl,--no-undefined,-fsanitize=address) -else() - target_link_libraries( - rocprofiler_tool - rocprofiler-v2 - hsa-runtime64::hsa-runtime64 - Threads::Threads - atomic - dl - rt - stdc++fs - amd_comgr) - target_link_options( - rocprofiler_tool PRIVATE - -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap -Wl,--no-undefined) -endif() +target_link_libraries(rocprofiler_tool + PRIVATE $) + +target_link_libraries( + rocprofiler_tool + PUBLIC rocprofiler-v2 hsa-runtime64::hsa-runtime64 Threads::Threads atomic dl rt + stdc++fs amd_comgr + PRIVATE rocprofiler::memcheck) +target_link_options( + rocprofiler_tool PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exportmap + -Wl,--no-undefined) install(TARGETS rocprofiler_tool LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rocprofiler COMPONENT runtime) diff --git a/src/tools/rocsys/CMakeLists.txt b/src/tools/rocsys/CMakeLists.txt index 3b809211..d6d10c7a 100644 --- a/src/tools/rocsys/CMakeLists.txt +++ b/src/tools/rocsys/CMakeLists.txt @@ -9,10 +9,7 @@ file(GLOB ROCPROFILER_ROCSYS_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) # Compiling/Installing ROCProfiler API add_executable(rocprofiler_rocsys_fe ${ROCPROFILER_ROCSYS_SRC_FILES}) -set_target_properties( - rocprofiler_rocsys_fe - PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_OUTPUT_DIRECTORY} OUTPUT_NAME - "rocsys") +set_target_properties(rocprofiler_rocsys_fe PROPERTIES OUTPUT_NAME "rocsys") target_include_directories( rocprofiler_rocsys_fe PRIVATE ${PROJECT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index cbc63b7f..156d5252 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -53,8 +53,8 @@ namespace util { static const char* cpp_demangle(const char* symname) { size_t size = 0; int status; - const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); - return (ret != 0) ? ret : strdup(symname); + const char* ret = abi::__cxa_demangle(symname, nullptr, &size, &status); + return (ret != nullptr) ? ret : strdup(symname); } // Callback function to get available in the system agents @@ -62,7 +62,7 @@ hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) hsa_status_t status = HSA_STATUS_ERROR; HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); - if (agent_info != NULL) status = HSA_STATUS_SUCCESS; + if (agent_info != nullptr) status = HSA_STATUS_SUCCESS; return status; } @@ -123,10 +123,10 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { hsa_status_t status; - cpu_pool_ = NULL; - kern_arg_pool_ = NULL; + cpu_pool_ = nullptr; + kern_arg_pool_ = nullptr; - InitHsaApiTable(NULL); + InitHsaApiTable(nullptr); // Initialize the Hsa Runtime if (initialize_hsa_) { @@ -137,11 +137,12 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize // Discover the set of Gpu devices available on the platform status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); - if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); - if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); + if (cpu_pool_ == nullptr) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); + if (kern_arg_pool_ == nullptr) + CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); // Get AqlProfile API table - aqlprofile_api_ = {0}; + aqlprofile_api_ = {}; #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else @@ -152,7 +153,7 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table - loader_api_ = {0}; + loader_api_ = {}; status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); @@ -160,7 +161,7 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize // Instantiate HSA timer timer_ = new HsaTimer(&hsa_api_); CHECK_STATUS("HSA timer allocation failed", - (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + (timer_ == nullptr) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); // Time correlation const uint32_t corr_iters = 1000; @@ -179,8 +180,8 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize // Destructor of the class HsaRsrcFactory::~HsaRsrcFactory() { delete timer_; - for (auto p : cpu_list_) delete p; - for (auto p : gpu_list_) delete p; + for (const auto* p : cpu_list_) delete p; + for (const auto* p : gpu_list_) delete p; if (initialize_hsa_) { hsa_status_t status = hsa_api_.hsa_shut_down(); CHECK_STATUS("Error in hsa_shut_down", status); @@ -190,8 +191,8 @@ HsaRsrcFactory::~HsaRsrcFactory() { void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { std::lock_guard lck(mutex_); - if (hsa_api_.hsa_init == NULL) { - if (table != NULL) { + if (hsa_api_.hsa_init == nullptr) { + if (table != nullptr) { hsa_api_.hsa_init = table->core_->hsa_init_fn; hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; @@ -289,7 +290,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { void* handle = dlopen(kAqlProfileLib, RTLD_NOW); - if (handle == NULL) { + if (handle == nullptr) { fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror()); return HSA_STATUS_ERROR; } @@ -305,10 +306,8 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); api->hsa_ven_amd_aqlprofile_stop = (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); -#ifdef AQLPROF_NEW_API api->hsa_ven_amd_aqlprofile_read = (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); -#endif api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); @@ -325,7 +324,7 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { // Determine if device is a Gpu agent hsa_status_t status; - AgentInfo* agent_info = NULL; + AgentInfo* agent_info = nullptr; hsa_device_type_t type; status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); @@ -339,10 +338,11 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); - if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; + if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == nullptr)) + cpu_pool_ = &agent_info->cpu_pool; status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); - if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) + if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == nullptr)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; @@ -362,7 +362,7 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); - agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; + agent_info->is_apu = agent_info->profile == HSA_PROFILE_FULL; hsa_api_.hsa_agent_get_info( agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num); @@ -407,7 +407,7 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { // Return systen agent info const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { - const AgentInfo* agent_info = NULL; + const AgentInfo* agent_info = nullptr; auto it = agent_map_.find(agent.handle); if (it != agent_map_.end()) { agent_info = it->second; @@ -482,8 +482,8 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { hsa_status_t status; - status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, - UINT32_MAX, UINT32_MAX, queue); + status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, nullptr, + nullptr, UINT32_MAX, UINT32_MAX, queue); return (status == HSA_STATUS_SUCCESS); } @@ -493,7 +493,7 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, // @return bool true if successful, false otherwise bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { hsa_status_t status; - status = hsa_api_.hsa_signal_create(value, 0, NULL, signal); + status = hsa_api_.hsa_signal_create(value, 0, nullptr, signal); return (status == HSA_STATUS_SUCCESS); } @@ -504,11 +504,11 @@ bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; - uint8_t* buffer = NULL; + uint8_t* buffer = nullptr; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); - uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : nullptr; return ptr; } @@ -519,7 +519,7 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; - uint8_t* buffer = NULL; + uint8_t* buffer = nullptr; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, @@ -527,10 +527,10 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, nullptr, buffer); } } - uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : nullptr; return ptr; } @@ -540,7 +540,7 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; - uint8_t* buffer = NULL; + uint8_t* buffer = nullptr; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, @@ -548,10 +548,10 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, nullptr, buffer); } } - uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : nullptr; return ptr; } @@ -562,8 +562,8 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP) - ? reinterpret_cast( - mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0)) + ? reinterpret_cast(mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_SHARED | MAP_ANONYMOUS, 0, 0)) : AllocateSysMemory(agent_info, size); return ptr; } @@ -573,7 +573,7 @@ hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { const hsa_signal_value_t exp_value = signal_value - 1; hsa_signal_value_t ret_value = signal_value; - while (1) { + while (true) { ret_value = hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED); if (ret_value == exp_value) break; @@ -599,9 +599,10 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; - status = hsa_api_.hsa_signal_create(1, 0, NULL, &s); + status = hsa_api_.hsa_signal_create(1, 0, nullptr, &s); CHECK_STATUS("hsa_signal_create()", status); - status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + status = + hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, nullptr, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); SignalWait(s, 1); status = hsa_api_.hsa_signal_destroy(s); @@ -654,12 +655,12 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Create executable. status = hsa_api_.hsa_executable_create_alt( - HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable); + HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, nullptr, executable); CHECK_STATUS("Error in creating executable object", status); // Load code object. status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, - code_obj_rdr, NULL, NULL); + code_obj_rdr, nullptr, nullptr); CHECK_STATUS("Error in loading executable object", status); // Freeze executable. @@ -668,8 +669,8 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Get symbol handle. hsa_executable_symbol_t kernelSymbol; - status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, - &kernelSymbol); + status = hsa_api_.hsa_executable_get_symbol(*executable, nullptr, kernel_name, agent_info->dev_id, + 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); close(file_handle); @@ -799,7 +800,7 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symname); CHECK_STATUS("Error in getting kernel name", status); symname[len] = 0; - if (data == NULL) { + if (data == nullptr) { const char* name = cpp_demangle(symname); auto ret = symbols_map_->insert({addr, name}); if (ret.second == false) { @@ -816,16 +817,16 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char* options) { std::lock_guard lck(mutex_); - if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; + if (symbols_map_ == nullptr) symbols_map_ = new symbols_map_t; hsa_status_t status = - hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); + hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, nullptr); CHECK_STATUS("Error in iterating executable symbols", status); return hsa_api_.hsa_executable_freeze(executable, options); } hsa_status_t HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable) { std::lock_guard lck(mutex_); - if (symbols_map_ != NULL) { + if (symbols_map_ != nullptr) { hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)1); CHECK_STATUS("Error in iterating executable symbols", status); @@ -838,8 +839,8 @@ HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; hsa_pfn_t HsaRsrcFactory::hsa_api_{}; bool HsaRsrcFactory::executable_tracking_on_ = false; -HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL; -void* HsaRsrcFactory::to_dump_code_obj_ = NULL; +HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = nullptr; +void* HsaRsrcFactory::to_dump_code_obj_ = nullptr; } // namespace util } // namespace rocprofiler diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8473a421..46efbd5b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -19,14 +19,29 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ################################################################################ -cmake_minimum_required(VERSION 3.16.0) +cmake_minimum_required(VERSION 3.18.0 FATAL_ERROR) set(EXE_NAME "rocprof-ctrl") +# Temporary up till we remove independant build of tests exists in the Jenkins side if(NOT DEFINED TEST_DIR) set(TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR}) project(${EXE_NAME} C CXX) # Set build environment - include(env) + list(INSERT CMAKE_MODULE_PATH 0 "${CMAKE_CURRENT_SOURCE_DIR}/../cmake_modules") + include(rocprofiler_options) + include(rocprofiler_utils) + include(rocprofiler_env) + include(rocprofiler_formatting) + include(rocprofiler_linting) + find_package(Threads REQUIRED) + find_package( + hsa-runtime64 REQUIRED CONFIG + HINTS ${CMAKE_INSTALL_PREFIX} + PATHS ${ROCM_PATH}) + find_package( + HIP REQUIRED CONFIG + HINTS ${CMAKE_INSTALL_PREFIX} + PATHS ${ROCM_PATH}) endif() set(THREADS_PREFER_PTHREAD_FLAG ON) @@ -136,8 +151,12 @@ foreach(target_id ${GPU_LIST}) generate_hsaco(${target_id} ${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME}.cl ${target_id}_DummyKernel.hsaco) endforeach(target_id) -add_custom_target(test DEPENDS ${HSACO_TARGET_LIST}) -add_custom_target(mytest DEPENDS ${HSACO_TARGET_LIST}) +# +# NOTE (jomadsen): cannot create target named test because this is a reserved target name +# -- cmake provides "test" target to run tests +# +# add_custom_target(test DEPENDS ${HSACO_TARGET_LIST}) +add_custom_target(mytest ALL DEPENDS ${HSACO_TARGET_LIST}) add_custom_command( TARGET mytest POST_BUILD DEPENDS mytest @@ -183,9 +202,11 @@ target_include_directories(${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${PROJECT_SOURCE_DIR}/include) target_link_libraries(${EXE_NAME} hsa-runtime64::hsa-runtime64 hsakmt::hsakmt Threads::Threads stdc++fs dl) -execute_process(COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}") -execute_process(COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}") -execute_process(COMMAND sh -xc "mkdir -p ${PROJECT_BINARY_DIR}/RESULTS") + +file(GLOB XML_FILES "${TEST_DIR}/tool/*.xml") +configure_file(${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}/run.sh COPYONLY) +execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${XML_FILES} ${PROJECT_BINARY_DIR}/) + # TODO(aelwazir): Should be replaced by the current location in the main CMakeLists.txt install( TARGETS ${EXE_NAME} @@ -219,14 +240,14 @@ install( DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME} COMPONENT asan) -# Build memory test bench -add_custom_target( - mbench - COMMAND sh -xc "cp -r ${TEST_DIR}/memory_validation ${PROJECT_BINARY_DIR}/test/." - COMMAND make -C "${PROJECT_BINARY_DIR}/test/memory_validation") +add_test(NAME v1-tests COMMAND ${PROJECT_BINARY_DIR}/run.sh) +set_tests_properties( + v1-tests PROPERTIES LABELS "v1" ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" + RUN_SERIAL TRUE) # Copy OCL test -execute_process(COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test/.") +execute_process(COMMAND ${CMAKE_COMMAND} -E copy_directory ${TEST_DIR}/ocl + ${PROJECT_BINARY_DIR}/test/ocl) install( DIRECTORY ${TEST_DIR}/ocl DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests-v1/test diff --git a/test/app/test.cpp b/test/app/test.cpp index e54bec82..f102d8fb 100644 --- a/test/app/test.cpp +++ b/test/app/test.cpp @@ -50,8 +50,8 @@ void thread_fun(const int kiter, const int diter, const uint32_t agents_number) for (int i = 0; i < kiter; ++i) { for (uint32_t n = 0; n < agents_number; ++n) { - // RunKernel(0, NULL, agent_info[n], queue[n], diter); - RunKernel(0, NULL, agent_info[n], queue[n], diter); + RunKernel(0, NULL, agent_info[n], queue[n], diter); + // RunKernel(0, NULL, agent_info[n], queue[n], diter); } } diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index fddf77e2..0a44d182 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -299,10 +299,8 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); api->hsa_ven_amd_aqlprofile_stop = (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); -#ifdef AQLPROF_NEW_API api->hsa_ven_amd_aqlprofile_read = (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); -#endif api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); diff --git a/tests-v2/featuretests/gtests_main.cpp b/tests-v2/featuretests/gtests_main.cpp index a861d36b..1d20fe2b 100644 --- a/tests-v2/featuretests/gtests_main.cpp +++ b/tests-v2/featuretests/gtests_main.cpp @@ -1,30 +1,25 @@ #include +#include + #include "src/core/hardware/hsa_info.h" int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); testing::FLAGS_gtest_death_test_style = "threadsafe"; - // Add line below to disable any problematic test - hsa_init(); - testing::GTEST_FLAG(filter) = - "-OpenMPTest.*:ProfilerSPMTest.*:ProfilerMQTest.*:ProfilerMPTest.*:MPITest.*"; // Disable ATT test fir gfx10 GPUs until its supported - // iterate for gpu's - hsa_iterate_agents( - [](hsa_agent_t agent, void*) { - char gpu_name[64]; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, gpu_name); - std::string gfx_name = gpu_name; - if (gfx_name.find("gfx10") != std::string::npos) { - testing::GTEST_FLAG(filter) = - "-ATTCollection.*:OpenMPTest.*:ProfilerSPMTest*:ProfilerMQTest.*:*ProfilerMPTest.*:" - "MPITest.*"; - } - return HSA_STATUS_SUCCESS; - }, - nullptr); -// Append filter above to disable any problematic test - int res = RUN_ALL_TESTS(); - hsa_shut_down(); - return res; + // read the command line arguments after above filters so it + // does not override the command-line --gtest_filter argument + bool skipInit = false; + for (int i = 0; i < argc; i++) { + if (std::string_view("--gtest_list_tests").compare(argv[i]) == 0 || + std::string_view("-h").compare(argv[i]) == 0 || + std::string_view("--help").compare(argv[i]) == 0) { + skipInit = true; + break; + } + } + if (!skipInit) hsa_init(); + testing::InitGoogleTest(&argc, argv); + // hsa_shut_down(); // Waiting for hsa_shutdown bug to fix + // Append filter above to disable any problematic test + return RUN_ALL_TESTS(); } diff --git a/tests-v2/featuretests/profiler/CMakeLists.txt b/tests-v2/featuretests/profiler/CMakeLists.txt index 125aaac4..4296ccf3 100644 --- a/tests-v2/featuretests/profiler/CMakeLists.txt +++ b/tests-v2/featuretests/profiler/CMakeLists.txt @@ -28,6 +28,9 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip") set(CMAKE_HIP_ARCHITECTURES OFF) +if(NOT DEFINED HIP_ROOT_DIR) + set(HIP_ROOT_DIR "${CMAKE_INSTALL_PREFIX}") +endif() find_package(HIP REQUIRED MODULE) # Enable CLANG-TIDY for static analysis search for clang-tidy @@ -36,6 +39,28 @@ if(CLANG_TIDY_EXE) set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY_EXE}; -format-style='file'; -header-filter=${CMAKE_CURRENT_SOURCE_DIR};) endif() + +# ######################################################################################## + +function(rocprofiler_featuretests_profiler_add_test _TARGET) + if(TARGET ${_TARGET}) + if(NOT TEST ${_TARGET}) + add_test( + NAME ${_TARGET} + COMMAND $ + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + endif() + set_tests_properties( + ${_TARGET} PROPERTIES LABELS "featuretests;profiler" ENVIRONMENT + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" ${ARGN}) + endif() +endfunction() + +function(rocprofiler_featuretests_profiler_add_executable _TARGET) + hip_add_executable(${_TARGET} ${ARGN}) + rocprofiler_featuretests_profiler_add_test(${_TARGET}) +endfunction() + # ######################################################################################## # App Based FeatureTests # ######################################################################################## @@ -53,7 +78,7 @@ endforeach() # Compile Applications hip_helloworld set_source_files_properties(apps/hello_world_hip.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(hip_helloworld apps/hello_world_hip.cpp) +rocprofiler_featuretests_profiler_add_executable(hip_helloworld apps/hello_world_hip.cpp) set_target_properties( hip_helloworld PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps") @@ -68,7 +93,7 @@ install( # hip_vectoradd set_source_files_properties(apps/vector_add_hip.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(hip_vectoradd apps/vector_add_hip.cpp) +rocprofiler_featuretests_profiler_add_executable(hip_vectoradd apps/vector_add_hip.cpp) set_target_properties( hip_vectoradd PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps") @@ -81,12 +106,22 @@ install( COMPONENT tests) # mpi_vectoradd +add_library(rocprofiler-tests-mpi INTERFACE) +add_library(rocprofiler::tests-mpi ALIAS rocprofiler-tests-mpi) + find_package(MPI) if(MPI_CXX_FOUND) - include_directories(SYSTEM ${MPI_INCLUDE_PATH}) + set(USE_MPI 1) + target_compile_definitions(rocprofiler-tests-mpi INTERFACE USE_MPI=1) + target_link_libraries(rocprofiler-tests-mpi INTERFACE stdc++fs ${MPI_C_LIBRARIES}) + target_include_directories(rocprofiler-tests-mpi INTERFACE ${MPI_INCLUDE_PATH}) +endif() + +if(USE_MPI) set_source_files_properties(apps/vector_add_mpi.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - hip_add_executable(mpi_vectoradd apps/vector_add_mpi.cpp) + rocprofiler_featuretests_profiler_add_executable(mpi_vectoradd + apps/vector_add_mpi.cpp) set_target_properties( mpi_vectoradd PROPERTIES RUNTIME_OUTPUT_DIRECTORY @@ -98,24 +133,50 @@ if(MPI_CXX_FOUND) DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps COMPONENT tests) - target_link_libraries(mpi_vectoradd ${MPI_C_LIBRARIES} stdc++fs) + target_link_libraries(mpi_vectoradd PRIVATE rocprofiler::tests-mpi) +endif() + +# openmp_helloworld +add_library(rocprofiler-tests-openmp INTERFACE) +add_library(rocprofiler::tests-openmp ALIAS rocprofiler-tests-openmp) + +find_package(OpenMP) +if(OpenMP_CXX_FOUND) + set(USE_OpenMP 1) + target_compile_definitions(rocprofiler-tests-openmp INTERFACE USE_OpenMP=1) + target_link_libraries(rocprofiler-tests-openmp INTERFACE OpenMP::OpenMP_CXX + hip::device) +elseif() + message(WARNING "OpenMP not found") + unset(USE_OpenMP) endif() -# openmp_helloworld find_package(hip REQUIRED) find_package(OpenMP) if(OpenMP_CXX_FOUND) # -# Source files. set_source_files_properties(gtests/apps/openmp/hello_world.cpp PROPERTIES -# HIP_SOURCE_PROPERTY_FORMAT 1) hip_add_executable(openmp_helloworld -# gtests/apps/openmp/hello_world.cpp) set_target_properties(openmp_helloworld PROPERTIES -# RUNTIME_OUTPUT_DIRECTORY -# "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/gtests/apps") +# if(USE_OpenMP) +# set_source_files_properties(apps/hello_world_omp.cpp +# PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) +# rocprofiler_featuretests_profiler_add_executable(openmp_helloworld +# apps/hello_world_omp.cpp) +# set_target_properties( +# openmp_helloworld +# PROPERTIES RUNTIME_OUTPUT_DIRECTORY +# "${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps") + +# target_link_options(openmp_helloworld PRIVATE "-Wl,--build-id=md5") +# target_link_libraries(openmp_helloworld PRIVATE rocprofiler::tests-openmp) -# # Link Libraries - HIP Device and OpenMP. target_compile_options(openmp_helloworld -# PRIVATE ${OpenMP_CXX_FLAGS}) target_link_libraries(openmp_helloworld PRIVATE hip::device -# ${OpenMP_CXX_FLAGS}) endif() +# install( +# TARGETS openmp_helloworld +# RUNTIME +# DESTINATION +# ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps +# COMPONENT tests) +# endif() # hsa-mem_async_copy -- Not Enabled for Now set_source_files_properties(apps/async_mem_copy.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(hsa_async_mem_copy apps/async_mem_copy.cpp) +rocprofiler_featuretests_profiler_add_executable(hsa_async_mem_copy + apps/async_mem_copy.cpp) set_target_properties( hsa_async_mem_copy PROPERTIES RUNTIME_OUTPUT_DIRECTORY @@ -147,8 +208,8 @@ file(GLOB GTEST_MAIN_SRC_FILE ${GTEST_MAIN_DIR}/gtests_main.cpp) set_source_files_properties(apps/multithreaded_testapp.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(multithreaded_testapp apps/multithreaded_testapp.cpp - ../utils/test_utils.cpp) +rocprofiler_featuretests_profiler_add_executable( + multithreaded_testapp apps/multithreaded_testapp.cpp ../utils/test_utils.cpp) target_include_directories( multithreaded_testapp PRIVATE ${PROJECT_SOURCE_DIR} @@ -226,18 +287,20 @@ install( ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps COMPONENT tests) -# add_executable(profiler_multiqueue_test discretetests/binary/multiqueue_test.cpp -# utils/csv_parser.cpp utils/test_utils.cpp) -# target_include_directories(profiler_multiqueue_test PRIVATE ${PROJECT_SOURCE_DIR} -# ${PROJECT_SOURCE_DIR}/tests-v2/featuretests/profiler) -# target_link_libraries(profiler_multiqueue_test PRIVATE hsa-runtime64::hsa-runtime64 -# Threads::Threads dl stdc++fs amd_comgr) add_dependencies(tests profiler_multiqueue_test) +set(runFeatureTests_SOURCES + profiler_gtest.cpp apps/hip_kernels.cpp ${GTEST_MAIN_SRC_FILE} ${CORE_HSA_SRC_FILES} + ${CORE_HW_SRC_FILES} ${CORE_UTILS_SRC_FILES} ${TEST_UTILS_SRC_FILES}) set_source_files_properties(apps/hip_kernels.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable( - runFeatureTests profiler_gtest.cpp apps/hip_kernels.cpp ${GTEST_MAIN_SRC_FILE} - ${CORE_HSA_SRC_FILES} ${CORE_HW_SRC_FILES} ${CORE_UTILS_SRC_FILES} - ${TEST_UTILS_SRC_FILES}) +hip_add_executable(runFeatureTests ${runFeatureTests_SOURCES}) + +# link MPI and OpenMP to runFeatureTests for visibility +if(USE_MPI) + target_compile_definitions(runFeatureTests PRIVATE USE_MPI=1) +endif() +if(USE_OpenMP) + target_compile_definitions(runFeatureTests PRIVATE USE_OpenMP=1) +endif() target_include_directories( runFeatureTests @@ -253,7 +316,40 @@ target_link_options(runFeatureTests PRIVATE "-Wl,--build-id=md5") install(TARGETS runFeatureTests RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests COMPONENT tests) -add_test(AllTests runFeatureTests) + +# add_test(AllTests runFeatureTests) +include(GoogleTest) + +set(GTEST_DISCOVER_TESTS_TARGET runFeatureTests) +set(GTEST_DISCOVER_TESTS_LABELS "v2" "featuretests") +set(GTEST_DISCOVER_TESTS_ENVIRONMENT ${ROCPROFILER_MEMCHECK_PRELOAD_ENV}) +configure_file( + ${PROJECT_SOURCE_DIR}/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/runFeatureTests_TestProperties.cmake @ONLY) + +# we cannot do LD_PRELOAD during test discovery but test discovery displays disabled tests +# in CDash so below is the work-around +# Commenting the first temporarily up till is fixed on Jenkins dockers +# if(NOT ROCPROFILER_MEMCHECK MATCHES "(Thread|Address)Sanitizer") +# gtest_discover_tests(runFeatureTests) + +# set_property( +# DIRECTORY ${CMAKE_CURRENT_LIST_DIR} +# APPEND +# PROPERTY TEST_INCLUDE_FILES +# ${CMAKE_CURRENT_BINARY_DIR}/runFeatureTests_TestProperties.cmake) +# else() + gtest_add_tests( + TARGET runFeatureTests + SOURCES "${runFeatureTests_SOURCES}" + TEST_LIST runFeatureTests_TESTS) + include(${CMAKE_CURRENT_BINARY_DIR}/runFeatureTests_TestProperties.cmake) +# endif() + +# for the *_FilePlugin tests +if(NOT EXISTS "${PROJECT_BINARY_DIR}/test-output") + file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/test-output") +endif() # Copy scripts, input files to samples folder configure_file(${CMAKE_CURRENT_SOURCE_DIR}/apps/goldentraces/basic_metrics.txt @@ -268,3 +364,21 @@ install( DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/profiler/apps/goldentraces COMPONENT tests) + +find_package( + Python3 + COMPONENTS Interpreter + REQUIRED) + + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import pandas" + RESULT_VARIABLE PANDAS_HEADER_PARSER + OUTPUT_QUIET) + if(NOT ${PANDAS_HEADER_PARSER} EQUAL 0) + message( + "The \"pandas\" Python3 package is not installed. \ + Please install it using the following command: \"${Python3_EXECUTABLE} -m pip install pandas\".") + else() + # cmake based tests + include(${CMAKE_CURRENT_LIST_DIR}/counter_validation_tests.cmake) + endif() diff --git a/tests-v2/featuretests/profiler/apps/goldentraces/mpi_vectoradd_golden_traces.txt b/tests-v2/featuretests/profiler/apps/goldentraces/mpi_vectoradd_golden_traces.txt index a8c4edda..59107d0a 100755 --- a/tests-v2/featuretests/profiler/apps/goldentraces/mpi_vectoradd_golden_traces.txt +++ b/tests-v2/featuretests/profiler/apps/goldentraces/mpi_vectoradd_golden_traces.txt @@ -4,14 +4,7 @@ Enabling Counter Collection ROCProfilerV2: Collecting the following counters: - GRBM_COUNT Enabling Counter Collection -ROCProfilerV2: Collecting the following counters: -- GRBM_COUNT -Enabling Counter Collection -device count and rank is8: 2 -Rank Id: 0 | Device Id : 0 | Num Devices: 8 -device count and rank is8: 2 -Rank Id: 1 | Device Id : 1 | Num Devices: 8 -Max error: 0.000000 +device count and rank is1: 1 +Rank Id: 0 | Device Id : 0 | Num Devices: 1 Max error: 0.000000 -Dispatch_ID(1), GPU_ID(5), Queue_ID(1), Queue_Index(0), Process_ID(2185441), Thread_ID(2185441), Grid_Size(1048576), Workgroup_Size(256), LDS(0), Scratch_Size(0), Arch_VGPR(12), Accumulative_VGPR(4), SGPR(32), Wave_Size(64), Kernel_Name("add"), Begin_Timestamp(139857691152944), End_Timestamp(139857835223272), Correlation_ID(0), GRBM_COUNT(499551.000000) -Dispatch_ID(1), GPU_ID(4), Queue_ID(1), Queue_Index(0), Process_ID(2185436), Thread_ID(2185436), Grid_Size(1048576), Workgroup_Size(256), LDS(0), Scratch_Size(0), Arch_VGPR(12), Accumulative_VGPR(4), SGPR(32), Wave_Size(64), Kernel_Name("add"), Begin_Timestamp(140429257347632), End_Timestamp(140429483317480), Correlation_ID(0), GRBM_COUNT(499406.000000) +Dispatch_ID(1), GPU_ID(1), Queue_ID(1), Queue_Index(0), Process_ID(6293), Thread_ID(6293), Grid_Size(1048576), Workgroup_Size(256), LDS(0), Scratch_Size(0), Arch_VGPR(12), Accumulative_VGPR(0), SGPR(32), Wave_Size(64), Kernel_Name("add"), Begin_Timestamp(140016470724832), End_Timestamp(5), Correlation_ID(0), GRBM_COUNT(1108537.000000) diff --git a/tests-v2/featuretests/profiler/apps/goldentraces/openmp_helloworld_golden_traces.txt b/tests-v2/featuretests/profiler/apps/goldentraces/openmp_helloworld_golden_traces.txt index e0007548..5333b219 100755 --- a/tests-v2/featuretests/profiler/apps/goldentraces/openmp_helloworld_golden_traces.txt +++ b/tests-v2/featuretests/profiler/apps/goldentraces/openmp_helloworld_golden_traces.txt @@ -1,4 +1,5 @@ ROCProfilerV2: Collecting the following counters: - GRBM_COUNT Enabling Counter Collection -Dispatch_ID(1), GPU_ID(4), Queue_ID(1), Queue_Index(1), Process_ID(2186189), Thread_ID(2186189), Grid_Size(10), Workgroup_Size(10), LDS(0), Scratch_Size(0), Arch_VGPR(8), Accumulative_VGPR(0), SGPR(16), Wave_Size(64), Kernel_Name("helloworld"), Begin_Timestamp(140284033765472), End_Timestamp(140288419293408), Correlation_ID(0), GRBM_COUNT(13839.000000) +PASSED! +Dispatch_ID(1), GPU_ID(1), Queue_ID(1), Queue_Index(0), Process_ID(11822), Thread_ID(11822), Grid_Size(1), Workgroup_Size(1), LDS(0), Scratch_Size(0), Arch_VGPR(4), Accumulative_VGPR(0), SGPR(16), Wave_Size(64), Kernel_Name("hip_helloworld"), Begin_Timestamp(140470675179888), End_Timestamp(140470675179776), Correlation_ID(0), GRBM_COUNT(22315.000000) \ No newline at end of file diff --git a/tests-v2/featuretests/profiler/counter_validation_tests.cmake b/tests-v2/featuretests/profiler/counter_validation_tests.cmake new file mode 100644 index 00000000..ff319780 --- /dev/null +++ b/tests-v2/featuretests/profiler/counter_validation_tests.cmake @@ -0,0 +1,32 @@ +# counter validation test - GRBM_COUNT +add_test( + NAME grbm_count_helloworld_test + COMMAND + ${PROJECT_BINARY_DIR}/rocprofv2 -i + ${PROJECT_BINARY_DIR}/tests-v2/featuretests/profiler/apps/input.txt -d + ${PROJECT_BINARY_DIR}/out-grbm_count -o grbm + tests-v2/featuretests/profiler/apps/hip_helloworld + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}") + +set_tests_properties( + grbm_count_helloworld_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +add_test( + NAME grbm_count_helloworld_test_validation + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/grbm_validate.py + "out-grbm_count/pmc_1/results_grbm.csv" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}") + +set_tests_properties( + grbm_count_helloworld_test_validation + PROPERTIES DEPENDS + grbm_count_helloworld_test + LABELS + "v2;validation" + PASS_REGULAR_EXPRESSION + "Test Passed" + FAIL_REGULAR_EXPRESSION + "Test Failed" + SKIP_REGULAR_EXPRESSION + "Skipped") diff --git a/tests-v2/featuretests/profiler/grbm_validate.py b/tests-v2/featuretests/profiler/grbm_validate.py new file mode 100644 index 00000000..05fdd4b2 --- /dev/null +++ b/tests-v2/featuretests/profiler/grbm_validate.py @@ -0,0 +1,27 @@ +import pandas as pd +import sys + + +def validate_grbm_count(filename): + df = pd.read_csv(filename) + + grbm_count = df.loc[0, "GRBM_COUNT"] + + # Validate the data + if not grbm_count < 0: + print("Test Passed: grbm count is valid.") + return 0 + else: + print("Test Failed: grbm count is not valid.") + return 1 + + +if __name__ == "__main__": + files = sys.argv[1:] + if not files: + raise RuntimeError("no input files provided") + for filename in files: + ec = validate_grbm_count(filename) + if ec != 0: + sys.stderr.write(f"{filename} did not pass validation\n") + sys.exit(ec) diff --git a/tests-v2/featuretests/profiler/profiler_gtest.cpp b/tests-v2/featuretests/profiler/profiler_gtest.cpp index b0848b20..8461e0cf 100644 --- a/tests-v2/featuretests/profiler/profiler_gtest.cpp +++ b/tests-v2/featuretests/profiler/profiler_gtest.cpp @@ -99,6 +99,10 @@ void ApplicationParser::SetApplicationEnv(const char* app_name) { setenv("COUNTERS_PATH", counter_path.str().c_str(), true); std::stringstream hsa_tools_lib_path; + auto _existing_ld_preload = getenv("LD_PRELOAD"); + if (_existing_ld_preload && strnlen(_existing_ld_preload, 1) > 0) + hsa_tools_lib_path << _existing_ld_preload << ":"; + hsa_tools_lib_path << app_path << lib_path; setenv("LD_PRELOAD", hsa_tools_lib_path.str().c_str(), true); @@ -247,9 +251,10 @@ TEST_F(HelloWorldTest, WhenRunningProfilerWithAppThenKernelNamessMatchWithGolden std::vector current_kernel_info; GetKernelInfoForRunningApplication(¤t_kernel_info); - ASSERT_TRUE(current_kernel_info.size()); - ASSERT_TRUE(golden_kernel_info.size()); - EXPECT_EQ(golden_kernel_info[0].kernel_name, current_kernel_info[0].kernel_name); + ASSERT_EQ(golden_kernel_info.size(), current_kernel_info.size()); + for (size_t i = 0; i < current_kernel_info.size(); ++i) { + EXPECT_EQ(golden_kernel_info[i].kernel_name, current_kernel_info[i].kernel_name) << "i=" << i; + } } // Test:3 Compares order of kernel-names in golden output against current @@ -380,143 +385,106 @@ TEST_F(HSATest, WhenRunningProfilerWithAppThenKernelNumbersMatchWithGoldenOutput * ############ OpenMP Tests ################ * ################################################### */ - -class OpenMPTest : public ProfilerTest { - protected: - std::vector golden_kernel_info; - void SetUp() { - ProfilerTest::SetUp("openmp_helloworld"); - GetKernelInfoForGoldenOutput("openmp_helloworld", kGoldenOutputOpenMP, &golden_kernel_info); - } -}; - -// Test:1 Compares total num of kernel-names in golden output against current -// profiler output -TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelNumbersMatchWithGoldenOutput) { - std::vector current_kernel_info; - - GetKernelInfoForRunningApplication(¤t_kernel_info); - ASSERT_TRUE(current_kernel_info.size()); - - EXPECT_EQ(golden_kernel_info.size(), current_kernel_info.size()); -} - -// Test:2 Compares order of kernel-names in golden output against current -// profiler output -TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelNamessMatchWithGoldenOutput) { - std::vector current_kernel_info; - - GetKernelInfoForRunningApplication(¤t_kernel_info); - ASSERT_TRUE(current_kernel_info.size()); - - EXPECT_EQ(golden_kernel_info[0].kernel_name, current_kernel_info[0].kernel_name); - EXPECT_EQ(golden_kernel_info[1].kernel_name, current_kernel_info[1].kernel_name); -} - -// Test:3 Compares order of kernel-names in golden output against current -// profiler output -TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelDurationShouldBePositive) { - // kernel info in current profiler run - std::vector current_kernel_info; - - GetKernelInfoForRunningApplication(¤t_kernel_info); - ASSERT_TRUE(current_kernel_info.size()); - - EXPECT_GT(current_kernel_info.size(), 0); -} - -// Test:4 Compares end-time is greater than start-time in current -// profiler output -TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenEndTimeIsGreaterThenStartTime) { - // kernel info in current profiler run - std::vector current_kernel_info; - - GetKernelInfoForRunningApplication(¤t_kernel_info); - ASSERT_TRUE(current_kernel_info.size()); - - for (auto& itr : current_kernel_info) { - if (!(itr.end_time).empty()) { - EXPECT_GT(itr.end_time, itr.begin_time); - } - } -} - +// #ifdef USE_OpenMP +// class OpenMPTest : public ProfilerTest { +// protected: +// std::vector golden_kernel_info; +// void SetUp() { +// ProfilerTest::SetUp("openmp_helloworld"); +// GetKernelInfoForGoldenOutput("openmp_helloworld", kGoldenOutputOpenMP, &golden_kernel_info); +// } +// }; + +// // Test:1 Compares total num of kernel-names in golden output against current +// // profiler output +// TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelNumbersMatchWithGoldenOutput) { +// std::vector current_kernel_info; + +// GetKernelInfoForRunningApplication(¤t_kernel_info); +// ASSERT_TRUE(current_kernel_info.size()); + +// EXPECT_EQ(golden_kernel_info.size(), current_kernel_info.size()); +// } + +// // Test:2 Compares order of kernel-names in golden output against current +// // profiler output +// TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelNamesMatchWithGoldenOutput) { +// std::vector current_kernel_info; + +// GetKernelInfoForRunningApplication(¤t_kernel_info); +// ASSERT_TRUE(current_kernel_info.size()); + +// EXPECT_EQ(golden_kernel_info[0].kernel_name, current_kernel_info[0].kernel_name); +// } + +// // Test:3 Compares order of kernel-names in golden output against current +// // profiler output +// TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenKernelDurationShouldBePositive) { +// // kernel info in current profiler run +// std::vector current_kernel_info; + +// GetKernelInfoForRunningApplication(¤t_kernel_info); +// ASSERT_TRUE(current_kernel_info.size()); + +// EXPECT_GT(current_kernel_info.size(), 0); +// } + +// // Test:4 Compares end-time is greater than start-time in current +// // profiler output +// TEST_F(OpenMPTest, WhenRunningProfilerWithAppThenEndTimeIsGreaterThenStartTime) { +// // kernel info in current profiler run +// std::vector current_kernel_info; + +// GetKernelInfoForRunningApplication(¤t_kernel_info); +// ASSERT_TRUE(current_kernel_info.size()); + +// for (auto& itr : current_kernel_info) { +// if (!(itr.end_time).empty()) { +// EXPECT_GT(itr.end_time, itr.begin_time); +// } +// } +// } +// #endif /* * ################################################### * ############ MPI Tests ################ * ################################################### */ - +#ifdef USE_MPI class MPITest : public ProfilerTest { protected: void ProcessMPIApplication(const char* app_name); void ExecuteAndParseApplication(std::stringstream& ss); + std::vector golden_kernel_info; void SetUp() { /*To supress No protocol found prints*/ setenv("HWLOC_COMPONENTS", "-gl", 1); - - // run as standalone test ProfilerTest::SetUp("mpi_vectoradd"); - - // run mpirun script - // ProcessMPIApplication("mpi_run.sh"); + GetKernelInfoForGoldenOutput("mpi_vectoradd", kGoldenOutputMpi, &golden_kernel_info); } - - /*virtual void TearDown() override { - unsetenv("HWLOC_COMPONENTS"); - unsetenv("LD_PRELOAD"); - ProfilerTest::TearDown(); - }*/ }; -void MPITest::ProcessMPIApplication(const char* app_name) { - std::string app_path = GetRunningPath(running_path); - std::string lib_path = app_path; - - std::stringstream hsa_tools_lib_path; - - hsa_tools_lib_path << app_path << "librocprofiler_tool.so"; - setenv("LD_PRELOAD", hsa_tools_lib_path.str().c_str(), true); - - std::stringstream os; - os << app_path << "tests/featuretests/profiler/apps/" << app_name; - ExecuteAndParseApplication(os); -} - -void MPITest::ExecuteAndParseApplication(std::stringstream& ss) { - FILE* handle = popen(ss.str().c_str(), "r"); - ASSERT_NE(handle, nullptr); - char* ln{NULL}; - std::string temp{""}; - size_t len{0}; - - while (getline(&ln, &len, handle) != -1) { - temp = temp + std::string(ln); - } +// Test:1 if kernel-name exists in current profiler output +TEST_F(MPITest, WhenRunningProfilerWithAppThenKernelNumbersOutputGenerated) { + std::vector current_kernel_info; - free(ln); - size_t pos{0}; - std::string delimiter{"\n"}; - while ((pos = temp.find(delimiter)) != std::string::npos) { - output_lines.push_back(temp.substr(0, pos)); - temp.erase(0, pos + delimiter.length()); - } + GetKernelInfoForRunningApplication(¤t_kernel_info); + ASSERT_TRUE(current_kernel_info.size()); - pclose(handle); + EXPECT_GT(current_kernel_info.size(), 0); } -// Test:1 Compares total num of kernel-names in golden output against current -// profiler output -TEST_F(MPITest, WhenRunningProfilerWithAppThenKernelNumbersMatchWithGoldenOutput) { +// Test:1 if kernel-name matches with golden output +TEST_F(MPITest, WhenRunningProfilerWithAppThenKernelNameMatchWithGoldenOutput) { std::vector current_kernel_info; GetKernelInfoForRunningApplication(¤t_kernel_info); ASSERT_TRUE(current_kernel_info.size()); - EXPECT_GT(current_kernel_info.size(), 0); + EXPECT_EQ(golden_kernel_info[0].kernel_name, current_kernel_info[0].kernel_name); } - +#endif /* * ################################################### * ############ HSA Load Unload Tests ################ @@ -586,8 +554,8 @@ TEST_F(LoadUnloadTest, WhenLoadingSecondTimeThenToolLoadsUnloadsSuccessfully) { class ATTCollection : public ::testing::Test { public: - virtual void SetUp() { bCollected = false; }; - virtual void TearDown(){}; + void SetUp() override { bCollected = false; }; + void TearDown() override{}; static bool bCollected; static void FlushCallback(const rocprofiler_record_header_t* record, @@ -625,11 +593,42 @@ class ATTCollection : public ::testing::Test { bool ATTCollection::bCollected = false; TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithOldAPI) { + // iterate for gpu's + struct agent_info { + bool skip = false; + std::vector agents = {}; + + auto as_string() const { + auto _ss = std::stringstream{}; + for (const auto& itr : agents) _ss << ", " << itr; + auto _v = _ss.str(); + if (_v.length() > 2) return _v.substr(2); + return _v; + } + }; + + auto _info = agent_info{}; + hsa_iterate_agents( + [](hsa_agent_t agent, void* _arg) { + agent_info* _info_v = static_cast(_arg); + EXPECT_NE(_info_v, nullptr); + char gpu_name[64] = {'\0'}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, gpu_name); + _info_v->agents.emplace_back(std::string{gpu_name}); + if (std::regex_search(_info_v->agents.back(), std::regex{"^gfx1[0-1][0-9][0-9]"})) { + _info_v->skip = true; + } + return HSA_STATUS_SUCCESS; + }, + static_cast(&_info)); + + if (_info.skip) GTEST_SKIP(); + int result = ROCPROFILER_STATUS_ERROR; // inititalize ROCProfiler result = rocprofiler_initialize(); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // Att trace collection parameters rocprofiler_session_id_t session_id; @@ -642,12 +641,12 @@ TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithOldAPI) { // create a session result = rocprofiler_create_session(ROCPROFILER_NONE_REPLAY_MODE, &session_id); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // create a buffer to hold att trace records for each kernel launch rocprofiler_buffer_id_t buffer_id; result = rocprofiler_create_buffer(session_id, FlushCallback, 0x9999, &buffer_id); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // create a filter for collecting att traces rocprofiler_filter_id_t filter_id; @@ -655,65 +654,93 @@ TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithOldAPI) { result = rocprofiler_create_filter(session_id, ROCPROFILER_ATT_TRACE_COLLECTION, rocprofiler_filter_data_t{.att_parameters = ¶meters[0]}, parameters.size(), &filter_id, property); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // set buffer for the filter result = rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // activating att tracing session result = rocprofiler_start_session(session_id); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // Launch a kernel LaunchVectorAddKernel(); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // deactivate att tracing session result = rocprofiler_terminate_session(session_id); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // dump att tracing data result = rocprofiler_flush_data(session_id, buffer_id); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // destroy session result = rocprofiler_destroy_session(session_id); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // finalize att tracing by destroying rocprofiler object result = rocprofiler_finalize(); - EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); + EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result) << "agents: " << _info.as_string(); // check if we got data from any shader engine - EXPECT_EQ(bCollected, true); + EXPECT_EQ(bCollected, true) << "agents: " << _info.as_string(); } // New API TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithNewAPI) { - int result = ROCPROFILER_STATUS_ERROR; + // iterate for gpu's + struct agent_info { + bool skip = false; + std::vector agents = {}; + + auto as_string() const { + auto _ss = std::stringstream{}; + for (const auto& itr : agents) _ss << ", " << itr; + auto _v = _ss.str(); + if (_v.length() > 2) return _v.substr(2); + return _v; + } + }; + + auto _info = agent_info{}; + hsa_iterate_agents( + [](hsa_agent_t agent, void* _arg) { + agent_info* _info_v = static_cast(_arg); + EXPECT_NE(_info_v, nullptr); + char gpu_name[64] = {'\0'}; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, gpu_name); + _info_v->agents.emplace_back(std::string{gpu_name}); + if (std::regex_search(_info_v->agents.back(), std::regex{"^gfx1[0-1][0-9][0-9]"})) { + _info_v->skip = true; + } + return HSA_STATUS_SUCCESS; + }, + static_cast(&_info)); + + if (_info.skip) GTEST_SKIP(); + int result = ROCPROFILER_STATUS_ERROR; // inititalize ROCProfiler result = rocprofiler_initialize(); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // Att trace collection parameters rocprofiler_session_id_t session_id; std::vector parameters; parameters.emplace_back(rocprofiler_att_parameter_t{ROCPROFILER_ATT_COMPUTE_UNIT, 0}); parameters.emplace_back(rocprofiler_att_parameter_t{ROCPROFILER_ATT_SE_MASK, 0xF}); - parameters.emplace_back(rocprofiler_att_parameter_t{ROCPROFILER_ATT_SIMD_SELECT, 0x3}); // Replace below tests once aqlprofile passes - parameters.emplace_back(rocprofiler_att_parameter_t{ROCPROFILER_ATT_BUFFER_SIZE, 0x1000000}); // Replace below tests once aqlprofile passes - + parameters.emplace_back(rocprofiler_att_parameter_t{ + ROCPROFILER_ATT_SIMD_SELECT, 0x3}); // Replace below tests once aqlprofile passes + parameters.emplace_back(rocprofiler_att_parameter_t{ + ROCPROFILER_ATT_BUFFER_SIZE, 0x1000000}); // Replace below tests once aqlprofile passes // create a session result = rocprofiler_create_session(ROCPROFILER_NONE_REPLAY_MODE, &session_id); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // create a buffer to hold att trace records for each kernel launch rocprofiler_buffer_id_t buffer_id; result = rocprofiler_create_buffer(session_id, FlushCallback, 0x9999, &buffer_id); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // create a filter for collecting att traces rocprofiler_filter_id_t filter_id; rocprofiler_filter_property_t property = {}; @@ -721,35 +748,27 @@ TEST_F(ATTCollection, WhenRunningATTItCollectsTraceDataWithNewAPI) { rocprofiler_filter_data_t{.att_parameters = ¶meters[0]}, parameters.size(), &filter_id, property); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // set buffer for the filter result = rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // activating att tracing session result = rocprofiler_start_session(session_id); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // Launch a kernel LaunchVectorAddKernel(); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // deactivate att tracing session result = rocprofiler_terminate_session(session_id); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // dump att tracing data result = rocprofiler_flush_data(session_id, buffer_id); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // destroy session result = rocprofiler_destroy_session(session_id); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // finalize att tracing by destroying rocprofiler object result = rocprofiler_finalize(); EXPECT_EQ(ROCPROFILER_STATUS_SUCCESS, result); - // check if we got data from any shader engine EXPECT_EQ(bCollected, true); } @@ -767,6 +786,7 @@ class ProfilerAPITest : public ::testing::Test { std::stringstream gfx_path; gfx_path << app_path << metrics_path; setenv("ROCPROFILER_METRICS_PATH", gfx_path.str().c_str(), true); + setenv("ROCPROFILER_MAX_ATT_PROFILES", "2", 1); } // function to check profiler API status static void CheckApi(rocprofiler_status_t status) { @@ -932,96 +952,6 @@ TEST_F(DerivedMetricsReuseTest, WhenRunningRepeatedBaseMetricsAPIsWorkFine) { CheckApi(rocprofiler_finalize()); } -/* - * ################################################### - * ############ SPM Tests ################ - * ################################################### - */ - -class ProfilerSPMTest : public ::testing::Test { - // function to check spm tracing API status - protected: - // function to check profiler API status - static void CheckApi(rocprofiler_status_t status) { - ASSERT_EQ(status, ROCPROFILER_STATUS_SUCCESS); - }; - - static void FlushCallback(const rocprofiler_record_header_t* record, - const rocprofiler_record_header_t* end_record, - rocprofiler_session_id_t session_id, - rocprofiler_buffer_id_t buffer_id) { - while (record < end_record) { - if (!record) - break; - else if (record->kind == ROCPROFILER_SPM_RECORD) { - const rocprofiler_record_spm_t* spm_record = - reinterpret_cast(record); - int se_num = 4; - // iterate over each shader engine - for (int i = 0; i < se_num; i++) { - printf("\n\n-------------- shader_engine %d --------------\n\n", i); - rocprofiler_record_se_spm_data_t se_spm = spm_record->shader_engine_data[i]; - for (int i = 0; i < 32; i++) { - printf("%04x\n", se_spm.counters_data[i].value); - } - } - } - CheckApi(rocprofiler_next_record(record, &record, session_id, buffer_id)); - } - } -}; - -TEST_F(ProfilerSPMTest, WhenRunningSPMItCollectsSPMData) { - // initialize rocprofiler - hsa_init(); - CheckApi(rocprofiler_initialize()); - - // spm trace collection parameters - rocprofiler_session_id_t session_id; - rocprofiler_spm_parameter_t spm_parameters; - const char* counter_name = "SQ_WAVES"; - spm_parameters.counters_names = &counter_name; - spm_parameters.counters_count = 1; - spm_parameters.gpu_agent_id = NULL; - // spm_parameters.cpu_agent_id = NULL; - spm_parameters.sampling_rate = 10000; - // create a session - CheckApi(rocprofiler_create_session(ROCPROFILER_NONE_REPLAY_MODE, &session_id)); - - // create a buffer to hold spm trace records for each kernel launch - rocprofiler_buffer_id_t buffer_id; - CheckApi(rocprofiler_create_buffer(session_id, FlushCallback, 0x99999999, &buffer_id)); - - // create a filter for collecting spm traces - rocprofiler_filter_id_t filter_id; - rocprofiler_filter_property_t property = {}; - CheckApi(rocprofiler_create_filter(session_id, ROCPROFILER_SPM_COLLECTION, - rocprofiler_filter_data_t{.spm_parameters = &spm_parameters}, - 1, &filter_id, property)); - - // set buffer for the filter - CheckApi(rocprofiler_set_filter_buffer(session_id, filter_id, buffer_id)); - - // activating spm tracing session - CheckApi(rocprofiler_start_session(session_id)); - - // Launch a kernel - LaunchVectorAddKernel(); - - // deactivate spm tracing session - // dump spm tracing data - // - CheckApi(rocprofiler_terminate_session(session_id)); - // CheckApi(rocprofiler_flush_data(session_id, buffer_id)); - - // destroy session - CheckApi(rocprofiler_destroy_session(session_id)); - - // finalize spm tracing by destroying rocprofiler object - CheckApi(rocprofiler_finalize()); - hsa_shut_down(); -} - /* * ################################################### * ############ Multi Thread Binary Tests ############ @@ -1222,13 +1152,13 @@ TEST(ProfilerMPTest, WhenRunningMultiProcessTestItPasses) { */ class CodeobjTest : public ::testing::Test { -public: - virtual void SetUp(const char* app_name) {}; + public: + virtual void SetUp(const char* app_name){}; virtual void TearDown(){}; static void FlushCallback(const rocprofiler_record_header_t* record, const rocprofiler_record_header_t* end_record, rocprofiler_session_id_t session_id, - rocprofiler_buffer_id_t buffer_id) {}; + rocprofiler_buffer_id_t buffer_id){}; void SetupRocprofiler() { int result = ROCPROFILER_STATUS_ERROR; @@ -1279,7 +1209,7 @@ TEST_F(CodeobjTest, WhenRunningProfilerWithCodeobjCapture) { EXPECT_GE(capture.count, 1); bool bCaptured_itself = false; - for (int i=0; i<(int)capture.count; i++) { + for (int i = 0; i < (int)capture.count; i++) { const char* path = capture.symbols[i].filepath; if (!path) continue; std::string fpath(path); @@ -1334,7 +1264,7 @@ TEST_F(CodeobjTest, WhenRunningProfilerWithMultipleCaptureAndCopy) { EXPECT_GE(capture.count, 1); - for (int i=0; i<(int)capture.count; i++) { + for (int i = 0; i < (int)capture.count; i++) { EXPECT_NE(capture.symbols[i].base_address, 0); EXPECT_NE(capture.symbols[i].clock_start.value, 0); EXPECT_NE(capture.symbols[i].data, nullptr); @@ -1445,7 +1375,8 @@ class VectorAddPerfettoMPITest : public PerfettoPluginTest { protected: virtual void SetUp() { setenv("MPI_RANK", "7", true); - RunApplication("hip_vectoradd", " -d /tmp/tests-v2/perfetto/ -o test_%q{MPI_RANK}_ --plugin perfetto"); + RunApplication("hip_vectoradd", + " -d /tmp/tests-v2/perfetto/ -o test_%q{MPI_RANK}_ --plugin perfetto"); } virtual void TearDown() { std::experimental::filesystem::remove_all("/tmp/tests-v2/perfetto/"); @@ -1459,7 +1390,8 @@ TEST_F(VectorAddPerfettoMPITest, WhenRunningProfilerWithPerfettoTest) { } bool CTFPluginTest::hasMetadataInDir(const char* directory) { - for (const auto& entry : std::experimental::filesystem::directory_iterator(directory)) + auto path = std::experimental::filesystem::directory_iterator(directory)->path(); + for (const auto& entry : std::experimental::filesystem::directory_iterator(path)) if (std::string(entry.path().filename()) == "metadata") return true; return false; } @@ -1471,7 +1403,7 @@ class VectorAddCTFTest : public CTFPluginTest { std::experimental::filesystem::remove_all("/tmp/tests-v2/"); unsetenv("MPI_RANK"); } - bool hasFile() { return hasMetadataInDir("/tmp/tests-v2/ctf/trace/"); } + bool hasFile() { return hasMetadataInDir("/tmp/tests-v2/ctf/"); } }; TEST_F(VectorAddCTFTest, WhenRunningProfilerWithCTFTest) { EXPECT_EQ(hasFile(), true); } @@ -1486,7 +1418,7 @@ class VectorAddCTFMPITest : public CTFPluginTest { std::experimental::filesystem::remove_all("/tmp/tests-v2/"); unsetenv("MPI_RANK"); } - bool hasFile() { return hasMetadataInDir("/tmp/tests-v2/ctf_7/trace/"); } + bool hasFile() { return hasMetadataInDir("/tmp/tests-v2/ctf_7/"); } }; TEST_F(VectorAddCTFMPITest, WhenRunningProfilerWithCTFTest) { EXPECT_EQ(hasFile(), true); } diff --git a/tests-v2/featuretests/tracer/CMakeLists.txt b/tests-v2/featuretests/tracer/CMakeLists.txt index 12b10640..af7d9602 100644 --- a/tests-v2/featuretests/tracer/CMakeLists.txt +++ b/tests-v2/featuretests/tracer/CMakeLists.txt @@ -6,8 +6,32 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip") set(CMAKE_HIP_ARCHITECTURES OFF) +if(DEFINED ROCM_PATH) + set(HIP_ROOT_DIR "${ROCM_PATH}/bin") +endif() find_package(HIP REQUIRED MODULE) +# ######################################################################################## +function(rocprofiler_featuretests_tracer_add_test _TARGET) + if(TARGET ${_TARGET}) + if(NOT TEST ${_TARGET}) + add_test( + NAME ${_TARGET} + COMMAND $ + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) + endif() + + set_tests_properties( + ${_TARGET} PROPERTIES LABELS "featuretests;tracer" ENVIRONMENT + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" ${ARGN}) + endif() +endfunction() + +function(rocprofiler_featuretests_tracer_add_executable _TARGET) + hip_add_executable(${_TARGET} ${ARGN}) + rocprofiler_featuretests_tracer_add_test(${_TARGET}) +endfunction() + # Setup testing enable_testing() find_package(GTest REQUIRED) @@ -29,7 +53,7 @@ file(GLOB GTEST_MAIN_SRC_FILE ${GTEST_MAIN_DIR}/*.cpp) # Compile Applications hip_helloworld set_source_files_properties(apps/hello_world.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(tracer_hip_helloworld apps/hello_world.cpp) +rocprofiler_featuretests_tracer_add_executable(tracer_hip_helloworld apps/hello_world.cpp) set_target_properties( tracer_hip_helloworld PROPERTIES RUNTIME_OUTPUT_DIRECTORY @@ -45,7 +69,7 @@ install( # hsa-mem_async_copy and async_copy_on_engine set_source_files_properties(apps/copy_on_engine.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) -hip_add_executable(copy_on_engine apps/copy_on_engine.cpp) +rocprofiler_featuretests_tracer_add_executable(copy_on_engine apps/copy_on_engine.cpp) set_target_properties( copy_on_engine PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/tests-v2/featuretests/tracer/apps") @@ -60,24 +84,27 @@ target_link_libraries(copy_on_engine hsa-runtime64::hsa-runtime64 Threads::Threa stdc++fs) # Compile MatrixTranspose App with ROCTX -find_library(ROCTX_LIBRARY NAMES roctx64 HINTS ${ROCM_PATH}/lib) +find_library( + ROCTX_LIBRARY + NAMES roctx64 + HINTS ${ROCM_PATH}/lib) if(ROCTX_LIBRARY) - set_source_files_properties(apps/MatrixTranspose.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - hip_add_executable(tracer_matrix_transpose apps/MatrixTranspose.cpp) - set_target_properties( - tracer_matrix_transpose - PROPERTIES RUNTIME_OUTPUT_DIRECTORY - "${PROJECT_BINARY_DIR}/tests-v2/featuretests/tracer/apps") - target_link_options(tracer_matrix_transpose PRIVATE "-Wl,--build-id=md5") - target_include_directories( - tracer_matrix_transpose PRIVATE ${ROCM_PATH}) - target_link_libraries(tracer_matrix_transpose ${ROCTX_LIBRARY}) - install( - TARGETS tracer_matrix_transpose - RUNTIME - DESTINATION - ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/tracer/apps - COMPONENT tests) + set_source_files_properties(apps/MatrixTranspose.cpp + PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + hip_add_executable(tracer_matrix_transpose apps/MatrixTranspose.cpp) + set_target_properties( + tracer_matrix_transpose + PROPERTIES RUNTIME_OUTPUT_DIRECTORY + "${PROJECT_BINARY_DIR}/tests-v2/featuretests/tracer/apps") + target_link_options(tracer_matrix_transpose PRIVATE "-Wl,--build-id=md5") + target_include_directories(tracer_matrix_transpose PRIVATE ${ROCM_PATH}) + target_link_libraries(tracer_matrix_transpose ${ROCTX_LIBRARY}) + install( + TARGETS tracer_matrix_transpose + RUNTIME + DESTINATION + ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/tracer/apps + COMPONENT tests) endif() # Add test cpp file @@ -102,3 +129,11 @@ install( DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests/featuretests/tracer/apps/goldentraces COMPONENT tests) + +find_package( + Python3 + COMPONENTS Interpreter + REQUIRED) + +# cmake based tests +include(${CMAKE_CURRENT_LIST_DIR}/hiptrace_validation_tests.cmake) diff --git a/tests-v2/featuretests/tracer/apps/MatrixTranspose.cpp b/tests-v2/featuretests/tracer/apps/MatrixTranspose.cpp index d6452395..81b2585c 100755 --- a/tests-v2/featuretests/tracer/apps/MatrixTranspose.cpp +++ b/tests-v2/featuretests/tracer/apps/MatrixTranspose.cpp @@ -39,52 +39,52 @@ THE SOFTWARE. // Device (Kernel) function, it must be void __global__ void matrixTranspose(float* out, float* in, const int width) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - out[y * width + x] = in[x * width + y]; + out[y * width + x] = in[x * width + y]; } // CPU implementation of matrix transpose void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) { - for (unsigned int j = 0; j < width; j++) { - for (unsigned int i = 0; i < width; i++) { - output[i * width + j] = input[j * width + i]; - } + for (unsigned int j = 0; j < width; j++) { + for (unsigned int i = 0; i < width; i++) { + output[i * width + j] = input[j * width + i]; } + } } int main() { - float* Matrix; - float* TransposeMatrix; - float* cpuTransposeMatrix; + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; - float* gpuMatrix; - float* gpuTransposeMatrix; + float* gpuMatrix; + float* gpuTransposeMatrix; - hipDeviceProp_t devProp; - hipGetDeviceProperties(&devProp, 0); + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); - std::cout << "Device name " << devProp.name << std::endl; + std::cout << "Device name " << devProp.name << std::endl; - int i; - int errors; + int i; + int errors; - Matrix = (float*)malloc(NUM * sizeof(float)); - TransposeMatrix = (float*)malloc(NUM * sizeof(float)); - cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); - // initialize the input data - for (i = 0; i < NUM; i++) { - Matrix[i] = (float)i * 10.0f; - } + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i * 10.0f; + } - // allocate the memory on the device side - hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); - hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); - uint32_t iterations = 10; - while (iterations-- > 0) { + uint32_t iterations = 10; + while (iterations-- > 0) { std::cout << "## Iteration (" << iterations << ") #################" << std::endl; // Memory transfer from host to device @@ -96,11 +96,11 @@ int main() { roctx_range_id_t roctx_id = roctxRangeStartA("roctx_range with id"); // Lauching kernel from host - hipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, - gpuMatrix, WIDTH); + hipLaunchKernelGGL( + matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH); - roctxRangeStop(roctx_id); + roctxRangeStop(roctx_id); roctxMark("ROCTX-MARK: after hipLaunchKernel"); // Memory transfer from device to host @@ -108,8 +108,8 @@ int main() { hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost); - roctxRangePop(); // for "hipMemcpy" - roctxRangePop(); // for "hipLaunchKernel" + roctxRangePop(); // for "hipMemcpy" + roctxRangePop(); // for "hipLaunchKernel" // CPU MatrixTranspose computation matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); @@ -118,26 +118,25 @@ int main() { errors = 0; double eps = 1.0E-6; for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) { - errors++; - } + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) { + errors++; + } } if (errors != 0) { - printf("FAILED: %d errors\n", errors); + printf("FAILED: %d errors\n", errors); } else { - printf("PASSED!\n"); - } - + printf("PASSED!\n"); } + } - // free the resources on device side - hipFree(gpuMatrix); - hipFree(gpuTransposeMatrix); + // free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); - // free the resources on host side - free(Matrix); - free(TransposeMatrix); - free(cpuTransposeMatrix); + // free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); - return errors; + return errors; } diff --git a/tests-v2/featuretests/tracer/apps/copy_on_engine.cpp b/tests-v2/featuretests/tracer/apps/copy_on_engine.cpp index 7ce76b72..185a060e 100644 --- a/tests-v2/featuretests/tracer/apps/copy_on_engine.cpp +++ b/tests-v2/featuretests/tracer/apps/copy_on_engine.cpp @@ -195,8 +195,8 @@ static hsa_status_t AsyncCpyTest(async_mem_cpy_agent* dst, async_mem_cpy_agent* // Initialize the system and destination buffers with a value so we can later // validate it has been overwritten void* sysPtr = args->cpu.ptr; - - *reinterpret_cast(src->ptr) = val; + err = hsa_amd_memory_fill(src->ptr, val, sz / sizeof(uint32_t)); + RET_IF_HSA_ERR(err); // Make sure the target and destination agents have access to the buffer. hsa_agent_t ag_list[3] = {dst->dev, src->dev, args->cpu.dev}; @@ -231,14 +231,14 @@ static hsa_status_t AsyncCpyTest(async_mem_cpy_agent* dst, async_mem_cpy_agent* } // Check that the contents of the buffer are what is expected. - if (*reinterpret_cast(dst->ptr) != *reinterpret_cast(src->ptr)) { - fprintf(stderr, - "Expected 0x%x but got 0x%x in buffer when copying from %lu to %lu and CPU device is " - "%lu.\n", - *reinterpret_cast(src->ptr), *reinterpret_cast(dst->ptr), - src->dev.handle, dst->dev.handle, args->cpu.dev.handle); - return HSA_STATUS_ERROR; + for (uint32_t i = 0; i < sz / sizeof(uint32_t); ++i) { + if (reinterpret_cast(sysPtr)[i] != val) { + fprintf(stderr, "Expected 0x%x but got 0x%x in buffer at index %d.\n", val, + reinterpret_cast(sysPtr)[i], i); + return HSA_STATUS_ERROR; + } } + return HSA_STATUS_SUCCESS; } diff --git a/tests-v2/featuretests/tracer/hip_trace_validate.py b/tests-v2/featuretests/tracer/hip_trace_validate.py new file mode 100644 index 00000000..a3880989 --- /dev/null +++ b/tests-v2/featuretests/tracer/hip_trace_validate.py @@ -0,0 +1,28 @@ +import pandas as pd +import sys + + +def validate_hip_trace(filename): + df = pd.read_csv(filename) + + start_time = df.loc[0, "Start_Timestamp"] + end_time = df.loc[0, "End_Timestamp"] + + # Validate the data + if start_time < end_time: + print("Test Passed: Time stamps are valid.") + return 0 + else: + print("Test Failed: Time stamps are not valid.") + return 1 + + +if __name__ == "__main__": + files = sys.argv[1:] + if not files: + raise RuntimeError("no input files provided") + for filename in files: + ec = validate_hip_trace(filename) + if ec != 0: + sys.stderr.write(f"{filename} did not pass validation\n") + sys.exit(ec) diff --git a/tests-v2/featuretests/tracer/hiptrace_validation_tests.cmake b/tests-v2/featuretests/tracer/hiptrace_validation_tests.cmake new file mode 100644 index 00000000..c690756e --- /dev/null +++ b/tests-v2/featuretests/tracer/hiptrace_validation_tests.cmake @@ -0,0 +1,29 @@ +# hip-trace validation test - Timestamp +add_test( + NAME hiptrace_helloworld_test + COMMAND ${PROJECT_BINARY_DIR}/rocprofv2 --hip-api -d ${PROJECT_BINARY_DIR}/out-trace + -o out tests-v2/featuretests/profiler/apps/hip_helloworld + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}") + +set_tests_properties( + hiptrace_helloworld_test PROPERTIES LABELS "v2;rocprofv2" ENVIRONMENT + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +add_test( + NAME hiptrace_helloworld_test_validation + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/hip_trace_validate.py + "out-trace/hip_api_trace_out.csv" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}") + +set_tests_properties( + hiptrace_helloworld_test_validation + PROPERTIES DEPENDS + hiptrace_helloworld_test + LABELS + "v2;validation" + PASS_REGULAR_EXPRESSION + "Test Passed" + FAIL_REGULAR_EXPRESSION + "Test Failed" + SKIP_REGULAR_EXPRESSION + "Skipped") diff --git a/tests-v2/featuretests/tracer/tracer_gtest.cpp b/tests-v2/featuretests/tracer/tracer_gtest.cpp index 9d5d7a64..11bfcd74 100644 --- a/tests-v2/featuretests/tracer/tracer_gtest.cpp +++ b/tests-v2/featuretests/tracer/tracer_gtest.cpp @@ -69,6 +69,10 @@ void ApplicationParser::SetApplicationEnv(const char* app_name, const char* trac setenv("LD_LIBRARY_PATH", ld_library_path.str().c_str(), true); std::stringstream hsa_tools_lib_path; + auto _existing_ld_preload = getenv("LD_PRELOAD"); + if (_existing_ld_preload && strnlen(_existing_ld_preload, 1) > 0) + hsa_tools_lib_path << _existing_ld_preload << ":"; + hsa_tools_lib_path << app_path << lib_path; setenv("LD_PRELOAD", hsa_tools_lib_path.str().c_str(), true); @@ -273,7 +277,7 @@ class AsyncCopyTest : public Tracertest { // Test:1 Compares total num of kernel-names in golden output against current // tracer output -TEST_F(AsyncCopyTest, DISABLED_WhenRunningTracerWithAppThenAsyncCopyOutputIsgenerated) { +TEST_F(AsyncCopyTest, WhenRunningTracerWithAppThenAsyncCopyOutputIsGenerated) { // kernel info in current profler run std::vector current_kernel_info; @@ -282,7 +286,7 @@ TEST_F(AsyncCopyTest, DISABLED_WhenRunningTracerWithAppThenAsyncCopyOutputIsgene } // Test:2 Matches coelation Ids -TEST_F(AsyncCopyTest, DISABLED_WhenRunningTracerWithAppThenAsyncCorelationCountIsCorrect) { +TEST_F(AsyncCopyTest, WhenRunningTracerWithAppThenAsyncCorelationCountIsCorrect) { // kernel info in current profler run std::vector current_kernel_info; @@ -387,4 +391,4 @@ TEST_F(ROCTXTest, WhenRunningTracerWithAppThenROCTxOutputIsgenerated) { EXPECT_EQ(roctx_output.size(), i) << "Current Output number of records is greater than golden output number of records" << std::endl; -} \ No newline at end of file +} diff --git a/tests-v2/featuretests/utils/csv_parser.h b/tests-v2/featuretests/utils/csv_parser.h index d060bbee..085a6d62 100644 --- a/tests-v2/featuretests/utils/csv_parser.h +++ b/tests-v2/featuretests/utils/csv_parser.h @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include +#include #include #include #include diff --git a/tests-v2/microbenchmarks/CMakeLists.txt b/tests-v2/microbenchmarks/CMakeLists.txt index ec8e425b..d66cb4c6 100644 --- a/tests-v2/microbenchmarks/CMakeLists.txt +++ b/tests-v2/microbenchmarks/CMakeLists.txt @@ -6,6 +6,9 @@ set(CMAKE_EXECUTABLE_RPATH_LINK_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RPATH_LINK_CXX_F set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${ROCM_PATH}/lib/cmake/hip") set(CMAKE_HIP_ARCHITECTURES OFF) +if(DEFINED ROCM_PATH) + set(HIP_ROOT_DIR "${ROCM_PATH}/bin") +endif() find_package(HIP REQUIRED MODULE) set(TEST_DIR ${PROJECT_SOURCE_DIR}/tests-v2/microbenchmarks) @@ -13,6 +16,14 @@ file(GLOB TEST_SRC_FILE ${TEST_DIR}/*.cpp) set_source_files_properties(${TEST_SRC_FILE} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) hip_add_executable(pcie_bw_test ${TEST_SRC_FILE}) +add_test( + NAME pcie_bw_test + COMMAND $ + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}") +set_tests_properties( + pcie_bw_test + PROPERTIES LABELS "v2;benchmarks" ENVIRONMENT "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}" + SKIP_REGULAR_EXPRESSION "SIGBUS error. Aborting test" DISABLED TRUE) target_link_libraries(pcie_bw_test PRIVATE rocm_smi64) target_link_options(pcie_bw_test PRIVATE "-Wl,--build-id=md5") diff --git a/tests-v2/microbenchmarks/pcie_bw_test.cpp b/tests-v2/microbenchmarks/pcie_bw_test.cpp index 5b5c9d7c..35f93d76 100644 --- a/tests-v2/microbenchmarks/pcie_bw_test.cpp +++ b/tests-v2/microbenchmarks/pcie_bw_test.cpp @@ -18,11 +18,17 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +// make sure assert works +#ifdef NDEBUG +#undef NDEBUG +#endif + #include #include #include #include +#include #include #include #include @@ -31,6 +37,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" @@ -71,22 +78,22 @@ #define HANDLE_ERROR CHK_ERR_ASRT(ret); #define HIP_ASSERT(x) (assert((x) == hipSuccess)) -#define SEND_DATA() \ - HIP_ASSERT(hipMemcpyAsync(dst, src, SIZE * sizeof(int), hipMemcpyDefault, stream)); - -static float burn_hip(int dev, int* dst, int* src, size_t SIZE, - std::atomic* transfer_started) { +static float burn_hip(int dev, int* dst, int* src, size_t sz, std::atomic* transfer_started) { hipSetDevice(dev); hipStream_t stream; hipStreamCreate(&stream); - hipEvent_t events[3]; + auto events = std::array{}; + + auto send_data = [dst, src, sz, stream]() { + HIP_ASSERT(hipMemcpyAsync(dst, src, sz * sizeof(int), hipMemcpyDefault, stream)); + }; - for (int i = 0; i < 3; i++) { - hipEventCreate(events + i); - SEND_DATA(); - hipEventRecord(events[i], stream); + for (auto& event : events) { + hipEventCreate(&event); + send_data(); + hipEventRecord(event, stream); } - SEND_DATA(); + send_data(); hipEventSynchronize(events[0]); transfer_started->store(true); @@ -95,23 +102,30 @@ static float burn_hip(int dev, int* dst, int* src, size_t SIZE, while (elapsed < 1500.0f) { // Transfer data for 1.5 seconds = 1500 ms float out; - hipEventSynchronize(events[(counter + 1) % 3]); - hipEventElapsedTime(&out, events[counter % 3], events[(counter + 1) % 3]); + hipEventSynchronize(events[(counter + 1) % events.size()]); + hipEventElapsedTime(&out, events[counter % events.size()], + events[(counter + 1) % events.size()]); elapsed += out; - hipEventRecord(events[counter % 3], stream); - SEND_DATA(); + hipEventRecord(events[counter % events.size()], stream); + send_data(); counter += 1; } hipStreamSynchronize(stream); - for (int i = 0; i < 3; i++) hipEventDestroy(events[i]); + for (auto& event : events) hipEventDestroy(event); hipStreamDestroy(stream); - return float(SIZE * sizeof(int) * counter) / elapsed / 1E6; + return float(sz * sizeof(int) * counter) / elapsed / 1E6; } +namespace { +void signal_handler(int _sig); +void activate_signal_handler(); +} // namespace + int main() { + activate_signal_handler(); const size_t SIZE = 3 << 28; rsmi_status_t ret; uint16_t dev_id; @@ -132,9 +146,10 @@ int main() { int* d_ptr; HIP_ASSERT(hipMalloc((void**)&d_ptr, SIZE * sizeof(int))); - std::cout << ">>> Device " << dev << std::endl; + std::cout << ">>> Device " << dev << std::flush; ret = rsmi_dev_id_get(dev, &dev_id); HANDLE_ERROR; + std::cout << " (rsmi device id: " << dev_id << ")" << std::endl; rsmi_pcie_bandwidth_t bandwidth; ret = rsmi_dev_pci_bandwidth_get(dev, &bandwidth); @@ -147,7 +162,9 @@ int main() { std::cout << "Current: " << bandwidth.transfer_rate.frequency[bandwidth.transfer_rate.current] << '\n'; - uint64_t sent = 0, received = 0, max_pkt_sz = 0; + uint64_t sent = 0; + uint64_t received = 0; + uint64_t max_pkt_sz = 0; std::atomic transfer_started; transfer_started.store(false); auto thread = @@ -184,4 +201,23 @@ int main() { delete[] h_ptr; ret = rsmi_shut_down(); return 0; -} \ No newline at end of file +} + +namespace { +// activate a signal handler to catch a SIGBUS on navi32 and +// emit a message that we can use to skip the test in CTest +void activate_signal_handler() { + struct sigaction _action = {}; + sigemptyset(&_action.sa_mask); + _action.sa_flags = SA_RESTART; + _action.sa_handler = signal_handler; + sigaction(SIGBUS, &_action, nullptr); +} + +void signal_handler(int _sig) { + if (_sig == SIGBUS) { + std::cerr << "SIGBUS error. Aborting test" << std::endl; + } + ::quick_exit(_sig); +} +} // namespace diff --git a/tests-v2/run_tests.sh b/tests-v2/run_tests.sh index 1a1977e7..d86804c8 100755 --- a/tests-v2/run_tests.sh +++ b/tests-v2/run_tests.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -e CURRENT_DIR="$( dirname -- "$0"; )"; @@ -15,4 +15,4 @@ echo -e "running feature tests for rocprofiler" eval ${CURRENT_DIR}/tests-v2/featuretests/profiler/runFeatureTests echo -e "Running Tracer Tests" -eval ${CURRENT_DIR}/tests-v2/featuretests/tracer/runTracerFeatureTests \ No newline at end of file +eval ${CURRENT_DIR}/tests-v2/featuretests/tracer/runTracerFeatureTests diff --git a/tests-v2/unittests/core/CMakeLists.txt b/tests-v2/unittests/core/CMakeLists.txt index c85a8338..c255e124 100644 --- a/tests-v2/unittests/core/CMakeLists.txt +++ b/tests-v2/unittests/core/CMakeLists.txt @@ -20,6 +20,30 @@ # SOFTWARE. # ############################################################################## +# ############################################################################## +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ############################################################################## + +# Setup unit testing env + find_library(PCIACCESS_LIBRARIES pciaccess REQUIRED) enable_testing() @@ -55,14 +79,14 @@ file(GLOB ROCPROFILER_TRACER_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/tracer/*.cpp) file(GLOB ROCPROFILER_ROCTRACER_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/tracer/src/*.cpp) - file(GLOB ROCPROFILER_ATT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/att/*.cpp) - file(GLOB ROCPROFILER_SRC_CLASS_FILES - ${CMAKE_CURRENT_SOURCE_DIR}/rocprofiler_singleton.cpp) - file(GLOB ROCPROFILER_ISA_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/isa_capture/*.cpp) - file(GLOB ROCPROFILER_SPM_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/spm/spm.cpp) - file(GLOB ROCPROFILER_SRC_API_FILES ${PROJECT_SOURCE_DIR}/src/api/*.cpp) - set(ROCPROFILER_SRC_FILES ${ROCPROFILER_SRC_API_FILES} ${ROCPROFILER_ATT_SRC_FILES} - ${ROCPROFILER_ISA_SRC_FILES} ${ROCPROFILER_SRC_PROFILER_FILES} ${ROCPROFILER_ATT_SRC_FILES}) +file(GLOB ROCPROFILER_ATT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/att/*.cpp) +file(GLOB ROCPROFILER_SRC_CLASS_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/rocprofiler_singleton.cpp) +file(GLOB ROCPROFILER_ISA_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/isa_capture/*.cpp) +file(GLOB ROCPROFILER_SPM_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/session/spm/spm.cpp) +file(GLOB ROCPROFILER_SRC_API_FILES ${PROJECT_SOURCE_DIR}/src/api/*.cpp) +set(ROCPROFILER_SRC_FILES ${ROCPROFILER_SRC_API_FILES} ${ROCPROFILER_ATT_SRC_FILES} + ${ROCPROFILER_ISA_SRC_FILES} ${ROCPROFILER_SRC_PROFILER_FILES} ${ROCPROFILER_ATT_SRC_FILES}) set(CORE_HSA_DIR ${PROJECT_SOURCE_DIR}/src/core/hsa) file(GLOB CORE_HSA_SRC_FILES ${CORE_HSA_DIR}/*.cpp) @@ -81,10 +105,14 @@ file(GLOB CORE_COUNTERS_PARENT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters file(GLOB CORE_COUNTERS_METRICS_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/metrics/*.cpp) file(GLOB CORE_COUNTERS_MMIO_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/mmio/*.cpp) -set(GTEST_MAIN_DIR ${PROJECT_SOURCE_DIR}/tests-v2/unittests/core) -file(GLOB GTEST_MAIN_SRC_FILE ${GTEST_MAIN_DIR}/gtests_main.cpp) -add_executable( - runCoreUnitTests +file(GLOB HSASingleton_TEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/HSASingleton/*.cpp) +file(GLOB ROCProfiler_Singleton_TEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/ROCProfiler_Singleton/*.cpp) +file(GLOB GTEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + +set(runCoreUnitTests_SOURCES + ${GTEST_SRC_FILES} + ${HSASingleton_TEST_SRC_FILES} + ${ROCProfiler_Singleton_TEST_SRC_FILES} ${CORE_MEMORY_SRC_FILES} ${CORE_SESSION_SRC_FILES} ${CORE_FILTER_SRC_FILES} @@ -103,31 +131,56 @@ add_executable( ${CORE_COUNTERS_METRICS_SRC_FILES} ${CORE_COUNTERS_MMIO_SRC_FILES} ${CORE_COUNTERS_PARENT_SRC_FILES} - ${CORE_PC_SAMPLING_FILES} - ${GTEST_MAIN_SRC_FILE} - ${CMAKE_CURRENT_SOURCE_DIR}/ROCProfiler_Singleton/ROCProfiler_Singleton_unittests.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/HSASingleton/HSASingleton_unittests.cpp - ) + ${CORE_PC_SAMPLING_FILES}) + +add_executable(runCoreUnitTests ${runCoreUnitTests_SOURCES}) target_include_directories( - runCoreUnitTests - PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/inc + runCoreUnitTests + PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/inc ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR} ${PROJECT_BINARY_DIR}/rocprofiler) target_compile_definitions( - runCoreUnitTests - PUBLIC AMD_INTERNAL_BUILD - PRIVATE PROF_API_IMPL HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1) - + runCoreUnitTests + PUBLIC AMD_INTERNAL_BUILD + PRIVATE PROF_API_IMPL HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1) target_link_libraries( - runCoreUnitTests PRIVATE rocprofiler_tool test_hsatool_library ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 + runCoreUnitTests PRIVATE rocprofiler_tool test_hsatool_library ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 GTest::gtest GTest::gtest_main stdc++fs ${PCIACCESS_LIBRARIES}) - add_dependencies(tests runCoreUnitTests) install(TARGETS runCoreUnitTests RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests COMPONENT tests) -add_test(AllTests runCoreUnitTests) \ No newline at end of file + +# add_test(AllTests runCoreUnitTests) +include(GoogleTest) + +set(GTEST_DISCOVER_TESTS_TARGET runCoreUnitTests) +set(GTEST_DISCOVER_TESTS_LABELS "v2" "unittests") +set(GTEST_DISCOVER_TESTS_ENVIRONMENT ${ROCPROFILER_MEMCHECK_PRELOAD_ENV}) +configure_file( + ${PROJECT_SOURCE_DIR}/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake @ONLY) + +if(NOT ROCPROFILER_MEMCHECK MATCHES "(Thread|Address)Sanitizer") + gtest_discover_tests(runCoreUnitTests) + set_property( + DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + APPEND + PROPERTY TEST_INCLUDE_FILES + ${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake) +else() + gtest_add_tests( + TARGET runCoreUnitTests + SOURCES "${runUnitTests_SOURCES}" + TEST_LIST runUnitTests_TESTS) + include(${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake) +endif() + +# for the *_FilePlugin tests +if(NOT EXISTS "${PROJECT_BINARY_DIR}/test-output") + file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/test-output") +endif() \ No newline at end of file diff --git a/tests-v2/unittests/profiler/CMakeLists.txt b/tests-v2/unittests/profiler/CMakeLists.txt index 393be7f1..e84d1a78 100644 --- a/tests-v2/unittests/profiler/CMakeLists.txt +++ b/tests-v2/unittests/profiler/CMakeLists.txt @@ -1,3 +1,25 @@ +# ############################################################################## +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ############################################################################## + # Setup unit testing env find_library(PCIACCESS_LIBRARIES pciaccess REQUIRED) @@ -63,8 +85,8 @@ file(GLOB CORE_COUNTERS_PARENT_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters file(GLOB CORE_COUNTERS_METRICS_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/metrics/*.cpp) file(GLOB CORE_COUNTERS_MMIO_SRC_FILES ${PROJECT_SOURCE_DIR}/src/core/counters/mmio/*.cpp) -add_executable( - runUnitTests + +set(runUnitTests_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/profiler_gtest.cpp ${CORE_MEMORY_SRC_FILES} ${CORE_SESSION_SRC_FILES} @@ -84,27 +106,56 @@ add_executable( ${CORE_COUNTERS_METRICS_SRC_FILES} ${CORE_COUNTERS_MMIO_SRC_FILES} ${CORE_COUNTERS_PARENT_SRC_FILES} - ${CORE_PC_SAMPLING_FILES} - ${GTEST_MAIN_SRC_FILE} -) + ${CORE_PC_SAMPLING_FILES}) + +add_executable(runUnitTests ${runUnitTests_SOURCES}) target_include_directories( - runUnitTests - PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/inc + runUnitTests + PRIVATE ${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/inc ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR} ${PROJECT_BINARY_DIR}/rocprofiler) target_compile_definitions( - runUnitTests - PUBLIC AMD_INTERNAL_BUILD - PRIVATE PROF_API_IMPL HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1) + runUnitTests + PUBLIC AMD_INTERNAL_BUILD + PRIVATE PROF_API_IMPL HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_AMD__=1) target_link_libraries( - runUnitTests PRIVATE rocprofiler_tool ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 - GTest::gtest GTest::gtest_main stdc++fs ${PCIACCESS_LIBRARIES} ${GDB} dw elf c dl) + runUnitTests PRIVATE rocprofiler_tool ${AQLPROFILE_LIB} hsa-runtime64::hsa-runtime64 + GTest::gtest GTest::gtest_main stdc++fs ${PCIACCESS_LIBRARIES} ${GDB} dw elf c dl) add_dependencies(tests runUnitTests) install(TARGETS runUnitTests RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests COMPONENT tests) -add_test(AllTests runUnitTests) \ No newline at end of file + +# add_test(AllTests runUnitTests) +include(GoogleTest) + +set(GTEST_DISCOVER_TESTS_TARGET runUnitTests) +set(GTEST_DISCOVER_TESTS_LABELS "v2" "unittests") +set(GTEST_DISCOVER_TESTS_ENVIRONMENT ${ROCPROFILER_MEMCHECK_PRELOAD_ENV}) +configure_file( + ${PROJECT_SOURCE_DIR}/cmake_modules/Templates/gtest_discover_tests_properties.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake @ONLY) + +if(NOT ROCPROFILER_MEMCHECK MATCHES "(Thread|Address)Sanitizer") + gtest_discover_tests(runUnitTests) + set_property( + DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + APPEND + PROPERTY TEST_INCLUDE_FILES + ${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake) +else() + gtest_add_tests( + TARGET runUnitTests + SOURCES "${runUnitTests_SOURCES}" + TEST_LIST runUnitTests_TESTS) + include(${CMAKE_CURRENT_BINARY_DIR}/runUnitTests_TestProperties.cmake) +endif() + +# for the *_FilePlugin tests +if(NOT EXISTS "${PROJECT_BINARY_DIR}/test-output") + file(MAKE_DIRECTORY "${PROJECT_BINARY_DIR}/test-output") +endif() \ No newline at end of file diff --git a/tests-v2/unittests/profiler/profiler_gtest.cpp b/tests-v2/unittests/profiler/profiler_gtest.cpp index 6145cd04..9827d560 100644 --- a/tests-v2/unittests/profiler/profiler_gtest.cpp +++ b/tests-v2/unittests/profiler/profiler_gtest.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include "api/rocprofiler_singleton.h" #include "core/memory/generic_buffer.h" @@ -36,12 +37,9 @@ * ############################################### */ - void buffer_callback_fun(const rocprofiler_record_header_t* begin, const rocprofiler_record_header_t* end, - rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) { - std::cout << "buffer callback" << std::endl; -} + rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) {} /* * ############################################### @@ -51,7 +49,7 @@ void buffer_callback_fun(const rocprofiler_record_header_t* begin, // A lot have changed in the class, since this test was written // Need to rewrite all the test cases again. -TEST(WhenAddingARecordToBuffer, DISABLED_RecordGetsAddedSuccefully) { +TEST(WhenAddingARecordToBuffer, RecordGetsAddedSuccefully) { Memory::GenericBuffer* buffer = new Memory::GenericBuffer( rocprofiler_session_id_t{0}, rocprofiler_buffer_id_t{0}, 0x8000, buffer_callback_fun); @@ -337,4 +335,4 @@ TEST(WhenTrucatingKokkossKernelNames, KernelNameGetsTruncatedProperly) { std::string trunkated_name = rocprofiler::truncate_name(long_kernel_name); EXPECT_EQ("hip_parallel_launch_local_memory", trunkated_name); -} \ No newline at end of file +}