diff --git a/hatchet/graphframe.py b/hatchet/graphframe.py index 822720a8..d041597b 100644 --- a/hatchet/graphframe.py +++ b/hatchet/graphframe.py @@ -4,29 +4,28 @@ # SPDX-License-Identifier: MIT import copy +import json import sys import traceback - from collections import defaultdict -import pandas as pd -import numpy as np import multiprocess as mp -import json +import numpy as np +import pandas as pd -from .node import Node -from .graph import Graph +from .external.console import ConsoleRenderer from .frame import Frame +from .graph import Graph +from .node import Node from .query import ( - is_hatchet_query, + AbstractQuery, ObjectQuery, - parse_string_dialect, QueryEngine, - AbstractQuery, + is_hatchet_query, + parse_string_dialect, ) -from .external.console import ConsoleRenderer -from .util.dot import trees_to_dot from .util.deprecated import deprecated_params +from .util.dot import trees_to_dot try: from .cython_modules.libs import graphframe_modules as _gfm_cy @@ -110,6 +109,35 @@ def from_hpctoolkit(dirname): return HPCToolkitReader(dirname).read() + @staticmethod + def from_hpctoolkit_latest( + dirname: str, + max_depth: int = None, + min_percentage_of_application_time: int = None, + min_percentage_of_parent_time: int = None, + ): + """ + Read an HPCToolkit database directory into a new GraphFrame + + Arguments: + dirname (str): directory of an HPCToolkit performance database + max_depth (int): maximum depth that nodes in the CCT can have to be imported in Hatchet + min_percentage_of_application_time (int): minimum percentage of application time that nodes in the CCT must have to be imported in Hatchet + min_percentage_of_parent_time (int): minimum percentage of parent time that nodes in the CCT must have to be imported in Hatchet + + Returns: + (GraphFrame): new GraphFrame containing HPCToolkit profile data + """ + # import this lazily to avoid circular dependencies + from .readers.hpctoolkit_reader_latest import HPCToolkitReaderLatest + + return HPCToolkitReaderLatest( + dirname, + max_depth=max_depth, + min_application_percentage_time=min_percentage_of_application_time, + min_parent_percentage_time=min_percentage_of_parent_time, + ).read() + @staticmethod def from_caliper(filename_or_stream, query=None): """Read in a Caliper .cali or .json file. diff --git a/hatchet/readers/hpctoolkit_reader_latest.py b/hatchet/readers/hpctoolkit_reader_latest.py new file mode 100644 index 00000000..237d47dd --- /dev/null +++ b/hatchet/readers/hpctoolkit_reader_latest.py @@ -0,0 +1,432 @@ +# Copyright 2017-2023 Lawrence Livermore National Security, LLC and other +# Hatchet Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +import os +import re +import struct +from typing import Dict, Union + +import pandas as pd + +from hatchet.frame import Frame +from hatchet.graph import Graph +from hatchet.graphframe import GraphFrame +from hatchet.node import Node + + +def safe_unpack( + format: str, data: bytes, offset: int, index: int = None, index_length: int = None +) -> tuple: + length = struct.calcsize(format) + if index: + offset += index * (length if index_length is None else index_length) + return struct.unpack(format, data[offset : offset + length]) + + +def read_string(data: bytes, offset: int) -> str: + result = "" + while True: + (letter,) = struct.unpack(" None: + self._dir_path = dir_path + self._max_depth = max_depth + self._application_percentage = min_application_percentage_time + self._parent_percentage = min_parent_percentage_time + + self._meta_file = None + self._profile_file = None + + self._functions = {} + self._source_files = {} + self._load_modules = {} + self._metric_descriptions = {} + self._summary_profile = {} + + self._time_metric = None + self._inclusive_metrics = {} + self._exclusive_metrics = {} + + self._cct_roots = [] + self._metrics_table = [] + + for file_path in os.listdir(self._dir_path): + if file_path.split(".")[-1] == "db": + file_path = os.path.join(self._dir_path, file_path) + with open(file_path, "rb") as file: + file.seek(10) + db = file.read(4) + try: + format = db.decode("ascii") + if format == "meta": + self._meta_file = file_path + elif format == "prof": + self._profile_file = file_path + except Exception: + pass + + if self._meta_file is None: + raise ValueError("ERROR: meta.db not found.") + + if self._profile_file is None: + raise ValueError("ERROR: profile.db not found.") + + def _read_metric_descriptions(self) -> None: + with open(self._meta_file, "rb") as file: + file.seek(FILE_HEADER_OFFSET + 4 * 8) + formatMetrics = " Dict[str, str]: + if pFile not in self._source_files: + (pPath,) = safe_unpack( + " Dict[str, str]: + if pModule not in self._load_modules: + (pPath,) = safe_unpack( + " Dict[str, Union[str, int]]: + if pFunction not in self._functions: + (pName, pModule, offset, pFile, line) = safe_unpack( + " Node: + node = Node(Frame(frame), parent=parent, hnid=ctxId, depth=depth) + if parent is None: + self._cct_roots.append(node) + else: + parent.add_child(node) + node_value = { + "node": node, + "name": ( + # f"{frame['type']}: {frame['name']}" + frame["name"] + if frame["name"] != 1 + else "entry" + ), + } + + if ctxId in self._summary_profile: + node_value.update(self._summary_profile[ctxId]) + + self._metrics_table.append(node_value) + + return node + + def _parse_context( + self, + current_offset: int, + total_size: int, + parent: Node, + meta_db: bytes, + parent_time: int, + ) -> None: + + final_offset = current_offset + total_size + + while current_offset < final_offset: + (szChildren, pChildren, ctxId, _, lexicalType, nFlexWords) = safe_unpack( + " None: + + with open(self._profile_file, "rb") as file: + file.seek(FILE_HEADER_OFFSET) + formatProfileInfos = " None: + with open(self._meta_file, "rb") as file: + meta_db = file.read() + + (pContext,) = safe_unpack(" GraphFrame: + self._read_metric_descriptions() + self._read_summary_profile() + return self._read_cct() diff --git a/hatchet/tests/data/hpctoolkit-gamess/FORMATS.md b/hatchet/tests/data/hpctoolkit-gamess/FORMATS.md new file mode 100644 index 00000000..dd4ec9cc --- /dev/null +++ b/hatchet/tests/data/hpctoolkit-gamess/FORMATS.md @@ -0,0 +1,1163 @@ +A full HPCToolkit database consists of the following files and directories: + + database/ + |-- FORMATS.md This file + |-- metrics/ Taxonomic metric descriptions for analysis presentations + | |-- METRICS.yaml.ex Documentation for the metric taxonomy YAML format + | `-- default.yaml Default metric taxonomy, suitable for most cases + |-- meta.db Properties of the measured application execution + |-- profile.db Performance measurements arranged by application thread + |-- cct.db Performance measurements arranged by calling context + |-- trace.db Time-centric execution traces + `-- src/ Relevant application source files + +This file describes the format of the `*.db` files in an HPCToolkit database. +See `metrics/METRICS.yaml.ex` for a description of the format for defining +performance metric taxonomies. + +Table of contents: + - [Common properties for all formats (READ FIRST)](#common-properties-for-all-formats-read-first) + - [`meta.db` v4.0](#metadb-version-40) + - [`profile.db` v4.0](#profiledb-version-40) + - [`cct.db` v4.0](#cctdb-version-40) + - [`trace.db` v4.0](#tracedb-version-40) + +* * * + +Common properties for all formats (READ FIRST) +============================================== + +### Formats legend ### +[Formats legend]: #formats-legend + +All `*.db` formats are custom binary formats comprised of structures at various +positions within the file. These structures are described by tables of the +following form, where each row (except the first and last) describe a field in +the structure in a notation similar to C's `struct`: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 4`|| **ALIGNMENT** || See [Alignment properties] +`00:`|Ty|`field1` |4.0| Description of the value in `field1` +| | +`13:`|Ty2|`field2` |4.1| Description of the value in `field2` +`15:`|| **END** || Extendable, see [Reader compatibility] + + - The initial **ALIGNMENT** row indicates the minimum alignment of the absolute + file offset of the beginning of the structure. For instance, `A 4` in the + above table indicates that `field1` is placed on a 4-byte boundary within the + file. See [Alignment properties] for more details. + + - **Hex** lists the constant byte-offset from the beginning of the structure to + the beginning of the field, in hexadecimal. Fields are normally packed with + no padding in-between, if there may be a gap between fields an empty row is + inserted into the table for readability. If the offset is not constant for a + field no value is listed. + + - **Type** lists the interpretation of the field's bytes, along with + (implicitly) its size. The standard types are as follows: + + u`N` is a unsigned integer of `N` bits. + Multi-byte integers are laid out in little-endian order. + + f64 is an IEEE 754 double-precision floating-point number, laid out in + little-endian order (ie. sign byte comes last). + + `Ty`[`N`] is an array of `N` elements of type `Ty`. There is no padding + between elements (ie. the stride is equal to the total size of `Ty`). + `N` may refer to a sibling field, or may be `...` if the size is defined + in the **Description**. + + `Ty`* is a u64, but additionally the value is the absolute byte-offset of a + structure of type `Ty` (ie. a pointer to `Ty`). The value is aligned to the + minimum alignment of `Ty`, as defined in [Alignment properties]. + + char* is a u64, but additionally the value is the absolute byte-offset of + the start byte of a null-terminated UTF-8 string. The value is generally + unaligned. + + - **Name** gives a short name to refer to the field in further descriptions. + + - **Ver.** lists the first version the field first appeared. Note that if the + offset of the field changed over a major version, this number is will not be + updated to match the new major version. + + - **Description** describes the value of the field. Longer and additional + descriptions are listed after the table in separate paragraphs or lists. + + - The final **END** row lists the total size of the structure. If this + structure is used in an array, this is the offset of the first field in the + following array element. The **Description** indicates whether the structure + may be modified in later minor versions (expandable) or not (fixed), see + [Reader compatibility] for more details. + + +### Common file structure ### +[Common file structure]: #common-file-structure + +All `*.db` files are structured as a file header containing format identifiers +and version information, and references to "sections" in the rest of the file. +The file header always has the following structure: + + Hex | Type | Name | Description (see the [Formats legend]) + ---:| ---- | -------- | ------------------------------------------------------- +`00:`|u8[10]|`magic` | Common format identifier, reads `HPCTOOLKIT` in ASCII +`0a:`|u8[4]|`format` | Specific format identifier +`0e:`|u8|`majorVersion`| Common major version, currently 4 +`0f:`|u8|`minorVersion`| Specific minor version +`10:`|u64|`szSection1` | Total size of section 1 +`18:`|u8*|`pSection1` | Pointer to the beginning of section 1 +`20:`|u64|`szSection2` | Total size of section 2 +`28:`|u8*|`pSection2` | Pointer to the beginning of section 2 +`30:`| | ...etc... | + +`majorVersion` and `minorVersion` indicate the version of the writer of the +file, see [Reader compatibility] for implications. + +`format` identifies the specific format for the file, and always reads as a +4-character ASCII string (no terminator). Specifically: + - `meta` for [`meta.db` v4.0](#metadb-version-40) + - `prof` for [`profile.db` v4.0](#profiledb-version-40) + - `ctxt` for [`cct.db` v4.0](#cctdb-version-40) + - `trce` for [`trace.db` v4.0](#tracedb-version-40) + +Additional notes: + - The structure of file headers, including the value for `magic`, does not + change across major versions. + - The values for `format` follow the same rules as enumerations as defined in + [Reader compatibility]. + - `majorVersion` is consistent across all `*.db` files in one database. + `minorVersion` is not in general. + +The remainder of the file header is made up of pointers of and sizes of +contiguous regions of the file, hereafter termed sections. Many but not all +structures reside in a section. For simplicity, the top-level file header for +each format is written with a shorthand form of the structure table: + + Hex | Name | Ver. | Section (see the [Common file structure]) + ---:| ----------- | ---- | ---------------------------------------------------- +`00:`| || See [Common file structure] +`10:`|`{sz,p}Section1`|4.0| Short description of section 1 +`20:`|`{sz,p}Section2`|4.2| Short description of section 2 +`30:`| ...etc... + +The names `Section1` and `Section2` are replaced with more descriptive +identifiers in practice. + +Each section generally starts with a section header structure, this indicates +what is in the section and where it is located. Sections are generally grouped +based on related information, and generally contain little padding to facilitate +reading a large blob of related information. + +Additional notes: + - `p*` fields are aligned based on the alignment of the section header + structure, unless otherwise noted. + - The order of sections in the file and the order they are listed in the header + may differ, do not rely on any section ordering properties without checking + first. + + +### Alignment properties ### +[Alignment properties]: #alignment-properties + +The `*.db` formats are designed for efficient access through memory-mapped +segments, to support this fields are aligned to improve access for +performance-critical readers. All types have a minimum alignment that is +respected (unless otherwise noted), defined as follows: + + - Integers (u`N`) have a minimum alignment equal to their width (`N/8`). + For example, u32 is 4-byte aligned. + - Floating point numbers (f64) are 8-byte aligned. + - Arrays (`Ty`[`N`]) have the same alignment as their elements (`Ty`). In this + case the total size of `Ty` is a multiple of the alignment of `Ty`, so there + is no implicit padding between elements. + - Pointers (`Ty`* and char*) are 8-byte aligned (same as u64). + - Structures listed with structure tables have the alignment listed in their + initial **ALIGNMENT** row. In general this is at least the alignment of all + contained fields. + +Note that 8-byte alignment is the maximum possible alignment. + +The following fields are not always aligned, see the notes in their defining +sections for recommendations on how to achieve performance in these cases: + - Performance data arrays in [Profile-Major][PSVB] and + [Context-Major Sparse Value Block][CSVB], and + - The array in a [trace line][THsec]. + +[PSVB]: #profile-major-sparse-value-block +[CSVB]: #context-major-sparse-value-block +[THsec]: #tracedb-trace-headers-section + + +### Reader compatibility ### +[Reader compatibility]: #reader-compatibility + +The `*.db` formats are also designed for high compatibility between readers and +writers as both continue to update. Readers are able to determine the level of +compatibility needed or available by inspecting the major and minor version +numbers in the [Common file structure]. Specifically, we define two kinds of +compatibility, taken from the reader's perspective: + - *Backward compatibility*, when the reader (eg. v4.5) is a newer version than + the writer of the file (eg. v4.3). + - *Forward compatibility*, when the reader (eg. v4.5) is an older version than + the writer of the file (eg. v4.7). + +Backward compatibility is implemented by the reader when required, in this case +the reader simply does not access fields that were not present in the listed +version. Note that the offsets of fields may change across major versions, the +reader is responsible for implementing any differences. + +Forward compatibility is implemented by the format specification and is only +available across minor versions. For readers, this means: + - All fields supported by the reader will be accessible, but fields added + later than the reader's supported version will not be accessible. + - Readers must ignore or error unknown enumeration values. This will not affect + the availability of any fields. The reader is responsible for synthesizing a + fallback result from available data. + - Readers must always use the saved structure size for "expandable" structures + (described below) as the stride for arrays, rather than the size in the + reader's supported version. + +"Expandable" structures may increase in size over the course of minor versions, +the converse are "fixed" structures which do not. The status of any particular +structure is noted in the **END** row of the structure's table. + +The format specification relies on the following restrictions to preserve +forward compatibility to the greatest extent possible: + - Fields and enumeration values must never be removed, replaced, or change + meaning in ways that would break older readers. + - The presence or interpretation of fields must not depend on enumeration + values. + - Fields may be added in any previously uninterpreted region: in "gaps" between + previous fields or at the end of the structure if it is expandable. + - Enumeration values may be added in previously unallocated values. + +A breakage of any of these restrictions requires a major version bump, adding +new fields or enumeration values requires a minor version bump. + +* * * + +`meta.db` version 4.0 +=================================== +`meta.db` is a binary file listing various metadata for the database, including: + - Performance metrics for the metrics measured at application run-time, + - Calling contexts for metric values listed in sibling `*.db` files, and + - A human-readable description of the database's contents. + +The `meta.db` file starts with the following header: + + Hex | Name | Ver. | Section (see the [Common file structure]) + ---:| ------------ | ---- | --------------------------------------------------- +`00:`| || See [Common file structure] +`10:`|`{sz,p}General` |4.0| [General Properties][GPsec] +`20:`|`{sz,p}IdNames` |4.0| [Identifier Names][INsec] +`30:`|`{sz,p}Metrics` |4.0| [Performance Metrics][PMsec] +`40:`|`{sz,p}Context` |4.0| [Context Tree][CTsec] +`50:`|`{sz,p}Strings` |4.0| Common String Table +`60:`|`{sz,p}Modules` |4.0| [Load Modules][LMsec] +`70:`|`{sz,p}Files` |4.0| [Source Files][SFsec] +`80:`|`{sz,p}Functions`|4.0| [Functions][Fnsec] +`90:`| **END** || Extendable, see [Reader compatibility] + +[GPsec]: #metadb-general-properties-section +[INsec]: #metadb-hierarchical-identifier-names-section +[PMsec]: #metadb-performance-metrics-section +[CTsec]: #metadb-context-tree-section +[LMsec]: #metadb-load-modules-section +[SFsec]: #metadb-source-files-section +[Fnsec]: #metadb-functions-section + +The `meta.db` file ends with an 8-byte footer, reading `_meta.db` in ASCII. + +Additional notes: + - The Common String Table section has no particular interpretation, it is used + as a section to store strings for the [Load Modules section][LMsec], + the [Source Files section][SFsec], and the [Functions section][Fnsec]. + + +`meta.db` General Properties section +----------------------------------------- +The General Properties section starts with the following header: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | -------- | ---- | ----------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|char*|`pTitle` |4.0| Title of the database. May be provided by the user. +`08:`|char*|`pDescription`|4.0| Human-readable Markdown description of the database. +`10:`|| **END** || Extendable, see [Reader compatibility] + +`description` provides information about the measured execution and subsequent +analysis that may be of interest to users. The exact layout and the information +contained may change without warning. + +Additional notes: + - The strings pointed to by `pTitle` and `pDescription` are fully contained + within the General Properties section, including the terminating NUL byte. + + +`meta.db` Hierarchical Identifier Names section +----------------------------------------------- +> In future versions of HPCToolkit new functionality may be added that requires +> new values for the the `kind` field of a [Hierarchical Identifier Tuple][HIT]. +> The Hierarchical Identifier Names section provides human-readable names for +> all possible values for forward compatibility. + +The Hierarchical Identifier Names section starts with the following header: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ------ | ---- | -------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|{Names}*|`ppNames`|4.0| Human-readable names for Identifier kinds +`08:`|u8|`nKinds` |4.0| Number of names listed in this section +`09:`|| **END** || Extendable, see [Reader compatibility] + +{Names} above refers to a char*[`nKinds`] structure, ie. an array of `nKinds` +pointers to human-readable names for Identifier kinds. `ppNames[kind]` is the +human-readable name for the Identifier kind `kind`, where `kind` is part of a +[Hierarchical Identifier Tuple][HIT]. + +[HIT]: #profiledb-hierarchical-identifier-tuple-section + +Additional notes: + - The strings pointed to `ppNames[...]` are fully contained within the + Hierarchical Identifier Names section, including the terminating NUL. + + +`meta.db` Performance Metrics section +------------------------------------- +> The Performance Metrics section lists the performance metrics measured at +> application runtime, and the analysis performed by HPCToolkit to generate the +> metric values within `profile.db` and `cct.db`. In summary: +> - Performance measurements for an application thread are first attributed to +> contexts, listed in the [Context Tree section](#metadb-context-tree-section). +> These are the raw metric values. +> - Propagated metric values are generated for each context by summing values +> attributed to children contexts, within the measurements for a single +> application thread. Which children are included in this sum is indicated +> by the `*pScope` {PS} structure. +> - Summary statistic values are generated for each context from the +> propagated metric values for each application thread, by first applying +> `*pFormula` to each value and then combining via `combine`. This generates +> a single statistic value for each context. + +The Performance Metrics section starts with the following header: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---------------- | ---- | ---------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|{MD}[`nMetrics`]*|`pMetrics`|4.0| Descriptions of performance metrics +`08:`|u32|`nMetrics` |4.0| Number of performance metrics +`0c:`|u8|`szMetric` |4.0| Size of the {MD} structure, currently 32 +`0d:`|u8|`szScopeInst` |4.0| Size of the {PSI} structure, currently 16 +`0e:`|u8|`szSummary` |4.0| Size of the {SS} structure, currently 24 +| | +`10:`|{PS}[`nScopes`]*|`pScopes` |4.0| Descriptions of propgation scopes +`18:`|u16|`nScopes` |4.0| Number of propgation scopes +`1a:`|u8|`szScope` |4.0| Size of the {PS} structure, currently 16 +`1b:`|| **END** || Extendable, see [Reader compatibility] + +{MD} above refers to the following sub-structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ----------------------- | ---- | ----------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|char*|`pName` |4.0| Canonical name for the metric +`08:`|{PSI}[`nScopeInsts`]*|`pScopeInsts`|4.0| Instantiated propagated sub-metrics +`10:`|{SS}[`nSummaries`]*|`pSummaries` |4.0| Descriptions of generated summary statistics +`18:`|u16|`nScopeInsts` |4.0| Number of instantiated sub-metrics for this metric +`1a:`|u16|`nSummaries` |4.0| Number of summary statistics for this metric +| | +`20:`|| **END** || Extendable, see [Reader compatibility] + +{PSI} above refers to the following sub-structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | --------- | ---- | -------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|{PS}*|`pScope` |4.0| Propagation scope instantiated +`08:`|u16|`propMetricId` |4.0| Unique identifier for propagated metric values +| | +`10:`|| **END** || Extendable, see [Reader compatibility] + +{SS} above refers to the following sub-structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ------ | ---- | -------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|{PS}*|`pScope` |4.0| Propagation scope summarized +`08:`|char*|`pFormula` |4.0| Canonical unary function used for summary values +`10:`|u8|`combine` |4.0| Combination n-ary function used for summary values +| | +`12:`|u16|`statMetricId`|4.0| Unique identifier for summary statistic values +| | +`18:`|| **END** || Extendable, see [Reader compatibility] + +The combination function `combine` is an enumeration with the following possible +values (the name after `/` is the matching name for `inputs:combine` in +METRICS.yaml): + - `0/sum`: Sum of input values + - `1/min`: Minimum of input values + - `2/max`: Maximum of input values + +> As noted before, propagated metric values are generated by summing the +> measured metric values from some or all of the descendants of a context, as +> determined by the "propagation scope." The fields of the {PS} structure +> describe the function mapping from each context to the subset of its +> descendants included in the sum. While presentable metric values can be +> produced based only on the `*pScopeName` and METRICS.yaml, more complex +> analysis can be performed if the propagation scope is defined in the meta.db. + +{PS} above refers to the following sub-structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | --------- | ---- | ----------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|char*|`pScopeName` |4.0| Name of the propagation scope +`08:`|u8|`type` |4.0| Type of propagation scope described +`09:`|u8|`propagationIndex`|4.0| Index of this propagation's propagation bit +| | +`10:`|| **END** || Extendable, see [Reader compatibility] + +> The propagation scope's name `*pScopeName` may be any string, however to aid +> writing and maintaining metric taxonomies (see METRICS.yaml) the propagation +> scope for a name rarely changes meanings. Regardless, readers are still +> strongly encouraged to use the definition provided by other fields described +> below to perform analysis requiring any non-trivial understanding of the +> propagation scope. + +The propagation scope `type` is an enumeration with the following values: + - `0`: Custom propagation scope, not defined in the meta.db. + + - `1`: Standard "point" propagation scope. No propagation occurs, all metric + values are recorded as measured. + + > The canonical `*pScopeName` for this case is "point". + + - `2`: Standard "execution" propagation scope. Propagation always occurs, the + propagated value sums values measured from all descendants without exception. + + > The canonical `*pScopeName` for this case is "execution". This case is + > often used for inclusive metric costs. + + - `3`: Transitive propagation scope. Propagation occurs from every context to + its parent when the `propagationIndex`th bit is set in the + [context's `propagation` bitmask][CT], and to further ancestors transitively + under the same condition. + + > An example transitive propagation scope is named "function," its propagated + > values sum measurements from all descendants not separated by a call, in + > other words its the metric cost exclusive to the function. The appropriate + > bit in the `propagation` bitmask is set only if the context's `relation` + > is a caller-callee relationship (of some kind). + +[CT]: #metadb-context-tree-section + +Additional notes: + - Matching the size of the [context's `propagation` field][CT], + `propagationIndex` is always less than 16. + - The arrays pointed to by `pMetrics`, `pScopes` and `pSummaries` are fully + contained within the Performance Metrics section. + - The strings pointed to by `pName`, `pScope` and `pFormula` are fully contained + within the Performance Metrics section, including the terminating NUL. + - The format and interpretation of `*pFormula` matches the `inputs:formula` key + in METRICS.yaml, see there for details. + - `propMetricId` is the metric identifier used in `profile.db` and `cct.db` for + propagated metric values for the given `*pName` and `*pScope`. + - `statMetricId` is the metric identifier used in `profile.db` summary + profiles for summary statistic values for the given `*pName`, `*pScope`, + `*pFormula` and `combine`. + - The stride of `*pMetrics`, `*pScopes` and `*pSummaries` is `szMetric`, + `szScope` and `szSummary`, respectively. For forward compatibility these + values should be read and used whenever accessing these arrays. + +`meta.db` Load Modules section +----------------------------- +> The Load Modules section lists information about the binaries used during the +> measured application execution. + +The Load Modules section starts with the following header: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|[LMS]\[`nModules`]*|`pModules`|4.0| Load modules used in this database +`08:`|u32|`nModules` |4.0| Number of load modules listed in this section +`0c:`|u16|`szModule` |4.0| Size of a [Load Module Specification][LMS], currently 16 +`0e:`|| **END** || Extendable, see [Reader compatibility] + +[LMS]: #load-module-specification + +Additional notes: + - The array pointed to by `pModules` is completely within the Load Modules + section. + - The stride of `*pModules` is `szModule`, for forwards compatibility this + should always be read and used as the stride when accessing `*pModules`. + +### Load Module Specification ### +A Load Module Specification refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u32|`flags` |4.0| Reserved for future use +| | +`08:`|char*|`pPath` |4.0| Full path to the associated application binary +`10:`|| **END** || Extendable, see [Reader compatibility] + +Additional notes: + - The string pointed to by `pPath` is completely within the + [Common String Table section](#metadb-version-40), including the terminating + NUL byte. + + +`meta.db` Source Files section +----------------------------- +> The Source Files section lists information about the application's source +> files, as gathered through debugging information on application binaries. + +The Source Files section starts with the following header: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|[SFS]\[`nFiles`]*|`pFiles`|4.0| Source files used in this database +`08:`|u32|`nFiles` |4.0| Number of source files listed in this section +`0c:`|u16|`szFile` |4.0| Size of a [Source File Specification][SFS], currently 16 +`0e:`|| **END** || Extendable, see [Reader compatibility] + +[SFS]: #source-file-specification + +Additional notes: + - The array pointed to by `pFiles` is completely within the Source Files + section. + - The stride of `*pFiles` is `szFile`, for forwards compatibility this should + always be read and used as the stride when accessing `*pFiles`. + +### Source File Specification ### +A Source File Specification refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|{Flags}|`flags` |4.0| See below +| | +`08:`|char*|`pPath` |4.0| Path to the source file. Absolute, or relative to the root database directory. +`10:`|| **END** || Extendable, see [Reader compatibility] + +{Flags} refers to an u32 bitfield with the following sub-fields (bit 0 is +least significant): + - Bit 0: `copied`. If 1, the source file was copied into the database and + should always be available. If 0, the source file was not copied and thus may + need to be searched for. + - Bits 1-31: Reserved for future use. + +Additional notes: + - The string pointed to by `pPath` is completely within the + [Common String Table section](#metadb-version-40), including the terminating + NUL byte. + + +`meta.db` Functions section +------------------------------ +> The Functions section lists various named source-level constructs observed in +> the application. These are inclusively called "functions," however this also +> includes other named constructs (e.g. ``). +> +> Counter-intuitively, sometimes we know a named source-level construct should +> exist, but not its actual name. These are "anonymous functions," in this case +> it is the reader's responsibility to construct a reasonable name with what +> information is available. + +The Functions section starts with the following header: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ------ | ---- | -------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|[FS]\[`nFunctions`]*|`pFunctions`|4.0| Functions used in this database +`08:`|u32|`nFunctions` |4.0| Number of functions listed in this section +`0c:`|u16|`szFunction` |4.0| Size of a [Function Specification][FS], currently 40 +`0e:`|| **END** || Extendable, see [Reader compatibility] + +[FS]: #function-specification + +Additional notes: + - The array pointed to by `pFunctions` is completely within the Functions + section. + - The stride of `*pFunctions` is `szFunction`, for forwards compatibility this + should always be read and used as the stride when accessing `*pFunctions`. + +### Function Specification ### +A Function Specification refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|char*|`pName` |4.0| Human-readable name of the function, or 0 +`08:`|[LMS]*|`pModule`|4.0| Load module containing this function, or 0 +`10:`|u64|`offset` |4.0| Offset within `*pModule` of this function's entry point +`18:`|[SFS]*|`pFile` |4.0| Source file of the function's definition, or 0 +`20:`|u32|`line` |4.0| Source line in `*pFile` of the function's definition +`24:`|u32|`flags` |4.0| Reserved for future use +`28:`|| **END** || Extendable, see [Reader compatibility] + +[LMS]: #load-module-specification +[SFS]: #source-file-specification + + +Additional notes: + - If not 0, the string pointed to by `pName` is completely within the + [Common String Table section](#metadb-version-40), including the terminating + NUL byte. + - If not 0, `pModule` points within the [Load Module section](#metadb-load-module-section). + - If not 0, `pFile` points within the [Source File section](#metadb-source-file-section). + - At least one of `pName`, `pModule` and `pFile` will not be 0. + + +`meta.db` Context Tree section +------------------------------- +> The Context Tree section lists the source-level calling contexts in which +> performance data was gathered during application runtime. Each context +> ({Ctx} below) represents a source-level (lexical) context, and these can be +> nested to create paths in the tree: +> +> function foo() +> loop at foo.c:12 +> line foo.c:15 +> instruction libfoo.so@0x10123 +> +> The relation between contexts may be enclosing lexical context (as above), or +> can be marked as a call of various types (see `relation` below): +> +> instruction libfoo.so@0x10123 +> [normal call to] function bar() +> line bar.c:57 +> [inlined call to] function baz() +> instruction libbar.so@0x25120 +> +> Although some patterns in the context tree are more common than others, the +> format is very flexible and will allow almost any nested structure. It is up +> to the reader to interpret the context tree as appropriate. +> +> The top of the context tree is made of "entry points" that describe how the +> application's code was called from the operating system. + +The Context Tree section starts with the following header: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | -------- | ---- | ------------------------------------------------ +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|{Entry}[`nEntryPoints`]*|`pEntryPoints`|4.0| Pointer to an array of entry point specifications +`08:`|u16|`nEntryPoints` |4.0| Number of entry points in this context tree +`0a:`|u8|`szEntryPoint` |4.0| Size of a {Entry} structure in bytes, currently 32 +`0b:`|| **END** || Extendable, see [Reader compatibility] + +{Entry} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ----------- | ---- | --------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u64|`szChildren` |4.0| Total size of `*pChildren`, in bytes +`08:`|{Ctx}[...]*|`pChildren`|4.0| Pointer to the array of child contexts +`10:`|u32|`ctxId` |4.0| Unique identifier for this context +`14:`|u16|`entryPoint` |4.0| Type of entry point used here +| | +`18:`|char*|`pPrettyName` |4.0| Human-readable name for the entry point +`20:`|| **END** || Extendable, see [Reader compatibility] + +`entryPoint` is an enumeration of the following possible values (name in quotes +is the associated canonical `*pPrettyName`): + - `0` "unknown entry": No recognized outside caller. + + > This can occur when the unwind fails due to incomplete unwind information. + - `1` "main thread": Setup code for the main thread. + - `2` "application thread": Setup code for threads created by the application, + via `pthread_create` or similar. + +Additional notes: + - The string pointed to by `pPrettyName` is completely within the + [Common String Table section](#metadb-version-40), including the terminating + NUL byte. + +{Ctx} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | --------------- | ---- | ----------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u64|`szChildren` |4.0| Total size of `*pChildren`, in bytes +`08:`|{Ctx}[...]*|`pChildren` |4.0| Pointer to the array of child contexts +`10:`|u32|`ctxId` |4.0| Unique identifier for this context +`14:`|{Flags}|`flags` |4.0| See below +`15:`|u8|`relation` |4.0| Relation this context has with its parent +`16:`|u8|`lexicalType` |4.0| Type of lexical context represented +`17:`|u8|`nFlexWords` |4.0| Size of `flex`, in u8[8] "words" (bytes / 8) +`18:`|u16|`propagation` |4.0| Bitmask for defining [propagation scopes][PMS] +| | +`20:`|u8[8]\[`nFlexWords`]|`flex`|4.0| Flexible data region, see below +` *:`|| **END** + +[PMS]: #metadb-performance-metrics-section + +`flex` contains a dynamic sequence of sub-fields, which are sequentially +"packed" into the next unused bytes at the minimum alignment. In particular: + - An u64 sub-field will always take the next full u8[8] "word" and never span + two words, but + - Two u32 sub-fields will share a single u8[8] word even if an u64 sub-field + is between them in the packing order. + +The packing order is indicated by the index on `flex`, ie. `flex[1]` is the +sub-field next in the packing order after `flex[0]`. This order still holds +even if not all fields are present for any particular instance. + +{Flags} above refers to an u8 bitfield with the following sub-fields (bit 0 is +least significant): + - Bit 0: `hasFunction`. If 1, the following sub-fields of `flex` are present: + + `flex[0]:` [FS]* `pFunction`: Function associated with this context + - Bit 1: `hasSrcLoc`. If 1, the following sub-fields of `flex` are present: + + `flex[1]:` [SFS]* `pFile`: Source file associated with this context + + `flex[2]:` u32 `line`: Associated source line in `pFile` + - Bit 2: `hasPoint`. If 1, the following sub-fields of `flex` are present: + + `flex[3]:` [LMS]* `pModule`: Load module associated with this context + + `flex[4]:` u64 `offset`: Associated byte offset in `*pModule` + - Bits 3-7: Reserved for future use. + +[FS]: #function-specification +[SFS]: #source-file-specification +[LMS]: #load-module-specification + +`relation` is an enumeration with the following values: + - `0`: This context's parent is an enclosing lexical context, eg. source line + within a function. Specifically, no call occurred. + - `1`: This context's parent used a typical function call to reach this + context. The parent context is the source-level location of the call. + - `2`: This context's parent used an inlined function call (ie. the call was + inlined by the compiler). The parent context is the source-level location of + the original call. + +The lexical type `lexicalType` is an enumeration with the following values: + - `0`: Function-like construct. If `hasFunction` is 1, `*pFunction` indicates + the function represented by this context. Otherwise the function for this + context is unknown (ie. an unknown function). + - `1`: Loop construct. `*pFile` and `line` indicate the source line of the + loop header. + - `2`: Source line construct. `*pFile` and `line` indicate the source line + represented by this context. + - `3`: Single instruction. `*pModule` and `offset` indicate the first byte of the + instruction represented by this context. + +Additional notes: + - `propagation` is an extra field used to assist in defining some propagation + scopes, see the [Performance Metrics section][PMS] for details. + - `ctxId` is always larger than 0, the value 0 is reserved to indicate the + global context (ie. an implicit context above and enclosing all others). + + > The global context is used to represent corner cases where there should be + > no associated context. See notes on the usage of `ctxId` for details. + - The arrays pointed to by `pRoots` and `pChildren` are completely within the + Context Tree section. The size of these arrays is given in `szRoots` or + `szChildren`, in bytes to allow for a singular read of all root/child + context structures. + - `pChildren` is 0 if there are no child Contexts, `pRoots` is 0 if there are + no Contexts in this section period. `szChildren` and `szRoots` are 0 in these + cases respectively. + - `pFunction` points within the [Function section](#metadb-function-section). + - `pFile` points within the [Source File section](#metadb-source-file-section). + - `pModule` points within the [Load Module section](#metadb-load-module-section). + - The size of a single {Ctx} is dynamic but can be derived from `nFlexWords`. + For forward compatibility, readers should always read and use this to read + arrays of {Ctx} elements. + + +* * * +`profile.db` version 4.0 +======================== + +The `profile.db` is a binary file containing the performance analysis results +generated by `hpcprof`, arranged by application thread and in a compact sparse +representation. Once an application thread is chosen, the analysis result for +a particular calling context can be obtained through a simple binary search. + +The `profile.db` file starts with the following header: + + Hex | Name | Ver. | Section (see the [Common file structure]) + ---:| --------------- | ---- | ------------------------------------------------ +`00:`| || See [Common file structure] +`10:`|`{sz,p}ProfileInfos`|4.0| [Profiles Information][PIsec] +`20:`|`{sz,p}IdTuples` |4.0| [Hierarchical Identifier Tuples][HITsec] +`30:`| **END** || Extendable, see [Reader compatibility] + +[PIsec]: #profiledb-profile-info-section +[HITsec]: #profiledb-hierarchical-identifier-tuple-section + +The `profile.db` file ends with an 8-byte footer, reading `_prof.db` in ASCII. + + +`profile.db` Profile Info section +--------------------------------- +> The Profile Info section lists the CPU threads and GPU streams present in the +> application execution, and references the performance *profile* measured from +> it during application runtime. Profiles may contain summary statistic values +> or propagated metric values, the first always contains summary statistic +> values from the entire application execution, intended to aid top-down +> analysis. See the comment in the [`meta.db` Performance Metrics section](#metadb-performance-metrics-section) +> for more detail on how values are calculated. + +The Profile Info section starts with the following header: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ----- | ---- | --------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|{PI}[`nProfiles`]*|`pProfiles`|4.0| Description for each profile +`08:`|u32|`nProfiles` |4.0| Number of profiles listed in this section +`0c:`|u8|`szProfile` |4.0| Size of a {PI} structure, currently 40 +`0d:`|| **END** || Extendable, see [Reader compatibility] + +{PI} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ------- | ---- | ------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|[PSVB]|`valueBlock`|4.0| Header for the values for this profile +`20:`|[HIT]*|`pIdTuple` |4.0| Identifier tuple for this profile +`28:`|{Flags}|`flags` |4.0| See below +| | +`30:`|| **END** || Extendable, see [Reader compatibility] + +[HIT]: #profiledb-hierarchical-identifier-tuple-section +[PSVB]: #profile-major-sparse-value-block + +{Flags} above refers to a u32 bitfield with the following sub-fields (bit 0 +is least significant): + - Bit 0: `isSummary`. If 0, this profile is a performance profile of the + application thread identified exactly by `*pIdTuple`. If 1, this profile is a + "summary profile" containing statistics across multiple measured application + threads where `*pIdTuple` lists common identifiers. + +Additional notes: + - The array pointed to by `pProfiles` is fully contained within the Profile + Info section. + - Profiles are unordered within this section, except the first which is + always the "canonical summary profile." This is always a summary profile and + contains statistics across all measured application threads. + - `pIdTuple` points within the [Identifier Tuple section](#profledb-hierarchical-identifier-tuple-section), + except for the canonical summary profile where `pIdTuple` is 0. + - The stride of `*pProfiles` is equal to `szProfile`, for forward compatibility + this should always be read and used as the stride when accessing `*pProfiles`. + +### Profile-Major Sparse Value Block ### +> All performance data in the `profile.db` and `cct.db` is arranged in +> internally-sparse "blocks," the variant present in `profile.db` uses one for +> each application thread (or equiv.) measured during application runtime. +> Conceptually, these are individual planes of a 3-dimensional tensor indexed by +> application thread (profile), context, and metric, in that order. + +Each Profile-Major Sparse Value Block has the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ----------------- | ---- | --------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u64|`nValues` |4.0| Number of non-zero values +`08:`|{Val}[`nValues`]*|`pValues` |4.0| Metric-value pairs +`10:`|u32|`nCtxs` |4.0| Number of non-empty contexts +| | +`18:`|{Idx}[`nCtxs`]*|`pCtxIndices`|4.0| Mapping from contexts to values +`20:`|| **END** || Fixed, see [Reader compatibility] + +{Val} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 2`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u16|`metricId` |4.0| Unique identifier of a metric listed in the [`meta.db`](#metadb-performance-metrics-section) +`02:`|f64|`value` |4.0| Value of the metric indicated by `metricId` +`0a:`|| **END** || Fixed, see [Reader compatibility] + +{Idx} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 4`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u32|`ctxId` |4.0| Unique identifier of a context listed in the [`meta.db`](#metadb-context-tree-section) +`04:`|u64|`startIndex`|4.0| Start index of `*pValues` attributed to the referenced context +`0c:`|| **END** || Fixed, see [Reader compatibility] + +The sub-array of `*pValues` attributed to the context referenced by `ctxId` +starts at index `startIndex` and ends just before the `startIndex` of the +following {Idx} structure, if this {Idx} is the final element of `*pCtxIndices` +(index `nCtxs - 1`) then the end is the last element of `*pValues` (index +`nValues - 1`). + +Additional notes: + - `pValues` and `pCtxIndices` point outside the sections listed in the + [`profile.db` header](#profiledb-version-40). + - The arrays pointed to by `pValues` and `pCtxIndices` are subsequent: only + padding is placed between them and `pValues < pCtxIndices`. This allows + readers to read a plane of data in a single contiguous blob from `pValues` + to `pCtxIndices + nCtxs * 0xc`. + - `metricId` is a `propMetricId` or `statMetricId` listed in the + [`meta.db` performance metrics section](#metadb-performance-metrics-section), + this is a `statMetricId` if `isSummary` is 1 and a `propMetricId` otherwise. + - `ctxId` is a `ctxId` listed in the [`meta.db` context tree section](#metadb-context-tree-section), + or 0 for metric values attributed to the implicit global context. + + > The values attributed to the global context are propagated from the root + > contexts, in effect these values give an "aggregate" view of the profile + > where the context dimension has been removed. + - `*pValues` and `*pCtxIndices` are sorted by `metricId` and `ctxId`, + respectively. This allows the use of binary search (or some variant thereof) + to locate the value(s) for a particular context or metric. + - `value` and `startIndex` are not aligned, however `metricId` and `ctxId` are. + This should in general not pose a significant performance penalty. + See [Alignment properties] above. + + +`profile.db` Hierarchical Identifier Tuple section +-------------------------------------------------- +> Application threads (or equiv.) are identified in a human-readable manner via +> the use of Hierarchical Identifier Tuples. Each tuple lists a series of +> identifications for an application thread, for instance compute node and +> MPI rank. The identifications within a tuple are ordered roughly by +> "hierarchy" from largest to smallest, for eg. compute node will appears before +> MPI rank if both are present. + +The Hierarchical Identifier Tuple section contains multiple Identifier Tuples, +each of the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ------ | ---- | -------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u16|`nIds` |4.0| Number of identifications in this tuple +| | +`08:`|{Id}[`nIds`]|`ids`|4.0| Identifications for an application thread +` *:`|| **END** || Extendable, see [Reader compatibility] + +{Id} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ------- | ---- | ------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u8|`kind` |4.0| One of the values listed in the [`meta.db` Identifier Names section][INsec]. +| | +`02:`|{Flags}|`flags` |4.0| See below. +`04:`|u32|`logicalId` |4.0| Logical identifier value, may be arbitrary but dense towards 0. +`08:`|u64|`physicalId` |4.0| Physical identifier value, eg. hostid or PCI bus index. +`10:`|| **END** || Fixed, see [Reader compatibility] + +[INsec]: #metadb-hierarchical-identifier-names-section + +{Flags} above refers to an u16 bitfield with the following sub-fields (bit 0 is +least significant): + - Bit 0: `isPhysical`. If 1, the `kind` represents a physical (hardware or VM) + construct for which `physicalId` is the identifier (and `logicalId` is + arbitrary but distinct). If 0, `kind` represents a logical (software-only) + construct (and `physicalId` is `logicalId` zero-extended to 64 bits). + - Bits 1-15: Reserved for future use. + +> The name associated with the `kind` in the [`meta.db`][INsec] indicates the +> meaning of `logicalId` (if `isPhysical == 0`) and/or `physicalId` (if +> `isPhysical == 1`). The following names are in current use with the given +> meanings: +> - "NODE": Compute node, `physicalId` indicates the hostid of the node. +> - "RANK": Rank of the process (from eg. MPI), `logicalId` indicates the rank. +> - "CORE": Core the application thread was bound to, `physicalId` indicates +> the index of the first hardware thread as listed in /proc/cpuinfo. +> - "THREAD": Application CPU thread, `logicalId` indicates the index. +> - "GPUCONTEXT": Context used to access a GPU, `logicalId` indicates the index +> as given by the underlying programming model (eg. CUDA context index). +> - "GPUSTREAM": Stream/queue used to push work to a GPU, `logicalId` indicates +> the index as given by the programming model (eg. CUDA stream index). +> +> These names/meanings are not stable and may change without a version bump, it +> is highly recommended that readers refrain from any special-case handling of +> particular `kind` values where possible. + +Additional notes: + - While `physicalId` (when valid) lists a physical identification for an + application thread, the contained value is often too obtuse for generating + human-readable output listing many identifiers. `logicalId` is a suitable + replacement in these cases, as these values are always dense towards 0. + + +* * * +`cct.db` version 4.0 +==================== + +The `cct.db` is a binary file containing the performance analysis results +generated by `hpcprof`, arranged by calling context and in a compact sparse +representation. Once a calling context is chosen, the analysis result for +a particular metric can be obtained through a simple binary search. + +The `cct.db` file starts with the following header: + + Hex | Name | Ver. | Section (see the [Common file structure]) + ---:| ----------- | ---- | ---------------------------------------------------- +`00:`| || See [Common file structure] +`10:`|`{sz,p}CtxInfos`|4.0| [Contexts Information][CIsec] +`20:`| **END** || Extendable, see [Reader compatibility] + +[CIsec]: #cctdb-context-info-section + +The `cct.db` file ends with an 8-byte footer, reading `__ctx.db` in ASCII. + + +`cct.db` Context Info section +----------------------------- +> The Context Info section associates contexts with a "block" of performance +> data, similar to what the [`profile.db` Profile Info section](#metadb-profiledb-profile-info-section) +> does for application threads. + +The Context Info section starts with the following header: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---------- | ---- | ---------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|{CI}[`nCtxs`]*|`pCtxs`|4.0| Description for each context in this database +`08:`|u32|`nCtxs` |4.0| Number of contexts listed in this section +`0c:`|u8|`szCtx` |4.0| Size of a {CI} structure, currently 32 +`0d:`|| **END** || Extendable, see [Reader compatibility] + +{CI} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ------ | ---- | ------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|[CSVB]|`valueBlock`|4.0| Header for the values for this Context +`20:`|| **END** || Extendable, see [Reader compatibility] + +[CSVB]: #context-major-sparse-value-block + +Additional notes: + - The array pointed to by `pCtxs` is fully contained within the Context Info + section. + - `(*pCtxs)[ctxId]` is associated with the context with the matching `ctxId` + as listed in the [`meta.db` context tree section](#metadb-context-tree-section). + `(*pCtxs)[0]` contains the metric values of the implicit global context. + + > The values attributed to the global context are propagated from the root + > contexts, in effect these values give an "aggregate" view of the profile + > where the context dimension has been removed. + - The stride of `*pCtxs` is `szCtx`, for forward compatibility this should + always be read and used as the stride when accessing `*pCtxs`. + +### Context-Major Sparse Value Block ### +> The Context-Major Sparse Value Block is very similar in structure to the +> [Profile-Major Sparse Value Block](#profile-major-sparse-value-block), they +> differ mainly in the order in which their 3 dimensions are indexed. +> For Context-Major, this order is context, metric, application thread (profile). + +Each Context-Major Sparse Value Block has the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ----------------- | ---- | --------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u64|`nValues` |4.0| Number of non-zero values +`08:`|{Val}[`nValues`]*|`pValues` |4.0| Profile-value pairs +`10:`|u16|`nMetrics` |4.0| Number of non-empty metrics +| | +`18:`|{Idx}[`nMetrics`]*|`pMetricIndices`|4.0| Mapping from metrics to values +`20:`|| **END** || Fixed, see [Reader compatibility] + +{Val} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 4`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u32|`profIndex` |4.0| Index of a profile listed in the [`profile.db`](#profiledb-profile-info-section) +`04:`|f64|`value` |4.0| Value attributed to the profile indicated by `profIndex` +`0c:`|| **END** || Fixed, see [Reader compatibility] + +{Idx} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 2`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u16|`metricId` |4.0| Unique identifier of a metric listed in the [`meta.db`](#metadb-performance-metrics-section) +`02:`|u64|`startIndex`|4.0| Start index of `*pValues` from the associated metric +`0a:`|| **END** || Fixed, see [Reader compatibility] + +The sub-array of `*pValues` from to the metric referenced by `metId` starts at +index `startIndex` and ends just before the `startIndex` of the following {Idx} +structure, if this {Idx} is the final element of `*pCtxIndices` (index +`nMetrics - 1`) then the end is the last element of `*pValues` (index +`nValues - 1`). + +Additional notes: + - `pValues` and `pMetricIndices` point outside the sections listed in the + [`cct.db` header](#cctdb-version-40). + - The arrays pointed to by `pValues` and `pMetricIndices` are subsequent: only + padding is placed between them and `pValues < pMetricIndices`. This allows + readers to read a plane of data in a single contiguous blob from `pValues` + to `pMetricIndices + nMetrics * 0xa`. + - `metricId` is a `propMetricId` listed in the + [`meta.db` performance metrics section](#metadb-performance-metrics-section). + Unlike `profile.db`, the `cct.db` does not include any summary profiles. + - `*pValues` and `*pMetricIndices` are sorted by `profIdx` and `metricId`, + respectively. This allows the use of binary search (or some variant thereof) + to locate the value(s) for a particular metric or application thread. + - `value` and `startIndex` are not aligned, however `profIdx` and `metricId` + are. This should in general not pose a significant performance penalty. + See [Alignment properties] above. + + +* * * +`trace.db` version 4.0 +====================== + +The `trace.db` file starts with the following header: + + Hex | Name | Ver. | Section (see the [Common file structure]) + ---:| ------------ | ---- | --------------------------------------------------- +`00:`| || See [Common file structure] +`10:`|`{sz,p}CtxTraces`|4.0| [Context Trace Headers][CTHsec] +`20:`| **END** || Extendable, see [Reader compatibility] + +[CTHsec]: #tracedb-context-trace-headers-section + +The `trace.db` file ends with an 8-byte footer, reading `trace.db` in ASCII. + +`trace.db` Context Trace Headers section +-------------------------------- + +The Context Trace Headers sections starts with the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | --------------- | ---- | ----------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|{CTH}[`nTraces`]*|`pTraces`|4.0| Header for each trace +`08:`|u32|`nTraces` |4.0| Number of traces listed in this section +`0c:`|u8|`szTrace` |4.0| Size of a {TH} structure, currently 24 +| | +`10:`|u64|`minTimestamp` |4.0| Smallest timestamp of the traces listed in `*pTraces` +`18:`|u64|`maxTimestamp` |4.0| Largest timestamp of the traces listed in `*pTraces` +`20:`|| **END** || Extendable, see [Reader compatibility] + +{CTH} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ----- | ---- | --------------------------------------------------- +`A 8`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u32|`profIndex` |4.0| Index of a profile listed in the [`profile.db`](#profiledb-profile-info-section) +| | +`08:`|{Elem}*|`pStart` |4.0| Pointer to the first element of the trace line (array) +`10:`|{Elem}*|`pEnd` |4.0| Pointer to the after-end element of the trace line (array) +`18:`|| **END** || Extendable, see [Reader compatibility] + +{Elem} above refers to the following structure: + + Hex | Type | Name | Ver. | Description (see the [Formats legend]) + ---:| ---- | ---- | ---- | ---------------------------------------------------- +`A 4`|| **ALIGNMENT** || See [Alignment properties] +`00:`|u64|`timestamp` |4.0| Timestamp of the trace sample (nanoseconds since the epoch) +`08:`|u32|`ctxId` |4.0| Unique identifier of a context listed in [`meta.db`](#metadb-context-tree-section) +`0c:`|| **END** || Fixed, see [Reader compatibility] + +Additional notes: + - If `ctxId` is 0, the traced thread was not running at the `timestamp`. + Consecutive {Elem} elements cannot both have `ctxId` set to 0. + + > This is equivalent to attributing the trace element to the implicit global + > context. + - The array pointed to by `pTraces` is completely within the Context Trace + Headers section. The pointers `pStart` and `pEnd` point outside any of the + sections listed in the [`trace.db` header](#tracedb-version-40). + - The array starting at `pStart` and ending just before `pEnd` is sorted in + order of increasing `timestamp`. + - The stride of `*pTraces` is `szTrace`, for forward compatibility this value + should be read and used when accessing `*pTraces`. + - `timestamp` is only aligned for even elements in a trace line array. Where + possible, readers are encouraged to prefer accessing even elements. + See [Alignment properties] above. diff --git a/hatchet/tests/data/hpctoolkit-gamess/cct.db b/hatchet/tests/data/hpctoolkit-gamess/cct.db new file mode 100644 index 00000000..2eaa59c7 Binary files /dev/null and b/hatchet/tests/data/hpctoolkit-gamess/cct.db differ diff --git a/hatchet/tests/data/hpctoolkit-gamess/meta.db b/hatchet/tests/data/hpctoolkit-gamess/meta.db new file mode 100644 index 00000000..761c93b3 Binary files /dev/null and b/hatchet/tests/data/hpctoolkit-gamess/meta.db differ diff --git a/hatchet/tests/data/hpctoolkit-gamess/metrics/METRICS.yaml.ex b/hatchet/tests/data/hpctoolkit-gamess/metrics/METRICS.yaml.ex new file mode 100644 index 00000000..eed8da33 --- /dev/null +++ b/hatchet/tests/data/hpctoolkit-gamess/metrics/METRICS.yaml.ex @@ -0,0 +1,286 @@ +%YAML 1.2 +--- +# Specification and example document for metric taxonomies. + +# Each HPCToolkit database provides post-processed performance data for every +# calling context, application thread and performance metric. Performance +# metrics are generally very specific and the impact on the application +# performance is not always clear (eg. is 98% of the GPU L2 misses on a single +# line a problem?). + +# Files of this format provide a full "taxonomy" of metrics, structured to aid +# manual performance analysis. Very general metrics (eg. time) are presented +# first to give a sense for *where* significant performance issues are, which +# can be expanded to present increasingly specific metrics to determine the +# *why* and *how*. In other words, the majority of an HPCToolkit database +# (see FORMATS.md) provides raw performance metrics, while METRICS.yaml files +# provide the interpretation. + +# This format is primarily intended to be read by the GUI application of +# HPCToolkit, HPCViewer. A number of keys in this file only make sense in this +# context, for instance options on how to present the final metric values. + +# NOTE: !!-type specifiers when used below indicate the type(s) allowed for +# the various keys. They are not required and match up with the default type +# as interpreted by most general YAML parsers. + +# Version of the METRICS.yaml format required by this file. Can be used by +# readers to error gracefully without reading the entire file. If omitted +# version checks are disabled. +version: !!int 0 + +# Set of all performance metrics used by this taxonomy. These correspond to +# performance metrics listed in the meta.db file. +# Anchors are used to refer to these metrics later in the file. +inputs: !!seq + - &in-cycles-E + # Canonical name for the performance metric. + # See Performance Metric Specification in FORMATS.md for details. + metric: !!str perf::cycles + # Name of the propagation scope for the value referenced. + # See Performance Metric Specification in FORMATS.md for details. + scope: !!str function + # Unary function used to generate summary statistic values, see Performance + # Metric Specification in FORMATS.md for details. + # This is a formula in the same format as the variants:formula:* keys in + # in the metric description below, with the following differences: + # - The formula must consist of a single !!str, not a !!seq or other + # formula structure ("$$" is used as the variable), and + # - The formula is canonicalized: whitespace and extraneous paratheticals + # should be removed to achieve a match. + # Defaults to '$$'. + formula: !!str $$ + # Combination function use to generate summary statistic values, see + # Performance Metric Specification in FORMATS.md for details. + # One of 'sum', 'min' or 'max'. Defaults to 'sum'. + combine: !!str sum + # Merge keys can be used to lower the repetition of common fields: + - &in-cycles-I + <<: *in-cycles-E + scope: execution + - &in-cycles-E-cnt + <<: *in-cycles-E + formula: 1 + - &in-cycles-I-cnt + <<: *in-cycles-I + formula: 1 + + - &in-l1-miss-E + metric: perf::l1-cache-miss + scope: function + - &in-l1-miss-I + <<: *in-l1-miss-E + scope: execution + - &in-l1-miss-E-cnt + <<: *in-l1-miss-E + formula: 1 + - &in-l1-miss-I-cnt + <<: *in-l1-miss-I + formula: 1 + + - &in-l2-miss-E + metric: perf::l2-cache-miss + scope: function + - &in-l2-miss-I + <<: *in-l2-miss-E + scope: execution + - &in-l2-miss-E-cnt + <<: *in-l2-miss-E + formula: 1 + - &in-l2-miss-I-cnt + <<: *in-l2-miss-I + formula: 1 + + - &in-l3-miss-E + metric: perf::l3-cache-miss + scope: function + - &in-l3-miss-I + <<: *in-l3-miss-E + scope: execution + - &in-l3-miss-E-cnt + <<: *in-l3-miss-E + formula: 1 + - &in-l3-miss-I-cnt + <<: *in-l3-miss-I + formula: 1 + +# Sequence of root metrics provided in this taxonomy. Every metric listed in the +# taxonomy is a descendant of one of these. +roots: + - # Name for the metric. + name: !!str CPU Cycles + # Longer description of the metric, written in Markdown. + # Defaults to the `short description:` if given. + description: > + Cycles spent: + - In the CPU doing actual work (FLOPs), or + - Waiting for outside operations to complete (memory stalls). + # Short description of the metric, used for cases where a long description + # would not be suitable. + # Defaults to `description:` up to the first period or newline. + short description: !!str Cycles spent in the CPU. + + # Whether this metric should be visible in the Viewer by default, default + # true. If false, the Viewer may require that the metric be enabled in the + # metric list before it will be presented. + visible by default: true + + # How the values in the metrics rooted here will be presented in the Viewer + # by default. One of: + # - 'column': Columns of data that can be expanded to show inner metrics. + # Defaults to 'column'. Only allowed on root metrics. + presentation: !!str column + + # Sequence of child metrics, format is the same as a root metric. + # If omitted there are no child metrics. + children: !!seq + - name: L2 Bound + description: Rough cycles spent accessing the L2 cache + + # List of formula variations for this taxonomic metric. Metric values are + # always attributed to an application thread, however for large executions + # this gives too much data to present clearly. Instead, the Viewer + # presents on "summary" values by applying statistics across threads. + # The `inputs:` key above lists the "partial" results required for + # calculating statistics, this key lists the final formulas to generate + # presentable values. + # + # Keys in this map are the human-readable names of the variants. + variants: !!map + !!str Sum: + # How the final value(s) for this metric variant should be rendered. + # Orderless set of elements to be rendered in the metric cell, the + # following options are available: + # - 'number': Numerical rendering (see `format:`). + # - 'percentage': Percentage of the global inclusive value. Only + # allowed if `formula:inclusive:` is given. + # - 'hidden': Mark as hiding (some) inner values (`*`). + # - 'colorbar': Color bar visually indicating the relative sizes of + # values in child metrics. An additional "grey" color is added to + # the bar to indicate the difference between sum-of-children and + # this metric variant's value. (Note that this difference will be + # exactly 0 if `formula:` is 'sum'.) + # The Viewer will order the elements reasonably, and may elide + # elements if screen real estate is tight. + render: !!seq [number, percent] # eg: 1.23e+04 56.7% + # Can also be given as a !!str for a single element: + render: !!str 'number' # eg: 1.23e+04 + + # Printf-like format to use when rendering the metric value(s) as a + # number (`render: number`). The input to "printf" is a single double + # value. Defaults to '%.2e'. + # + # In more detail, this string must be of the form: + # [prefix]%(#0- +')*[field width][.precision](eEfFgGaA)[suffix] + # Where "prefix" and "suffix" use %% to generate a literal %. + format: !!str '%.2e' + + # Which variant child metric values are gotten from. Also used as the + # default variant when first expanding this metric variant. Explicitly + # lists the variant to use for each child metric in order. + child variant: !!seq + - Sum # Use Sum value(s) from first child + - Mean # Use Mean value(s) from second child + # Or can also be given as a !!str if the variant is the same. + child variant: !!str Sum # Use Sum value(s) from all children + # Defaults to the name of this variant. + + # Formula(s) for calculating the final value(s) for this metric + # variant. Ignored unless `render:` contains a numerical element + # (ie. everything except 'hidden'). Can be one of: + # - 'first': Value(s) for this variant are copied from the value(s) + # of the first child. Invalid if `render:` contains 'colorbar'. + # - 'sum': Value(s) are generated by summing child value(s). + # In all cases value(s) are generated vector-wise (ie. inclusive + # values come from inclusive child values, exclusive from exclusive, + # etc.), and null child values generate null values in the parent + # (ie. they aren't replaced with 0). + formula: !!str first + # Can also be written as a !!map listing the vector of formulas. + formula: !!map + # The following keys define the formulas used to generate metrics. + # Formulas are roughly written as a C-like math expression, except: + # - "Variables" are references to other nodes, which can be other + # formulas (sub-expressions) or an entry in the global `inputs:`. + # Eg: `*in-cycles-E` is an input metric value. + # - Parentheses are represented with a YAML !!seq ([...]), breaks + # between elements (,) are considered whitespace. + # Eg: `2 * (3 + 4)` -> `[2 *,[3,+,4]]` + # - Number constants and infix operators can be represented by + # !!int, !!float and !!str YAML elements (as appropriate), and + # need not be separated by an element break (whitespace suffices). + # Eg: `[2 *,[3,+,4]]` == `[2,*,[3+4]]` + # The following operators are available in increasing precedence: + # + - # Addition and subtraction + # * / # Multiplication and (true) division + # ^ # Exponentiation + # - Function calls are represented by a YAML !!map with a single + # pair. The key is the function name and the value is a !!seq + # listing the arguments. + # Eg: `foo(1, 2, 3)` -> `[foo:[1,2,3]]`, + # and `foo(1+x)` -> `[foo:[ [1+,*x] ]]` + # The following functions are available: + # sum:[...] # Sum of arguments + # prod:[...] # Product of arguments + # pow:[a, b] # a raised to the b + # sqrt:[a] # Square root of a (pow(a, .5)) + # log:[a, b] # Logarithm of a base-b + # log:[a] # Natural logarithm of a + # min:[...] # Smallest of arguments + # max:[...] # Largest of arguments + # floor:[a] # Largest integer less than or equal to a + # strict floor:[a] # Largest integer less than a + # ceil:[a] # Smallest integer greater than or equal to a + # strict ceil:[a] # Smallest integer greater than a + + # Formulas to generate "inclusive" cost values. Defaults to null. + inclusive: + # Custom formula used when no special properties are required of + # the formulation. Defaults to the value of `standard:`. + custom: [4*,[*in-l1-miss-I,-,*in-l2-miss-I]] + + # Version of the formula based completely on well-defined metric + # inputs, which refer only to non-custom propagation scopes. Used + # in the bottom-up and flat views, where this property is required + # for accurate analysis. Defaults to null. + # See the meta.db Performance Metrics section for details. + standard: [4*,[*in-l1-miss-I,-,*in-l2-miss-I]] + + # Formulas to generate "exclusive" cost values. Defaults to null. + exclusive: + standard: [4*,[*in-l1-miss-E,-,*in-l2-miss-E]] + + # Another example variant for "L2 Bound" + Mean: + render: [number, percent] + formula: + inclusive: [4*,[*in-l1-miss-I,/,*in-l1-miss-I-cnt, -,*in-l2-miss-I,/,*in-l2-miss-I-cnt]] + exclusive: [4*,[*in-l1-miss-E,/,*in-l1-miss-E-cnt, -,*in-l2-miss-E,/,*in-l2-miss-E-cnt]] + + # Sibling metric, still under "CPU Cycles" + - name: L3 Bound + description: Rough cycles spent accessing L3 cache + variants: + Sum: + render: number + formula: + inclusive: [64*,[*in-l2-miss-I, -,*in-l3-miss-I]] + exclusive: [64*,[*in-l2-miss-E, -,*in-l3-miss-E]] + Mean: + render: [number, percent] + formula: + inclusive: [64*,[*in-l2-miss-I,/,*in-l2-miss-I-cnt, -,*in-l3-miss-I,/,*in-l3-miss-I-cnt]] + exclusive: [64*,[*in-l2-miss-E,/,*in-l2-miss-E-cnt, -,*in-l3-miss-E,/,*in-l3-miss-E-cnt]] + + # Parameters for the root "CPU Cycles" metric + variants: + Sum: + render: number + formula: + inclusive: *in-cycles-I + exclusive: *in-cycles-E + Mean: + render: [number, colorbar] + formula: + inclusive: [*in-cycles-I,/,*in-cycles-I-cnt] + exclusive: [*in-cycles-E,/,*in-cycles-E-cnt] diff --git a/hatchet/tests/data/hpctoolkit-gamess/metrics/default.yaml b/hatchet/tests/data/hpctoolkit-gamess/metrics/default.yaml new file mode 100644 index 00000000..14827891 --- /dev/null +++ b/hatchet/tests/data/hpctoolkit-gamess/metrics/default.yaml @@ -0,0 +1,1153 @@ +version: 0 +inputs: + - &GSYNCx3a_STRx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GSYNC:STR (sec) + scope: execution + formula: $$ + combine: sum + - &GSYNCx3a_STRx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GSYNC:STR (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GSYNCx3a_STRx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GSYNC:STR (sec) + scope: function + formula: $$ + combine: sum + - &GSYNCx3a_UNKx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GSYNC:UNK (sec) + scope: execution + formula: $$ + combine: sum + - &GSYNCx3a_UNKx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GSYNC:UNK (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GSYNCx3a_UNKx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GSYNC:UNK (sec) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_D2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:D2A (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_D2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:D2A (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_D2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:D2A (B) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_A2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:A2D (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_A2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:A2D (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_A2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:A2D (B) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_A2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:A2A (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_A2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:A2A (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_A2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:A2A (B) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_A2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:A2H (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_A2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:A2H (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_A2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:A2H (B) + scope: function + formula: $$ + combine: sum + - &GSYNCx3a_CTXx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GSYNC:CTX (sec) + scope: execution + formula: $$ + combine: sum + - &GSYNCx3a_CTXx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GSYNC:CTX (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GSYNCx3a_CTXx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GSYNC:CTX (sec) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_H2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:H2A (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_H2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:H2A (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_H2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:H2A (B) + scope: function + formula: $$ + combine: sum + - &GSYNCx3a_STREx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GSYNC:STRE (sec) + scope: execution + formula: $$ + combine: sum + - &GSYNCx3a_STREx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GSYNC:STRE (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GSYNCx3a_STREx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GSYNC:STRE (sec) + scope: function + formula: $$ + combine: sum + - &GMEMx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMEM:ARY (B) + scope: execution + formula: $$ + combine: sum + - &GMEMx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMEM:ARY (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMEM:ARY (B) + scope: function + formula: $$ + combine: sum + - &GMSETx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMSET:DST (B) + scope: execution + formula: $$ + combine: sum + - &GMSETx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMSET:DST (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMSET:DST (B) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_P2Px20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:P2P (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_P2Px20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:P2P (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_P2Px20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:P2P (B) + scope: function + formula: $$ + combine: sum + - &GMEMx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMEM:PIN (B) + scope: execution + formula: $$ + combine: sum + - &GMEMx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMEM:PIN (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMEM:PIN (B) + scope: function + formula: $$ + combine: sum + - &GMEMx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMEM:PAG (B) + scope: execution + formula: $$ + combine: sum + - &GMEMx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMEM:PAG (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMEM:PAG (B) + scope: function + formula: $$ + combine: sum + - &GMEMx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMEM:UNK (B) + scope: execution + formula: $$ + combine: sum + - &GMEMx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMEM:UNK (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMEM:UNK (B) + scope: function + formula: $$ + combine: sum + - &GXCOPYx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY (sec) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY (sec) + scope: function + formula: $$ + combine: sum + - &GMEMx3a_COUNT-sum-x5b_0x0x5d_-execution + metric: GMEM:COUNT + scope: execution + formula: $$ + combine: sum + - &GMEMx3a_COUNT-sum-x5b_0x0x5d_-lex_aware + metric: GMEM:COUNT + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx3a_COUNT-sum-x5b_0x0x5d_-function + metric: GMEM:COUNT + scope: function + formula: $$ + combine: sum + - &REALTIMEx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: REALTIME (sec) + scope: execution + formula: $$ + combine: sum + - &REALTIMEx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: REALTIME (sec) + scope: lex_aware + formula: $$ + combine: sum + - &REALTIMEx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: REALTIME (sec) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_H2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:H2D (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_H2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:H2D (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_H2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:H2D (B) + scope: function + formula: $$ + combine: sum + - &GMEMx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMEM:DEV (B) + scope: execution + formula: $$ + combine: sum + - &GMEMx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMEM:DEV (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMEM:DEV (B) + scope: function + formula: $$ + combine: sum + - &GMSETx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMSET:MST (B) + scope: execution + formula: $$ + combine: sum + - &GMSETx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMSET:MST (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMSET:MST (B) + scope: function + formula: $$ + combine: sum + - &GMSETx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GMSET (sec) + scope: execution + formula: $$ + combine: sum + - &GMSETx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMSET (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GMSET (sec) + scope: function + formula: $$ + combine: sum + - &GKERx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GKER (sec) + scope: execution + formula: $$ + combine: sum + - &GKERx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GKER (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GKERx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GKER (sec) + scope: function + formula: $$ + combine: sum + - &GMEMx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GMEM (sec) + scope: execution + formula: $$ + combine: sum + - &GMEMx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMEM (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GMEM (sec) + scope: function + formula: $$ + combine: sum + - &GSYNCx3a_COUNT-sum-x5b_0x0x5d_-execution + metric: GSYNC:COUNT + scope: execution + formula: $$ + combine: sum + - &GSYNCx3a_COUNT-sum-x5b_0x0x5d_-lex_aware + metric: GSYNC:COUNT + scope: lex_aware + formula: $$ + combine: sum + - &GSYNCx3a_COUNT-sum-x5b_0x0x5d_-function + metric: GSYNC:COUNT + scope: function + formula: $$ + combine: sum + - &GMEMx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMEM:MAN (B) + scope: execution + formula: $$ + combine: sum + - &GMEMx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMEM:MAN (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMEM:MAN (B) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_COUNT-sum-x5b_0x0x5d_-execution + metric: GXCOPY:COUNT + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_COUNT-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:COUNT + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_COUNT-sum-x5b_0x0x5d_-function + metric: GXCOPY:COUNT + scope: function + formula: $$ + combine: sum + - &GMSETx3a_COUNT-sum-x5b_0x0x5d_-execution + metric: GMSET:COUNT + scope: execution + formula: $$ + combine: sum + - &GMSETx3a_COUNT-sum-x5b_0x0x5d_-lex_aware + metric: GMSET:COUNT + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx3a_COUNT-sum-x5b_0x0x5d_-function + metric: GMSET:COUNT + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_D2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:D2D (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_D2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:D2D (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_D2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:D2D (B) + scope: function + formula: $$ + combine: sum + - &GSYNCx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GSYNC (sec) + scope: execution + formula: $$ + combine: sum + - &GSYNCx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GSYNC (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GSYNCx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GSYNC (sec) + scope: function + formula: $$ + combine: sum + - &GMSETx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMSET:ARY (B) + scope: execution + formula: $$ + combine: sum + - &GMSETx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMSET:ARY (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMSET:ARY (B) + scope: function + formula: $$ + combine: sum + - &GSYNCx3a_EVTx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GSYNC:EVT (sec) + scope: execution + formula: $$ + combine: sum + - &GSYNCx3a_EVTx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GSYNC:EVT (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GSYNCx3a_EVTx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GSYNC:EVT (sec) + scope: function + formula: $$ + combine: sum + - &GPUOPx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GPUOP (sec) + scope: execution + formula: $$ + combine: sum + - &GPUOPx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GPUOP (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GPUOPx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GPUOP (sec) + scope: function + formula: $$ + combine: sum + - &GICOPYx20_x28_secx29_-sum-x5b_0x0x5d_-execution + metric: GICOPY (sec) + scope: execution + formula: $$ + combine: sum + - &GICOPYx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + metric: GICOPY (sec) + scope: lex_aware + formula: $$ + combine: sum + - &GICOPYx20_x28_secx29_-sum-x5b_0x0x5d_-function + metric: GICOPY (sec) + scope: function + formula: $$ + combine: sum + - &GMSETx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMSET:DEV (B) + scope: execution + formula: $$ + combine: sum + - &GMSETx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMSET:DEV (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMSET:DEV (B) + scope: function + formula: $$ + combine: sum + - &GMSETx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMSET:UNK (B) + scope: execution + formula: $$ + combine: sum + - &GMSETx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMSET:UNK (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMSET:UNK (B) + scope: function + formula: $$ + combine: sum + - &GMEMx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMEM:DST (B) + scope: execution + formula: $$ + combine: sum + - &GMEMx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMEM:DST (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMEM:DST (B) + scope: function + formula: $$ + combine: sum + - &GMSETx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMSET:PAG (B) + scope: execution + formula: $$ + combine: sum + - &GMSETx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMSET:PAG (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMSET:PAG (B) + scope: function + formula: $$ + combine: sum + - &GMEMx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMEM:MST (B) + scope: execution + formula: $$ + combine: sum + - &GMEMx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMEM:MST (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMEMx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMEM:MST (B) + scope: function + formula: $$ + combine: sum + - &GMSETx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMSET:MAN (B) + scope: execution + formula: $$ + combine: sum + - &GMSETx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMSET:MAN (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMSET:MAN (B) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_H2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:H2H (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_H2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:H2H (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_H2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:H2H (B) + scope: function + formula: $$ + combine: sum + - &GMSETx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GMSET:PIN (B) + scope: execution + formula: $$ + combine: sum + - &GMSETx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GMSET:PIN (B) + scope: lex_aware + formula: $$ + combine: sum + - &GMSETx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GMSET:PIN (B) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:UNK (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:UNK (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:UNK (B) + scope: function + formula: $$ + combine: sum + - &GXCOPYx3a_D2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + metric: GXCOPY:D2H (B) + scope: execution + formula: $$ + combine: sum + - &GXCOPYx3a_D2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + metric: GXCOPY:D2H (B) + scope: lex_aware + formula: $$ + combine: sum + - &GXCOPYx3a_D2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-function + metric: GXCOPY:D2H (B) + scope: function + formula: $$ + combine: sum +roots: + - name: REALTIME (sec) + description: REALTIME (sec) + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *REALTIMEx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *REALTIMEx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *REALTIMEx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GPUOP (sec) + description: "GPU time: all operations (seconds)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GPUOPx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GPUOPx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GPUOPx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GKER (sec) + description: "GPU time: kernel execution (seconds)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GKERx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GKERx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GKERx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GMEM (sec) + description: "GPU time: memory allocation/deallocation (seconds)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GMSET (sec) + description: "GPU time: memory set (seconds)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY (sec) + description: "GPU time: explicit data copy (seconds)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GICOPY (sec) + description: "GPU time: implicit data copy (seconds)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GICOPYx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GICOPYx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GICOPYx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GSYNC (sec) + description: "GPU time: synchronization (seconds)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GSYNCx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GSYNCx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GSYNCx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GMEM:UNK (B) + description: "GPU memory alloc/free: unknown memory kind (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMEM:PAG (B) + description: "GPU memory alloc/free: pageable memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMEM:PIN (B) + description: "GPU memory alloc/free: pinned memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMEM:DEV (B) + description: "GPU memory alloc/free: device memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMEM:ARY (B) + description: "GPU memory alloc/free: array memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMEM:MAN (B) + description: "GPU memory alloc/free: managed memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMEM:DST (B) + description: "GPU memory alloc/free: device static memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMEM:MST (B) + description: "GPU memory alloc/free: managed static memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMEM:COUNT + description: "GPU memory alloc/free: count" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMEMx3a_COUNT-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMEMx3a_COUNT-sum-x5b_0x0x5d_-lex_aware + standard: *GMEMx3a_COUNT-sum-x5b_0x0x5d_-function + - name: GMSET:UNK (B) + description: "GPU memory set: unknown memory kind (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMSET:PAG (B) + description: "GPU memory set: pageable memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx3a_PAGx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMSET:PIN (B) + description: "GPU memory set: pinned memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx3a_PINx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMSET:DEV (B) + description: "GPU memory set: device memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx3a_DEVx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMSET:ARY (B) + description: "GPU memory set: array memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx3a_ARYx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMSET:MAN (B) + description: "GPU memory set: managed memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx3a_MANx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMSET:DST (B) + description: "GPU memory set: device static memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx3a_DSTx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMSET:MST (B) + description: "GPU memory set: managed static memory (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx3a_MSTx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GMSET:COUNT + description: "GPU memory set: count" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GMSETx3a_COUNT-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GMSETx3a_COUNT-sum-x5b_0x0x5d_-lex_aware + standard: *GMSETx3a_COUNT-sum-x5b_0x0x5d_-function + - name: GXCOPY:UNK (B) + description: "GPU explicit memory copy: unknown kind (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_UNKx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:H2D (B) + description: "GPU explicit memory copy: host to device (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_H2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_H2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_H2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:D2H (B) + description: "GPU explicit memory copy: device to host (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_D2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_D2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_D2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:H2A (B) + description: "GPU explicit memory copy: host to array (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_H2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_H2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_H2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:A2H (B) + description: "GPU explicit memory copy: array to host (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_A2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_A2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_A2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:A2A (B) + description: "GPU explicit memory copy: array to array (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_A2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_A2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_A2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:A2D (B) + description: "GPU explicit memory copy: array to device (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_A2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_A2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_A2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:D2A (B) + description: "GPU explicit memory copy: device to array (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_D2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_D2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_D2Ax20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:D2D (B) + description: "GPU explicit memory copy: device to device (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_D2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_D2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_D2Dx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:H2H (B) + description: "GPU explicit memory copy: host to host (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_H2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_H2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_H2Hx20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:P2P (B) + description: "GPU explicit memory copy: peer to peer (bytes)" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_P2Px20_x28_Bx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_P2Px20_x28_Bx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_P2Px20_x28_Bx29_-sum-x5b_0x0x5d_-function + - name: GXCOPY:COUNT + description: "GPU explicit memory copy: count" + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GXCOPYx3a_COUNT-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GXCOPYx3a_COUNT-sum-x5b_0x0x5d_-lex_aware + standard: *GXCOPYx3a_COUNT-sum-x5b_0x0x5d_-function + - name: GSYNC:UNK (sec) + description: "GPU synchronizations: unknown kind" + visible by default: false + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GSYNCx3a_UNKx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GSYNCx3a_UNKx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GSYNCx3a_UNKx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GSYNC:EVT (sec) + description: "GPU synchronizations: event" + visible by default: false + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GSYNCx3a_EVTx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GSYNCx3a_EVTx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GSYNCx3a_EVTx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GSYNC:STRE (sec) + description: "GPU synchronizations: stream event wait" + visible by default: false + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GSYNCx3a_STREx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GSYNCx3a_STREx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GSYNCx3a_STREx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GSYNC:STR (sec) + description: "GPU synchronizations: stream" + visible by default: false + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GSYNCx3a_STRx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GSYNCx3a_STRx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GSYNCx3a_STRx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GSYNC:CTX (sec) + description: "GPU synchronizations: context" + visible by default: false + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GSYNCx3a_CTXx20_x28_secx29_-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GSYNCx3a_CTXx20_x28_secx29_-sum-x5b_0x0x5d_-lex_aware + standard: *GSYNCx3a_CTXx20_x28_secx29_-sum-x5b_0x0x5d_-function + - name: GSYNC:COUNT + description: "GPU synchronizations: count" + visible by default: false + variants: + Sum: + render: [number, percent] + formula: + inclusive: + standard: *GSYNCx3a_COUNT-sum-x5b_0x0x5d_-execution + exclusive: + custom: *GSYNCx3a_COUNT-sum-x5b_0x0x5d_-lex_aware + standard: *GSYNCx3a_COUNT-sum-x5b_0x0x5d_-function \ No newline at end of file diff --git a/hatchet/tests/data/hpctoolkit-gamess/profile.db b/hatchet/tests/data/hpctoolkit-gamess/profile.db new file mode 100644 index 00000000..77f55453 Binary files /dev/null and b/hatchet/tests/data/hpctoolkit-gamess/profile.db differ diff --git a/hatchet/tests/data/hpctoolkit-gamess/trace.db b/hatchet/tests/data/hpctoolkit-gamess/trace.db new file mode 100644 index 00000000..b7f102e7 Binary files /dev/null and b/hatchet/tests/data/hpctoolkit-gamess/trace.db differ diff --git a/hatchet/tests/hpctoolkit_latest.py b/hatchet/tests/hpctoolkit_latest.py new file mode 100644 index 00000000..75f74eb7 --- /dev/null +++ b/hatchet/tests/hpctoolkit_latest.py @@ -0,0 +1,213 @@ +# Copyright 2017-2023 Lawrence Livermore National Security, LLC and other +# Hatchet Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +from hatchet import GraphFrame +from hatchet.node import Node + + +def test_import_entire_db(data_dir: str) -> None: + graphframe = GraphFrame.from_hpctoolkit_latest(f"{data_dir}/hpctoolkit-gamess") + + assert len(graphframe.graph.roots) == 1 + assert graphframe.graph.roots[0]._hatchet_nid == 1195 + assert graphframe.graph.roots[0]._depth == 0 + assert graphframe.graph.roots[0].frame["name"] == "entry" + assert graphframe.graph.roots[0].frame["type"] == "entry" + + assert len(graphframe.dataframe) == 10824 + assert "name" in graphframe.dataframe.columns + assert "time (inc)" in graphframe.dataframe.columns + assert "time" in graphframe.dataframe.columns + assert "gpuop (inc)" in graphframe.dataframe.columns + assert "gker (inc)" in graphframe.dataframe.columns + assert "gxcopy (inc)" in graphframe.dataframe.columns + assert "gxcopy:count (inc)" in graphframe.dataframe.columns + + measurements = graphframe.dataframe.loc[Node(None, hnid=1195)] + assert measurements["name"] == "entry" + assert round(measurements["time (inc)"], 2) == 1608.49 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + + measurements = graphframe.dataframe.loc[Node(None, hnid=1197)] + assert measurements["name"] == "gamess_" + assert round(measurements["time (inc)"], 2) == 1608.49 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + + measurements = graphframe.dataframe.loc[Node(None, hnid=1004)] + assert measurements["name"] == "[libsci_cray.so.5.0]:0" + assert round(measurements["time (inc)"], 2) == 0.08 + assert round(measurements["time"], 2) == 0.08 + + measurements = graphframe.dataframe.loc[Node(None, hnid=1003)] + assert measurements["name"] == "[libsci_cray.so.5.0]:0" + assert round(measurements["time (inc)"], 2) == 0.08 + assert round(measurements["time"], 2) == 0.08 + + +def test_filter_by_max_depth(data_dir: str) -> None: + graphframe = GraphFrame.from_hpctoolkit_latest( + f"{data_dir}/hpctoolkit-gamess", max_depth=10 + ) + + assert len(graphframe.graph.roots) == 1 + assert graphframe.graph.roots[0]._hatchet_nid == 1195 + assert graphframe.graph.roots[0]._depth == 0 + assert graphframe.graph.roots[0].frame["name"] == "entry" + assert graphframe.graph.roots[0].frame["type"] == "entry" + + assert len(graphframe.dataframe) == 133 + assert "name" in graphframe.dataframe.columns + assert "time (inc)" in graphframe.dataframe.columns + assert "time" in graphframe.dataframe.columns + assert "gpuop (inc)" in graphframe.dataframe.columns + assert "gker (inc)" in graphframe.dataframe.columns + assert "gxcopy (inc)" in graphframe.dataframe.columns + assert "gxcopy:count (inc)" in graphframe.dataframe.columns + + measurements = graphframe.dataframe.loc[Node(None, hnid=1195)] + assert measurements["name"] == "entry" + assert round(measurements["time (inc)"], 2) == 1608.49 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + + measurements = graphframe.dataframe.loc[Node(None, hnid=1197)] + assert measurements["name"] == "gamess_" + assert round(measurements["time (inc)"], 2) == 1608.49 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + + measurements = graphframe.dataframe.loc[Node(None, hnid=9846)] + assert measurements["name"] == "wfn_" + assert round(measurements["time (inc)"], 2) == 786.09 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + + measurements = graphframe.dataframe.loc[Node(None, hnid=9845)] + assert measurements["name"] == "[gamess.00.x]:0" + assert round(measurements["time (inc)"], 2) == 786.09 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + + for node in graphframe.graph.traverse(): + assert node._depth <= 10 + + +def test_filter_by_min_percentage_of_application_time(data_dir: str) -> None: + graphframe = GraphFrame.from_hpctoolkit_latest( + f"{data_dir}/hpctoolkit-gamess", min_percentage_of_application_time=1 + ) + + assert len(graphframe.graph.roots) == 1 + assert graphframe.graph.roots[0]._hatchet_nid == 1195 + assert graphframe.graph.roots[0]._depth == 0 + assert graphframe.graph.roots[0].frame["name"] == "entry" + assert graphframe.graph.roots[0].frame["type"] == "entry" + + assert len(graphframe.dataframe) == 164 + assert "name" in graphframe.dataframe.columns + assert "time (inc)" in graphframe.dataframe.columns + assert "time" in graphframe.dataframe.columns + assert "gpuop (inc)" in graphframe.dataframe.columns + assert "gker (inc)" in graphframe.dataframe.columns + assert "gxcopy (inc)" in graphframe.dataframe.columns + assert "gxcopy:count (inc)" in graphframe.dataframe.columns + + measurements = graphframe.dataframe.loc[Node(None, hnid=1195)] + assert measurements["name"] == "entry" + assert round(measurements["time (inc)"], 2) == 1608.49 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + application_time = measurements["time (inc)"] + + measurements = graphframe.dataframe.loc[Node(None, hnid=1197)] + assert measurements["name"] == "gamess_" + assert round(measurements["time (inc)"], 2) == 1608.49 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + + measurements = graphframe.dataframe.loc[Node(None, hnid=2856)] + assert measurements["name"] == "__GI___sched_yield" + assert round(measurements["time (inc)"], 3) == 159.238 + assert round(measurements["time"], 3) == 159.238 + + measurements = graphframe.dataframe.loc[Node(None, hnid=251)] + assert measurements["name"] == "[libc-2.31.so]:0" + assert round(measurements["time (inc)"], 3) == 159.238 + assert round(measurements["time"], 3) == 159.238 + + for node in graphframe.graph.traverse(): + node_time = graphframe.dataframe.loc[node]["time (inc)"] + assert node_time / application_time >= 0.01 + + +def test_filter_by_min_percentage_of_parent_time(data_dir: str) -> None: + graphframe = GraphFrame.from_hpctoolkit_latest( + f"{data_dir}/hpctoolkit-gamess", min_percentage_of_parent_time=1 + ) + + assert len(graphframe.graph.roots) == 1 + assert graphframe.graph.roots[0]._hatchet_nid == 1195 + assert graphframe.graph.roots[0]._depth == 0 + assert graphframe.graph.roots[0].frame["name"] == "entry" + assert graphframe.graph.roots[0].frame["type"] == "entry" + + assert len(graphframe.dataframe) == 4576 + assert "name" in graphframe.dataframe.columns + assert "time (inc)" in graphframe.dataframe.columns + assert "time" in graphframe.dataframe.columns + assert "gpuop (inc)" in graphframe.dataframe.columns + assert "gker (inc)" in graphframe.dataframe.columns + assert "gxcopy (inc)" in graphframe.dataframe.columns + assert "gxcopy:count (inc)" in graphframe.dataframe.columns + + measurements = graphframe.dataframe.loc[Node(None, hnid=1195)] + assert measurements["name"] == "entry" + assert round(measurements["time (inc)"], 2) == 1608.49 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + + measurements = graphframe.dataframe.loc[Node(None, hnid=1197)] + assert measurements["name"] == "gamess_" + assert round(measurements["time (inc)"], 2) == 1608.49 + assert round(measurements["gpuop (inc)"], 2) == 608.09 + assert round(measurements["gker (inc)"], 2) == 608.00 + assert round(measurements["gxcopy (inc)"], 2) == 0.09 + assert measurements["gxcopy:count (inc)"] == 9688 + + measurements = graphframe.dataframe.loc[Node(None, hnid=2856)] + assert measurements["name"] == "__GI___sched_yield" + assert round(measurements["time (inc)"], 3) == 159.238 + assert round(measurements["time"], 3) == 159.238 + + measurements = graphframe.dataframe.loc[Node(None, hnid=251)] + assert measurements["name"] == "[libc-2.31.so]:0" + assert round(measurements["time (inc)"], 3) == 159.238 + assert round(measurements["time"], 3) == 159.238 + + for node in graphframe.graph.traverse(): + node_time = graphframe.dataframe.loc[node]["time (inc)"] + if node.frame["type"] != "entry": + parent_time = graphframe.dataframe.loc[node.parents[0]]["time (inc)"] + assert node_time / parent_time >= 0.01