diff --git a/cpp/.clang-format b/.clang-format similarity index 100% rename from cpp/.clang-format rename to .clang-format diff --git a/.gitattributes b/.gitattributes index fbfe7434d50..ed8e5e1425a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,5 @@ python/cudf/cudf/_version.py export-subst +python/strings_udf/strings_udf/_version.py export-subst python/cudf_kafka/cudf_kafka/_version.py export-subst python/custreamz/custreamz/_version.py export-subst python/dask_cudf/dask_cudf/_version.py export-subst diff --git a/.gitignore b/.gitignore index 29df683e9ec..0d63c76bf9f 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,8 @@ python/cudf_kafka/*/_lib/**/*.cpp python/cudf_kafka/*/_lib/**/*.h python/custreamz/*/_lib/**/*.cpp python/custreamz/*/_lib/**/*.h +python/strings_udf/strings_udf/_lib/*.cpp +python/strings_udf/strings_udf/*.ptx .Python env/ develop-eggs/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 08e35fb47b0..a1debd52720 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,6 @@ repos: # project can specify its own first/third-party packages. args: ["--config-root=python/", "--resolve-all-configs"] files: python/.* - exclude: (__init__.py|setup.py)$ types_or: [python, cython, pyi] - repo: https://github.com/psf/black rev: 22.3.0 diff --git a/build.sh b/build.sh index eee3ee512fa..ac283d01fc9 100755 --- a/build.sh +++ b/build.sh @@ -17,7 +17,7 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats" +VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz strings_udf -v -g -n -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats" HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) @@ -335,6 +335,15 @@ if buildAll || hasArg cudf; then fi fi +if buildAll || hasArg strings_udf; then + + cd ${REPODIR}/python/strings_udf + python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1} + if [[ ${INSTALL_TARGET} != "" ]]; then + python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1} + fi +fi + # Build and install the dask_cudf Python package if buildAll || hasArg dask_cudf; then diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 0cdea59b2de..514b2463685 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -82,6 +82,14 @@ fi if [ "$BUILD_LIBCUDF" == '1' ]; then gpuci_logger "Build conda pkg for libcudf" gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcudf $CONDA_BUILD_ARGS + + # BUILD_LIBCUDF == 1 means this job is being run on the cpu_build jobs + # that is where we must also build the strings_udf package + gpuci_logger "Build conda pkg for strings_udf (python 3.8)" + gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.8 + gpuci_logger "Build conda pkg for strings_udf (python 3.9)" + gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.9 + mkdir -p ${CONDA_BLD_DIR}/libcudf/work cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf/work gpuci_logger "sccache stats" @@ -110,6 +118,10 @@ if [ "$BUILD_CUDF" == '1' ]; then gpuci_logger "Build conda pkg for custreamz" gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL + + gpuci_logger "Build conda pkg for strings_udf" + gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL + fi ################################################################################ # UPLOAD - Conda packages diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh index 29f6265ec63..5fe35ef7466 100755 --- a/ci/cpu/upload.sh +++ b/ci/cpu/upload.sh @@ -33,6 +33,19 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then export LIBCUDF_FILES=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf --output) LIBCUDF_FILES=$(echo "$LIBCUDF_FILES" | sed 's/.*libcudf-example.*//') # skip libcudf-example pkg upload gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing --no-progress $LIBCUDF_FILES + + # since strings_udf compiles libcudf code, we require it be built in the same environment as libcudf + # however since libcudf is agnostic to the python version that is present, we must vary it explicitly + # here if we want packages for both python 3.8 and 3.9 + export STRINGS_UDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/strings_udf --python=3.8 --output) + test -e ${STRINGS_UDF_FILE} + echo "Upload strings_udf (python 3.8): ${STRINGS_UDF_FILE}" + gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${STRINGS_UDF_FILE} --no-progress + + export STRINGS_UDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/strings_udf --python=3.9 --output) + test -e ${STRINGS_UDF_FILE} + echo "Upload strings_udf (python 3.9): ${STRINGS_UDF_FILE}" + gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${STRINGS_UDF_FILE} --no-progress fi if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 60dc8e2ae33..b491399f630 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -81,7 +81,6 @@ gpuci_logger "Check conda environment" conda info conda config --show-sources conda list --show-channel-urls - gpuci_logger "Check compiler versions" python --version @@ -123,11 +122,11 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then install_dask ################################################################################ - # BUILD - Build libcudf, cuDF, libcudf_kafka, and dask_cudf from source + # BUILD - Build libcudf, cuDF, libcudf_kafka, dask_cudf, and strings_udf from source ################################################################################ gpuci_logger "Build from source" - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds + "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka strings_udf benchmarks tests --ptds ################################################################################ # TEST - Run GoogleTest @@ -185,7 +184,11 @@ else gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON -c ${CONDA_ARTIFACT_PATH} gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON -c ${CONDA_ARTIFACT_PATH} - gpuci_logger "Installing cudf, dask-cudf, cudf_kafka and custreamz" + # the CUDA component of strings_udf must be built on cuda 11.5 just like libcudf + # but because there is no separate python package, we must also build the python on the 11.5 jobs + # this means that at this point (on the GPU test jobs) the whole package is already built and has been + # copied by CI from the upstream 11.5 jobs into $CONDA_ARTIFACT_PATH + gpuci_logger "Installing cudf, dask-cudf, cudf_kafka, and custreamz" gpuci_mamba_retry install cudf dask-cudf cudf_kafka custreamz -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}" gpuci_logger "GoogleTests" @@ -249,6 +252,8 @@ fi cd "$WORKSPACE/python/cudf/cudf" # It is essential to cd into $WORKSPACE/python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level. +gpuci_logger "Check conda packages" +conda list gpuci_logger "Python py.test for cuDF" py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests @@ -260,6 +265,31 @@ cd "$WORKSPACE/python/custreamz" gpuci_logger "Python py.test for cuStreamz" py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term custreamz +gpuci_logger "Installing strings_udf" +gpuci_mamba_retry install strings_udf -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}" + +cd "$WORKSPACE/python/strings_udf/strings_udf" +gpuci_logger "Python py.test for strings_udf" + +# We do not want to exit with a nonzero exit code in the case where no +# strings_udf tests are run because that will always happen when the local CUDA +# version is not 11.5. We need to suppress the exit code because this script is +# run with set -e and we're already setting a trap that we don't want to +# override here. + +STRINGS_UDF_PYTEST_RETCODE=0 +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/strings-udf-cuda-tmp" --junitxml="$WORKSPACE/junit-strings-udf.xml" -v --cov-config=.coveragerc --cov=strings_udf --cov-report=xml:"$WORKSPACE/python/strings_udf/strings-udf-coverage.xml" --cov-report term tests || STRINGS_UDF_PYTEST_RETCODE=$? + +if [ ${STRINGS_UDF_PYTEST_RETCODE} -eq 5 ]; then + echo "No strings UDF tests were run, but this script will continue to execute." +elif [ ${STRINGS_UDF_PYTEST_RETCODE} -ne 0 ]; then + exit ${STRINGS_UDF_PYTEST_RETCODE} +else + cd "$WORKSPACE/python/cudf/cudf" + gpuci_logger "Python py.test retest cuDF UDFs" + py.test tests/test_udf_masked_ops.py -n 8 --cache-clear +fi + # Run benchmarks with both cudf and pandas to ensure compatibility is maintained. # Benchmarks are run in DEBUG_ONLY mode, meaning that only small data sizes are used. # Therefore, these runs only verify that benchmarks are valid. diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh index 1a5c2614000..36d093d0d28 100755 --- a/ci/gpu/test-notebooks.sh +++ b/ci/gpu/test-notebooks.sh @@ -1,4 +1,5 @@ #!/bin/bash +# Copyright (c) 2020-2022, NVIDIA CORPORATION. NOTEBOOKS_DIR="$WORKSPACE/notebooks" NBTEST="$WORKSPACE/ci/utils/nbtest.sh" diff --git a/ci/local/build.sh b/ci/local/build.sh index 345db967264..f6479cd76cc 100755 --- a/ci/local/build.sh +++ b/ci/local/build.sh @@ -1,4 +1,5 @@ #!/bin/bash +# Copyright (c) 2019-2022, NVIDIA CORPORATION. GIT_DESCRIBE_TAG=`git describe --tags` MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 34783a414bd..8fad4e08c56 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -37,6 +37,9 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' # Python update sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt +# Strings UDF update +sed_runner 's/'"strings_udf_version .*)"'/'"strings_udf_version ${NEXT_FULL_TAG})"'/g' python/strings_udf/CMakeLists.txt + # cpp libcudf_kafka update sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh index 1b39f267c65..2a94e2d0695 100755 --- a/ci/utils/nbtest.sh +++ b/ci/utils/nbtest.sh @@ -1,4 +1,5 @@ #!/bin/bash +# Copyright (c) 2020-2022, NVIDIA CORPORATION. MAGIC_OVERRIDE_CODE=" def my_run_line_magic(*args, **kwargs): diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index 57dbfdf6d0a..69b9f6b24ba 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -21,7 +21,7 @@ dependencies: - python>=3.8,<3.10 - numba>=0.56.2 - numpy - - pandas>=1.0,<1.5.0dev0 + - pandas>=1.0,<1.6.0dev0 - pyarrow=9 - fastavro>=0.22.9 - python-snappy>=0.6.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 65e97d3755c..9b8e379b25e 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -48,7 +48,7 @@ requirements: - protobuf>=3.20.1,<3.21.0a0 - python - typing_extensions - - pandas >=1.0,<1.5.0dev0 + - pandas >=1.0,<1.6.0dev0 - cupy >=9.5.0,<12.0.0a0 - numba >=0.56.2 - numpy diff --git a/conda/recipes/strings_udf/build.sh b/conda/recipes/strings_udf/build.sh new file mode 100644 index 00000000000..2de1325347b --- /dev/null +++ b/conda/recipes/strings_udf/build.sh @@ -0,0 +1,4 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +# This assumes the script is executed from the root of the repo directory +./build.sh strings_udf diff --git a/conda/recipes/strings_udf/conda_build_config.yaml b/conda/recipes/strings_udf/conda_build_config.yaml new file mode 100644 index 00000000000..d9c3f21448f --- /dev/null +++ b/conda/recipes/strings_udf/conda_build_config.yaml @@ -0,0 +1,14 @@ +c_compiler_version: + - 9 + +cxx_compiler_version: + - 9 + +sysroot_version: + - "2.17" + +cmake_version: + - ">=3.20.1,!=3.23.0" + +cuda_compiler: + - nvcc diff --git a/conda/recipes/strings_udf/meta.yaml b/conda/recipes/strings_udf/meta.yaml new file mode 100644 index 00000000000..e29fb55ce63 --- /dev/null +++ b/conda/recipes/strings_udf/meta.yaml @@ -0,0 +1,65 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} +{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} +{% set py_version=environ.get('CONDA_PY', 38) %} +{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} +{% set cuda_major=cuda_version.split('.')[0] %} + +package: + name: strings_udf + version: {{ version }} + +source: + git_url: ../../.. + +build: + number: {{ GIT_DESCRIBE_NUMBER }} + string: cuda_{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + script_env: + - VERSION_SUFFIX + - PARALLEL_LEVEL + # libcudf's run_exports pinning is looser than we would like + ignore_run_exports: + - libcudf + ignore_run_exports_from: + - {{ compiler('cuda') }} + +requirements: + build: + - cmake {{ cmake_version }} + - {{ compiler('c') }} + - {{ compiler('cxx') }} + - {{ compiler('cuda') }} {{ cuda_version }} + - sysroot_{{ target_platform }} {{ sysroot_version }} + host: + - python + - cython >=0.29,<0.30 + - scikit-build>=0.13.1 + - setuptools + - numba >=0.54 + - libcudf ={{ version }} + - cudf ={{ version }} + - cudatoolkit ={{ cuda_version }} + run: + - python + - typing_extensions + - numba >=0.54 + - numpy + - libcudf ={{ version }} + - cudf ={{ version }} + - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} + - cachetools + - ptxcompiler # [linux64] # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler +test: # [linux64] + requires: # [linux64] + - cudatoolkit {{ cuda_version }}.* # [linux64] + imports: # [linux64] + - strings_udf # [linux64] + +about: + home: https://rapids.ai/ + license: Apache-2.0 + license_family: APACHE + license_file: LICENSE + summary: strings_udf library diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ae33ad260d2..7efa186aede 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -328,6 +328,7 @@ add_library( src/io/csv/writer_impl.cu src/io/functions.cpp src/io/json/json_gpu.cu + src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/reader_impl.cu src/io/json/experimental/read_json.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index bbd51546668..d1ff177a25e 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -223,8 +223,10 @@ ConfigureBench(MERGE_BENCH merge/merge.cpp) ConfigureBench(NULLMASK_BENCH null_mask/set_null_mask.cpp) # ################################################################################################## -# * parquet writer chunks benchmark --------------------------------------------------------------- -ConfigureBench(PARQUET_WRITER_CHUNKS_BENCH io/parquet/parquet_writer_chunks.cpp) +# * parquet writer benchmark ---------------------------------------------------------------------- +ConfigureNVBench( + PARQUET_WRITER_NVBENCH io/parquet/parquet_writer.cpp io/parquet/parquet_writer_chunks.cpp +) # ################################################################################################## # * parquet reader benchmark ---------------------------------------------------------------------- @@ -238,11 +240,7 @@ ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reade # ################################################################################################## # * csv reader benchmark -------------------------------------------------------------------------- -ConfigureBench(CSV_READER_BENCH io/csv/csv_reader.cpp) - -# ################################################################################################## -# * parquet writer benchmark ---------------------------------------------------------------------- -ConfigureBench(PARQUET_WRITER_BENCH io/parquet/parquet_writer.cpp) +ConfigureNVBench(CSV_READER_NVBENCH io/csv/csv_reader_input.cpp io/csv/csv_reader_options.cpp) # ################################################################################################## # * orc writer benchmark -------------------------------------------------------------------------- diff --git a/cpp/benchmarks/io/csv/csv_reader.cpp b/cpp/benchmarks/io/csv/csv_reader.cpp deleted file mode 100644 index 17b4a342dea..00000000000 --- a/cpp/benchmarks/io/csv/csv_reader.cpp +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include - -// to enable, run cmake with -DBUILD_BENCHMARKS=ON - -constexpr size_t data_size = 256 << 20; -constexpr cudf::size_type num_cols = 64; - -namespace cudf_io = cudf::io; - -class CsvRead : public cudf::benchmark { -}; - -void BM_csv_read_varying_input(benchmark::State& state) -{ - auto const data_types = get_type_or_group(state.range(0)); - auto const source_type = static_cast(state.range(1)); - - auto const tbl = - create_random_table(cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}); - auto const view = tbl->view(); - - cuio_source_sink_pair source_sink(source_type); - cudf_io::csv_writer_options options = - cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true); - cudf_io::write_csv(options); - - cudf_io::csv_reader_options const read_options = - cudf_io::csv_reader_options::builder(source_sink.make_source_info()); - - auto mem_stats_logger = cudf::memory_stats_logger(); - for (auto _ : state) { - try_drop_l3_cache(); - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf_io::read_csv(read_options); - } - - state.SetBytesProcessed(data_size * state.iterations()); - state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); - state.counters["encoded_file_size"] = source_sink.size(); -} - -void BM_csv_read_varying_options(benchmark::State& state) -{ - auto const col_sel = static_cast(state.range(0)); - auto const row_sel = static_cast(state.range(1)); - auto const num_chunks = state.range(2); - - auto const data_types = - dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL), - int32_t(type_group_id::FLOATING_POINT), - int32_t(type_group_id::FIXED_POINT), - int32_t(type_group_id::TIMESTAMP), - int32_t(type_group_id::DURATION), - int32_t(cudf::type_id::STRING)}), - col_sel); - auto const cols_to_read = select_column_indexes(data_types.size(), col_sel); - - auto const tbl = create_random_table(data_types, table_size_bytes{data_size}); - auto const view = tbl->view(); - - cuio_source_sink_pair source_sink(io_type::HOST_BUFFER); - cudf_io::csv_writer_options options = - cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view) - .include_header(true) - .line_terminator("\r\n"); - cudf_io::write_csv(options); - - cudf_io::csv_reader_options read_options = - cudf_io::csv_reader_options::builder(source_sink.make_source_info()) - .use_cols_indexes(cols_to_read) - .thousands('\'') - .windowslinetermination(true) - .comment('#') - .prefix("BM_"); - - size_t const chunk_size = source_sink.size() / num_chunks; - cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; - auto mem_stats_logger = cudf::memory_stats_logger(); - for (auto _ : state) { - try_drop_l3_cache(); - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - for (int32_t chunk = 0; chunk < num_chunks; ++chunk) { - // only read the header in the first chunk - read_options.set_header(chunk == 0 ? 0 : -1); - - auto const is_last_chunk = chunk == (num_chunks - 1); - switch (row_sel) { - case row_selection::ALL: break; - case row_selection::BYTE_RANGE: - read_options.set_byte_range_offset(chunk * chunk_size); - read_options.set_byte_range_size(chunk_size); - if (is_last_chunk) read_options.set_byte_range_size(0); - break; - case row_selection::NROWS: - read_options.set_skiprows(chunk * chunk_row_cnt); - read_options.set_nrows(chunk_row_cnt); - if (is_last_chunk) read_options.set_nrows(-1); - break; - case row_selection::SKIPFOOTER: - read_options.set_skiprows(chunk * chunk_row_cnt); - read_options.set_skipfooter(view.num_rows() - (chunk + 1) * chunk_row_cnt); - if (is_last_chunk) read_options.set_skipfooter(0); - break; - default: CUDF_FAIL("Unsupported row selection method"); - } - - cudf_io::read_csv(read_options); - } - } - - auto const data_processed = data_size * cols_to_read.size() / view.num_columns(); - state.SetBytesProcessed(data_processed * state.iterations()); - state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); - state.counters["encoded_file_size"] = source_sink.size(); -} - -#define CSV_RD_BM_INPUTS_DEFINE(name, type_or_group, src_type) \ - BENCHMARK_DEFINE_F(CsvRead, name) \ - (::benchmark::State & state) { BM_csv_read_varying_input(state); } \ - BENCHMARK_REGISTER_F(CsvRead, name) \ - ->Args({int32_t(type_or_group), src_type}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL); -RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT); -RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT); -RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); -RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, durations, type_group_id::DURATION); -RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING); - -BENCHMARK_DEFINE_F(CsvRead, column_selection) -(::benchmark::State& state) { BM_csv_read_varying_options(state); } -BENCHMARK_REGISTER_F(CsvRead, column_selection) - ->ArgsProduct({{int32_t(column_selection::ALL), - int32_t(column_selection::ALTERNATE), - int32_t(column_selection::FIRST_HALF), - int32_t(column_selection::SECOND_HALF)}, - {int32_t(row_selection::ALL)}, - {1}}) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_DEFINE_F(CsvRead, row_selection) -(::benchmark::State& state) { BM_csv_read_varying_options(state); } -BENCHMARK_REGISTER_F(CsvRead, row_selection) - ->ArgsProduct({{int32_t(column_selection::ALL)}, - {int32_t(row_selection::BYTE_RANGE), - int32_t(row_selection::NROWS), - int32_t(row_selection::SKIPFOOTER)}, - {1, 8}}) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); diff --git a/cpp/benchmarks/io/csv/csv_reader_input.cpp b/cpp/benchmarks/io/csv/csv_reader_input.cpp new file mode 100644 index 00000000000..4f895e13f1b --- /dev/null +++ b/cpp/benchmarks/io/csv/csv_reader_input.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include + +constexpr size_t data_size = 256 << 20; +constexpr cudf::size_type num_cols = 64; + +template +void csv_read_common(DataType const& data_types, + cudf::io::io_type const& source_type, + nvbench::state& state) +{ + auto const tbl = + create_random_table(cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}); + auto const view = tbl->view(); + + cuio_source_sink_pair source_sink(source_type); + cudf::io::csv_writer_options options = + cudf::io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true); + + cudf::io::write_csv(options); + + cudf::io::csv_reader_options const read_options = + cudf::io::csv_reader_options::builder(source_sink.make_source_info()); + + auto const mem_stats_logger = cudf::memory_stats_logger(); // init stats logger + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + try_drop_l3_cache(); // Drop L3 cache for accurate measurement + + timer.start(); + cudf::io::read_csv(read_options); + timer.stop(); + }); + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); +} + +template +void BM_csv_read_input(nvbench::state& state, nvbench::type_list>) +{ + cudf::rmm_pool_raii rmm_pool; + + auto const d_type = get_type_or_group(static_cast(DataType)); + auto const source_type = io_type::FILEPATH; + + csv_read_common(d_type, source_type, state); +} + +template +void BM_csv_read_io(nvbench::state& state, nvbench::type_list>) +{ + cudf::rmm_pool_raii rmm_pool; + + auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL), + static_cast(data_type::FLOAT), + static_cast(data_type::DECIMAL), + static_cast(data_type::TIMESTAMP), + static_cast(data_type::DURATION), + static_cast(data_type::STRING)}); + auto const source_type = IO; + + csv_read_common(d_type, source_type, state); +} + +using d_type_list = nvbench::enum_type_list; + +using io_list = + nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(BM_csv_read_input, NVBENCH_TYPE_AXES(d_type_list)) + .set_name("csv_read_data_type") + .set_type_axes_names({"data_type"}) + .set_min_samples(4); + +NVBENCH_BENCH_TYPES(BM_csv_read_io, NVBENCH_TYPE_AXES(io_list)) + .set_name("csv_read_io") + .set_type_axes_names({"io"}) + .set_min_samples(4); diff --git a/cpp/benchmarks/io/csv/csv_reader_options.cpp b/cpp/benchmarks/io/csv/csv_reader_options.cpp new file mode 100644 index 00000000000..b569dc65f3d --- /dev/null +++ b/cpp/benchmarks/io/csv/csv_reader_options.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include + +constexpr size_t data_size = 256 << 20; + +template +void BM_csv_read_varying_options( + nvbench::state& state, + nvbench::type_list, nvbench::enum_type>) +{ + cudf::rmm_pool_raii rmm_pool; + + auto const data_types = + dtypes_for_column_selection(get_type_or_group({static_cast(data_type::INTEGRAL), + static_cast(data_type::FLOAT), + static_cast(data_type::DECIMAL), + static_cast(data_type::TIMESTAMP), + static_cast(data_type::DURATION), + static_cast(data_type::STRING)}), + ColSelection); + auto const cols_to_read = select_column_indexes(data_types.size(), ColSelection); + auto const num_chunks = state.get_int64("num_chunks"); + + auto const tbl = create_random_table(data_types, table_size_bytes{data_size}); + auto const view = tbl->view(); + + cuio_source_sink_pair source_sink(io_type::HOST_BUFFER); + cudf::io::csv_writer_options options = + cudf::io::csv_writer_options::builder(source_sink.make_sink_info(), view) + .include_header(true) + .line_terminator("\r\n"); + cudf::io::write_csv(options); + + cudf::io::csv_reader_options read_options = + cudf::io::csv_reader_options::builder(source_sink.make_source_info()) + .use_cols_indexes(cols_to_read) + .thousands('\'') + .windowslinetermination(true) + .comment('#') + .prefix("BM_"); + + size_t const chunk_size = source_sink.size() / num_chunks; + cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + try_drop_l3_cache(); // Drop L3 cache for accurate measurement + + timer.start(); + for (int32_t chunk = 0; chunk < num_chunks; ++chunk) { + // only read the header in the first chunk + read_options.set_header(chunk == 0 ? 0 : -1); + + auto const is_last_chunk = chunk == (num_chunks - 1); + switch (RowSelection) { + case row_selection::ALL: break; + case row_selection::BYTE_RANGE: + read_options.set_byte_range_offset(chunk * chunk_size); + read_options.set_byte_range_size(chunk_size); + if (is_last_chunk) read_options.set_byte_range_size(0); + break; + case row_selection::NROWS: + read_options.set_skiprows(chunk * chunk_row_cnt); + read_options.set_nrows(chunk_row_cnt); + if (is_last_chunk) read_options.set_nrows(-1); + break; + case row_selection::SKIPFOOTER: + read_options.set_skiprows(chunk * chunk_row_cnt); + read_options.set_skipfooter(view.num_rows() - (chunk + 1) * chunk_row_cnt); + if (is_last_chunk) read_options.set_skipfooter(0); + break; + default: CUDF_FAIL("Unsupported row selection method"); + } + + cudf::io::read_csv(read_options); + } + timer.stop(); + }); + + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + auto const data_processed = data_size * cols_to_read.size() / view.num_columns(); + state.add_element_count(static_cast(data_processed) / elapsed_time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); +} + +using col_selections = nvbench::enum_type_list; + +using row_selections = nvbench:: + enum_type_list; + +NVBENCH_BENCH_TYPES(BM_csv_read_varying_options, + NVBENCH_TYPE_AXES(col_selections, nvbench::enum_type_list)) + .set_name("csv_read_column_selection") + .set_type_axes_names({"column_selection", "row_selection"}) + .set_min_samples(4) + .add_int64_axis("num_chunks", {1}); + +NVBENCH_BENCH_TYPES(BM_csv_read_varying_options, + NVBENCH_TYPE_AXES(nvbench::enum_type_list, + row_selections)) + .set_name("csv_read_row_selection") + .set_type_axes_names({"column_selection", "row_selection"}) + .set_min_samples(4) + .add_int64_axis("num_chunks", {1, 8}); diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp index 1e84d0ee7c5..e2d4c3b77d8 100644 --- a/cpp/benchmarks/io/json/nested_json.cpp +++ b/cpp/benchmarks/io/json/nested_json.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -71,15 +72,21 @@ void BM_NESTED_JSON(nvbench::state& state) state.add_element_count(input.size()); // Run algorithm + auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { // Allocate device-side temporary storage & run algorithm cudf::io::json::detail::parse_nested_json(input, default_options, cudf::default_stream_value); }); + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(string_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } NVBENCH_BENCH(BM_NESTED_JSON) .set_name("nested_json_gpu_parser") - .add_int64_power_of_two_axis("string_size", nvbench::range(20, 31, 1)); + .add_int64_power_of_two_axis("string_size", nvbench::range(20, 30, 1)); } // namespace cudf diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp index 6c52c9a91cb..3ebef4fe0bd 100644 --- a/cpp/benchmarks/io/nvbench_helpers.hpp +++ b/cpp/benchmarks/io/nvbench_helpers.hpp @@ -122,7 +122,11 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( [](auto value) { switch (value) { case row_selection::ALL: return "ALL"; + case row_selection::BYTE_RANGE: return "BYTE_RANGE"; case row_selection::NROWS: return "NROWS"; + case row_selection::SKIPFOOTER: return "SKIPFOOTER"; + case row_selection::STRIPES: return "STRIPES"; + case row_selection::ROW_GROUPS: return "ROW_GROUPS"; default: return "Unknown"; } }, @@ -139,18 +143,6 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( }, [](auto) { return std::string{}; }) -NVBENCH_DECLARE_ENUM_TYPE_STRINGS( - cudf::io::statistics_freq, - [](auto value) { - switch (value) { - case cudf::io::statistics_freq::STATISTICS_NONE: return "STATISTICS_NONE"; - case cudf::io::statistics_freq::STATISTICS_ROWGROUP: return "ORC_STATISTICS_STRIPE"; - case cudf::io::statistics_freq::STATISTICS_PAGE: return "ORC_STATISTICS_ROW_GROUP"; - default: return "Unknown"; - } - }, - [](auto) { return std::string{}; }) - enum class converts_strings : bool { YES, NO }; enum class uses_pandas_metadata : bool { YES, NO }; diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp index 73f6812ac8e..ddf699b0eaa 100644 --- a/cpp/benchmarks/io/orc/orc_writer.cpp +++ b/cpp/benchmarks/io/orc/orc_writer.cpp @@ -26,6 +26,18 @@ #include +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + cudf::io::statistics_freq, + [](auto value) { + switch (value) { + case cudf::io::statistics_freq::STATISTICS_NONE: return "STATISTICS_NONE"; + case cudf::io::statistics_freq::STATISTICS_ROWGROUP: return "ORC_STATISTICS_STRIPE"; + case cudf::io::statistics_freq::STATISTICS_PAGE: return "ORC_STATISTICS_ROW_GROUP"; + default: return "Unknown"; + } + }, + [](auto) { return std::string{}; }) + constexpr int64_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; @@ -38,7 +50,7 @@ void BM_orc_write_encode(nvbench::state& state, nvbench::type_list #include +#include #include -#include +#include #include +#include + +#include // to enable, run cmake with -DBUILD_BENCHMARKS=ON +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + cudf::io::statistics_freq, + [](auto value) { + switch (value) { + case cudf::io::statistics_freq::STATISTICS_NONE: return "STATISTICS_NONE"; + case cudf::io::statistics_freq::STATISTICS_ROWGROUP: return "STATISTICS_ROWGROUP"; + case cudf::io::statistics_freq::STATISTICS_PAGE: return "STATISTICS_PAGE"; + default: return "Unknown"; + } + }, + [](auto) { return std::string{}; }) + constexpr size_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; -namespace cudf_io = cudf::io; +template +void BM_parq_write_encode(nvbench::state& state, nvbench::type_list>) +{ + cudf::rmm_pool_raii rmm_pool; -class ParquetWrite : public cudf::benchmark { -}; + auto const data_types = get_type_or_group(static_cast(DataType)); + cudf::size_type const cardinality = state.get_int64("cardinality"); + cudf::size_type const run_length = state.get_int64("run_length"); + auto const compression = cudf::io::compression_type::SNAPPY; + auto const sink_type = io_type::VOID; -void BM_parq_write_varying_inout(benchmark::State& state) + auto const tbl = + create_random_table(cycle_dtypes(data_types, num_cols), + table_size_bytes{data_size}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + std::size_t encoded_file_size = 0; + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, + [&](nvbench::launch& launch, auto& timer) { + cuio_source_sink_pair source_sink(sink_type); + + timer.start(); + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(compression); + cudf::io::write_parquet(opts); + timer.stop(); + + encoded_file_size = source_sink.size(); + }); + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size"); +} + +template +void BM_parq_write_io_compression( + nvbench::state& state, + nvbench::type_list, nvbench::enum_type>) { - auto const data_types = get_type_or_group(state.range(0)); - cudf::size_type const cardinality = state.range(1); - cudf::size_type const run_length = state.range(2); - cudf_io::compression_type const compression = - state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE; - auto const sink_type = static_cast(state.range(4)); + cudf::rmm_pool_raii rmm_pool; + + auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL), + static_cast(data_type::FLOAT), + static_cast(data_type::DECIMAL), + static_cast(data_type::TIMESTAMP), + static_cast(data_type::DURATION), + static_cast(data_type::STRING), + static_cast(data_type::LIST), + static_cast(data_type::STRUCT)}); + + cudf::size_type const cardinality = state.get_int64("cardinality"); + cudf::size_type const run_length = state.get_int64("run_length"); + auto const compression = Compression; + auto const sink_type = IO; auto const tbl = create_random_table(cycle_dtypes(data_types, num_cols), @@ -46,80 +111,114 @@ void BM_parq_write_varying_inout(benchmark::State& state) data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); auto const view = tbl->view(); - cuio_source_sink_pair source_sink(sink_type); - auto mem_stats_logger = cudf::memory_stats_logger(); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf_io::parquet_writer_options opts = - cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression); - cudf_io::write_parquet(opts); - } - - state.SetBytesProcessed(data_size * state.iterations()); - state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); - state.counters["encoded_file_size"] = source_sink.size(); + std::size_t encoded_file_size = 0; + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, + [&](nvbench::launch& launch, auto& timer) { + cuio_source_sink_pair source_sink(sink_type); + + timer.start(); + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(compression); + cudf::io::write_parquet(opts); + timer.stop(); + + encoded_file_size = source_sink.size(); + }); + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size"); } -void BM_parq_write_varying_options(benchmark::State& state) +template +void BM_parq_write_varying_options( + nvbench::state& state, + nvbench::type_list, nvbench::enum_type>) { - auto const compression = static_cast(state.range(0)); - auto const enable_stats = static_cast(state.range(1)); - auto const file_path = state.range(2) != 0 ? "unused_path.parquet" : ""; - - auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED), - int32_t(type_group_id::FLOATING_POINT), - int32_t(type_group_id::FIXED_POINT), - int32_t(type_group_id::TIMESTAMP), - int32_t(type_group_id::DURATION), - int32_t(cudf::type_id::STRING), - int32_t(cudf::type_id::LIST)}); + auto const enable_stats = Statistics; + auto const compression = Compression; + auto const file_path = state.get_string("file_path"); + + auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), + static_cast(data_type::FLOAT), + static_cast(data_type::DECIMAL), + static_cast(data_type::TIMESTAMP), + static_cast(data_type::DURATION), + static_cast(data_type::STRING), + static_cast(data_type::LIST)}); auto const tbl = create_random_table(data_types, table_size_bytes{data_size}); auto const view = tbl->view(); - cuio_source_sink_pair source_sink(io_type::FILEPATH); + std::size_t encoded_file_size = 0; + auto mem_stats_logger = cudf::memory_stats_logger(); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf_io::parquet_writer_options const options = - cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression) - .stats_level(enable_stats) - .column_chunks_file_paths({file_path}); - cudf_io::write_parquet(options); - } - - state.SetBytesProcessed(data_size * state.iterations()); - state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); - state.counters["encoded_file_size"] = source_sink.size(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, + [&](nvbench::launch& launch, auto& timer) { + cuio_source_sink_pair source_sink(io_type::FILEPATH); + + timer.start(); + cudf::io::parquet_writer_options const options = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(compression) + .stats_level(enable_stats) + .column_chunks_file_paths({file_path}); + cudf::io::write_parquet(options); + timer.stop(); + + encoded_file_size = source_sink.size(); + }); + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size"); } -#define PARQ_WR_BM_INOUTS_DEFINE(name, type_or_group, sink_type) \ - BENCHMARK_DEFINE_F(ParquetWrite, name) \ - (::benchmark::State & state) { BM_parq_write_varying_inout(state); } \ - BENCHMARK_REGISTER_F(ParquetWrite, name) \ - ->ArgsProduct({{int32_t(type_or_group)}, {0, 1000}, {1, 32}, {true, false}, {sink_type}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL); -WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT); -WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT); -WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP); -WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, durations, type_group_id::DURATION); -WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING); -WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST); -WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, struct, cudf::type_id::STRUCT); - -BENCHMARK_DEFINE_F(ParquetWrite, writer_options) -(::benchmark::State& state) { BM_parq_write_varying_options(state); } -BENCHMARK_REGISTER_F(ParquetWrite, writer_options) - ->ArgsProduct({{int32_t(cudf::io::compression_type::NONE), - int32_t(cudf::io::compression_type::SNAPPY)}, - {int32_t(cudf::io::statistics_freq::STATISTICS_NONE), - int32_t(cudf::io::statistics_freq::STATISTICS_ROWGROUP), - int32_t(cudf::io::statistics_freq::STATISTICS_PAGE)}, - {false, true}}) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); +using d_type_list = nvbench::enum_type_list; + +using io_list = nvbench::enum_type_list; + +using compression_list = + nvbench::enum_type_list; + +using stats_list = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) + .set_name("parquet_write_encode") + .set_type_axes_names({"data_type"}) + .set_min_samples(4) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + +NVBENCH_BENCH_TYPES(BM_parq_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) + .set_name("parquet_write_io_compression") + .set_type_axes_names({"io", "compression"}) + .set_min_samples(4) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + +NVBENCH_BENCH_TYPES(BM_parq_write_varying_options, NVBENCH_TYPE_AXES(stats_list, compression_list)) + .set_name("parquet_write_options") + .set_type_axes_names({"statistics", "compression"}) + .set_min_samples(4) + .add_string_axis("file_path", {"unused_path.parquet", ""}); diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp index e22696b9c01..6c8500a2a70 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp @@ -16,50 +16,61 @@ #include #include +#include #include -#include +#include #include #include #include +#include + +#include // to enable, run cmake with -DBUILD_BENCHMARKS=ON constexpr int64_t data_size = 512 << 20; -namespace cudf_io = cudf::io; +void PQ_write(nvbench::state& state) +{ + cudf::rmm_pool_raii rmm_pool; -class ParquetWrite : public cudf::benchmark { -}; -class ParquetWriteChunked : public cudf::benchmark { -}; + cudf::size_type const num_cols = state.get_int64("num_cols"); -void PQ_write(benchmark::State& state) -{ - cudf::size_type num_cols = state.range(0); - - auto tbl = create_random_table(cycle_dtypes({cudf::type_id::INT32}, num_cols), - table_size_bytes{data_size}); - cudf::table_view view = tbl->view(); - - auto mem_stats_logger = cudf::memory_stats_logger(); - cuio_source_sink_pair source_sink(io_type::VOID); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf_io::parquet_writer_options opts = - cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view); - cudf_io::write_parquet(opts); - } + auto const tbl = create_random_table(cycle_dtypes({cudf::type_id::INT32}, num_cols), + table_size_bytes{data_size}); + auto const view = tbl->view(); + + std::size_t encoded_file_size = 0; + auto const mem_stats_logger = cudf::memory_stats_logger(); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, + [&](nvbench::launch& launch, auto& timer) { + cuio_source_sink_pair source_sink(io_type::VOID); - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0)); - state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); - state.counters["encoded_file_size"] = source_sink.size(); + timer.start(); + cudf::io::parquet_writer_options opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view); + cudf::io::write_parquet(opts); + timer.stop(); + + encoded_file_size = source_sink.size(); + }); + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size"); } -void PQ_write_chunked(benchmark::State& state) +void PQ_write_chunked(nvbench::state& state) { - cudf::size_type num_cols = state.range(0); - cudf::size_type num_tables = state.range(1); + cudf::rmm_pool_raii rmm_pool; + + cudf::size_type const num_cols = state.get_int64("num_cols"); + cudf::size_type const num_tables = state.get_int64("num_chunks"); std::vector> tables; for (cudf::size_type idx = 0; idx < num_tables; idx++) { @@ -67,47 +78,41 @@ void PQ_write_chunked(benchmark::State& state) table_size_bytes{size_t(data_size / num_tables)})); } - auto mem_stats_logger = cudf::memory_stats_logger(); - cuio_source_sink_pair source_sink(io_type::VOID); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf_io::chunked_parquet_writer_options opts = - cudf_io::chunked_parquet_writer_options::builder(source_sink.make_sink_info()); - cudf_io::parquet_chunked_writer writer(opts); - std::for_each(tables.begin(), tables.end(), [&writer](std::unique_ptr const& tbl) { - writer.write(*tbl); + auto const mem_stats_logger = cudf::memory_stats_logger(); + std::size_t encoded_file_size = 0; + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.exec( + nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) { + cuio_source_sink_pair source_sink(io_type::VOID); + + timer.start(); + cudf::io::chunked_parquet_writer_options opts = + cudf::io::chunked_parquet_writer_options::builder(source_sink.make_sink_info()); + cudf::io::parquet_chunked_writer writer(opts); + std::for_each(tables.begin(), + tables.end(), + [&writer](std::unique_ptr const& tbl) { writer.write(*tbl); }); + writer.close(); + timer.stop(); + + encoded_file_size = source_sink.size(); }); - writer.close(); - } - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0)); - state.counters["peak_memory_usage"] = mem_stats_logger.peak_memory_usage(); - state.counters["encoded_file_size"] = source_sink.size(); + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size"); } -#define PWBM_BENCHMARK_DEFINE(name, size, num_columns) \ - BENCHMARK_DEFINE_F(ParquetWrite, name)(::benchmark::State & state) { PQ_write(state); } \ - BENCHMARK_REGISTER_F(ParquetWrite, name) \ - ->Args({num_columns}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime() - -PWBM_BENCHMARK_DEFINE(3Gb8Cols, data_size, 8); -PWBM_BENCHMARK_DEFINE(3Gb1024Cols, data_size, 1024); - -#define PWCBM_BENCHMARK_DEFINE(name, num_columns, num_chunks) \ - BENCHMARK_DEFINE_F(ParquetWriteChunked, name)(::benchmark::State & state) \ - { \ - PQ_write_chunked(state); \ - } \ - BENCHMARK_REGISTER_F(ParquetWriteChunked, name) \ - ->Args({num_columns, num_chunks}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime() \ - ->Iterations(4) - -PWCBM_BENCHMARK_DEFINE(3Gb8Cols64Chunks, 8, 8); -PWCBM_BENCHMARK_DEFINE(3Gb1024Cols64Chunks, 1024, 8); - -PWCBM_BENCHMARK_DEFINE(3Gb8Cols128Chunks, 8, 64); -PWCBM_BENCHMARK_DEFINE(3Gb1024Cols128Chunks, 1024, 64); +NVBENCH_BENCH(PQ_write) + .set_name("parquet_write_num_cols") + .set_min_samples(4) + .add_int64_axis("num_cols", {8, 1024}); + +NVBENCH_BENCH(PQ_write_chunked) + .set_name("parquet_chunked_write") + .set_min_samples(4) + .add_int64_axis("num_cols", {8, 1024}) + .add_int64_axis("num_chunks", {8, 64}); diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index cbdaf5520ff..379b1521bf0 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -41,8 +41,8 @@ function(find_and_configure_thrust VERSION) CPM_ARGS GIT_REPOSITORY https://github.com/NVIDIA/thrust.git GIT_TAG ${VERSION} - GIT_SHALLOW TRUE ${cpm_thrust_disconnect_update} PATCH_COMMAND patch --reject-file=- -p1 -N < - ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true + GIT_SHALLOW TRUE ${cpm_thrust_disconnect_update} + PATCH_COMMAND patch --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true OPTIONS "THRUST_INSTALL TRUE" ) @@ -80,6 +80,6 @@ function(find_and_configure_thrust VERSION) endif() endfunction() -set(CUDF_MIN_VERSION_Thrust 1.17.0) +set(CUDF_MIN_VERSION_Thrust 1.17.2) find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust}) diff --git a/cpp/doxygen/developer_guide/DOCUMENTATION.md b/cpp/doxygen/developer_guide/DOCUMENTATION.md index c9f38d5849b..8a7d89c8dbd 100644 --- a/cpp/doxygen/developer_guide/DOCUMENTATION.md +++ b/cpp/doxygen/developer_guide/DOCUMENTATION.md @@ -437,7 +437,7 @@ We recommend installing Doxygen using conda (`conda install doxygen`) or a Linux Alternatively you can [build and install doxygen from source](https://www.doxygen.nl/manual/install.html). To build the libcudf HTML documentation simply run the `doxygen` command from the `cpp/doxygen` directory containing the `Doxyfile`. -The libcudf documentation can also be built using `make docs_cudf` from the cmake build directory (e.g. `cpp/build`). +The libcudf documentation can also be built using `cmake --build . --target docs_cudf` from the cmake build directory (e.g. `cpp/build`). Doxygen reads and processes all appropriate source files under the `cpp/include/` directory. The output is generated in the `cpp/doxygen/html/` directory. You can load the local `index.html` file generated there into any web browser to view the result. diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp index 1d6e718717d..5a3914da453 100644 --- a/cpp/examples/basic/src/process_csv.cpp +++ b/cpp/examples/basic/src/process_csv.cpp @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include #include diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 73724b99589..66ac6d74cff 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -38,6 +38,22 @@ namespace io { class json_reader_options_builder; +/** + * @brief Allows specifying the target types for nested JSON data via json_reader_options' + * `set_dtypes` method. + */ +struct schema_element { + /** + * @brief The type that this column should be converted to + */ + data_type type; + + /** + * @brief Allows specifying this column's child columns target type + */ + std::map child_types; +}; + /** * @brief Input arguments to the `read_json` interface. * @@ -65,7 +81,10 @@ class json_reader_options { source_info _source; // Data types of the column; empty to infer dtypes - std::variant, std::map> _dtypes; + std::variant, + std::map, + std::map> + _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -83,6 +102,9 @@ class json_reader_options { // Whether to use the experimental reader bool _experimental = false; + // Whether to keep the quote characters of string values + bool _keep_quotes = false; + /** * @brief Constructor from source info. * @@ -120,7 +142,10 @@ class json_reader_options { * * @returns Data types of the columns */ - std::variant, std::map> const& get_dtypes() const + std::variant, + std::map, + std::map> const& + get_dtypes() const { return _dtypes; } @@ -203,6 +228,13 @@ class json_reader_options { */ bool is_enabled_experimental() const { return _experimental; } + /** + * @brief Whether the experimental reader should keep quotes of string values. + * + * @returns true if the experimental reader should keep quotes, false otherwise + */ + bool is_enabled_keep_quotes() const { return _keep_quotes; } + /** * @brief Set data types for columns to be read. * @@ -217,6 +249,13 @@ class json_reader_options { */ void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** + * @brief Set data types for a potentially nested column hierarchy. + * + * @param types Map of column names to schema_element to support arbitrary nesting of data types + */ + void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** * @brief Set the compression type. * @@ -258,6 +297,14 @@ class json_reader_options { * @param val Boolean value to enable/disable the experimental reader */ void enable_experimental(bool val) { _experimental = val; } + + /** + * @brief Set whether the experimental reader should keep quotes of string values. + * + * @param val Boolean value to indicate whether the experimental reader should keep quotes + * of string values + */ + void enable_keep_quotes(bool val) { _keep_quotes = val; } }; /** @@ -305,6 +352,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set data types for columns to be read. + * + * @param types Column name -> schema_element map + * @return this for chaining + */ + json_reader_options_builder& dtypes(std::map types) + { + options._dtypes = std::move(types); + return *this; + } + /** * @brief Set the compression type. * @@ -377,6 +436,19 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether the experimental reader should keep quotes of string values. + * + * @param val Boolean value to indicate whether the experimental reader should keep quotes + * of string values + * @return this for chaining + */ + json_reader_options_builder& keep_quotes(bool val) + { + options._keep_quotes = val; + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index e5b89cc0f91..d974eaa103a 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -107,7 +107,7 @@ struct double_statistics : minmax_statistics, sum_statistics { * order. The `sum` is the total length of elements in the column. * Note: According to ORC specs, the sum should be signed, but pyarrow uses unsigned value */ -struct string_statistics : minmax_statistics, sum_statistics { +struct string_statistics : minmax_statistics, sum_statistics { }; /** @@ -163,6 +163,7 @@ struct column_statistics; */ struct column_statistics { std::optional number_of_values; ///< number of statistics + std::optional has_null; ///< column has any nulls std::variant; auto find_key = static_cast(key).value(stream); auto keys_view = column_device_view::create(input.keys(), stream); - auto iter = thrust::equal_range(thrust::device, // segfaults: rmm::exec_policy(stream) and - // thrust::cuda::par.on(stream) + auto iter = thrust::equal_range(rmm::exec_policy(cudf::default_stream_value), keys_view->begin(), keys_view->end(), find_key); diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 5850b3af1c4..eeb27c2ac05 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -153,7 +154,8 @@ std::shared_ptr dispatch_to_arrow::operator()( auto count = thrust::make_counting_iterator(0); - thrust::for_each(count, + thrust::for_each(rmm::exec_policy(cudf::default_stream_value), + count, count + input.size(), [in = input.begin(), out = buf.data()] __device__(auto in_idx) { auto const out_idx = in_idx * 2; diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index d847598d6dd..e02c7ff85fa 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -422,7 +422,7 @@ struct AgentDFA { OffsetT const num_total_symbols, StateIndexT& state, CallbackOpT& callback_op, - cub::Int2Type /**/) + cub::Int2Type) { using StateTransitionOpT = StateTransitionOp; diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 6f702a489a0..aabaa941daf 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -287,6 +287,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info) column_statistics::column_statistics(cudf::io::orc::column_statistics&& cs) { number_of_values = cs.number_of_values; + has_null = cs.has_null; if (cs.int_stats) { type_specific_stats = *cs.int_stats; } else if (cs.double_stats) { diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index ceac40ba4f9..7d78bd34b19 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -47,9 +47,6 @@ table_with_metadata read_json(host_span> sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const dtypes_empty = - std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); - CUDF_EXPECTS(dtypes_empty, "user specified dtypes are not yet supported"); CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0, "specifying a byte range is not yet supported"); diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu new file mode 100644 index 00000000000..3a26a1479e5 --- /dev/null +++ b/cpp/src/io/json/json_tree.cu @@ -0,0 +1,714 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nested_json.hpp" +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::io::json { +namespace detail { + +// The node that a token represents +struct token_to_node { + __device__ auto operator()(PdaTokenT const token) -> NodeT + { + switch (token) { + case token_t::StructBegin: return NC_STRUCT; + case token_t::ListBegin: return NC_LIST; + case token_t::StringBegin: return NC_STR; + case token_t::ValueBegin: return NC_VAL; + case token_t::FieldNameBegin: return NC_FN; + default: return NC_ERR; + }; + } +}; + +// Convert token indices to node range for each valid node. +struct node_ranges { + device_span tokens; + device_span token_indices; + bool include_quote_char; + __device__ auto operator()(size_type i) -> thrust::tuple + { + // Whether a token expects to be followed by its respective end-of-* token partner + auto is_begin_of_section = [] __device__(PdaTokenT const token) { + switch (token) { + case token_t::StringBegin: + case token_t::ValueBegin: + case token_t::FieldNameBegin: return true; + default: return false; + }; + }; + // The end-of-* partner token for a given beginning-of-* token + auto end_of_partner = [] __device__(PdaTokenT const token) { + switch (token) { + case token_t::StringBegin: return token_t::StringEnd; + case token_t::ValueBegin: return token_t::ValueEnd; + case token_t::FieldNameBegin: return token_t::FieldNameEnd; + default: return token_t::ErrorBegin; + }; + }; + // Includes quote char for end-of-string token or Skips the quote char for + // beginning-of-field-name token + auto get_token_index = [include_quote_char = include_quote_char] __device__( + PdaTokenT const token, SymbolOffsetT const token_index) { + constexpr SymbolOffsetT quote_char_size = 1; + switch (token) { + // Strip off quote char included for StringBegin + case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size); + // Strip off or Include trailing quote char for string values for StringEnd + case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0); + // Strip off quote char included for FieldNameBegin + case token_t::FieldNameBegin: return token_index + quote_char_size; + default: return token_index; + }; + }; + PdaTokenT const token = tokens[i]; + // The section from the original JSON input that this token demarcates + SymbolOffsetT range_begin = get_token_index(token, token_indices[i]); + SymbolOffsetT range_end = range_begin + 1; // non-leaf, non-field nodes ignore this value. + if (is_begin_of_section(token)) { + if ((i + 1) < tokens.size() && end_of_partner(token) == tokens[i + 1]) { + // Update the range_end for this pair of tokens + range_end = get_token_index(tokens[i + 1], token_indices[i + 1]); + } + } + return thrust::make_tuple(range_begin, range_end); + } +}; + +// Generates a tree representation of the given tokens, token_indices. +tree_meta_t get_tree_representation(device_span tokens, + device_span token_indices, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + // Whether a token does represent a node in the tree representation + auto is_node = [] __device__(PdaTokenT const token) -> bool { + switch (token) { + case token_t::StructBegin: + case token_t::ListBegin: + case token_t::StringBegin: + case token_t::ValueBegin: + case token_t::FieldNameBegin: + case token_t::ErrorBegin: return true; + default: return false; + }; + }; + + // Whether the token pops from the parent node stack + auto does_pop = [] __device__(PdaTokenT const token) { + switch (token) { + case token_t::StructMemberEnd: + case token_t::StructEnd: + case token_t::ListEnd: return true; + default: return false; + }; + }; + + // Whether the token pushes onto the parent node stack + auto does_push = [] __device__(PdaTokenT const token) { + switch (token) { + case token_t::FieldNameBegin: + case token_t::StructBegin: + case token_t::ListBegin: return true; + default: return false; + }; + }; + + auto num_tokens = tokens.size(); + auto is_node_it = thrust::make_transform_iterator( + tokens.begin(), + [is_node] __device__(auto t) -> size_type { return static_cast(is_node(t)); }); + auto num_nodes = thrust::count_if( + rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node); + + // Node categories: copy_if with transform. + rmm::device_uvector node_categories(num_nodes, stream, mr); + auto node_categories_it = + thrust::make_transform_output_iterator(node_categories.begin(), token_to_node{}); + auto node_categories_end = thrust::copy_if(rmm::exec_policy(stream), + tokens.begin(), + tokens.begin() + num_tokens, + node_categories_it, + is_node); + CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes, + "node category count mismatch"); + + // Node levels: transform_exclusive_scan, copy_if. + rmm::device_uvector token_levels(num_tokens, stream); + auto push_pop_it = thrust::make_transform_iterator( + tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { + return does_push(token) - does_pop(token); + }); + thrust::exclusive_scan( + rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin()); + + rmm::device_uvector node_levels(num_nodes, stream, mr); + auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream), + token_levels.begin(), + token_levels.begin() + num_tokens, + tokens.begin(), + node_levels.begin(), + is_node); + CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch"); + + // Node ranges: copy_if with transform. + rmm::device_uvector node_range_begin(num_nodes, stream, mr); + rmm::device_uvector node_range_end(num_nodes, stream, mr); + auto node_range_tuple_it = + thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); + // Whether the tokenizer stage should keep quote characters for string values + // If the tokenizer keeps the quote characters, they may be stripped during type casting + constexpr bool include_quote_char = true; + auto node_range_out_it = thrust::make_transform_output_iterator( + node_range_tuple_it, node_ranges{tokens, token_indices, include_quote_char}); + + auto node_range_out_end = + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_tokens, + node_range_out_it, + [is_node, tokens_gpu = tokens.begin()] __device__(size_type i) -> bool { + return is_node(tokens_gpu[i]); + }); + CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); + + // Node parent ids: previous push token_id transform, stable sort, segmented scan with Max, + // reorder, copy_if. This one is sort of logical stack. But more generalized. + // TODO: make it own function. + rmm::device_uvector parent_token_ids(num_tokens, stream); + rmm::device_uvector initial_order(num_tokens, stream); + + thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end()); + thrust::tabulate(rmm::exec_policy(stream), + parent_token_ids.begin(), + parent_token_ids.end(), + [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type { + return (i > 0) && does_push(tokens_gpu[i - 1]) ? i - 1 : -1; + // -1, not sentinel used here because of max operation below + }); + + auto out_pid = thrust::make_zip_iterator(parent_token_ids.data(), initial_order.data()); + // Uses radix sort for builtin types. + thrust::stable_sort_by_key(rmm::exec_policy(stream), + token_levels.data(), + token_levels.data() + token_levels.size(), + out_pid); + + // SegmentedScan Max. + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + token_levels.data(), + token_levels.data() + token_levels.size(), + parent_token_ids.data(), + parent_token_ids.data(), + thrust::equal_to{}, + thrust::maximum{}); + // Reusing token_levels memory & use scatter to restore the original order. + + std::swap(token_levels, parent_token_ids); + auto& sorted_parent_token_ids = token_levels; + thrust::scatter(rmm::exec_policy(stream), + sorted_parent_token_ids.begin(), + sorted_parent_token_ids.end(), + initial_order.data(), + parent_token_ids.data()); + + rmm::device_uvector node_ids_gpu(num_tokens, stream); + thrust::exclusive_scan( + rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin()); + + rmm::device_uvector parent_node_ids(num_nodes, stream, mr); + auto parent_node_ids_it = thrust::make_transform_iterator( + parent_token_ids.begin(), + [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT { + return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid]; + }); + auto parent_node_ids_end = thrust::copy_if(rmm::exec_policy(stream), + parent_node_ids_it, + parent_node_ids_it + parent_token_ids.size(), + tokens.begin(), + parent_node_ids.begin(), + is_node); + CUDF_EXPECTS(parent_node_ids_end - parent_node_ids.begin() == num_nodes, + "parent node id gather mismatch"); + + return {std::move(node_categories), + std::move(parent_node_ids), + std::move(node_levels), + std::move(node_range_begin), + std::move(node_range_end)}; +} + +/** + * @brief Generates unique node_type id for each node. + * Field nodes with the same name are assigned the same node_type id. + * List, Struct, and String nodes are assigned their category values as node_type ids. + * + * All inputs and outputs are in node_id order. + * @param d_input JSON string in device memory + * @param d_tree Tree representation of the JSON + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return Vector of node_type ids + */ +rmm::device_uvector hash_node_type_with_field_name(device_span d_input, + tree_meta_t const& d_tree, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; + using hash_map_type = + cuco::static_map; + auto num_nodes = d_tree.node_categories.size(); + + constexpr size_type empty_node_index_sentinel = -1; + hash_map_type key_map{compute_hash_table_size(num_nodes), // TODO reduce oversubscription + cuco::sentinel::empty_key{empty_node_index_sentinel}, + cuco::sentinel::empty_value{empty_node_index_sentinel}, + hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + auto d_hasher = [d_input = d_input.data(), + node_range_begin = d_tree.node_range_begin.data(), + node_range_end = d_tree.node_range_end.data()] __device__(auto node_id) { + auto const field_name = cudf::string_view(d_input + node_range_begin[node_id], + node_range_end[node_id] - node_range_begin[node_id]); + return cudf::detail::default_hash{}(field_name); + }; + auto d_equal = [d_input = d_input.data(), + node_range_begin = d_tree.node_range_begin.data(), + node_range_end = d_tree.node_range_end.data()] __device__(auto node_id1, + auto node_id2) { + auto const field_name1 = cudf::string_view( + d_input + node_range_begin[node_id1], node_range_end[node_id1] - node_range_begin[node_id1]); + auto const field_name2 = cudf::string_view( + d_input + node_range_begin[node_id2], node_range_end[node_id2] - node_range_begin[node_id2]); + return field_name1 == field_name2; + }; + auto is_field_name_node = [node_categories = d_tree.node_categories.data()] __device__( + auto node_id) { return node_categories[node_id] == node_t::NC_FN; }; + // key-value pairs: uses node_id itself as node_type. (unique node_id for a field name due to + // hashing) + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); + + key_map.insert_if(iter, + iter + num_nodes, + thrust::counting_iterator(0), // stencil + is_field_name_node, + d_hasher, + d_equal, + stream.value()); + auto get_hash_value = + [key_map = key_map.get_device_view(), d_hasher, d_equal] __device__(auto node_id) -> size_type { + auto it = key_map.find(node_id, d_hasher, d_equal); + return (it == key_map.end()) ? size_type{0} : it->second.load(cuda::std::memory_order_relaxed); + }; + // convert field nodes to node indices, and other nodes to enum value. + rmm::device_uvector node_type(num_nodes, stream); + thrust::tabulate(rmm::exec_policy(stream), + node_type.begin(), + node_type.end(), + [node_categories = d_tree.node_categories.data(), + is_field_name_node, + get_hash_value] __device__(auto node_id) -> size_type { + if (is_field_name_node(node_id)) + return static_cast(NUM_NODE_CLASSES) + get_hash_value(node_id); + else + return static_cast(node_categories[node_id]); + }); + return node_type; +} + +/** + * @brief Translates sorted parent_node_ids to parent_indices with indices from scatter_indices + * + * @param scatter_indices The sorted order of parent_node_ids + * @param parent_node_ids The sorted parent_node_ids + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Translated parent_indices pointing to sorted node_ids positions + */ +rmm::device_uvector translate_sorted_parent_node_indices( + device_span scatter_indices, + device_span parent_node_ids, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = scatter_indices.size(); + auto const gather_indices = cudf::detail::scatter_to_gather( + scatter_indices.begin(), scatter_indices.end(), num_nodes, stream); + + rmm::device_uvector parent_indices(num_nodes, stream); + // gather, except parent sentinels + thrust::transform(rmm::exec_policy(stream), + parent_node_ids.begin(), + parent_node_ids.end(), + parent_indices.begin(), + [gather_indices = gather_indices.data()] __device__(auto parent_node_id) { + return (parent_node_id == parent_node_sentinel) + ? parent_node_sentinel + : gather_indices[parent_node_id]; + }); + return parent_indices; +}; + +/** + * @brief Generates column id and parent column id for each node from the node_level sorted inputs + * + * 4. Per-Level Processing: Propagate parent node ids for each level. + * For each level, + * a. gather col_id from previous level results. input=col_id, gather_map is parent_indices. + * b. stable sort by {parent_col_id, node_type} + * c. scan sum of unique {parent_col_id, node_type} + * d. scatter the col_id back to stable node_level order (using scatter_indices) + * + * pre-condition: All input arguments are stable sorted by node_level (stable in node_id order) + * post-condition: Returned column_id, parent_col_id are level sorted. + * @param node_type Unique id to identify node type, field with different name has different id. + * @param parent_indices Parent node indices in the sorted node_level order + * @param d_level_boundaries The boundaries of each level in the sorted node_level order + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return column_id, parent_column_id + */ +std::pair, rmm::device_uvector> generate_column_id( + device_span node_type, // level sorted + device_span parent_indices, // level sorted + device_span d_level_boundaries, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + auto const num_nodes = node_type.size(); + rmm::device_uvector scatter_indices(num_nodes, stream); + thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end()); + rmm::device_uvector col_id(num_nodes, stream, mr); + rmm::device_uvector parent_col_id(num_nodes, stream); + // scatter 1 to level_boundaries alone, useful for scan later + thrust::scatter(rmm::exec_policy(stream), + thrust::make_constant_iterator(1), + thrust::make_constant_iterator(1) + d_level_boundaries.size() - 1, + d_level_boundaries.begin(), + col_id.begin()); + auto level_boundaries = cudf::detail::make_std_vector_async(d_level_boundaries, stream); + // Initialize First level node's node col_id to 0 + thrust::fill(rmm::exec_policy(stream), col_id.begin(), col_id.begin() + level_boundaries[0], 0); + // Initialize First level node's parent_col_id to parent_node_sentinel sentinel + thrust::fill(rmm::exec_policy(stream), + parent_col_id.begin(), + parent_col_id.begin() + level_boundaries[0], + parent_node_sentinel); + + // Per-level processing + auto const num_levels = level_boundaries.size(); + for (size_t level = 1; level < num_levels; level++) { + // Gather the each node's parent's column id for the nodes of the current level + thrust::gather(rmm::exec_policy(stream), + parent_indices.data() + level_boundaries[level - 1], + parent_indices.data() + level_boundaries[level], + col_id.data(), + parent_col_id.data() + level_boundaries[level - 1]); + + // To invoke Radix sort for keys {parent_col_id, node_type} instead of merge sort, + // we need to split to 2 Radix sorts. + // Secondary sort on node_type + thrust::stable_sort_by_key( + rmm::exec_policy(stream), + node_type.data() + level_boundaries[level - 1], + node_type.data() + level_boundaries[level], + thrust::make_zip_iterator(parent_col_id.begin() + level_boundaries[level - 1], + scatter_indices.begin())); + // Primary sort on parent_col_id + thrust::stable_sort_by_key( + rmm::exec_policy(stream), + parent_col_id.begin() + level_boundaries[level - 1], + parent_col_id.begin() + level_boundaries[level], + thrust::make_zip_iterator(node_type.data() + level_boundaries[level - 1], + scatter_indices.begin())); + + auto start_it = thrust::make_zip_iterator(parent_col_id.begin() + level_boundaries[level - 1], + node_type.data() + level_boundaries[level - 1]); + auto adjacent_pair_it = thrust::make_zip_iterator(start_it - 1, start_it); + // Compares two adjacent items, beginning with the first and second item from the current level. + // Writes flags to the index of the rhs item. + // First index holds next col_id from previous level. + thrust::transform(rmm::exec_policy(stream), + adjacent_pair_it + 1, + adjacent_pair_it + level_boundaries[level] - level_boundaries[level - 1], + col_id.data() + level_boundaries[level - 1] + 1, + [] __device__(auto adjacent_pair) -> size_type { + auto const lhs = thrust::get<0>(adjacent_pair); + auto const rhs = thrust::get<1>(adjacent_pair); + return lhs != rhs ? 1 : 0; + }); + + // includes previous level last col_id to continue the index. + thrust::inclusive_scan(rmm::exec_policy(stream), + col_id.data() + level_boundaries[level - 1], + col_id.data() + level_boundaries[level] + (level != num_levels - 1), + // +1 only for not-last-levels, for next level start col_id + col_id.data() + level_boundaries[level - 1]); + // scatter to restore original order. + auto const num_nodes_per_level = level_boundaries[level] - level_boundaries[level - 1]; + { + rmm::device_uvector tmp_col_id(num_nodes_per_level, stream); + rmm::device_uvector tmp_parent_col_id(num_nodes_per_level, stream); + thrust::scatter(rmm::exec_policy(stream), + thrust::make_zip_iterator(col_id.begin() + level_boundaries[level - 1], + parent_col_id.data() + level_boundaries[level - 1]), + thrust::make_zip_iterator(col_id.begin() + level_boundaries[level], + parent_col_id.data() + level_boundaries[level]), + scatter_indices.begin(), + thrust::make_zip_iterator(tmp_col_id.begin(), tmp_parent_col_id.begin())); + thrust::copy(rmm::exec_policy(stream), + tmp_col_id.begin(), + tmp_col_id.end(), + col_id.begin() + level_boundaries[level - 1]); + thrust::copy(rmm::exec_policy(stream), + tmp_parent_col_id.begin(), + tmp_parent_col_id.end(), + parent_col_id.begin() + level_boundaries[level - 1]); + } + thrust::sequence(rmm::exec_policy(stream), + scatter_indices.begin(), + scatter_indices.begin() + num_nodes_per_level); + } + + return {std::move(col_id), std::move(parent_col_id)}; +} + +/** + * @brief Computes row indices of each node in the hierarchy. + * 5. Generate row_offset. + * a. stable_sort by parent_col_id. + * b. scan_by_key {parent_col_id} (required only on nodes who's parent is list) + * c. propagate to non-list leaves from parent list node by recursion + * + * pre-condition: + * scatter_indices is a sequence, representing node_id. + * d_tree.node_categories, d_tree.parent_node_ids, parent_col_id are in order of node_id. + * post-condition: row_offsets is in order of node_id. + * parent_col_id and scatter_indices are sorted by parent_col_id. (unused after this function) + * @param scatter_indices node_id + * @param parent_col_id parent node's column id + * @param d_tree Tree representation of the JSON string + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return row_offsets + */ +rmm::device_uvector compute_row_offsets(device_span scatter_indices, + rmm::device_uvector&& parent_col_id, + tree_meta_t& d_tree, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + auto const num_nodes = d_tree.node_categories.size(); + // TODO generate scatter_indices sequences here itself + thrust::stable_sort_by_key( + rmm::exec_policy(stream), parent_col_id.begin(), parent_col_id.end(), scatter_indices.begin()); + rmm::device_uvector row_offsets(num_nodes, stream, mr); + // TODO is it possible to generate list child_offsets too here? + thrust::exclusive_scan_by_key( + rmm::exec_policy(stream), + parent_col_id.begin(), // TODO: is there any way to limit this to list parents alone? + parent_col_id.end(), + thrust::make_constant_iterator(1), + row_offsets.begin()); + + // Using scatter instead of sort. + auto& temp_storage = parent_col_id; // reuse parent_col_id as temp storage + thrust::scatter(rmm::exec_policy(stream), + row_offsets.begin(), + row_offsets.end(), + scatter_indices.begin(), + temp_storage.begin()); + row_offsets = std::move(temp_storage); + + // Propagate row offsets to non-list leaves from list's immediate children node by recursion + thrust::transform_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_nodes), + row_offsets.begin(), + [node_categories = d_tree.node_categories.data(), + parent_node_ids = d_tree.parent_node_ids.begin(), + row_offsets = row_offsets.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + while (parent_node_id != parent_node_sentinel and + node_categories[parent_node_id] != node_t::NC_LIST) { + node_id = parent_node_id; + parent_node_id = parent_node_ids[parent_node_id]; + } + return row_offsets[node_id]; + }, + [node_categories = d_tree.node_categories.data(), + parent_node_ids = d_tree.parent_node_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and + !(node_categories[parent_node_id] == node_t::NC_LIST); + }); + return row_offsets; +} + +/** +@note +This algorithm assigns a unique column id to each node in the tree. +The row offset is the row index of the node in that column id. +Algorithm: +1. Convert node_category+fieldname to node_type. + a. Create a hashmap to hash field name and assign unique node id as values. + b. Convert the node categories to node types. + Node type is defined as node category enum value if it is not a field node, + otherwise it is the unique node id assigned by the hashmap (value shifted by #NUM_CATEGORY). +2. Preprocessing: Translate parent node ids after sorting by level. + a. sort by level + b. get gather map of sorted indices + c. translate parent_node_ids to new sorted indices +3. Find level boundaries. + copy_if index of first unique values of sorted levels. +4. Per-Level Processing: Propagate parent node ids for each level. + For each level, + a. gather col_id from previous level results. input=col_id, gather_map is parent_indices. + b. stable sort by {parent_col_id, node_type} + c. scan sum of unique {parent_col_id, node_type} + d. scatter the col_id back to stable node_level order (using scatter_indices) + Restore original node_id order +5. Generate row_offset. + a. stable_sort by parent_col_id. + b. scan_by_key {parent_col_id} (required only on nodes whose parent is a list) + c. propagate to non-list leaves from parent list node by recursion +**/ +std::tuple, rmm::device_uvector> +records_orient_tree_traversal(device_span d_input, + tree_meta_t& d_tree, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + // 1. Convert node_category + field_name to node_type. + + auto num_nodes = d_tree.node_categories.size(); + rmm::device_uvector node_type = + hash_node_type_with_field_name(d_input, d_tree, stream); + // TODO two-level hashing: one for field names + // and another for {node-level, node_category} + field hash for the entire path + + // 2. Preprocessing: Translate parent node ids after sorting by level. + // a. sort by level + // b. get gather map of sorted indices + // c. translate parent_node_ids to sorted indices + + rmm::device_uvector scatter_indices(num_nodes, stream); + thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end()); + + rmm::device_uvector parent_node_ids(d_tree.parent_node_ids, stream); // make a copy + auto out_pid = + thrust::make_zip_iterator(scatter_indices.data(), parent_node_ids.data(), node_type.data()); + // Uses cub radix sort. sort by level + thrust::stable_sort_by_key(rmm::exec_policy(stream), + d_tree.node_levels.data(), + d_tree.node_levels.data() + num_nodes, + out_pid); + + rmm::device_uvector parent_indices = + translate_sorted_parent_node_indices(scatter_indices, parent_node_ids, stream); + // TODO optimize memory usage: parent_node_ids is no longer needed + + // 3. Find level boundaries. + auto level_boundaries = [&]() { + // Already node_levels is sorted + auto max_level = d_tree.node_levels.back_element(stream); + rmm::device_uvector level_boundaries(max_level + 1, stream); + // TODO try reduce_by_key + auto level_end = + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(1), + thrust::make_counting_iterator(num_nodes + 1), + level_boundaries.begin(), + [num_nodes, node_levels = d_tree.node_levels.begin()] __device__(auto index) { + return index == num_nodes || node_levels[index] != node_levels[index - 1]; + }); + CUDF_EXPECTS(thrust::distance(level_boundaries.begin(), level_end) == max_level + 1, + "num_levels != max_level + 1"); + return level_boundaries; + }; + + // 4. Per-Level Processing: Propagate parent node ids for each level. + auto [col_id, parent_col_id] = generate_column_id(node_type, // level sorted + parent_indices, // level sorted + level_boundaries(), + stream, + mr); + + // restore original order of col_id, parent_col_id and used d_tree members + { + rmm::device_uvector tmp_col_id(num_nodes, stream); + rmm::device_uvector tmp_parent_col_id(num_nodes, stream); + rmm::device_uvector tmp_node_levels(num_nodes, stream); + thrust::scatter( + rmm::exec_policy(stream), + thrust::make_zip_iterator(col_id.begin(), parent_col_id.begin(), d_tree.node_levels.begin()), + thrust::make_zip_iterator(col_id.end(), parent_col_id.end(), d_tree.node_levels.end()), + scatter_indices.begin(), + thrust::make_zip_iterator( + tmp_col_id.begin(), tmp_parent_col_id.begin(), tmp_node_levels.begin())); + col_id = std::move(tmp_col_id); + parent_col_id = std::move(tmp_parent_col_id); + d_tree.node_levels = std::move(tmp_node_levels); + thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end()); + } + + // 5. Generate row_offset. + auto row_offsets = + compute_row_offsets(scatter_indices, std::move(parent_col_id), d_tree, stream, mr); + return std::tuple{std::move(col_id), std::move(row_offsets)}; +} + +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 4e930f86591..548f5c4e9e9 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -21,10 +21,13 @@ #include #include #include +#include #include #include +#include +#include #include namespace cudf::io::json { @@ -58,7 +61,7 @@ using NodeT = char; /// Type used to index into the nodes within the tree of structs, lists, field names, and value /// nodes -using NodeIndexT = uint32_t; +using NodeIndexT = size_type; /// Type large enough to represent tree depth from [0, max-tree-depth); may be an unsigned type using TreeDepthT = StackLevelT; @@ -67,14 +70,14 @@ using TreeDepthT = StackLevelT; * @brief Struct that encapsulate all information of a columnar tree representation. */ struct tree_meta_t { - std::vector node_categories; - std::vector parent_node_ids; - std::vector node_levels; - std::vector node_range_begin; - std::vector node_range_end; + rmm::device_uvector node_categories; + rmm::device_uvector parent_node_ids; + rmm::device_uvector node_levels; + rmm::device_uvector node_range_begin; + rmm::device_uvector node_range_end; }; -constexpr NodeIndexT parent_node_sentinel = std::numeric_limits::max(); +constexpr NodeIndexT parent_node_sentinel = -1; /** * @brief Class of a node (or a node "category") within the tree representation @@ -125,6 +128,7 @@ struct json_column { // Following "items" as the default child column's name of a list column // Using the struct's field names std::map child_columns; + std::vector column_order; // Counting the current number of items in this column row_offset_t current_offset = 0; @@ -140,19 +144,7 @@ struct json_column { * * @param up_to_row_offset The row offset up to which to fill with nulls. */ - void null_fill(row_offset_t up_to_row_offset) - { - // Fill all the rows up to up_to_row_offset with "empty"/null rows - validity.resize(word_index(up_to_row_offset) + 1); - std::fill_n(std::back_inserter(string_offsets), - up_to_row_offset - string_offsets.size(), - (string_offsets.size() > 0) ? string_offsets.back() : 0); - std::fill_n(std::back_inserter(string_lengths), up_to_row_offset - string_lengths.size(), 0); - std::fill_n(std::back_inserter(child_offsets), - up_to_row_offset + 1 - child_offsets.size(), - (child_offsets.size() > 0) ? child_offsets.back() : 0); - current_offset = up_to_row_offset; - } + void null_fill(row_offset_t up_to_row_offset); /** * @brief Recursively iterates through the tree of columns making sure that all child columns of a @@ -160,26 +152,7 @@ struct json_column { * * @param min_row_count The minimum number of rows to be filled. */ - void level_child_cols_recursively(row_offset_t min_row_count) - { - // Fill this columns with nulls up to the given row count - null_fill(min_row_count); - - // If this is a struct column, we need to level all its child columns - if (type == json_col_t::StructColumn) { - for (auto it = std::begin(child_columns); it != std::end(child_columns); it++) { - it->second.level_child_cols_recursively(min_row_count); - } - } - // If this is a list column, we need to make sure that its child column levels its children - else if (type == json_col_t::ListColumn) { - auto it = std::begin(child_columns); - // Make that child column fill its child columns up to its own row count - if (it != std::end(child_columns)) { - it->second.level_child_cols_recursively(it->second.current_offset); - } - } - } + void level_child_cols_recursively(row_offset_t min_row_count); /** * @brief Appends the row at the given index to the column, filling all rows between the column's @@ -193,42 +166,10 @@ struct json_column { * the offsets */ void append_row(uint32_t row_index, - json_col_t const& row_type, + json_col_t row_type, uint32_t string_offset, uint32_t string_end, - uint32_t child_count) - { - // If, thus far, the column's type couldn't be inferred, we infer it to the given type - if (type == json_col_t::Unknown) { type = row_type; } - - // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type - // CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); - - // Fill all the omitted rows with "empty"/null rows (if needed) - null_fill(row_index); - - // Table listing what we intend to use for a given column type and row type combination - // col type | row type => {valid, FAIL, null} - // ----------------------------------------------- - // List | List => valid - // List | Struct => FAIL - // List | String => null - // Struct | List => FAIL - // Struct | Struct => valid - // Struct | String => null - // String | List => null - // String | Struct => null - // String | String => valid - bool const is_valid = (type == row_type); - if (static_cast(validity.size()) < word_index(current_offset)) - validity.push_back({}); - set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); - valid_count += (is_valid) ? 1U : 0U; - string_offsets.push_back(string_offset); - string_lengths.push_back(string_end - string_offset); - child_offsets.push_back((child_offsets.size() > 0) ? child_offsets.back() + child_count : 0); - current_offset++; - }; + uint32_t child_count); }; /** @@ -243,6 +184,10 @@ enum token_t : PdaTokenT { ListBegin, /// End-of-list token (on encounter of semantic ']') ListEnd, + // Beginning-of-struct-member token + StructMemberBegin, + // End-of-struct-member token + StructMemberEnd, /// Beginning-of-field-name token (on encounter of first quote) FieldNameBegin, /// End-of-field-name token (on encounter of a field name's second quote) @@ -298,6 +243,39 @@ std::pair, rmm::device_uvector> ge rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Parses the given JSON string and generates a tree representation of the given input. + * + * @param tokens Vector of token types in the json string + * @param token_indices The indices within the input string corresponding to each token + * @param stream The CUDA stream to which kernels are dispatched + * @param mr Optional, resource with which to allocate + * @return A tree representation of the input JSON string as vectors of node type, parent index, + * level, begin index, and end index in the input JSON string + */ +tree_meta_t get_tree_representation( + device_span tokens, + device_span token_indices, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Traverse the tree representation of the JSON input in records orient format and populate + * the output columns indices and row offsets within that column. + * + * @param d_input The JSON input + * @param d_tree A tree representation of the input JSON string as vectors of node type, parent + * index, level, begin index, and end index in the input JSON string + * @param stream The CUDA stream to which kernels are dispatched + * @param mr Optional, resource with which to allocate + * @return A tuple of the output column indices and the row offsets within each column for each node + */ +std::tuple, rmm::device_uvector> +records_orient_tree_traversal( + device_span d_input, + tree_meta_t& d_tree, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Parses the given JSON string and generates table from the given input. * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 63b1ad32ba1..552cd1e6167 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -18,11 +18,15 @@ #include #include -#include +#include +#include #include +#include #include +#include #include +#include #include #include #include @@ -30,10 +34,12 @@ #include #include +#include #include #include #include +#include #include #include @@ -129,9 +135,9 @@ std::array, TT_NUM_STATES> const trans // Translation table (i.e., for each transition, what are the symbols that we output) std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{ {/* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}}, - /* TT_STR */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}, - /* TT_ESC */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}}}; + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}}}, + /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}}}, + /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}}}}}; // The DFA's starting state constexpr auto start_state = static_cast(TT_OOS); @@ -472,17 +478,19 @@ auto get_transition_table(bool newline_delimited_json) */ auto get_translation_table() { - constexpr auto StructBegin = token_t::StructBegin; - constexpr auto StructEnd = token_t::StructEnd; - constexpr auto ListBegin = token_t::ListBegin; - constexpr auto ListEnd = token_t::ListEnd; - constexpr auto FieldNameBegin = token_t::FieldNameBegin; - constexpr auto FieldNameEnd = token_t::FieldNameEnd; - constexpr auto StringBegin = token_t::StringBegin; - constexpr auto StringEnd = token_t::StringEnd; - constexpr auto ValueBegin = token_t::ValueBegin; - constexpr auto ValueEnd = token_t::ValueEnd; - constexpr auto ErrorBegin = token_t::ErrorBegin; + constexpr auto StructBegin = token_t::StructBegin; + constexpr auto StructEnd = token_t::StructEnd; + constexpr auto ListBegin = token_t::ListBegin; + constexpr auto ListEnd = token_t::ListEnd; + constexpr auto StructMemberBegin = token_t::StructMemberBegin; + constexpr auto StructMemberEnd = token_t::StructMemberEnd; + constexpr auto FieldNameBegin = token_t::FieldNameBegin; + constexpr auto FieldNameEnd = token_t::FieldNameEnd; + constexpr auto StringBegin = token_t::StringBegin; + constexpr auto StringEnd = token_t::StringEnd; + constexpr auto ValueBegin = token_t::ValueBegin; + constexpr auto ValueEnd = token_t::ValueEnd; + constexpr auto ErrorBegin = token_t::ErrorBegin; std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ @@ -521,78 +529,80 @@ auto get_translation_table() {}, // WHITE_SPACE {}, // LINE_BREAK {ValueBegin}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_BOA)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER - /*LIST*/ - {StructBegin}, // OPENING_BRACE - {ListBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET - {StringBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ValueBegin}, // OTHER - /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {StructEnd}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {FieldNameBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_LON)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - {ValueEnd}, // LINE_BREAK - {}, // OTHER - /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ValueEnd, ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ValueEnd}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - {ValueEnd}, // LINE_BREAK - {}, // OTHER - /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ValueEnd, StructEnd}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ValueEnd}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - {ValueEnd}, // LINE_BREAK - {}}}; // OTHER + pda_tlt[static_cast(pda_state_t::PD_BOA)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ValueBegin}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StructMemberBegin, FieldNameBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER + pda_tlt[static_cast(pda_state_t::PD_LON)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + {ValueEnd}, // LINE_BREAK + {}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ValueEnd, ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + {ValueEnd}, // LINE_BREAK + {}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ValueEnd, StructMemberEnd, StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd, StructMemberEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + {ValueEnd}, // LINE_BREAK + {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ {}, // OPENING_BRACE @@ -668,79 +678,81 @@ auto get_translation_table() {}, // LINE_BREAK {}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_PVL)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}, // OTHER - /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}, // OTHER - /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {StructEnd}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_BFN)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER - /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER - /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {FieldNameBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}}}; // OTHER + pda_tlt[static_cast(pda_state_t::PD_PVL)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {StructMemberEnd, StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {StructMemberEnd}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_BFN)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + {ErrorBegin}, // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StructMemberBegin, FieldNameBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + {}, // LINE_BREAK + {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ {ErrorBegin}, // OPENING_BRACE @@ -908,13 +920,107 @@ struct JSONToStackOp { } }; +void json_column::null_fill(row_offset_t up_to_row_offset) +{ + // Fill all the rows up to up_to_row_offset with "empty"/null rows + validity.resize(word_index(up_to_row_offset) + 1); + std::fill_n(std::back_inserter(string_offsets), + up_to_row_offset - string_offsets.size(), + (string_offsets.size() > 0) ? string_offsets.back() : 0); + std::fill_n(std::back_inserter(string_lengths), up_to_row_offset - string_lengths.size(), 0); + std::fill_n(std::back_inserter(child_offsets), + up_to_row_offset + 1 - child_offsets.size(), + (child_offsets.size() > 0) ? child_offsets.back() : 0); + current_offset = up_to_row_offset; +} + +void json_column::level_child_cols_recursively(row_offset_t min_row_count) +{ + // Fill this columns with nulls up to the given row count + null_fill(min_row_count); + + // If this is a struct column, we need to level all its child columns + if (type == json_col_t::StructColumn) { + for (auto it = std::begin(child_columns); it != std::end(child_columns); it++) { + it->second.level_child_cols_recursively(min_row_count); + } + } + // If this is a list column, we need to make sure that its child column levels its children + else if (type == json_col_t::ListColumn) { + auto it = std::begin(child_columns); + // Make that child column fill its child columns up to its own row count + if (it != std::end(child_columns)) { + it->second.level_child_cols_recursively(it->second.current_offset); + } + } +}; + +void json_column::append_row(uint32_t row_index, + json_col_t row_type, + uint32_t string_offset, + uint32_t string_end, + uint32_t child_count) +{ + // If, thus far, the column's type couldn't be inferred, we infer it to the given type + if (type == json_col_t::Unknown) { + type = row_type; + } + // If, at some point within a column, we encounter a nested type (list or struct), + // we change that column's type to that respective nested type and invalidate all previous rows + else if (type == json_col_t::StringColumn && + (row_type == json_col_t::ListColumn || row_type == json_col_t::StructColumn)) { + // Change the column type + type = row_type; + + // Invalidate all previous entries, as they were _not_ of the nested type to which we just + // converted + std::fill_n(validity.begin(), validity.size(), 0); + valid_count = 0U; + } + // If this is a nested column but we're trying to insert either (a) a list node into a struct + // column or (b) a struct node into a list column, we fail + CUDF_EXPECTS(not((type == json_col_t::ListColumn and row_type == json_col_t::StructColumn) or + (type == json_col_t::StructColumn and row_type == json_col_t::ListColumn)), + "A mix of lists and structs within the same column is not supported"); + + // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type + CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); + + // Fill all the omitted rows with "empty"/null rows (if needed) + null_fill(row_index); + + // Table listing what we intend to use for a given column type and row type combination + // col type | row type => {valid, FAIL, null} + // ----------------------------------------------- + // List | List => valid + // List | Struct => FAIL + // List | String => null + // Struct | List => FAIL + // Struct | Struct => valid + // Struct | String => null + // String | List => valid (we switch col type to list, null'ing all previous rows) + // String | Struct => valid (we switch col type to list, null'ing all previous rows) + // String | String => valid + bool const is_valid = (type == row_type); + if (static_cast(validity.size()) < word_index(current_offset)) validity.push_back({}); + if (is_valid) { set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); } + valid_count += (is_valid) ? 1U : 0U; + string_offsets.push_back(string_offset); + string_lengths.push_back(string_end - string_offset); + child_offsets.push_back((child_offsets.size() > 0) ? child_offsets.back() + child_count : 0); + current_offset++; +}; + namespace detail { void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, rmm::cuda_stream_view stream) { - constexpr std::size_t single_item = 1; + // Range of encapsulating function that comprises: + // -> DFA simulation for filtering out brackets and braces inside of quotes + // -> Logical stack to infer the stack context + CUDF_FUNC_RANGE(); // Symbol representing the JSON-root (i.e., we're at nesting level '0') constexpr StackSymbolT root_symbol = '_'; @@ -922,7 +1028,7 @@ void get_stack_context(device_span json_in, constexpr StackSymbolT read_symbol = 'x'; // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) - hostdevice_vector num_stack_ops(single_item, stream); + rmm::device_scalar d_num_stack_ops(stream); // Sequence of stack symbols and their position in the original input (sparse representation) rmm::device_uvector stack_ops{json_in.size(), stream}; @@ -945,14 +1051,17 @@ void get_stack_context(device_span json_in, static_cast(json_in.size()), stack_ops.data(), stack_op_indices.data(), - num_stack_ops.device_ptr(), + d_num_stack_ops.data(), to_stack_op::start_state, stream); + // Copy back to actual number of stack operations + auto const num_stack_ops = d_num_stack_ops.value(stream); + // stack operations with indices are converted to top of the stack for each character in the input fst::sparse_stack_op_to_top_of_stack( stack_ops.data(), - device_span{stack_op_indices.data(), stack_op_indices.size()}, + device_span{stack_op_indices.data(), num_stack_ops}, JSONToStackOp{}, d_top_of_stack, root_symbol, @@ -967,26 +1076,28 @@ std::pair, rmm::device_uvector> ge rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - rmm::device_uvector tokens{json_in.size(), stream, mr}; - rmm::device_uvector tokens_indices{json_in.size(), stream, mr}; - rmm::device_scalar num_written_tokens{stream, mr}; + // Range of encapsulating function that parses to internal columnar data representation + CUDF_FUNC_RANGE(); auto const new_line_delimited_json = options.is_enabled_lines(); - // Memory holding the top-of-stack stack context for the input - rmm::device_uvector stack_op_indices{json_in.size(), stream}; - - // Identify what is the stack context for each input character (is it: JSON-root, struct, or list) - get_stack_context(json_in, stack_op_indices.data(), stream); - // Prepare for PDA transducer pass, merging input symbols with stack symbols - rmm::device_uvector pda_sgids{json_in.size(), stream}; - auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data()); - thrust::transform(rmm::exec_policy(stream), - zip_in, - zip_in + json_in.size(), - pda_sgids.data(), - tokenizer_pda::PdaSymbolToSymbolGroupId{}); + rmm::device_uvector pda_sgids = [json_in, stream]() { + rmm::device_uvector pda_sgids{json_in.size(), stream}; + // Memory holding the top-of-stack stack context for the input + rmm::device_uvector stack_op_indices{json_in.size(), stream}; + + // Identify what is the stack context for each input character (JSON-root, struct, or list) + get_stack_context(json_in, stack_op_indices.data(), stream); + + auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data()); + thrust::transform(rmm::exec_policy(stream), + zip_in, + zip_in + json_in.size(), + pda_sgids.data(), + tokenizer_pda::PdaSymbolToSymbolGroupId{}); + return pda_sgids; + }(); // PDA transducer alias using ToTokenStreamFstT = @@ -1006,6 +1117,9 @@ std::pair, rmm::device_uvector> ge stream}; // Perform a PDA-transducer pass + rmm::device_scalar num_written_tokens{stream}; + rmm::device_uvector tokens{json_in.size(), stream, mr}; + rmm::device_uvector tokens_indices{json_in.size(), stream, mr}; json_to_tokens_fst.Transduce(pda_sgids.begin(), static_cast(json_in.size()), tokens.data(), @@ -1014,7 +1128,7 @@ std::pair, rmm::device_uvector> ge tokenizer_pda::start_state, stream); - auto num_total_tokens = num_written_tokens.value(stream); + auto const num_total_tokens = num_written_tokens.value(stream); tokens.resize(num_total_tokens, stream); tokens_indices.resize(num_total_tokens, stream); @@ -1030,6 +1144,8 @@ std::pair, rmm::device_uvector> ge * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory * @param[in] options Parsing options specifying the parsing behaviour + * @param[in] include_quote_char Whether to include the original quote chars around string values, + * allowing to distinguish string values from numeric and literal values * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input @@ -1039,9 +1155,13 @@ void make_json_column(json_column& root_column, host_span input, device_span d_input, cudf::io::json_reader_options const& options, + bool include_quote_char, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { + // Range of encapsulating function that parses to internal columnar data representation + CUDF_FUNC_RANGE(); + // Default name for a list's child column std::string const list_child_name = "element"; @@ -1079,12 +1199,20 @@ void make_json_column(json_column& root_column, }; }; - // Skips the quote char if the token is a beginning-of-string or beginning-of-field-name token - auto get_token_index = [](PdaTokenT const token, SymbolOffsetT const token_index) { - constexpr SymbolOffsetT skip_quote_char = 1; + // Depending on whether we want to include the quotes of strings or not, respectively, we: + // (a) strip off the beginning quote included in StringBegin and FieldNameBegin or + // (b) include of the end quote excluded from in StringEnd and strip off the beginning quote + // included FieldNameBegin + auto get_token_index = [include_quote_char](PdaTokenT const token, + SymbolOffsetT const token_index) { + constexpr SymbolOffsetT quote_char_size = 1; switch (token) { - case token_t::StringBegin: return token_index + skip_quote_char; - case token_t::FieldNameBegin: return token_index + skip_quote_char; + // Optionally strip off quote char included for StringBegin + case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size); + // Optionally include trailing quote char for string values excluded for StringEnd + case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0); + // Strip off quote char included for FieldNameBegin + case token_t::FieldNameBegin: return token_index + quote_char_size; default: return token_index; }; }; @@ -1116,6 +1244,8 @@ void make_json_column(json_column& root_column, case token_t::StructEnd: return "StructEnd"; case token_t::ListBegin: return "ListBegin"; case token_t::ListEnd: return "ListEnd"; + case token_t::StructMemberBegin: return "StructMemberBegin"; + case token_t::StructMemberEnd: return "StructMemberEnd"; case token_t::FieldNameBegin: return "FieldNameBegin"; case token_t::FieldNameEnd: return "FieldNameEnd"; case token_t::StringBegin: return "StringBegin"; @@ -1169,6 +1299,7 @@ void make_json_column(json_column& root_column, if (current_data_path.top().column->child_columns.size() == 0) { current_data_path.top().column->child_columns.emplace(std::string{list_child_name}, json_column{json_col_t::Unknown}); + current_data_path.top().column->column_order.push_back(list_child_name); } current_data_path.top().current_selected_col = ¤t_data_path.top().column->child_columns.begin()->second; @@ -1208,6 +1339,7 @@ void make_json_column(json_column& root_column, // The field name's column does not exist yet, so we have to append the child column to the // struct column + struct_col->column_order.push_back(field_name); return &struct_col->child_columns.emplace(field_name, json_column{}).first->second; }; @@ -1401,16 +1533,38 @@ void make_json_column(json_column& root_column, root_column.level_child_cols_recursively(root_column.current_offset); } +/** + * @brief Retrieves the parse_options to be used for type inference and type casting + * + * @param options The reader options to influence the relevant type inference and type casting + * options + */ +auto parsing_options(cudf::io::json_reader_options const& options) +{ + auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; + + auto const stream = cudf::default_stream_value; + parse_opts.dayfirst = options.is_enabled_dayfirst(); + parse_opts.keepquotes = options.is_enabled_keep_quotes(); + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + return parse_opts; +} + std::pair, std::vector> json_column_to_cudf_column( json_column const& json_col, device_span d_input, + cudf::io::json_reader_options const& options, + std::optional schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of orchestrating/encapsulating function + CUDF_FUNC_RANGE(); + auto make_validity = [stream, mr](json_column const& json_col) -> std::pair { - if (json_col.current_offset == json_col.valid_count) { return {rmm::device_buffer{}, 0}; } - return {rmm::device_buffer{json_col.validity.data(), bitmask_allocation_size_bytes(json_col.current_offset), stream, @@ -1418,31 +1572,85 @@ std::pair, std::vector> json_column_to json_col.current_offset - json_col.valid_count}; }; + auto get_child_schema = [schema](auto child_name) -> std::optional { + if (schema.has_value()) { + auto const result = schema.value().child_types.find(child_name); + if (result != std::end(schema.value().child_types)) { return result->second; } + } + return {}; + }; + switch (json_col.type) { case json_col_t::StringColumn: { - // move string_offsets to GPU and transform to string column - auto const col_size = json_col.string_offsets.size(); - using char_length_pair_t = thrust::pair; + auto const col_size = json_col.string_offsets.size(); CUDF_EXPECTS(json_col.string_offsets.size() == json_col.string_lengths.size(), "string offset, string length mismatch"); - rmm::device_uvector d_string_data(col_size, stream); + + // Move string_offsets and string_lengths to GPU rmm::device_uvector d_string_offsets = cudf::detail::make_device_uvector_async(json_col.string_offsets, stream); rmm::device_uvector d_string_lengths = cudf::detail::make_device_uvector_async(json_col.string_lengths, stream); + + // Prepare iterator that returns (string_offset, string_length)-tuples auto offset_length_it = thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin()); - thrust::transform(rmm::exec_policy(stream), - offset_length_it, - offset_length_it + col_size, - d_string_data.data(), - [data = d_input.data()] __device__(auto ip) { - return char_length_pair_t{data + thrust::get<0>(ip), thrust::get<1>(ip)}; - }); - auto str_col_ptr = make_strings_column(d_string_data, stream, mr); - auto [result_bitmask, null_count] = make_validity(json_col); - str_col_ptr->set_null_mask(result_bitmask, null_count); - return {std::move(str_col_ptr), {{"offsets"}, {"chars"}}}; + + // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference + auto string_ranges_it = + thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { + return thrust::pair{ + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion + auto string_spans_it = thrust::make_transform_iterator( + offset_length_it, [data = d_input.data()] __device__(auto ip) { + return thrust::pair{ + data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + data_type target_type{}; + + if (schema.has_value()) { +#ifdef NJP_DEBUG_PRINT + std::cout << "-> explicit type: " + << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) + : "n/a"); +#endif + target_type = schema.value().type; + } + // Infer column type, if we don't have an explicit type for it + else { + target_type = cudf::io::detail::infer_data_type( + parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); + } + + // Convert strings to the inferred data type + auto col = experimental::detail::parse_data(string_spans_it, + col_size, + target_type, + make_validity(json_col).first, + parsing_options(options).view(), + stream, + mr); + + // Reset nullable if we do not have nulls + // This is to match the existing JSON reader's behaviour: + // - Non-string columns will always be returned as nullable + // - String columns will be returned as nullable, iff there's at least one null entry + if (target_type.id() == type_id::STRING and col->null_count() == 0) { + col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); + } + + // For string columns return ["offsets", "char"] schema + if (target_type.id() == type_id::STRING) { + return {std::move(col), {{"offsets"}, {"chars"}}}; + } + // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema + else { + return {std::move(col), {}}; + } break; } case json_col_t::StructColumn: { @@ -1450,10 +1658,12 @@ std::pair, std::vector> json_column_to std::vector column_names{}; size_type num_rows{json_col.current_offset}; // Create children columns - for (auto const& col : json_col.child_columns) { - column_names.emplace_back(col.first); - auto const& child_col = col.second; - auto [child_column, names] = json_column_to_cudf_column(child_col, d_input, stream, mr); + for (auto const& col_name : json_col.column_order) { + auto const& col = json_col.child_columns.find(col_name); + column_names.emplace_back(col->first); + auto const& child_col = col->second; + auto [child_column, names] = json_column_to_cudf_column( + child_col, d_input, options, get_child_schema(col_name), stream, mr); CUDF_EXPECTS(num_rows == child_column->size(), "All children columns must have the same size"); child_columns.push_back(std::move(child_column)); @@ -1478,7 +1688,12 @@ std::pair, std::vector> json_column_to std::make_unique(data_type{type_id::INT32}, num_rows, d_offsets.release()); // Create children column auto [child_column, names] = - json_column_to_cudf_column(json_col.child_columns.begin()->second, d_input, stream, mr); + json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + get_child_schema(json_col.child_columns.begin()->first), + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows - 1, @@ -1502,6 +1717,9 @@ table_with_metadata parse_nested_json(host_span input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of orchestrating/encapsulating function + CUDF_FUNC_RANGE(); + auto const new_line_delimited_json = options.is_enabled_lines(); // Allocate device memory for the JSON input & copy over to device @@ -1516,6 +1734,10 @@ table_with_metadata parse_nested_json(host_span input, constexpr uint32_t token_end_offset_zero = 0; constexpr uint32_t node_init_child_count_zero = 0; + // Whether the tokenizer stage should keep quote characters for string values + // If the tokenizer keeps the quote characters, they may be stripped during type casting + constexpr bool include_quote_chars = true; + // We initialize the very root node and root column, which represent the JSON document being // parsed. That root node is a list node and that root column is a list column. The column has the // root node as its only row. The values parsed from the JSON input will be treated as follows: @@ -1529,7 +1751,8 @@ table_with_metadata parse_nested_json(host_span input, // Push the root node onto the stack for the data path data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); - make_json_column(root_column, data_path, input, d_input, options, stream, mr); + make_json_column( + root_column, data_path, input, d_input, options, include_quote_chars, stream, mr); // data_root refers to the root column of the data represented by the given JSON string auto const& data_root = @@ -1550,14 +1773,61 @@ table_with_metadata parse_nested_json(host_span input, std::vector out_column_names; // Iterate over the struct's child columns and convert to cudf column - for (auto const& [col_name, json_col] : root_struct_col.child_columns) { + size_type column_index = 0; + for (auto const& col_name : root_struct_col.column_order) { + auto const& json_col = root_struct_col.child_columns.find(col_name)->second; // Insert this columns name into the schema out_column_names.emplace_back(col_name); + std::optional child_schema_element = std::visit( + cudf::detail::visitor_overload{ + [column_index](const std::vector& user_dtypes) -> std::optional { + auto ret = (static_cast(column_index) < user_dtypes.size()) + ? std::optional{{user_dtypes[column_index]}} + : std::optional{}; +#ifdef NJP_DEBUG_PRINT + std::cout << "Column by index: #" << column_index << ", type id: " + << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") + << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" + << "\n"; +#endif + return ret; + }, + [col_name]( + std::map const& user_dtypes) -> std::optional { + auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? std::optional{{user_dtypes.find(col_name)->second}} + : std::optional{}; +#ifdef NJP_DEBUG_PRINT + std::cout << "Column by flat name: '" << col_name << "', type id: " + << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") + << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" + << "\n"; +#endif + return ret; + }, + [col_name](std::map const& user_dtypes) + -> std::optional { + auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? user_dtypes.find(col_name)->second + : std::optional{}; +#ifdef NJP_DEBUG_PRINT + std::cout << "Column by nested name: #" << col_name << ", type id: " + << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") + << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" + << "\n"; +#endif + return ret; + }}, + options.get_dtypes()); + // Get this JSON column's cudf column and schema info - auto [cudf_col, col_name_info] = json_column_to_cudf_column(json_col, d_input, stream, mr); + auto [cudf_col, col_name_info] = + json_column_to_cudf_column(json_col, d_input, options, child_schema_element, stream, mr); out_column_names.back().children = std::move(col_name_info); out_columns.emplace_back(std::move(cudf_col)); + + column_index++; } return table_with_metadata{std::make_unique(std::move(out_columns)), diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index da6e7621449..48b2af81fcd 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -432,6 +432,18 @@ std::vector get_data_types(json_reader_options const& reader_opts, return it->second; }); return sorted_dtypes; + }, + [&](const std::map& dtypes) { + std::vector sorted_dtypes; + std::transform(std::cbegin(column_names), + std::cend(column_names), + std::back_inserter(sorted_dtypes), + [&](auto const& column_name) { + auto const it = dtypes.find(column_name); + CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); + return it->second.type; + }); + return sorted_dtypes; }}, reader_opts.get_dtypes()); } else { diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index 3f63c8240ae..2f49416e681 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -184,7 +184,8 @@ void ProtobufReader::read(column_statistics& s, size_t maxlen) make_field_reader(6, s.decimal_stats), make_field_reader(7, s.date_stats), make_field_reader(8, s.binary_stats), - make_field_reader(9, s.timestamp_stats)); + make_field_reader(9, s.timestamp_stats), + make_field_reader(10, s.has_null)); function_builder(s, maxlen, op); } diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index a007750d264..2018024f566 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -122,7 +122,7 @@ struct column_statistics { std::optional date_stats; std::optional binary_stats; std::optional timestamp_stats; - // TODO: hasNull (issue #7087) + std::optional has_null; }; struct StripeStatistics { @@ -423,6 +423,12 @@ inline uint8_t ProtobufReader::get() return (m_cur < m_end) ? *m_cur++ : 0; }; +template <> +inline bool ProtobufReader::get() +{ + return static_cast(get()); +}; + template <> inline uint32_t ProtobufReader::get() { diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index fb1db395922..bbff689082e 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -281,7 +281,7 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional sint64 sum = 3; // sum will store the total length of all strings // } if (s->chunk.has_minmax && s->chunk.has_sum) { - uint32_t sz = (pb_put_uint(cur, 3, s->chunk.sum.i_val) - cur) + + uint32_t sz = (pb_put_int(cur, 3, s->chunk.sum.i_val) - cur) + (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) + (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) + s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length; @@ -291,7 +291,7 @@ __global__ void __launch_bounds__(encode_threads_per_block) cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length); cur = pb_put_binary( cur, 2, s->chunk.max_value.str_val.ptr, s->chunk.max_value.str_val.length); - cur = pb_put_uint(cur, 3, s->chunk.sum.i_val); + cur = pb_put_int(cur, 3, s->chunk.sum.i_val); } break; case dtype_bool: diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 4fa407f4e88..c9cc0f04b3c 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1770,7 +1770,8 @@ __global__ void __launch_bounds__(block_size) // Adjust seconds only for negative timestamps with positive nanoseconds. // Alternative way to represent negative timestamps is with negative nanoseconds // in which case the adjustment in not needed. - if (seconds < 0 && nanos > 0) { seconds -= 1; } + // Comparing with 999999 instead of zero to match the apache writer. + if (seconds < 0 and nanos > 999999) { seconds -= 1; } duration_ns d_ns{nanos}; duration_s d_s{seconds}; diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 77984ee3c27..cdee066a06a 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -478,15 +478,16 @@ inline __device__ void PackLiteralsShuffle( constexpr uint32_t MASK2T = 1; // mask for 2 thread leader constexpr uint32_t MASK4T = 3; // mask for 4 thread leader constexpr uint32_t MASK8T = 7; // mask for 8 thread leader - uint64_t vt; + uint64_t v64; if (t > (count | 0x1f)) { return; } switch (w) { case 1: - v |= shuffle_xor(v, 1) << 1; - v |= shuffle_xor(v, 2) << 2; - v |= shuffle_xor(v, 4) << 4; + v |= shuffle_xor(v, 1) << 1; // grab bit 1 from neighbor + v |= shuffle_xor(v, 2) << 2; // grab bits 2-3 from 2 lanes over + v |= shuffle_xor(v, 4) << 4; // grab bits 4-7 from 4 lanes over + // sub-warp leader writes the combined bits if (t < count && !(t & MASK8T)) { dst[(t * w) >> 3] = v; } return; case 2: @@ -511,14 +512,13 @@ inline __device__ void PackLiteralsShuffle( case 5: v |= shuffle_xor(v, 1) << 5; v |= shuffle_xor(v, 2) << 10; - vt = shuffle_xor(v, 4); - vt = vt << 20 | v; + v64 = static_cast(shuffle_xor(v, 4)) << 20 | v; if (t < count && !(t & MASK8T)) { - dst[(t >> 3) * 5 + 0] = vt; - dst[(t >> 3) * 5 + 1] = vt >> 8; - dst[(t >> 3) * 5 + 2] = vt >> 16; - dst[(t >> 3) * 5 + 3] = vt >> 24; - dst[(t >> 3) * 5 + 4] = vt >> 32; + dst[(t >> 3) * 5 + 0] = v64; + dst[(t >> 3) * 5 + 1] = v64 >> 8; + dst[(t >> 3) * 5 + 2] = v64 >> 16; + dst[(t >> 3) * 5 + 3] = v64 >> 24; + dst[(t >> 3) * 5 + 4] = v64 >> 32; } return; case 6: @@ -535,14 +535,13 @@ inline __device__ void PackLiteralsShuffle( return; case 10: v |= shuffle_xor(v, 1) << 10; - vt = shuffle_xor(v, 2); - vt = vt << 20 | v; + v64 = static_cast(shuffle_xor(v, 2)) << 20 | v; if (t < count && !(t & MASK4T)) { - dst[(t >> 2) * 5 + 0] = vt; - dst[(t >> 2) * 5 + 1] = vt >> 8; - dst[(t >> 2) * 5 + 2] = vt >> 16; - dst[(t >> 2) * 5 + 3] = vt >> 24; - dst[(t >> 2) * 5 + 4] = vt >> 32; + dst[(t >> 2) * 5 + 0] = v64; + dst[(t >> 2) * 5 + 1] = v64 >> 8; + dst[(t >> 2) * 5 + 2] = v64 >> 16; + dst[(t >> 2) * 5 + 3] = v64 >> 24; + dst[(t >> 2) * 5 + 4] = v64 >> 32; } return; case 12: @@ -560,14 +559,13 @@ inline __device__ void PackLiteralsShuffle( } return; case 20: - vt = shuffle_xor(v, 1); - vt = vt << 20 | v; + v64 = static_cast(shuffle_xor(v, 1)) << 20 | v; if (t < count && !(t & MASK2T)) { - dst[(t >> 1) * 5 + 0] = vt; - dst[(t >> 1) * 5 + 1] = vt >> 8; - dst[(t >> 1) * 5 + 2] = vt >> 16; - dst[(t >> 1) * 5 + 3] = vt >> 24; - dst[(t >> 1) * 5 + 4] = vt >> 32; + dst[(t >> 1) * 5 + 0] = v64; + dst[(t >> 1) * 5 + 1] = v64 >> 8; + dst[(t >> 1) * 5 + 2] = v64 >> 16; + dst[(t >> 1) * 5 + 3] = v64 >> 24; + dst[(t >> 1) * 5 + 4] = v64 >> 32; } return; case 24: @@ -590,10 +588,14 @@ inline __device__ void PackLiteralsRoundRobin( { // Scratch space to temporarily write to. Needed because we will use atomics to write 32 bit // words but the destination mem may not be a multiple of 4 bytes. - // TODO (dm): This assumes blockdim = 128 and max bits per value = 16. Reduce magic numbers. - // To allow up to 24 bit this needs to be sized at 96 words. - __shared__ uint32_t scratch[64]; - if (t < 64) { scratch[t] = 0; } + // TODO (dm): This assumes blockdim = 128. Reduce magic numbers. + constexpr uint32_t NUM_THREADS = 128; // this needs to match gpuEncodePages block_size parameter + constexpr uint32_t NUM_BYTES = (NUM_THREADS * MAX_DICT_BITS) >> 3; + constexpr uint32_t SCRATCH_SIZE = NUM_BYTES / sizeof(uint32_t); + __shared__ uint32_t scratch[SCRATCH_SIZE]; + for (uint32_t i = t; i < SCRATCH_SIZE; i += NUM_THREADS) { + scratch[i] = 0; + } __syncthreads(); if (t <= count) { @@ -615,10 +617,9 @@ inline __device__ void PackLiteralsRoundRobin( auto available_bytes = (count * w + 7) / 8; auto scratch_bytes = reinterpret_cast(&scratch[0]); - if (t < available_bytes) { dst[t] = scratch_bytes[t]; } - if (t + 128 < available_bytes) { dst[t + 128] = scratch_bytes[t + 128]; } - // would need the following for up to 24 bits - // if (t + 256 < available_bytes) { dst[t + 256] = scratch_bytes[t + 256]; } + for (uint32_t i = t; i < available_bytes; i += NUM_THREADS) { + dst[i] = scratch_bytes[i]; + } __syncthreads(); } @@ -628,6 +629,7 @@ inline __device__ void PackLiteralsRoundRobin( inline __device__ void PackLiterals( uint8_t* dst, uint32_t v, uint32_t count, uint32_t w, uint32_t t) { + if (w > 24) { CUDF_UNREACHABLE("Unsupported bit width"); } switch (w) { case 1: case 2: @@ -644,11 +646,9 @@ inline __device__ void PackLiterals( // bit widths that lie on easy boundaries can be handled either directly // (8, 16, 24) or through fast shuffle operations. PackLiteralsShuffle(dst, v, count, w, t); - break; + return; default: - if (w > 16) { CUDF_UNREACHABLE("Unsupported bit width"); } - // less efficient bit packing that uses atomics, but can handle arbitrary - // bit widths up to 16. used for repetition and definition level encoding + // bit packing that uses atomics, but can handle arbitrary bit widths up to 24. PackLiteralsRoundRobin(dst, v, count, w, t); } } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index d0d367df962..8f4cd5c6f3b 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -45,8 +45,11 @@ namespace parquet { using cudf::io::detail::string_index_pair; +// Largest number of bits to use for dictionary keys +constexpr int MAX_DICT_BITS = 24; + // Total number of unsigned 24 bit values -constexpr size_type MAX_DICT_SIZE = (1 << 24) - 1; +constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1; /** * @brief Struct representing an input column in the file. diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 2bfd7c1ba4d..9514b053451 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1051,31 +1051,23 @@ auto build_chunk_dictionaries(hostdevice_2dvector& chunks, // Make decision about which chunks have dictionary for (auto& ck : h_chunks) { if (not ck.use_dictionary) { continue; } - std::tie(ck.use_dictionary, ck.dict_rle_bits) = [&]() { + std::tie(ck.use_dictionary, ck.dict_rle_bits) = [&]() -> std::pair { // calculate size of chunk if dictionary is used // If we have N unique values then the idx for the last value is N - 1 and nbits is the number // of bits required to encode indices into the dictionary auto max_dict_index = (ck.num_dict_entries > 0) ? ck.num_dict_entries - 1 : 0; - auto nbits = CompactProtocolReader::NumRequiredBits(max_dict_index); + auto nbits = std::max(CompactProtocolReader::NumRequiredBits(max_dict_index), 1); - // We don't use dictionary if the indices are > 24 bits because that's the maximum bitpacking - // bitsize we efficiently support - if (nbits > 24) { return std::pair(false, 0); } - - // Only these bit sizes are allowed for RLE encoding because it's compute optimized - constexpr auto allowed_bitsizes = - std::array{1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24}; - - // ceil to (1/2/3/4/5/6/8/10/12/16/20/24) - auto rle_bits = *std::lower_bound(allowed_bitsizes.begin(), allowed_bitsizes.end(), nbits); - auto rle_byte_size = util::div_rounding_up_safe(ck.num_values * rle_bits, 8); + // We don't use dictionary if the indices are > MAX_DICT_BITS bits because that's the maximum + // bitpacking bitsize we efficiently support + if (nbits > MAX_DICT_BITS) { return {false, 0}; } + auto rle_byte_size = util::div_rounding_up_safe(ck.num_values * nbits, 8); auto dict_enc_size = ck.uniq_data_size + rle_byte_size; + if (ck.plain_data_size <= dict_enc_size) { return {false, 0}; } - bool use_dict = (ck.plain_data_size > dict_enc_size); - if (not use_dict) { rle_bits = 0; } - return std::pair(use_dict, rle_bits); + return {true, nbits}; }(); } diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 5313d7a89ba..133c5fe9826 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -302,6 +302,73 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel( } } +__global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel( + cudf::size_type base_tile_idx, + int64_t base_input_offset, + int64_t base_offset_offset, + cudf::io::text::detail::scan_tile_state_view tile_output_offsets, + char delim, + cudf::device_span chunk_input_chars, + int64_t byte_range_end, + cudf::split_device_span output_offsets) +{ + using InputLoad = + cub::BlockLoad; + using OffsetScan = cub::BlockScan; + using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback; + + __shared__ union { + typename InputLoad::TempStorage input_load; + typename OffsetScan::TempStorage offset_scan; + } temp_storage; + + int32_t const tile_idx = base_tile_idx + blockIdx.x; + int32_t const tile_input_offset = blockIdx.x * ITEMS_PER_TILE; + int32_t const thread_input_offset = tile_input_offset + threadIdx.x * ITEMS_PER_THREAD; + int32_t const thread_input_size = chunk_input_chars.size() - thread_input_offset; + + // STEP 1: Load inputs + + char thread_chars[ITEMS_PER_THREAD]; + + InputLoad(temp_storage.input_load) + .Load(chunk_input_chars.data() + tile_input_offset, + thread_chars, + chunk_input_chars.size() - tile_input_offset); + + // STEP 2: Flag matches + + cutoff_offset thread_offset; + uint32_t thread_match_mask[(ITEMS_PER_THREAD + 31) / 32]{}; + + for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) { + auto const is_match = i < thread_input_size and thread_chars[i] == delim; + auto const match_end = base_input_offset + thread_input_offset + i + 1; + auto const is_past_range = match_end >= byte_range_end; + thread_match_mask[i / 32] |= uint32_t{is_match} << (i % 32); + thread_offset = thread_offset + cutoff_offset{is_match, is_past_range}; + } + + // STEP 3: Scan flags to determine absolute thread output offset + + auto prefix_callback = OffsetScanCallback(tile_output_offsets, tile_idx); + + __syncthreads(); // required before temp_memory re-use + OffsetScan(temp_storage.offset_scan).ExclusiveSum(thread_offset, thread_offset, prefix_callback); + + // Step 4: Assign outputs from each thread using match offsets. + + for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) { + auto const is_match = (thread_match_mask[i / 32] >> (i % 32)) & 1u; + if (is_match && !thread_offset.is_past_end()) { + auto const match_end = base_input_offset + thread_input_offset + i + 1; + auto const is_past_range = match_end >= byte_range_end; + output_offsets[thread_offset.offset() - base_offset_offset] = match_end; + thread_offset = thread_offset + cutoff_offset{true, is_past_range}; + } + } +} + } // namespace namespace cudf { @@ -615,19 +682,35 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source cudaStreamWaitEvent(scan_stream.value(), last_launch_event); - multibyte_split_kernel<<>>( // - base_tile_idx, - chunk_offset, - offset_storage.size(), - tile_multistates, - tile_offsets, - {device_delim.data(), static_cast(device_delim.size())}, - *chunk, - byte_range_end, - offset_output); + if (delimiter.size() == 1) { + // the single-byte case allows for a much more efficient kernel, so we special-case it + byte_split_kernel<<>>( // + base_tile_idx, + chunk_offset, + offset_storage.size(), + tile_offsets, + delimiter[0], + *chunk, + byte_range_end, + offset_output); + } else { + multibyte_split_kernel<<>>( // + base_tile_idx, + chunk_offset, + offset_storage.size(), + tile_multistates, + tile_offsets, + {device_delim.data(), static_cast(device_delim.size())}, + *chunk, + byte_range_end, + offset_output); + } // load the next chunk auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream); diff --git a/cpp/src/io/utilities/column_type_histogram.hpp b/cpp/src/io/utilities/column_type_histogram.hpp index 99762595693..8bd2d3a89cf 100644 --- a/cpp/src/io/utilities/column_type_histogram.hpp +++ b/cpp/src/io/utilities/column_type_histogram.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,14 +25,14 @@ namespace io { * @brief Per-column histogram struct containing detected occurrences of each dtype */ struct column_type_histogram { - cudf::size_type null_count; - cudf::size_type float_count; - cudf::size_type datetime_count; - cudf::size_type string_count; - cudf::size_type negative_small_int_count; - cudf::size_type positive_small_int_count; - cudf::size_type big_int_count; - cudf::size_type bool_count; + cudf::size_type null_count{}; + cudf::size_type float_count{}; + cudf::size_type datetime_count{}; + cudf::size_type string_count{}; + cudf::size_type negative_small_int_count{}; + cudf::size_type positive_small_int_count{}; + cudf::size_type big_int_count{}; + cudf::size_type bool_count{}; }; } // namespace io diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index a3699acb934..388c9b28001 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -42,6 +42,16 @@ using cudf::device_span; namespace cudf { namespace io { +/** + * @brief Non-owning view for json type inference options + */ +struct json_inference_options_view { + char quote_char; + cudf::detail::trie_view trie_true; + cudf::detail::trie_view trie_false; + cudf::detail::trie_view trie_na; +}; + /** * @brief Structure for holding various options used when parsing and * converting CSV/json data to cuDF data type values. @@ -79,6 +89,14 @@ struct parse_options { cudf::detail::optional_trie trie_na; bool multi_delimiter; + [[nodiscard]] json_inference_options_view json_view() const + { + return {quotechar, + cudf::detail::make_trie_view(trie_true), + cudf::detail::make_trie_view(trie_false), + cudf::detail::make_trie_view(trie_na)}; + } + [[nodiscard]] parse_options_view view() const { return {delimiter, @@ -545,12 +563,22 @@ __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const& opts) { + // If this is a string value, remove quotes + if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { + thrust::advance(begin, 1); + thrust::advance(end, -1); + } return to_timestamp(begin, end, opts.dayfirst); } template ())> __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) { + // If this is a string value, remove quotes + if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { + thrust::advance(begin, 1); + thrust::advance(end, -1); + } return to_duration(begin, end); } diff --git a/cpp/src/io/utilities/type_inference.cuh b/cpp/src/io/utilities/type_inference.cuh new file mode 100644 index 00000000000..578c72fc316 --- /dev/null +++ b/cpp/src/io/utilities/type_inference.cuh @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include + +#include + +namespace cudf::io::detail { +/** + * @brief Custom column_type_histogram sum reduction callable + */ +struct custom_sum { + __device__ inline cudf::io::column_type_histogram operator()( + cudf::io::column_type_histogram const& lhs, cudf::io::column_type_histogram const& rhs) + { + return {lhs.null_count + rhs.null_count, + lhs.float_count + rhs.float_count, + lhs.datetime_count + rhs.datetime_count, + lhs.string_count + rhs.string_count, + lhs.negative_small_int_count + rhs.negative_small_int_count, + lhs.positive_small_int_count + rhs.positive_small_int_count, + lhs.big_int_count + rhs.big_int_count, + lhs.bool_count + rhs.bool_count}; + } +}; + +/** + * @brief Returns true if the input character is a valid digit. + * Supports both decimal and hexadecimal digits (uppercase and lowercase). + * + * @param c Character to check + * @param is_hex Whether to check as a hexadecimal + * + * @return `true` if it is digit-like, `false` otherwise + */ +__device__ __inline__ bool is_digit(char const c, bool const is_hex = false) +{ + if (c >= '0' && c <= '9') return true; + + if (is_hex) { + if (c >= 'A' && c <= 'F') return true; + if (c >= 'a' && c <= 'f') return true; + } + + return false; +} + +/** + * @brief Returns true if the counters indicate a potentially valid float. + * False positives are possible because positions are not taken into account. + * For example, field "e.123-" would match the pattern. + */ +__device__ __inline__ bool is_like_float(std::size_t len, + uint32_t digit_cnt, + uint32_t decimal_cnt, + uint32_t dash_cnt, + uint32_t exponent_cnt) +{ + // Can't have more than one exponent and one decimal point + if (decimal_cnt > 1) return false; + if (exponent_cnt > 1) return false; + // Without the exponent or a decimal point, this is an integer, not a float + if (decimal_cnt == 0 && exponent_cnt == 0) return false; + + // Can only have one '-' per component + if (dash_cnt > 1 + exponent_cnt) return false; + + // If anything other than these characters is present, it's not a float + if (digit_cnt + decimal_cnt + dash_cnt + exponent_cnt != len) return false; + + // Needs at least 1 digit, 2 if exponent is present + if (digit_cnt < 1 + exponent_cnt) return false; + + return true; +} + +/** + * @brief Constructs column type histogram for a given column string input `data`. + * + * @tparam BlockSize Number of threads in each block + * @tparam OptionsView Type of inference options view + * @tparam ColumnStringIter Iterator type whose `value_type` is a + * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and + * `offset_t` needs to be convertible to `std::size_t`. + * + * @param[in] options View of inference options + * @param[in] data JSON string input + * @param[in] column_strings_begin The begining of an offset-length tuple sequence + * @param[in] size Size of the string input + * @param[out] column_info Histogram of column type counters + */ +template +__global__ void infer_column_type_kernel(OptionsView options, + device_span data, + ColumnStringIter column_strings_begin, + std::size_t size, + cudf::io::column_type_histogram* column_info) +{ + auto thread_type_histogram = cudf::io::column_type_histogram{}; + + for (auto idx = threadIdx.x + blockDim.x * blockIdx.x; idx < size; + idx += gridDim.x * blockDim.x) { + auto const field_offset = thrust::get<0>(*(column_strings_begin + idx)); + auto const field_len = thrust::get<1>(*(column_strings_begin + idx)); + auto const field_begin = data.begin() + field_offset; + + if (cudf::detail::serialized_trie_contains( + options.trie_na, {field_begin, static_cast(field_len)})) { + ++thread_type_histogram.null_count; + continue; + } + + // Handling strings + if (field_len >= 2 and *field_begin == options.quote_char and + field_begin[field_len - 1] == options.quote_char) { + ++thread_type_histogram.string_count; + continue; + } + + uint32_t digit_count = 0; + uint32_t decimal_count = 0; + uint32_t slash_count = 0; + uint32_t dash_count = 0; + uint32_t plus_count = 0; + uint32_t colon_count = 0; + uint32_t exponent_count = 0; + uint32_t other_count = 0; + + auto const maybe_hex = + (field_len > 2 && field_begin[0] == '0' && field_begin[1] == 'x') || + (field_len > 3 && field_begin[0] == '-' && field_begin[1] == '0' && field_begin[2] == 'x'); + auto const field_end = field_begin + field_len; + + for (auto pos = field_begin; pos < field_end; ++pos) { + if (is_digit(*pos, maybe_hex)) { + digit_count++; + continue; + } + // Looking for unique characters that will help identify column types + switch (*pos) { + case '.': decimal_count++; break; + case '-': dash_count++; break; + case '+': plus_count++; break; + case '/': slash_count++; break; + case ':': colon_count++; break; + case 'e': + case 'E': + if (!maybe_hex && pos > field_begin && pos < field_end - 1) exponent_count++; + break; + default: other_count++; break; + } + } + + // All characters must be digits in an integer, except for the starting sign and 'x' in the + // hexadecimal prefix + auto const int_req_number_cnt = + static_cast(field_len) - + ((*field_begin == '-' || *field_begin == '+') && field_len > 1) - maybe_hex; + if (cudf::detail::serialized_trie_contains( + options.trie_true, {field_begin, static_cast(field_len)}) || + cudf::detail::serialized_trie_contains( + options.trie_false, {field_begin, static_cast(field_len)})) { + ++thread_type_histogram.bool_count; + } else if (digit_count == int_req_number_cnt) { + auto const is_negative = (*field_begin == '-'); + char const* data_begin = field_begin + (is_negative || (*field_begin == '+')); + cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( + data_begin, data_begin + digit_count, is_negative, thread_type_histogram); + ++*ptr; + } else if (is_like_float( + field_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) { + ++thread_type_histogram.float_count; + } + // All invalid JSON values are treated as string + else { + ++thread_type_histogram.string_count; + } + } // grid-stride for loop + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + auto const block_type_histogram = + BlockReduce(temp_storage).Reduce(thread_type_histogram, custom_sum{}); + if (threadIdx.x == 0) { + atomicAdd(&column_info->null_count, block_type_histogram.null_count); + atomicAdd(&column_info->float_count, block_type_histogram.float_count); + atomicAdd(&column_info->datetime_count, block_type_histogram.datetime_count); + atomicAdd(&column_info->string_count, block_type_histogram.string_count); + atomicAdd(&column_info->negative_small_int_count, + block_type_histogram.negative_small_int_count); + atomicAdd(&column_info->positive_small_int_count, + block_type_histogram.positive_small_int_count); + atomicAdd(&column_info->big_int_count, block_type_histogram.big_int_count); + atomicAdd(&column_info->bool_count, block_type_histogram.bool_count); + } +} + +/** + * @brief Constructs column type histogram for a given column string input `data`. + * + * @tparam OptionsView Type of inference options view + * @tparam ColumnStringIter Iterator type whose `value_type` is a + * `thrust::tuple`, where `offset_t` and `length_t` are of integral type and + * `offset_t` needs to be convertible to `std::size_t`. + * + * @param options View of inference options + * @param data JSON string input + * @param column_strings_begin The begining of an offset-length tuple sequence + * @param size Size of the string input + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A histogram containing column-specific type counters + */ +template +cudf::io::column_type_histogram infer_column_type(OptionsView const& options, + cudf::device_span data, + ColumnStringIter column_strings_begin, + std::size_t const size, + rmm::cuda_stream_view stream) +{ + constexpr int block_size = 128; + + auto const grid_size = (size + block_size - 1) / block_size; + auto d_column_info = rmm::device_scalar(stream); + CUDF_CUDA_TRY(cudaMemsetAsync( + d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); + + infer_column_type_kernel<<>>( + options, data, column_strings_begin, size, d_column_info.data()); + + return d_column_info.value(stream); +} + +/** + * @brief Infers data type for a given JSON string input `data`. + * + * @throw cudf::logic_error if input size is 0 + * @throw cudf::logic_error if date time is not inferred as string + * @throw cudf::logic_error if data type inference failed + * + * @tparam OptionsView Type of inference options view + * @tparam ColumnStringIter Iterator type whose `value_type` is convertible to + * `thrust::tuple` + * + * @param options View of inference options + * @param data JSON string input + * @param column_strings_begin The begining of an offset-length tuple sequence + * @param size Size of the string input + * @param stream CUDA stream used for device memory operations and kernel launches + * @return The inferred data type + */ +template +cudf::data_type infer_data_type(OptionsView const& options, + device_span data, + ColumnStringIter column_strings_begin, + std::size_t const size, + rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(size != 0, "No data available for data type inference.\n"); + + auto const h_column_info = infer_column_type(options, data, column_strings_begin, size, stream); + + auto get_type_id = [&](auto const& cinfo) { + auto int_count_total = + cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count; + if (cinfo.null_count == static_cast(size)) { + // Entire column is NULL; allocate the smallest amount of memory + return type_id::INT8; + } else if (cinfo.string_count > 0) { + return type_id::STRING; + } else if (cinfo.datetime_count > 0) { + CUDF_FAIL("Date time is inferred as string.\n"); + } else if (cinfo.float_count > 0 || (int_count_total > 0 && cinfo.null_count > 0)) { + return type_id::FLOAT64; + } else if (cinfo.big_int_count == 0 && int_count_total != 0) { + return type_id::INT64; + } else if (cinfo.big_int_count != 0 && cinfo.negative_small_int_count != 0) { + return type_id::STRING; + } else if (cinfo.big_int_count != 0) { + return type_id::UINT64; + } else if (cinfo.bool_count > 0) { + return type_id::BOOL8; + } + CUDF_FAIL("Data type inference failed.\n"); + }; + return cudf::data_type{get_type_id(h_column_info)}; +} +} // namespace cudf::io::detail diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu index 04456d970bf..ea35977e8e4 100644 --- a/cpp/src/lists/segmented_sort.cu +++ b/cpp/src/lists/segmented_sort.cu @@ -46,6 +46,17 @@ namespace lists { namespace detail { struct SegmentedSortColumn { + /** + * @brief Compile time check for allowing radix sort for column type. + * + * Floating point is not included here because of the special handling of NaNs. + */ + template + static constexpr bool is_radix_sort_supported() + { + return std::is_integral(); + } + template void SortPairsAscending(KeyT const* keys_in, KeyT* keys_out, @@ -133,7 +144,7 @@ struct SegmentedSortColumn { } template - std::enable_if_t(), std::unique_ptr> operator()( + std::enable_if_t(), std::unique_ptr> operator()( column_view const& child, column_view const& segment_offsets, order column_order, @@ -152,7 +163,7 @@ struct SegmentedSortColumn { } template - std::enable_if_t(), std::unique_ptr> operator()( + std::enable_if_t(), std::unique_ptr> operator()( column_view const& child, column_view const& offsets, order column_order, @@ -253,14 +264,14 @@ std::unique_ptr sort_lists(lists_column_view const& input, }); // for numeric columns, calls Faster segmented radix sort path // for non-numeric columns, calls segmented_sort_by_key. - auto output_child = type_dispatcher(input.child().type(), - SegmentedSortColumn{}, - input.get_sliced_child(stream), - output_offset->view(), - column_order, - null_precedence, - stream, - mr); + auto output_child = type_dispatcher(input.child().type(), + SegmentedSortColumn{}, + input.get_sliced_child(stream), + output_offset->view(), + column_order, + null_precedence, + stream, + mr); auto null_mask = cudf::detail::copy_bitmask(input.parent(), stream, mr); diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 431e9ae26e3..b7d154c4808 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -71,7 +71,7 @@ std::unique_ptr contains_impl(strings_column_view const& input, mr); if (input.is_empty()) { return results; } - auto d_prog = reprog_device::create(pattern, flags, stream); + auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream); auto d_results = results->mutable_view().data(); auto const d_strings = column_device_view::create(input.parent(), stream); @@ -114,7 +114,7 @@ std::unique_ptr count_re( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { // compile regex into device object - auto d_prog = reprog_device::create(pattern, flags, stream); + auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream); auto const d_strings = column_device_view::create(input.parent(), stream); diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu index 3889f3e2833..76d2f84b1a0 100644 --- a/cpp/src/strings/extract/extract.cu +++ b/cpp/src/strings/extract/extract.cu @@ -92,7 +92,7 @@ std::unique_ptr
extract(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { // compile regex into device object - auto d_prog = reprog_device::create(pattern, flags, stream); + auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream); auto const groups = d_prog->group_counts(); CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern"); diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index b3c251ca5b7..76c2788c1be 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -106,7 +106,7 @@ std::unique_ptr extract_all_record( auto const d_strings = column_device_view::create(input.parent(), stream); // Compile regex into device object. - auto d_prog = reprog_device::create(pattern, flags, stream); + auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream); // The extract pattern should always include groups. auto const groups = d_prog->group_counts(); CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern."); diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 9b1013bae09..5b86aedc409 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -36,7 +36,7 @@ namespace detail { namespace { // Bitmask of all operators #define OPERATOR_MASK 0200 -enum OperatorType { +enum OperatorType : int32_t { START = 0200, // Start, used for marker on stack LBRA_NC = 0203, // non-capturing group CAT = 0205, // Concatentation, implicit operator @@ -176,6 +176,7 @@ class regex_parser { char32_t const* _expr_ptr; bool _lex_done{false}; regex_flags const _flags; + capture_groups const _capture; int32_t _id_cclass_w{-1}; // alphanumeric [a-zA-Z0-9_] int32_t _id_cclass_W{-1}; // not alphanumeric plus '\n' @@ -528,7 +529,8 @@ class regex_parser { _expr_ptr += 2; return LBRA_NC; } - return LBRA; + return (_capture == capture_groups::NON_CAPTURE) ? static_cast(LBRA_NC) + : static_cast(LBRA); case ')': return RBRA; case '^': { _chr = chr; @@ -752,8 +754,11 @@ class regex_parser { } public: - regex_parser(const char32_t* pattern, regex_flags const flags, reprog& prog) - : _prog(prog), _pattern_begin(pattern), _expr_ptr(pattern), _flags(flags) + regex_parser(const char32_t* pattern, + regex_flags const flags, + capture_groups const capture, + reprog& prog) + : _prog(prog), _pattern_begin(pattern), _expr_ptr(pattern), _flags(flags), _capture(capture) { auto const dot_type = is_dotall(_flags) ? ANYNL : ANY; @@ -956,11 +961,14 @@ class regex_compiler { } public: - regex_compiler(const char32_t* pattern, regex_flags const flags, reprog& prog) + regex_compiler(const char32_t* pattern, + regex_flags const flags, + capture_groups const capture, + reprog& prog) : _prog(prog), _last_was_and(false), _bracket_count(0), _flags(flags) { // Parse pattern into items - auto const items = regex_parser(pattern, _flags, _prog).get_items(); + auto const items = regex_parser(pattern, _flags, capture, _prog).get_items(); int cur_subid{}; int push_subid{}; @@ -995,28 +1003,29 @@ class regex_compiler { CUDF_EXPECTS(_bracket_count == 0, "unmatched left parenthesis"); _prog.set_start_inst(_and_stack.top().id_first); - _prog.finalize(); + _prog.optimize(); _prog.check_for_errors(); + _prog.finalize(); _prog.set_groups_count(cur_subid); } }; // Convert pattern into program -reprog reprog::create_from(std::string_view pattern, regex_flags const flags) +reprog reprog::create_from(std::string_view pattern, + regex_flags const flags, + capture_groups const capture) { reprog rtn; auto pattern32 = string_to_char32_vector(pattern); - regex_compiler compiler(pattern32.data(), flags, rtn); + regex_compiler compiler(pattern32.data(), flags, capture, rtn); // for debugging, it can be helpful to call rtn.print(flags) here to dump // out the instructions that have been created from the given pattern return rtn; } -void reprog::finalize() -{ - collapse_nops(); - build_start_ids(); -} +void reprog::optimize() { collapse_nops(); } + +void reprog::finalize() { build_start_ids(); } void reprog::collapse_nops() { diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h index 10092137c77..7ad7f481436 100644 --- a/cpp/src/strings/regex/regcomp.h +++ b/cpp/src/strings/regex/regcomp.h @@ -31,7 +31,7 @@ namespace detail { * 03xx are tokens, i.e. operands for operators * ``` */ -enum InstType { +enum InstType : int32_t { CHAR = 0177, // Literal character RBRA = 0201, // Right bracket, ) LBRA = 0202, // Left bracket, ( @@ -109,9 +109,12 @@ class reprog { * * @param pattern Regex pattern encoded as UTF-8 * @param flags For interpreting certain `pattern` characters + * @param capture For controlling how capture groups are processed * @return Instance of reprog */ - static reprog create_from(std::string_view pattern, regex_flags const flags); + static reprog create_from(std::string_view pattern, + regex_flags const flags, + capture_groups const capture = capture_groups::EXTRACT); int32_t add_inst(int32_t type); int32_t add_inst(reinst const& inst); @@ -134,6 +137,7 @@ class reprog { void set_start_inst(int32_t id); [[nodiscard]] int32_t get_start_inst() const; + void optimize(); void finalize(); void check_for_errors(); #ifndef NDEBUG diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index 11cc1a493a0..98631680800 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -91,15 +91,19 @@ class reprog_device { std::string_view pattern, rmm::cuda_stream_view stream); /** - * @brief Create the device program instance from a regex pattern. + * @brief Create the device program instance from a regex pattern * - * @param pattern The regex pattern to compile. - * @param re_flags Regex flags for interpreting special characters in the pattern. + * @param pattern The regex pattern to compile + * @param re_flags Regex flags for interpreting special characters in the pattern + * @param capture Control how capture groups are processed * @param stream CUDA stream used for device memory operations and kernel launches - * @return The program device object. + * @return The program device object */ static std::unique_ptr> create( - std::string_view pattern, regex_flags const re_flags, rmm::cuda_stream_view stream); + std::string_view pattern, + regex_flags const re_flags, + capture_groups const capture, + rmm::cuda_stream_view stream); /** * @brief Called automatically by the unique_ptr returned from create(). diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu index 5b97271cb21..03247d24ba3 100644 --- a/cpp/src/strings/regex/regexec.cu +++ b/cpp/src/strings/regex/regexec.cu @@ -47,15 +47,19 @@ reprog_device::reprog_device(reprog& prog) std::unique_ptr> reprog_device::create( std::string_view pattern, rmm::cuda_stream_view stream) { - return reprog_device::create(pattern, regex_flags::MULTILINE, stream); + return reprog_device::create( + pattern, regex_flags::MULTILINE, capture_groups::NON_CAPTURE, stream); } // Create instance of the reprog that can be passed into a device kernel std::unique_ptr> reprog_device::create( - std::string_view pattern, regex_flags const flags, rmm::cuda_stream_view stream) + std::string_view pattern, + regex_flags const flags, + capture_groups const capture, + rmm::cuda_stream_view stream) { // compile pattern into host object - reprog h_prog = reprog::create_from(pattern, flags); + reprog h_prog = reprog::create_from(pattern, flags, capture); // compute size to hold all the member data auto const insts_count = h_prog.insts_count(); diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index ab391c29ab8..e0a995c26b9 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -114,7 +114,7 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& input, CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty"); // compile regex into device object - auto d_prog = reprog_device::create(pattern, flags, stream); + auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream); // parse the repl string for back-ref indicators auto group_count = std::min(99, d_prog->group_counts()); // group count should NOT exceed 99 diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 149ccb6167d..a5b9ad37e65 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -145,7 +145,7 @@ std::unique_ptr replace_re( patterns.size()); std::transform( patterns.begin(), patterns.end(), h_progs.begin(), [flags, stream](auto const& ptn) { - return reprog_device::create(ptn, flags, stream); + return reprog_device::create(ptn, flags, capture_groups::NON_CAPTURE, stream); }); // get the longest regex for the dispatcher diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 131fa8decba..fd0049d7c89 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -115,7 +115,7 @@ std::unique_ptr replace_re( string_view d_repl(replacement.data(), replacement.size()); // compile regex into device object - auto d_prog = reprog_device::create(pattern, flags, stream); + auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream); auto const maxrepl = max_replace_count.value_or(-1); diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index ab89936f541..73470bde867 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -103,7 +103,7 @@ std::unique_ptr findall( auto const d_strings = column_device_view::create(input.parent(), stream); // compile regex into device object - auto const d_prog = reprog_device::create(pattern, flags, stream); + auto const d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream); // Create lists offsets column auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3710bc6cdfa..e630e842f4e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -226,12 +226,13 @@ ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu) -ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp) +ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu) ConfigureTest(FST_TEST io/fst/fst_test.cu) +ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu) if(CUDF_ENABLE_ARROW_S3) target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED") endif() diff --git a/cpp/tests/column/column_test.cu b/cpp/tests/column/column_test.cu index 801cee285b6..51f37ecac6c 100644 --- a/cpp/tests/column/column_test.cu +++ b/cpp/tests/column/column_test.cu @@ -31,6 +31,8 @@ #include #include +#include + #include #include @@ -46,8 +48,10 @@ struct TypedColumnTest : public cudf::test::BaseFixture { { auto typed_data = static_cast(data.data()); auto typed_mask = static_cast(mask.data()); - thrust::sequence(thrust::device, typed_data, typed_data + data.size()); - thrust::sequence(thrust::device, typed_mask, typed_mask + mask.size()); + thrust::sequence( + rmm::exec_policy(cudf::default_stream_value), typed_data, typed_data + data.size()); + thrust::sequence( + rmm::exec_policy(cudf::default_stream_value), typed_mask, typed_mask + mask.size()); } cudf::size_type num_elements() { return _num_elements; } @@ -349,7 +353,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorNoMask) { rmm::device_uvector original{static_cast(this->num_elements()), cudf::default_stream_value}; - thrust::copy(thrust::device, + thrust::copy(rmm::exec_policy(cudf::default_stream_value), static_cast(this->data.data()), static_cast(this->data.data()) + this->num_elements(), original.begin()); @@ -366,7 +370,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorWithMask) { rmm::device_uvector original{static_cast(this->num_elements()), cudf::default_stream_value}; - thrust::copy(thrust::device, + thrust::copy(rmm::exec_policy(cudf::default_stream_value), static_cast(this->data.data()), static_cast(this->data.data()) + this->num_elements(), original.begin()); diff --git a/cpp/tests/column/compound_test.cu b/cpp/tests/column/compound_test.cu index a6d15b50150..58be2b2f316 100644 --- a/cpp/tests/column/compound_test.cu +++ b/cpp/tests/column/compound_test.cu @@ -66,7 +66,7 @@ struct checker_for_level2 { TEST_F(CompoundColumnTest, ChildrenLevel1) { rmm::device_uvector data(1000, cudf::default_stream_value); - thrust::sequence(rmm::exec_policy(), data.begin(), data.end(), 1); + thrust::sequence(rmm::exec_policy(cudf::default_stream_value), data.begin(), data.end(), 1); auto null_mask = cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED); rmm::device_buffer data1{data.data() + 100, 100 * sizeof(int32_t), cudf::default_stream_value}; @@ -105,14 +105,14 @@ TEST_F(CompoundColumnTest, ChildrenLevel1) { auto column = cudf::column_device_view::create(parent->view()); - EXPECT_TRUE(thrust::any_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::default_stream_value), thrust::make_counting_iterator(0), thrust::make_counting_iterator(100), checker_for_level1{*column})); } { auto column = cudf::mutable_column_device_view::create(parent->mutable_view()); - EXPECT_TRUE(thrust::any_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::default_stream_value), thrust::make_counting_iterator(0), thrust::make_counting_iterator(100), checker_for_level1{*column})); @@ -122,7 +122,7 @@ TEST_F(CompoundColumnTest, ChildrenLevel1) TEST_F(CompoundColumnTest, ChildrenLevel2) { rmm::device_uvector data(1000, cudf::default_stream_value); - thrust::sequence(rmm::exec_policy(), data.begin(), data.end(), 1); + thrust::sequence(rmm::exec_policy(cudf::default_stream_value), data.begin(), data.end(), 1); auto null_mask = cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED); rmm::device_buffer data11{data.data() + 100, 100 * sizeof(int32_t), cudf::default_stream_value}; @@ -202,14 +202,14 @@ TEST_F(CompoundColumnTest, ChildrenLevel2) { auto column = cudf::column_device_view::create(parent->view()); - EXPECT_TRUE(thrust::any_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::default_stream_value), thrust::make_counting_iterator(0), thrust::make_counting_iterator(100), checker_for_level2{*column})); } { auto column = cudf::mutable_column_device_view::create(parent->mutable_view()); - EXPECT_TRUE(thrust::any_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::default_stream_value), thrust::make_counting_iterator(0), thrust::make_counting_iterator(100), checker_for_level2{*column})); diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu index f5efd616e3d..c06afe85cff 100644 --- a/cpp/tests/copying/concatenate_tests.cu +++ b/cpp/tests/copying/concatenate_tests.cu @@ -522,11 +522,11 @@ TEST_F(OverflowTest, Presliced) // try and concatenate 4 string columns of with ~1/2 billion chars in each auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1); - thrust::fill(rmm::exec_policy(), + thrust::fill(rmm::exec_policy(cudf::default_stream_value), offsets->mutable_view().begin(), offsets->mutable_view().end(), string_size); - thrust::exclusive_scan(rmm::exec_policy(), + thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value), offsets->view().begin(), offsets->view().end(), offsets->mutable_view().begin()); @@ -596,11 +596,11 @@ TEST_F(OverflowTest, Presliced) // try and concatenate 4 struct columns of with ~1/2 billion elements in each auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1); - thrust::fill(rmm::exec_policy(), + thrust::fill(rmm::exec_policy(cudf::default_stream_value), offsets->mutable_view().begin(), offsets->mutable_view().end(), list_size); - thrust::exclusive_scan(rmm::exec_policy(), + thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value), offsets->view().begin(), offsets->view().end(), offsets->mutable_view().begin()); @@ -688,11 +688,11 @@ TEST_F(OverflowTest, BigColumnsSmallSlices) constexpr size_type string_size = inner_size / num_rows; auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1); - thrust::fill(rmm::exec_policy(), + thrust::fill(rmm::exec_policy(cudf::default_stream_value), offsets->mutable_view().begin(), offsets->mutable_view().end(), string_size); - thrust::exclusive_scan(rmm::exec_policy(), + thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value), offsets->view().begin(), offsets->view().end(), offsets->mutable_view().begin()); @@ -715,11 +715,11 @@ TEST_F(OverflowTest, BigColumnsSmallSlices) constexpr size_type list_size = inner_size / num_rows; auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1); - thrust::fill(rmm::exec_policy(), + thrust::fill(rmm::exec_policy(cudf::default_stream_value), offsets->mutable_view().begin(), offsets->mutable_view().end(), list_size); - thrust::exclusive_scan(rmm::exec_policy(), + thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value), offsets->view().begin(), offsets->view().end(), offsets->mutable_view().begin()); @@ -742,11 +742,11 @@ TEST_F(OverflowTest, BigColumnsSmallSlices) constexpr size_type list_size = inner_size / num_rows; auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1); - thrust::fill(rmm::exec_policy(), + thrust::fill(rmm::exec_policy(cudf::default_stream_value), offsets->mutable_view().begin(), offsets->mutable_view().end(), list_size); - thrust::exclusive_scan(rmm::exec_policy(), + thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value), offsets->view().begin(), offsets->view().end(), offsets->mutable_view().begin()); diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu index 55623eec218..e3cd975ab41 100644 --- a/cpp/tests/copying/detail_gather_tests.cu +++ b/cpp/tests/copying/detail_gather_tests.cu @@ -48,7 +48,8 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest) { constexpr cudf::size_type source_size{1000}; rmm::device_uvector gather_map(source_size, cudf::default_stream_value); - thrust::sequence(thrust::device, gather_map.begin(), gather_map.end()); + thrust::sequence( + rmm::exec_policy(cudf::default_stream_value), gather_map.begin(), gather_map.end()); auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); cudf::test::fixed_width_column_wrapper source_column(data, data + source_size); diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index b4add7d4123..85e28a5ec16 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -1360,7 +1360,8 @@ TEST_F(ContiguousSplitUntypedTest, ValidityEdgeCase) } } -TEST_F(ContiguousSplitUntypedTest, CalculationOverflow) +// This test requires about 25GB of device memory when used with the arena allocator +TEST_F(ContiguousSplitUntypedTest, DISABLED_VeryLargeColumnTest) { // tests an edge case where buf.elements * buf.element_size overflows an INT32. auto col = cudf::make_fixed_width_column( diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index d9788388fb1..17e67da6227 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -147,9 +147,11 @@ struct AtomicsTest : public cudf::test::BaseFixture { if (block_size == 0) { block_size = vec_size; } if (is_cas_test) { - gpu_atomicCAS_test<<>>(dev_result.data(), dev_data.data(), vec_size); + gpu_atomicCAS_test<<>>( + dev_result.data(), dev_data.data(), vec_size); } else { - gpu_atomic_test<<>>(dev_result.data(), dev_data.data(), vec_size); + gpu_atomic_test<<>>( + dev_result.data(), dev_data.data(), vec_size); } auto host_result = cudf::detail::make_host_vector_sync(dev_result); @@ -296,7 +298,7 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture { if (block_size == 0) { block_size = vec_size; } - gpu_atomic_bitwiseOp_test<<>>( + gpu_atomic_bitwiseOp_test<<>>( reinterpret_cast(dev_result.data()), reinterpret_cast(dev_data.data()), vec_size); auto host_result = cudf::detail::make_host_vector_sync(dev_result); diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu index a83f72bb9c8..e34cf23eee4 100644 --- a/cpp/tests/error/error_handling_test.cu +++ b/cpp/tests/error/error_handling_test.cu @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -89,7 +90,7 @@ TEST(DeathTest, CudaFatalError) { testing::FLAGS_gtest_death_test_style = "threadsafe"; auto call_kernel = []() { - kernel<<<1, 1>>>(); + kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>(); try { CUDF_CUDA_TRY(cudaDeviceSynchronize()); } catch (const cudf::fatal_cuda_error& fe) { diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu index c6f2bb6ce47..a4e0736e22f 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cu +++ b/cpp/tests/fixed_point/fixed_point_tests.cu @@ -85,8 +85,10 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice) std::vector vec1(1000, decimal32{1, scale_type{-2}}); auto d_vec1 = cudf::detail::make_device_uvector_sync(vec1); - auto const sum = thrust::reduce( - rmm::exec_policy(), std::cbegin(d_vec1), std::cend(d_vec1), decimal32{0, scale_type{-2}}); + auto const sum = thrust::reduce(rmm::exec_policy(cudf::default_stream_value), + std::cbegin(d_vec1), + std::cend(d_vec1), + decimal32{0, scale_type{-2}}); EXPECT_EQ(static_cast(sum), 1000); @@ -99,8 +101,10 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice) std::vector vec2(1000); std::iota(std::begin(vec2), std::end(vec2), 1); - auto const res1 = thrust::reduce( - rmm::exec_policy(), std::cbegin(d_vec1), std::cend(d_vec1), decimal32{0, scale_type{-2}}); + auto const res1 = thrust::reduce(rmm::exec_policy(cudf::default_stream_value), + std::cbegin(d_vec1), + std::cend(d_vec1), + decimal32{0, scale_type{-2}}); auto const res2 = std::accumulate(std::cbegin(vec2), std::cend(vec2), 0); @@ -108,7 +112,7 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice) rmm::device_uvector d_vec3(1000, cudf::default_stream_value); - thrust::transform(rmm::exec_policy(), + thrust::transform(rmm::exec_policy(cudf::default_stream_value), std::cbegin(d_vec1), std::cend(d_vec1), std::begin(d_vec3), diff --git a/cpp/tests/groupby/lists_tests.cu b/cpp/tests/groupby/lists_tests.cu index c37e78f70f2..45c6b8fe2e6 100644 --- a/cpp/tests/groupby/lists_tests.cu +++ b/cpp/tests/groupby/lists_tests.cu @@ -28,6 +28,8 @@ #include #include +#include + #include #include @@ -122,7 +124,8 @@ inline void test_hash_based_sum_agg(column_view const& keys, // For each row in expected table `t[0, num_rows)`, there must be a match // in the resulting table `t[num_rows, 2 * num_rows)` - EXPECT_TRUE(thrust::all_of(thrust::make_counting_iterator(0), + EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), + thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_rows), func)); } diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu index c88f6a28f79..84e64027c5d 100644 --- a/cpp/tests/hash_map/map_test.cu +++ b/cpp/tests/hash_map/map_test.cu @@ -140,16 +140,18 @@ TYPED_TEST(InsertTest, UniqueKeysUniqueValues) { using map_type = typename TypeParam::map_type; using pair_type = typename TypeParam::pair_type; - thrust::tabulate( - rmm::exec_policy(), this->pairs.begin(), this->pairs.end(), unique_pair_generator{}); + thrust::tabulate(rmm::exec_policy(cudf::default_stream_value), + this->pairs.begin(), + this->pairs.end(), + unique_pair_generator{}); // All pairs should be new inserts - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin(), this->pairs.end(), insert_pair{*this->map})); // All pairs should be present in the map - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin(), this->pairs.end(), find_pair{*this->map})); @@ -159,23 +161,23 @@ TYPED_TEST(InsertTest, IdenticalKeysIdenticalValues) { using map_type = typename TypeParam::map_type; using pair_type = typename TypeParam::pair_type; - thrust::tabulate(rmm::exec_policy(), + thrust::tabulate(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin(), this->pairs.end(), identical_pair_generator{}); // Insert a single pair - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin(), this->pairs.begin() + 1, insert_pair{*this->map})); // Identical inserts should all return false (no new insert) - EXPECT_FALSE(thrust::all_of(rmm::exec_policy(), + EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin(), this->pairs.end(), insert_pair{*this->map})); // All pairs should be present in the map - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin(), this->pairs.end(), find_pair{*this->map})); @@ -185,30 +187,30 @@ TYPED_TEST(InsertTest, IdenticalKeysUniqueValues) { using map_type = typename TypeParam::map_type; using pair_type = typename TypeParam::pair_type; - thrust::tabulate(rmm::exec_policy(), + thrust::tabulate(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin(), this->pairs.end(), identical_key_generator{}); // Insert a single pair - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin(), this->pairs.begin() + 1, insert_pair{*this->map})); // Identical key inserts should all return false (no new insert) - EXPECT_FALSE(thrust::all_of(rmm::exec_policy(), + EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin() + 1, this->pairs.end(), insert_pair{*this->map})); // Only first pair is present in map - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(), + EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin(), this->pairs.begin() + 1, find_pair{*this->map})); - EXPECT_FALSE(thrust::all_of(rmm::exec_policy(), + EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), this->pairs.begin() + 1, this->pairs.end(), find_pair{*this->map})); diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 232aaa51ef3..5a0db6e3c64 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -27,6 +27,9 @@ #include #include #include +#include + +#include #include @@ -156,14 +159,91 @@ void check_float_column(cudf::column_view const& col, struct JsonReaderTest : public cudf::test::BaseFixture { }; -TEST_F(JsonReaderTest, BasicJsonLines) +/** + * @brief Enum class to be used to specify the test case of parametrized tests + */ +enum class json_test_t { + // Run test with the existing JSON lines reader using row-orient input data + json_lines_row_orient, + // Run test with the existing JSON lines reader using record-orient input data + json_lines_record_orient, + // Run test with the nested JSON lines reader using record-orient input data + json_experimental_record_orient +}; + +/** + * @brief Test fixture for parametrized JSON reader tests + */ +struct JsonReaderParamTest : public cudf::test::BaseFixture, + public testing::WithParamInterface { +}; + +/** + * @brief Test fixture for parametrized JSON reader tests, testing record orient-only for existing + * JSON lines reader and the new experimental reader + */ +struct JsonReaderDualTest : public cudf::test::BaseFixture, + public testing::WithParamInterface { +}; + +/** + * @brief Generates a JSON lines string that uses the record orient + * + * @param records An array of a map of key-value pairs + * @param record_delimiter The delimiter to be used to delimit a record + * @param prefix The prefix prepended to the whole string + * @param suffix The suffix to be appended after the whole string + * @return The JSON lines string that uses the record orient + */ +std::string to_records_orient(std::vector> const& records, + std::string record_delimiter, + std::string prefix = "", + std::string suffix = "") { - std::string data = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n"; + std::string result = prefix; + for (auto record_it = std::cbegin(records); record_it != std::cend(records); record_it++) { + result += "{"; + for (auto kv_pair_it = std::cbegin(*record_it); kv_pair_it != std::cend(*record_it); + kv_pair_it++) { + auto const& [key, value] = *kv_pair_it; + result += "\"" + key + "\":" + value; + result += (kv_pair_it != std::prev(std::end(*record_it))) ? ", " : ""; + } + result += "}"; + if (record_it != std::prev(std::end(records))) { result += record_delimiter; } + } + return (result + suffix); +} + +// Parametrize qualifying JSON tests for executing both experimental reader and existing JSON lines +// reader +INSTANTIATE_TEST_CASE_P(JsonReaderParamTest, + JsonReaderParamTest, + ::testing::Values(json_test_t::json_lines_row_orient, + json_test_t::json_lines_record_orient, + json_test_t::json_experimental_record_orient)); + +// Parametrize qualifying JSON tests for executing both experimental reader and existing JSON lines +// reader +INSTANTIATE_TEST_CASE_P(JsonReaderDualTest, + JsonReaderDualTest, + ::testing::Values(json_test_t::json_lines_record_orient, + json_test_t::json_experimental_record_orient)); + +TEST_P(JsonReaderParamTest, BasicJsonLines) +{ + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n"; + std::string record_orient = to_records_orient( + {{{"0", "1"}, {"1", "1.1"}}, {{"0", "2"}, {"1", "2.2"}}, {{"0", "3"}, {"1", "3.3"}}}, "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) .dtypes(std::vector{dtype(), dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 2); @@ -182,19 +262,36 @@ TEST_F(JsonReaderTest, BasicJsonLines) float64_wrapper{{1.1, 2.2, 3.3}, validity}); } -TEST_F(JsonReaderTest, FloatingPoint) +TEST_P(JsonReaderParamTest, FloatingPoint) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = + "[5.6]\n[0.5679e2]\n[1.2e10]\n[0.07e1]\n[3000e-3]\n[12.34e0]\n[3.1e-001]\n[-73." + "98007199999998]\n"; + std::string record_orient = to_records_orient({{{"0", "5.6"}}, + {{"0", "0.5679e2"}}, + {{"0", "1.2e10"}}, + {{"0", "0.07e1"}}, + {{"0", "3000e-3"}}, + {{"0", "12.34e0"}}, + {{"0", "3.1e-001"}}, + {{"0", "-73.98007199999998"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + auto filepath = temp_env->get_temp_dir() + "FloatingPoint.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[5.6]\n[0.5679e2]\n[1.2e10]\n[0.07e1]\n[3000e-3]\n[12.34e0]\n[3.1e-001]\n[-73." - "98007199999998]\n"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); + cudf_io::table_with_metadata result = cudf_io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 1); @@ -211,14 +308,21 @@ TEST_F(JsonReaderTest, FloatingPoint) ASSERT_EQ((1u << result.tbl->get_column(0).size()) - 1, bitmask[0]); } -TEST_F(JsonReaderTest, JsonLinesStrings) +TEST_P(JsonReaderParamTest, JsonLinesStrings) { - std::string data = "[1, 1.1, \"aa \"]\n[2, 2.2, \" bbb\"]"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[1, 1.1, \"aa \"]\n[2, 2.2, \" bbb\"]"; + std::string record_orient = to_records_orient({{{"0", "1"}, {"1", "1.1"}, {"2", R"("aa ")"}}, + {{"0", "2"}, {"1", "2.2"}, {"2", R"(" bbb")"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) .dtypes({{"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -241,8 +345,12 @@ TEST_F(JsonReaderTest, JsonLinesStrings) cudf::test::strings_column_wrapper({"aa ", " bbb"})); } -TEST_F(JsonReaderTest, MultiColumn) +TEST_P(JsonReaderParamTest, MultiColumn) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + bool const row_orient = (test_opt == json_test_t::json_lines_row_orient); + constexpr auto num_rows = 10; auto int8_values = random_values(num_rows); auto int16_values = random_values(num_rows); @@ -254,10 +362,25 @@ TEST_F(JsonReaderTest, MultiColumn) auto filepath = temp_env->get_temp_dir() + "MultiColumn.json"; { std::ostringstream line; - for (int i = 0; i < num_rows; ++i) { - line << "[" << std::to_string(int8_values[i]) << "," << int16_values[i] << "," - << int32_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," - << float64_values[i] << "]\n"; + if (row_orient) { + for (int i = 0; i < num_rows; ++i) { + line << "[" << std::to_string(int8_values[i]) << "," << int16_values[i] << "," + << int32_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," + << float64_values[i] << "]\n"; + } + } else { + std::vector> records; + for (int i = 0; i < num_rows; ++i) { + records.push_back({ + {"0", std::to_string(int8_values[i])}, // + {"1", std::to_string(int16_values[i])}, // + {"2", std::to_string(int32_values[i])}, // + {"3", std::to_string(int64_values[i])}, // + {"4", std::to_string(float32_values[i])}, // + {"5", std::to_string(float64_values[i])}, // + }); + } + line << to_records_orient(records, "\n"); } std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); @@ -271,7 +394,8 @@ TEST_F(JsonReaderTest, MultiColumn) dtype(), dtype(), dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -297,18 +421,33 @@ TEST_F(JsonReaderTest, MultiColumn) check_float_column(view.column(5), float64_values, validity); } -TEST_F(JsonReaderTest, Booleans) +TEST_P(JsonReaderParamTest, Booleans) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[true]\n[true]\n[false]\n[false]\n[true]"; + std::string record_orient = to_records_orient( + { + {{"0", "true"}}, + {{"0", "true"}}, + {{"0", "false"}}, + {{"0", "false"}}, + {{"0", "true"}}, + }, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + auto filepath = temp_env->get_temp_dir() + "Booleans.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[true]\n[true]\n[false]\n[false]\n[true]"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); // Booleans are the same (integer) data type, but valued at 0 or 1 @@ -322,21 +461,39 @@ TEST_F(JsonReaderTest, Booleans) bool_wrapper{{true, true, false, false, true}, validity}); } -TEST_F(JsonReaderTest, Dates) +TEST_P(JsonReaderParamTest, Dates) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = + "[05/03/2001]\n[31/10/2010]\n[20/10/1994]\n[18/10/1990]\n[1/1/1970]\n" + "[18/04/1995]\n[14/07/1994]\n[07/06/2006 11:20:30.400]\n" + "[16/09/2005T1:2:30.400PM]\n[2/2/1970]\n"; + std::string record_orient = to_records_orient({{{"0", R"("05/03/2001")"}}, + {{"0", R"("31/10/2010")"}}, + {{"0", R"("20/10/1994")"}}, + {{"0", R"("18/10/1990")"}}, + {{"0", R"("1/1/1970")"}}, + {{"0", R"("18/04/1995")"}}, + {{"0", R"("14/07/1994")"}}, + {{"0", R"("07/06/2006 11:20:30.400")"}}, + {{"0", R"("16/09/2005T1:2:30.400PM")"}}, + {{"0", R"("2/2/1970")"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + auto filepath = temp_env->get_temp_dir() + "Dates.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[05/03/2001]\n[31/10/2010]\n[20/10/1994]\n[18/10/1990]\n[1/1/1970]\n"; - outfile << "[18/04/1995]\n[14/07/1994]\n[07/06/2006 11:20:30.400]\n"; - outfile << "[16/09/2005T1:2:30.400PM]\n[2/2/1970]\n"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .lines(true) - .dayfirst(true); + .dayfirst(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); const auto view = result.tbl->view(); @@ -359,21 +516,39 @@ TEST_F(JsonReaderTest, Dates) validity}); } -TEST_F(JsonReaderTest, Durations) +TEST_P(JsonReaderParamTest, Durations) { - auto filepath = temp_env->get_temp_dir() + "Durations.json"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = + "[-2]\n[-1]\n[0]\n" + "[1 days]\n[0 days 23:01:00]\n[0 days 00:00:00.000000123]\n" + "[0:0:0.000123]\n[0:0:0.000123000]\n[00:00:00.100000001]\n" + "[-2147483648]\n[2147483647]\n"; + std::string record_orient = to_records_orient({{{"0", "-2"}}, + {{"0", "-1"}}, + {{"0", "0"}}, + {{"0", R"("1 days")"}}, + {{"0", R"("0 days 23:01:00")"}}, + {{"0", R"("0 days 00:00:00.000000123")"}}, + {{"0", R"("0:0:0.000123")"}}, + {{"0", R"("0:0:0.000123000")"}}, + {{"0", R"("00:00:00.100000001")"}}, + {{"0", R"(-2147483648)"}}, + {{"0", R"(2147483647)"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + auto filepath = temp_env->get_temp_dir() + "Durations.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[-2]\n[-1]\n[0]\n"; - outfile << "[1 days]\n[0 days 23:01:00]\n[0 days 00:00:00.000000123]\n"; - outfile << "[0:0:0.000123]\n[0:0:0.000123000]\n[00:00:00.100000001]\n"; - outfile << "[-2147483648]\n[2147483647]\n"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({data_type{type_id::DURATION_NANOSECONDS}}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); const auto view = result.tbl->view(); @@ -398,13 +573,20 @@ TEST_F(JsonReaderTest, Durations) validity}); } -TEST_F(JsonReaderTest, JsonLinesDtypeInference) +TEST_P(JsonReaderParamTest, JsonLinesDtypeInference) { - std::string data = "[100, 1.1, \"aa \"]\n[200, 2.2, \" bbb\"]"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[100, 1.1, \"aa \"]\n[200, 2.2, \" bbb\"]"; + std::string record_orient = to_records_orient({{{"0", "100"}, {"1", "1.1"}, {"2", R"("aa ")"}}, + {{"0", "200"}, {"1", "2.2"}, {"2", R"(" bbb")"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -427,15 +609,24 @@ TEST_F(JsonReaderTest, JsonLinesDtypeInference) cudf::test::strings_column_wrapper({"aa ", " bbb"})); } -TEST_F(JsonReaderTest, JsonLinesFileInput) +TEST_P(JsonReaderParamTest, JsonLinesFileInput) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[11, 1.1]\n[22, 2.2]"; + std::string record_orient = + to_records_orient({{{"0", "11"}, {"1", "1.1"}}, {{"0", "22"}, {"1", "2.2"}}}, "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; std::ofstream outfile(fname, std::ofstream::out); - outfile << "[11, 1.1]\n[22, 2.2]"; + outfile << data; outfile.close(); cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{fname}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{fname}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -481,15 +672,19 @@ TEST_F(JsonReaderTest, JsonLinesByteRange) int64_wrapper{{3000, 4000, 5000}, validity}); } -TEST_F(JsonReaderTest, JsonLinesObjects) +TEST_P(JsonReaderDualTest, JsonLinesObjects) { - const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json"; std::ofstream outfile(fname, std::ofstream::out); outfile << " {\"co\\\"l1\" : 1, \"col2\" : 2.0} \n"; outfile.close(); cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{fname}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{fname}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -507,12 +702,15 @@ TEST_F(JsonReaderTest, JsonLinesObjects) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{2.0}, validity}); } -TEST_F(JsonReaderTest, JsonLinesObjectsStrings) +TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings) { - auto test_json_objects = [](std::string const& data) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + auto test_json_objects = [test_experimental](std::string const& data) { cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -545,15 +743,18 @@ TEST_F(JsonReaderTest, JsonLinesObjectsStrings) "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n"); } -TEST_F(JsonReaderTest, JsonLinesObjectsMissingData) +TEST_P(JsonReaderDualTest, JsonLinesObjectsMissingData) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); // Note: columns will be ordered based on which fields appear first std::string const data = "{ \"col2\":1.1, \"col3\":\"aaa\"}\n" "{\"col1\":200, \"col3\":\"bbb\"}\n"; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -581,15 +782,18 @@ TEST_F(JsonReaderTest, JsonLinesObjectsMissingData) cudf::test::strings_column_wrapper({"aaa", "bbb"})); } -TEST_F(JsonReaderTest, JsonLinesObjectsOutOfOrder) +TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); std::string const data = "{\"col1\":100, \"col2\":1.1, \"col3\":\"aaa\"}\n" "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n"; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -675,18 +879,31 @@ TEST_F(JsonReaderTest, ArrowFileSource) int8_wrapper{{9, 8, 7, 6, 5, 4, 3, 2}, validity}); } -TEST_F(JsonReaderTest, InvalidFloatingPoint) +TEST_P(JsonReaderParamTest, InvalidFloatingPoint) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[1.2e1+]\n[3.4e2-]\n[5.6e3e]\n[7.8e3A]\n[9.0Be1]\n[1C.2]"; + std::string record_orient = to_records_orient({{{"0", "1.2e1+"}}, + {{"0", "3.4e2-"}}, + {{"0", "5.6e3e"}}, + {{"0", "7.8e3A"}}, + {{"0", "9.0Be1"}}, + {{"0", "1C.2"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + const auto filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[1.2e1+]\n[3.4e2-]\n[5.6e3e]\n[7.8e3A]\n[9.0Be1]\n[1C.2]"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 1); @@ -700,20 +917,30 @@ TEST_F(JsonReaderTest, InvalidFloatingPoint) ASSERT_EQ(0u, col_data.second[0]); } -TEST_F(JsonReaderTest, StringInference) +TEST_P(JsonReaderParamTest, StringInference) { - std::string buffer = "[\"-1\"]"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[\"-1\"]"; + std::string record_orient = to_records_orient({{{"0", R"("-1")"}}}, "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) - .lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{data.c_str(), data.size()}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); } -TEST_F(JsonReaderTest, ParseInRangeIntegers) +TEST_P(JsonReaderParamTest, ParseInRangeIntegers) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + bool const row_orient = (test_opt == json_test_t::json_lines_row_orient); + constexpr auto num_rows = 4; std::vector small_int = {0, -10, 20, -30}; std::vector less_equal_int64_max = {std::numeric_limits::max() - 3, @@ -751,19 +978,41 @@ TEST_F(JsonReaderTest, ParseInRangeIntegers) auto filepath = temp_env->get_temp_dir() + "ParseInRangeIntegers.json"; { std::ostringstream line; - for (int i = 0; i < num_rows; ++i) { - line << "[" << small_int[i] << "," << less_equal_int64_max[i] << "," - << greater_equal_int64_min[i] << "," << greater_int64_max[i] << "," - << less_equal_uint64_max[i] << "," << small_int_append_zeros[i] << "," - << less_equal_int64_max_append_zeros[i] << "," << greater_equal_int64_min_append_zeros[i] - << "," << greater_int64_max_append_zeros[i] << "," - << less_equal_uint64_max_append_zeros[i] << "]\n"; + if (row_orient) { + for (int i = 0; i < num_rows; ++i) { + line << "[" << small_int[i] << "," << less_equal_int64_max[i] << "," + << greater_equal_int64_min[i] << "," << greater_int64_max[i] << "," + << less_equal_uint64_max[i] << "," << small_int_append_zeros[i] << "," + << less_equal_int64_max_append_zeros[i] << "," + << greater_equal_int64_min_append_zeros[i] << "," << greater_int64_max_append_zeros[i] + << "," << less_equal_uint64_max_append_zeros[i] << "]\n"; + } + } else { + std::vector> records; + for (int i = 0; i < num_rows; ++i) { + records.push_back({ + {"0", std::to_string(small_int[i])}, // + {"1", std::to_string(less_equal_int64_max[i])}, // + {"2", std::to_string(greater_equal_int64_min[i])}, // + {"3", std::to_string(greater_int64_max[i])}, // + {"4", std::to_string(less_equal_uint64_max[i])}, // + {"5", small_int_append_zeros[i]}, // + {"6", less_equal_int64_max_append_zeros[i]}, // + {"7", greater_equal_int64_min_append_zeros[i]}, // + {"8", greater_int64_max_append_zeros[i]}, // + {"9", less_equal_uint64_max_append_zeros[i]}, // + }); + } + line << to_records_orient(records, "\n"); } + std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); } cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -782,8 +1031,12 @@ TEST_F(JsonReaderTest, ParseInRangeIntegers) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_less_equal_uint64_max, view.column(9)); } -TEST_F(JsonReaderTest, ParseOutOfRangeIntegers) +TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + bool const row_orient = (test_opt == json_test_t::json_lines_row_orient); + constexpr auto num_rows = 4; std::vector out_of_range_positive = {"111111111111111111111", "2222222222222222222222", @@ -830,18 +1083,41 @@ TEST_F(JsonReaderTest, ParseOutOfRangeIntegers) auto filepath = temp_env->get_temp_dir() + "ParseOutOfRangeIntegers.json"; { std::ostringstream line; - for (int i = 0; i < num_rows; ++i) { - line << "[" << out_of_range_positive[i] << "," << out_of_range_negative[i] << "," - << greater_uint64_max[i] << "," << less_int64_min[i] << "," << mixed_range[i] << "," - << out_of_range_positive_append_zeros[i] << "," << out_of_range_negative_append_zeros[i] - << "," << greater_uint64_max_append_zeros[i] << "," << less_int64_min_append_zeros[i] - << "," << mixed_range_append_zeros[i] << "]\n"; + if (row_orient) { + for (int i = 0; i < num_rows; ++i) { + line << "[" << out_of_range_positive[i] << "," << out_of_range_negative[i] << "," + << greater_uint64_max[i] << "," << less_int64_min[i] << "," << mixed_range[i] << "," + << out_of_range_positive_append_zeros[i] << "," + << out_of_range_negative_append_zeros[i] << "," << greater_uint64_max_append_zeros[i] + << "," << less_int64_min_append_zeros[i] << "," << mixed_range_append_zeros[i] + << "]\n"; + } + } else { + std::vector> records; + for (int i = 0; i < num_rows; ++i) { + records.push_back({ + {"0", out_of_range_positive[i]}, // + {"1", out_of_range_negative[i]}, // + {"2", greater_uint64_max[i]}, // + {"3", less_int64_min[i]}, // + {"4", mixed_range[i]}, // + {"5", out_of_range_positive_append_zeros[i]}, // + {"6", out_of_range_negative_append_zeros[i]}, // + {"7", greater_uint64_max_append_zeros[i]}, // + {"8", less_int64_min_append_zeros[i]}, // + {"9", mixed_range_append_zeros[i]}, // + }); + } + line << to_records_orient(records, "\n"); } + std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); } cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -859,20 +1135,30 @@ TEST_F(JsonReaderTest, ParseOutOfRangeIntegers) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_mixed_range_append, view.column(9)); } -TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs) +TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::vector row_orient{"[11, 1.1]\n[22, 2.2]\n", "[33, 3.3]\n[44, 4.4]"}; + std::vector record_orient{ + to_records_orient({{{"0", "11"}, {"1", "1.1"}}, {{"0", "22"}, {"1", "2.2"}}}, "\n") + "\n", + to_records_orient({{{"0", "33"}, {"1", "3.3"}}, {{"0", "44"}, {"1", "4.4"}}}, "\n") + "\n"}; + auto const& data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + const std::string file1 = temp_env->get_temp_dir() + "JsonLinesFileTest1.json"; std::ofstream outfile(file1, std::ofstream::out); - outfile << "[11, 1.1]\n[22, 2.2]\n"; + outfile << data[0]; outfile.close(); const std::string file2 = temp_env->get_temp_dir() + "JsonLinesFileTest2.json"; std::ofstream outfile2(file2, std::ofstream::out); - outfile2 << "[33, 3.3]\n[44, 4.4]"; + outfile2 << data[1]; outfile2.close(); cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{{file1, file2}}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{{file1, file2}}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -1000,10 +1286,168 @@ TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions) json_lines_options.enable_experimental(true); cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); - // Verify that the data read via non-nested JSON lines reader matches the data read via nested - // JSON reader + // Verify that the data read via non-nested JSON lines reader matches the data read via + // nested JSON reader CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); } } +TEST_F(JsonReaderTest, TestColumnOrder) +{ + std::string const json_string = + // Expected order: + // root: b, c, a, d + // a: 2, 0, 1 + {R"({"b":"b0"} + {"c":"c1","a":{"2":null}} + {"d":"d2","a":{"0":"a2.0", "2":"a2.2"}} + {"b":"b3","a":{"1":null, "2":"a3.2"}})"}; + + std::vector const root_col_names{"b", "c", "a", "d"}; + std::vector const a_child_col_names{"2", "0", "1"}; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true) + .experimental(true); + + // Read in data using nested JSON reader + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify root column order (assert to avoid OOB access) + ASSERT_EQ(new_reader_table.metadata.schema_info.size(), root_col_names.size()); + + for (std::size_t i = 0; i < a_child_col_names.size(); i++) { + auto const& root_col_name = root_col_names[i]; + EXPECT_EQ(new_reader_table.metadata.schema_info[i].name, root_col_name); + } + + // Verify nested child column order (assert to avoid OOB access) + ASSERT_EQ(new_reader_table.metadata.schema_info[2].children.size(), a_child_col_names.size()); + for (std::size_t i = 0; i < a_child_col_names.size(); i++) { + auto const& a_child_col_name = a_child_col_names[i]; + EXPECT_EQ(new_reader_table.metadata.schema_info[2].children[i].name, a_child_col_name); + } + + // Verify data of root columns + ASSERT_EQ(root_col_names.size(), new_reader_table.tbl->num_columns()); + column_wrapper root_col_data_b{{"b0", "", "", "b3"}, + {true, false, false, true}}; + column_wrapper root_col_data_c{{"", "c1", "", ""}, + {false, true, false, false}}; + column_wrapper root_col_data_d{{"", "", "d2", ""}, + {false, false, true, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_b, new_reader_table.tbl->get_column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_c, new_reader_table.tbl->get_column(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_d, new_reader_table.tbl->get_column(3)); + + // Verify data of child columns of column 'a' + auto const col_a = new_reader_table.tbl->get_column(2); + ASSERT_EQ(a_child_col_names.size(), col_a.num_children()); + column_wrapper col_a2{{"", "", "a2.2", "a3.2"}, {false, false, true, true}}; + column_wrapper col_a0{{"", "", "a2.0", ""}, {false, false, true, false}}; + // col a.1 is inferred as all-null + int8_wrapper col_a1{{0, 0, 0, 0}, {false, false, false, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a2, col_a.child(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a0, col_a.child(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a1, col_a.child(2)); +} + +TEST_P(JsonReaderParamTest, JsonDtypeSchema) +{ + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[1, 1.1, \"aa \"]\n[2, 2.2, \" bbb\"]"; + std::string record_orient = to_records_orient({{{"0", "1"}, {"1", "1.1"}, {"2", R"("aa ")"}}, + {{"0", "2"}, {"1", "2.2"}, {"2", R"(" bbb")"}}}, + "\n"); + + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + + std::map dtype_schema{ + {"2", {dtype()}}, {"0", {dtype()}}, {"1", {dtype()}}}; + cudf_io::json_reader_options in_options = + cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) + .dtypes(dtype_schema) + .lines(true) + .experimental(test_experimental); + + cudf_io::table_with_metadata result = cudf_io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 3); + EXPECT_EQ(result.tbl->num_rows(), 2); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); + + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); + EXPECT_EQ(result.metadata.schema_info[2].name, "2"); + + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), + cudf::test::strings_column_wrapper({"aa ", " bbb"})); +} + +TEST_F(JsonReaderTest, JsonNestedDtypeSchema) +{ + std::string json_string = R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}])"; + + std::map dtype_schema{ + {"a", + { + data_type{cudf::type_id::LIST}, + {{"element", {data_type{cudf::type_id::STRUCT}, {{"0", {dtype()}}}}}}, + }}, + {"b", {dtype()}}, + }; + + cudf_io::json_reader_options in_options = + cudf_io::json_reader_options::builder( + cudf_io::source_info{json_string.data(), json_string.size()}) + .dtypes(dtype_schema) + .lines(false) + .experimental(true); + + cudf_io::table_with_metadata result = cudf_io::read_json(in_options); + + // Make sure we have columns "a" and "b" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + // Make sure column "a" is a list column (offsets and elements) + ASSERT_EQ(result.tbl->get_column(0).num_children(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2); + // Make sure column "b" is a leaf column + ASSERT_EQ(result.tbl->get_column(1).num_children(), 0); + ASSERT_EQ(result.metadata.schema_info[1].children.size(), 0); + // Offsets child with no other child columns + ASSERT_EQ(result.tbl->get_column(0).child(0).num_children(), 0); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 0); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "offsets"); + // Elements is the struct column with a single child column "0" + ASSERT_EQ(result.tbl->get_column(0).child(1).num_children(), 1); + ASSERT_EQ(result.metadata.schema_info[0].children[1].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "element"); + + // Verify column "a" being a list column + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::LIST); + // Verify column "a->element->0" is a float column + EXPECT_EQ(result.tbl->get_column(0).child(1).child(0).type().id(), cudf::type_id::FLOAT32); + // Verify column "b" is an int column + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT32); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(1).child(0), + float_wrapper{{0.0, 123.0}, {false, true}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), + int_wrapper{{1, 1, 2}, {true, true, true}}); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp new file mode 100644 index 00000000000..940d9d8ca0a --- /dev/null +++ b/cpp/tests/io/json_tree.cpp @@ -0,0 +1,826 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +namespace cuio_json = cudf::io::json; +namespace cudf::io::json { +// Host copy of tree_meta_t +struct tree_meta_t2 { + std::vector node_categories; + std::vector parent_node_ids; + std::vector node_levels; + std::vector node_range_begin; + std::vector node_range_end; +}; +} // namespace cudf::io::json + +namespace { +std::string get_node_string(std::size_t const node_id, + cuio_json::tree_meta_t2 const& tree_rep, + std::string const& json_input) +{ + auto node_to_str = [](cuio_json::PdaTokenT const token) { + switch (token) { + case cuio_json::NC_STRUCT: return "STRUCT"; + case cuio_json::NC_LIST: return "LIST"; + case cuio_json::NC_FN: return "FN"; + case cuio_json::NC_STR: return "STR"; + case cuio_json::NC_VAL: return "VAL"; + case cuio_json::NC_ERR: return "ERR"; + default: return "N/A"; + }; + }; + + return "<" + std::to_string(node_id) + ":" + node_to_str(tree_rep.node_categories[node_id]) + + ":[" + std::to_string(tree_rep.node_range_begin[node_id]) + ", " + + std::to_string(tree_rep.node_range_end[node_id]) + ") '" + + json_input.substr(tree_rep.node_range_begin[node_id], + tree_rep.node_range_end[node_id] - tree_rep.node_range_begin[node_id]) + + "'>"; +} + +void print_tree_representation(std::string const& json_input, + cuio_json::tree_meta_t2 const& tree_rep) +{ + for (std::size_t i = 0; i < tree_rep.node_categories.size(); i++) { + auto parent_id = tree_rep.parent_node_ids[i]; + std::stack path; + path.push(i); + while (parent_id != cuio_json::parent_node_sentinel) { + path.push(parent_id); + parent_id = tree_rep.parent_node_ids[parent_id]; + } + + while (path.size()) { + auto const node_id = path.top(); + std::cout << get_node_string(node_id, tree_rep, json_input) + << (path.size() > 1 ? " -> " : ""); + path.pop(); + } + std::cout << "\n"; + } +} +} // namespace + +namespace cudf::io::json { +namespace test { + +tree_meta_t2 to_cpu_tree(tree_meta_t const& d_value, rmm::cuda_stream_view stream) +{ + return {cudf::detail::make_std_vector_async(d_value.node_categories, stream), + cudf::detail::make_std_vector_async(d_value.parent_node_ids, stream), + cudf::detail::make_std_vector_async(d_value.node_levels, stream), + cudf::detail::make_std_vector_async(d_value.node_range_begin, stream), + cudf::detail::make_std_vector_async(d_value.node_range_end, stream)}; +} + +// DEBUG prints +auto to_cat = [](auto v) -> std::string { + switch (v) { + case NC_STRUCT: return " S"; + case NC_LIST: return " L"; + case NC_STR: return " \""; + case NC_VAL: return " V"; + case NC_FN: return " F"; + case NC_ERR: return "ER"; + default: return "UN"; + }; +}; +auto to_int = [](auto v) { return std::to_string(static_cast(v)); }; +auto print_vec = [](auto const& cpu, auto const name, auto converter) { + for (auto const& v : cpu) + printf("%3s,", converter(v).c_str()); + std::cout << name << std::endl; +}; +void print_tree(tree_meta_t2 const& cpu_tree) +{ + print_vec(cpu_tree.node_categories, "node_categories", to_cat); + print_vec(cpu_tree.parent_node_ids, "parent_node_ids", to_int); + print_vec(cpu_tree.node_levels, "node_levels", to_int); + print_vec(cpu_tree.node_range_begin, "node_range_begin", to_int); + print_vec(cpu_tree.node_range_end, "node_range_end", to_int); +} +void print_tree(tree_meta_t const& d_gpu_tree) +{ + auto const cpu_tree = to_cpu_tree(d_gpu_tree, rmm::cuda_stream_default); + print_tree(cpu_tree); +} + +template +bool compare_vector(std::vector const& cpu_vec, + std::vector const& gpu_vec, + std::string const& name) +{ + EXPECT_EQ(cpu_vec.size(), gpu_vec.size()); + bool mismatch = false; + if (!std::equal(cpu_vec.begin(), cpu_vec.end(), gpu_vec.begin())) { + print_vec(cpu_vec, name + "(cpu)", to_int); + print_vec(gpu_vec, name + "(gpu)", to_int); + for (auto i = 0lu; i < cpu_vec.size(); i++) { + mismatch |= (cpu_vec[i] != gpu_vec[i]); + printf("%3s,", (cpu_vec[i] == gpu_vec[i] ? " " : "x")); + } + printf("\n"); + } + EXPECT_FALSE(mismatch); + return mismatch; +} + +template +bool compare_vector(std::vector const& cpu_vec, + rmm::device_uvector const& d_vec, + std::string const& name) +{ + auto gpu_vec = cudf::detail::make_std_vector_async(d_vec, cudf::default_stream_value); + return compare_vector(cpu_vec, gpu_vec, name); +} + +void compare_trees(tree_meta_t2 const& cpu_tree, tree_meta_t const& d_gpu_tree, bool print = false) +{ + auto cpu_num_nodes = cpu_tree.node_categories.size(); + EXPECT_EQ(cpu_num_nodes, d_gpu_tree.node_categories.size()); + EXPECT_EQ(cpu_num_nodes, d_gpu_tree.parent_node_ids.size()); + EXPECT_EQ(cpu_num_nodes, d_gpu_tree.node_levels.size()); + EXPECT_EQ(cpu_num_nodes, d_gpu_tree.node_range_begin.size()); + EXPECT_EQ(cpu_num_nodes, d_gpu_tree.node_range_end.size()); + auto gpu_tree = to_cpu_tree(d_gpu_tree, cudf::default_stream_value); + bool mismatch = false; + +#define COMPARE_MEMBER(member) \ + for (std::size_t i = 0; i < cpu_num_nodes; i++) { \ + EXPECT_EQ(cpu_tree.member[i], gpu_tree.member[i]) << #member << "[" << i << "]"; \ + } + COMPARE_MEMBER(node_categories); + COMPARE_MEMBER(parent_node_ids); + COMPARE_MEMBER(node_levels); + COMPARE_MEMBER(node_range_begin); + COMPARE_MEMBER(node_range_end); +#undef COMPARE_MEMBER + +#define PRINT_VEC(vec, conv) print_vec(vec, #vec, conv); +#define PRINT_COMPARISON(vec, conv) \ + PRINT_VEC(cpu_tree.vec, conv); \ + PRINT_VEC(gpu_tree.vec, conv); \ + if (!std::equal(cpu_tree.vec.begin(), cpu_tree.vec.end(), gpu_tree.vec.begin())) { \ + for (auto i = 0lu; i < cpu_tree.vec.size(); i++) { \ + mismatch |= (gpu_tree.vec[i] != cpu_tree.vec[i]); \ + printf("%3s,", (gpu_tree.vec[i] == cpu_tree.vec[i] ? " " : "x")); \ + } \ + printf("\n"); \ + } + if (print) { + for (int i = 0; i < int(cpu_num_nodes); i++) + printf("%3d,", i); + printf(" node_id\n"); + PRINT_COMPARISON(node_categories, to_cat); // Works + PRINT_COMPARISON(node_levels, to_int); // Works + PRINT_COMPARISON(node_range_begin, to_int); // Works + PRINT_COMPARISON(node_range_end, to_int); // Works + PRINT_COMPARISON(parent_node_ids, to_int); // Works + EXPECT_FALSE(mismatch); + } +#undef PRINT_VEC +#undef PRINT_COMPARISON +} + +template +auto translate_col_id(T const& col_id) +{ + using value_type = typename T::value_type; + std::unordered_map col_id_map; + std::vector new_col_ids(col_id.size()); + value_type unique_id = 0; + for (auto id : col_id) { + if (col_id_map.count(id) == 0) { col_id_map[id] = unique_id++; } + } + for (size_t i = 0; i < col_id.size(); i++) { + new_col_ids[i] = col_id_map[col_id[i]]; + } + return new_col_ids; +} + +tree_meta_t2 get_tree_representation_cpu(device_span tokens_gpu, + device_span token_indices_gpu1, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + constexpr bool include_quote_char = true; + // Copy the JSON tokens to the host + thrust::host_vector tokens = cudf::detail::make_host_vector_async(tokens_gpu, stream); + thrust::host_vector token_indices = + cudf::detail::make_host_vector_async(token_indices_gpu1, stream); + + // Make sure tokens have been copied to the host + stream.synchronize(); + + // DEBUG print + [[maybe_unused]] auto to_token_str = [](PdaTokenT token) { + switch (token) { + case token_t::StructBegin: return " {"; + case token_t::StructEnd: return " }"; + case token_t::ListBegin: return " ["; + case token_t::ListEnd: return " ]"; + case token_t::FieldNameBegin: return "FB"; + case token_t::FieldNameEnd: return "FE"; + case token_t::StringBegin: return "SB"; + case token_t::StringEnd: return "SE"; + case token_t::ErrorBegin: return "er"; + case token_t::ValueBegin: return "VB"; + case token_t::ValueEnd: return "VE"; + case token_t::StructMemberBegin: return " <"; + case token_t::StructMemberEnd: return " >"; + default: return "."; + } + }; + if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { + std::cout << "Tokens: \n"; + for (auto i = 0u; i < tokens.size(); i++) { + std::cout << to_token_str(tokens[i]) << " "; + } + std::cout << std::endl; + } + + // Whether a token does represent a node in the tree representation + auto is_node = [](PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: + case token_t::ListBegin: + case token_t::StringBegin: + case token_t::ValueBegin: + case token_t::FieldNameBegin: + case token_t::ErrorBegin: return true; + default: return false; + }; + }; + + // The node that a token represents + auto token_to_node = [](PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: return NC_STRUCT; + case token_t::ListBegin: return NC_LIST; + case token_t::StringBegin: return NC_STR; + case token_t::ValueBegin: return NC_VAL; + case token_t::FieldNameBegin: return NC_FN; + default: return NC_ERR; + }; + }; + + // Includes quote char for end-of-string token or Skips the quote char for beginning-of-field-name + auto get_token_index = [include_quote_char](PdaTokenT const token, + SymbolOffsetT const token_index) { + constexpr SymbolOffsetT quote_char_size = 1; + switch (token) { + // Strip off or include quote char for StringBegin + case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size); + // Strip off or Include trailing quote char for string values for StringEnd + case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0); + // Strip off quote char included for FieldNameBegin + case token_t::FieldNameBegin: return token_index + quote_char_size; + default: return token_index; + }; + }; + + // Whether a token expects to be followed by its respective end-of-* token partner + auto is_begin_of_section = [](PdaTokenT const token) { + switch (token) { + case token_t::StringBegin: + case token_t::ValueBegin: + case token_t::FieldNameBegin: return true; + default: return false; + }; + }; + + // The end-of-* partner token for a given beginning-of-* token + auto end_of_partner = [](PdaTokenT const token) { + switch (token) { + case token_t::StringBegin: return token_t::StringEnd; + case token_t::ValueBegin: return token_t::ValueEnd; + case token_t::FieldNameBegin: return token_t::FieldNameEnd; + default: return token_t::ErrorBegin; + }; + }; + + // Whether the token pops from the parent node stack + auto does_pop = [](PdaTokenT const token) { + switch (token) { + case token_t::StructEnd: + case token_t::ListEnd: return true; + default: return false; + }; + }; + + // Whether the token pushes onto the parent node stack + auto does_push = [](PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: + case token_t::ListBegin: return true; + default: return false; + }; + }; + + // The node id sitting on top of the stack becomes the node's parent + // The full stack represents the path from the root to the current node + std::stack> parent_stack; + + constexpr bool field_name_node = true; + constexpr bool no_field_name_node = false; + + std::vector node_categories; + std::vector parent_node_ids; + std::vector node_levels; + std::vector node_range_begin; + std::vector node_range_end; + + std::size_t node_id = 0; + for (std::size_t i = 0; i < tokens.size(); i++) { + auto token = tokens[i]; + + // The section from the original JSON input that this token demarcates + std::size_t range_begin = get_token_index(token, token_indices[i]); + std::size_t range_end = range_begin + 1; + + // Identify this node's parent node id + std::size_t parent_node_id = + (parent_stack.size() > 0) ? parent_stack.top().first : parent_node_sentinel; + + // If this token is the beginning-of-{value, string, field name}, also consume the next end-of-* + // token + if (is_begin_of_section(token)) { + if ((i + 1) < tokens.size() && end_of_partner(token) == tokens[i + 1]) { + // Update the range_end for this pair of tokens + range_end = get_token_index(tokens[i + 1], token_indices[i + 1]); + // We can skip the subsequent end-of-* token + i++; + } + } + + // Emit node if this token becomes a node in the tree + if (is_node(token)) { + node_categories.push_back(token_to_node(token)); + parent_node_ids.push_back(parent_node_id); + node_levels.push_back(parent_stack.size()); + node_range_begin.push_back(range_begin); + node_range_end.push_back(range_end); + } + + // Modify the stack if needed + if (token == token_t::FieldNameBegin) { + parent_stack.push({node_id, field_name_node}); + } else { + if (does_push(token)) { + parent_stack.push({node_id, no_field_name_node}); + } else if (does_pop(token)) { + CUDF_EXPECTS(parent_stack.size() >= 1, "Invalid JSON input."); + parent_stack.pop(); + } + + // If what we're left with is a field name on top of stack, we need to pop it + if (parent_stack.size() >= 1 && parent_stack.top().second == field_name_node) { + parent_stack.pop(); + } + } + + // Update node_id + if (is_node(token)) { node_id++; } + } + + return {std::move(node_categories), + std::move(parent_node_ids), + std::move(node_levels), + std::move(node_range_begin), + std::move(node_range_end)}; +} + +std::tuple, std::vector> records_orient_tree_traversal_cpu( + host_span input, tree_meta_t2 const& tree, rmm::cuda_stream_view stream) +{ + std::vector node_ids(tree.parent_node_ids.size()); + std::iota(node_ids.begin(), node_ids.end(), 0); + + if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { + for (int i = 0; i < int(tree.node_range_begin.size()); i++) { + printf("%3s ", + std::string(input.data() + tree.node_range_begin[i], + tree.node_range_end[i] - tree.node_range_begin[i]) + .c_str()); + } + printf(" (JSON)\n"); + print_vec(tree.node_categories, "node_categories", to_cat); + print_vec(node_ids, "cpu.node_ids", to_int); + } + + // print_vec(tree.parent_node_ids, "tree.parent_node_ids (before)"); + constexpr NodeIndexT top_node = -1; + // CPU version of the algorithm + // Calculate row offsets too. + auto hash_path = [&](auto node_id) { + size_t seed = 0; + while (node_id != top_node) { + seed = cudf::detail::hash_combine(seed, std::hash{}(tree.node_levels[node_id])); + seed = cudf::detail::hash_combine(seed, std::hash{}(tree.node_categories[node_id])); + if (tree.node_categories[node_id] == node_t::NC_FN) { + auto field_name = + std::string_view(input.data() + tree.node_range_begin[node_id], + tree.node_range_end[node_id] - tree.node_range_begin[node_id]); + seed = cudf::detail::hash_combine(seed, std::hash{}(field_name)); + } + node_id = tree.parent_node_ids[node_id]; + } + return seed; + }; + auto equal_path = [&](auto node_id1, auto node_id2) { + bool is_equal = true; + while (is_equal and node_id1 != top_node and node_id2 != top_node) { + is_equal &= tree.node_levels[node_id1] == tree.node_levels[node_id2]; + is_equal &= tree.node_categories[node_id1] == tree.node_categories[node_id2]; + if (is_equal and tree.node_categories[node_id1] == node_t::NC_FN) { + auto field_name1 = + std::string_view(input.data() + tree.node_range_begin[node_id1], + tree.node_range_end[node_id1] - tree.node_range_begin[node_id1]); + auto field_name2 = + std::string_view(input.data() + tree.node_range_begin[node_id2], + tree.node_range_end[node_id2] - tree.node_range_begin[node_id2]); + is_equal &= field_name1 == field_name2; + } + node_id1 = tree.parent_node_ids[node_id1]; + node_id2 = tree.parent_node_ids[node_id2]; + } + return is_equal and node_id1 == top_node and node_id2 == top_node; + }; + std::unordered_map node_id_map( + 10, hash_path, equal_path); + auto unique_col_id = 0; + for (auto& node_idx : node_ids) { + if (node_id_map.count(node_idx) == 0) { + node_id_map[node_idx] = unique_col_id++; // node_idx; + node_idx = node_id_map[node_idx]; + } else { + node_idx = node_id_map[node_idx]; + } + } + // Translate parent_node_ids + auto parent_col_ids(tree.parent_node_ids); + for (auto& parent_node_id : parent_col_ids) { + if (parent_node_id != top_node) parent_node_id = node_ids[parent_node_id]; + } + if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { + print_vec(node_ids, "cpu.node_ids (after)", to_int); + print_vec(tree.parent_node_ids, "cpu.parent_node_ids (after)", to_int); + } + // row_offsets + std::vector row_offsets(tree.parent_node_ids.size(), 0); + std::unordered_map col_id_current_offset; + for (std::size_t i = 0; i < tree.parent_node_ids.size(); i++) { + auto current_col_id = node_ids[i]; + auto parent_col_id = parent_col_ids[i]; + auto parent_node_id = tree.parent_node_ids[i]; + if (parent_col_id == top_node) { + // row_offsets[current_col_id] = 0; // JSON lines treats top node as list. + col_id_current_offset[current_col_id]++; + row_offsets[i] = col_id_current_offset[current_col_id] - 1; + } else { + if (tree.node_categories[parent_node_id] == node_t::NC_LIST) { + col_id_current_offset[current_col_id]++; + row_offsets[i] = col_id_current_offset[current_col_id] - 1; + } else { + row_offsets[i] = col_id_current_offset[parent_col_id] - 1; + col_id_current_offset[current_col_id] = col_id_current_offset[parent_col_id]; + } + } + } + if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { + print_vec(row_offsets, "cpu.row_offsets (generated)", to_int); + } + return {std::move(node_ids), std::move(row_offsets)}; +} + +} // namespace test +} // namespace cudf::io::json + +namespace json_test = cudf::io::json::test; + +// Base test fixture for tests +struct JsonTest : public cudf::test::BaseFixture { +}; + +TEST_F(JsonTest, TreeRepresentation) +{ + auto const stream = cudf::default_stream_value; + + // Test input + std::string const input = R"( [{)" + R"("category": "reference",)" + R"("index:": [4,12,42],)" + R"("author": "Nigel Rees",)" + R"("title": "[Sayings of the Century]",)" + R"("price": 8.95)" + R"(}, )" + R"({)" + R"("category": "reference",)" + R"("index": [4,{},null,{"a":[{ }, {}] } ],)" + R"("author": "Nigel Rees",)" + R"("title": "{}[], <=semantic-symbols-string",)" + R"("price": 8.95)" + R"(}] )"; + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = cudf::device_span{ + d_scalar.data(), static_cast(d_scalar.size())}; + + cudf::io::json_reader_options const options{}; + + // Parse the JSON and get the token stream + const auto [tokens_gpu, token_indices_gpu] = + cudf::io::json::detail::get_token_stream(d_input, options, stream); + + // Get the JSON's tree representation + auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream); + // host tree generation + auto cpu_tree = + cuio_json::test::get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); + json_test::compare_trees(cpu_tree, gpu_tree); + + // Print tree representation + if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { print_tree_representation(input, cpu_tree); } + + // Golden sample of node categories + std::vector golden_node_categories = { + cuio_json::NC_LIST, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STR, + cuio_json::NC_FN, cuio_json::NC_LIST, cuio_json::NC_VAL, cuio_json::NC_VAL, + cuio_json::NC_VAL, cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, + cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_VAL, cuio_json::NC_STRUCT, + cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_LIST, + cuio_json::NC_VAL, cuio_json::NC_STRUCT, cuio_json::NC_VAL, cuio_json::NC_STRUCT, + cuio_json::NC_FN, cuio_json::NC_LIST, cuio_json::NC_STRUCT, cuio_json::NC_STRUCT, + cuio_json::NC_FN, cuio_json::NC_STR, cuio_json::NC_FN, cuio_json::NC_STR, + cuio_json::NC_FN, cuio_json::NC_VAL}; + + // Golden sample of node ids + // clang-format off + std::vector golden_parent_node_ids = { + cuio_json::parent_node_sentinel, 0, 1, 2, + 1, 4, 5, 5, + 5, 1, 9, 1, + 11, 1, 13, 0, + 15, 16, 15, 18, + 19, 19, 19, 19, + 23, 24, 25, 25, + 15, 28, 15, 30, + 15, 32}; + // clang-format on + + // Golden sample of node levels + std::vector golden_node_levels = {0, 1, 2, 3, 2, 3, 4, 4, 4, 2, 3, 2, + 3, 2, 3, 1, 2, 3, 2, 3, 4, 4, 4, 4, + 5, 6, 7, 7, 2, 3, 2, 3, 2, 3}; + + // Golden sample of the character-ranges from the original input that each node demarcates + std::vector golden_node_range_begin = { + 2, 3, 5, 16, 29, 38, 39, 41, 44, 49, 58, 72, 80, 108, 116, 124, 126, + 137, 150, 158, 159, 161, 164, 169, 171, 174, 175, 180, 189, 198, 212, 220, 255, 263}; + + // Golden sample of the character-ranges from the original input that each node demarcates + std::vector golden_node_range_end = { + 3, 4, 13, 27, 35, 39, 40, 43, 46, 55, 70, 77, 106, 113, 120, 125, 134, + 148, 155, 159, 160, 162, 168, 170, 172, 175, 176, 181, 195, 210, 217, 253, 260, 267}; + + // Check results against golden samples + ASSERT_EQ(golden_node_categories.size(), cpu_tree.node_categories.size()); + ASSERT_EQ(golden_parent_node_ids.size(), cpu_tree.parent_node_ids.size()); + ASSERT_EQ(golden_node_levels.size(), cpu_tree.node_levels.size()); + ASSERT_EQ(golden_node_range_begin.size(), cpu_tree.node_range_begin.size()); + ASSERT_EQ(golden_node_range_end.size(), cpu_tree.node_range_end.size()); + + for (std::size_t i = 0; i < golden_node_categories.size(); i++) { + ASSERT_EQ(golden_node_categories[i], cpu_tree.node_categories[i]) << "[" << i << "]"; + ASSERT_EQ(golden_parent_node_ids[i], cpu_tree.parent_node_ids[i]) << "[" << i << "]"; + ASSERT_EQ(golden_node_levels[i], cpu_tree.node_levels[i]) << "[" << i << "]"; + ASSERT_EQ(golden_node_range_begin[i], cpu_tree.node_range_begin[i]) << "[" << i << "]"; + ASSERT_EQ(golden_node_range_end[i], cpu_tree.node_range_end[i]) << "[" << i << "]"; + } +} + +TEST_F(JsonTest, TreeRepresentation2) +{ + auto const stream = cudf::default_stream_value; + // Test input: value end with comma, space, close-brace ", }" + std::string const input = + // 0 1 2 3 4 5 6 7 8 9 + // 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890 + R"([ {}, { "a": { "y" : 6, "z": [] }}, { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11)" + "\n}}]"; + // Prepare input & output buffers + cudf::string_scalar d_scalar(input, true, stream); + auto d_input = cudf::device_span{d_scalar.data(), + static_cast(d_scalar.size())}; + + cudf::io::json_reader_options const options{}; + + // Parse the JSON and get the token stream + const auto [tokens_gpu, token_indices_gpu] = + cudf::io::json::detail::get_token_stream(d_input, options, stream); + + // Get the JSON's tree representation + auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream); + // host tree generation + auto cpu_tree = + cuio_json::test::get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); + json_test::compare_trees(cpu_tree, gpu_tree); + + // Print tree representation + if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { print_tree_representation(input, cpu_tree); } + + // Golden sample of node categories + // clang-format off + std::vector golden_node_categories = { + cuio_json::NC_LIST, cuio_json::NC_STRUCT, + cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_VAL, cuio_json::NC_FN, cuio_json::NC_LIST, + cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_VAL, cuio_json::NC_FN, cuio_json::NC_VAL, + cuio_json::NC_FN, cuio_json::NC_STRUCT, cuio_json::NC_FN, cuio_json::NC_VAL, cuio_json::NC_FN, cuio_json::NC_VAL}; + + // Golden sample of node ids + std::vector golden_parent_node_ids = { + cuio_json::parent_node_sentinel, 0, + 0, 2, 3, 4, 5, 4, 7, + 0, 9, 10, 11, 12, 11, 14, + 9, 16, 17, 18, 17, 20}; + // clang-format on + + // Golden sample of node levels + std::vector golden_node_levels = { + 0, 1, 1, 2, 3, 4, 5, 4, 5, 1, 2, 3, 4, 5, 4, 5, 2, 3, 4, 5, 4, 5, + }; + + // Golden sample of the character-ranges from the original input that each node demarcates + std::vector golden_node_range_begin = {0, 2, 6, 9, 13, 16, 21, 25, 29, 36, 39, + 44, 47, 52, 56, 60, 66, 71, 73, 77, 83, 87}; + + // Golden sample of the character-ranges from the original input that each node demarcates + std::vector golden_node_range_end = {1, 3, 7, 10, 14, 17, 22, 26, 30, 37, 40, + 45, 48, 53, 57, 61, 67, 72, 74, 79, 84, 89}; + + // Check results against golden samples + ASSERT_EQ(golden_node_categories.size(), cpu_tree.node_categories.size()); + ASSERT_EQ(golden_parent_node_ids.size(), cpu_tree.parent_node_ids.size()); + ASSERT_EQ(golden_node_levels.size(), cpu_tree.node_levels.size()); + ASSERT_EQ(golden_node_range_begin.size(), cpu_tree.node_range_begin.size()); + ASSERT_EQ(golden_node_range_end.size(), cpu_tree.node_range_end.size()); + + for (std::size_t i = 0; i < golden_node_categories.size(); i++) { + ASSERT_EQ(golden_node_categories[i], cpu_tree.node_categories[i]); + ASSERT_EQ(golden_parent_node_ids[i], cpu_tree.parent_node_ids[i]); + ASSERT_EQ(golden_node_levels[i], cpu_tree.node_levels[i]); + ASSERT_EQ(golden_node_range_begin[i], cpu_tree.node_range_begin[i]); + ASSERT_EQ(golden_node_range_end[i], cpu_tree.node_range_end[i]); + } +} + +TEST_F(JsonTest, TreeRepresentation3) +{ + auto const stream = cudf::default_stream_value; + // Test input: Json lines with same TreeRepresentation2 input + std::string const input = + R"( {} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"; // Prepare input & output buffers + cudf::string_scalar d_scalar(input, true, stream); + auto d_input = cudf::device_span{d_scalar.data(), + static_cast(d_scalar.size())}; + + cudf::io::json_reader_options options{}; + options.enable_lines(true); + + // Parse the JSON and get the token stream + const auto [tokens_gpu, token_indices_gpu] = + cudf::io::json::detail::get_token_stream(d_input, options, stream); + + // Get the JSON's tree representation + auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream); + // host tree generation + auto cpu_tree = + cuio_json::test::get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); + json_test::compare_trees(cpu_tree, gpu_tree); + + // Print tree representation + if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { print_tree_representation(input, cpu_tree); } +} + +/** + * @brief Test fixture for parametrized JSON tree traversal tests + */ +struct JsonTreeTraversalTest : public cudf::test::BaseFixture, + public testing::WithParamInterface> { +}; + +// +std::vector json_list = { + "value", + "\"string\"", + "[1, 2, 3]", + R"({"a": 1, "b": 2, "c": 3})", + // input a: {x:i, y:i, z:[]}, b: {x:i, z:i} + R"([ {}, { "a": { "y" : 6, "z": [] }}, { "a" : { "x" : 8, "y": 9}, "b" : {"x": 10, "z": 11}}])", + // input a: {x:i, y:i, z:[]}, b: {x:i, z: {p: i, q: i}} + R"([ {}, { "a": { "y" : 1, "z": [] }}, + { "a": { "x" : 2, "y": 3}, "b" : {"x": 4, "z": [ {"p": 1, "q": 2}]}}, + { "a": { "y" : 6, "z": [7, 8, 9]}, "b": {"x": 10, "z": [{}, {"q": 3}, {"p": 4}]}}, + { "a": { "z": [12, 13, 14, 15]}}, + { "a": { "z": [16], "x": 2}} + ])" + //^row offset a a.x a.y a.z b b.x b.z + // 1 1 1 + // 2 2 2 2 2 2 b.z[] 0 b.z.p 0, b.z.q 0 + // 3 3 3 3 3 3 a.z[] 0, 1, 2 b.z[] 1, 2, 3 b.z.q 2, b.z.p 3 + // 4 4 a.z[] 3, 4, 5, 6 + // 5 5 5 a.z[] 7 +}; + +std::vector json_lines_list = { + // Test input a: {x:i, y:i, z:[]}, b: {x:i, z:i} with JSON-lines + R"( {} + { "a": { "y" : 6, "z": [] }} + { "a": { "y" : 6, "z": [2, 3, 4, 5] }} + { "a": { "z": [4], "y" : 6 }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"}; +INSTANTIATE_TEST_SUITE_P(Mixed_And_Records, + JsonTreeTraversalTest, + ::testing::Combine(::testing::Values(false), + ::testing::ValuesIn(json_list))); +INSTANTIATE_TEST_SUITE_P(JsonLines, + JsonTreeTraversalTest, + ::testing::Combine(::testing::Values(true), + ::testing::ValuesIn(json_lines_list))); + +TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal) +{ + auto [json_lines, input] = GetParam(); + auto stream = cudf::default_stream_value; + cudf::io::json_reader_options options{}; + options.enable_lines(json_lines); + + // std::cout << json_lines << input << std::endl; + cudf::string_scalar d_scalar(input, true, stream); + auto d_input = cudf::device_span{d_scalar.data(), + static_cast(d_scalar.size())}; + + // Parse the JSON and get the token stream + const auto [tokens_gpu, token_indices_gpu] = + cudf::io::json::detail::get_token_stream(d_input, options, stream); + // host tree generation + auto cpu_tree = + cuio_json::test::get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); + // host tree traversal + auto [cpu_col_id, cpu_row_offsets] = + cuio_json::test::records_orient_tree_traversal_cpu(input, cpu_tree, stream); + // gpu tree generation + auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream); + // Print tree representation + if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { + printf("BEFORE traversal (gpu_tree):\n"); + json_test::print_tree(gpu_tree); + } + // gpu tree traversal + auto [gpu_col_id, gpu_row_offsets] = + cuio_json::detail::records_orient_tree_traversal(d_input, gpu_tree, stream); + // Print tree representation + if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { + printf("AFTER traversal (gpu_tree):\n"); + json_test::print_tree(gpu_tree); + } + + // convert to sequence because gpu col id might be have random id + auto gpu_col_id2 = + json_test::translate_col_id(cudf::detail::make_std_vector_async(gpu_col_id, stream)); + EXPECT_FALSE(json_test::compare_vector(cpu_col_id, gpu_col_id2, "col_id")); + EXPECT_FALSE(json_test::compare_vector(cpu_row_offsets, gpu_row_offsets, "row_offsets")); +} diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index b5c97a5f6c2..43702f1f7e7 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -70,7 +70,7 @@ TEST_F(JSONTypeCastTest, String) auto d_column = cudf::column_device_view::create(input); rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(thrust::device, + thrust::transform(rmm::exec_policy(cudf::default_stream_value), d_column->pair_begin(), d_column->pair_end(), svs.begin(), @@ -100,7 +100,7 @@ TEST_F(JSONTypeCastTest, Int) cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"}); auto d_column = cudf::column_device_view::create(data); rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(thrust::device, + thrust::transform(rmm::exec_policy(cudf::default_stream_value), d_column->pair_begin(), d_column->pair_end(), svs.begin(), @@ -137,7 +137,7 @@ TEST_F(JSONTypeCastTest, StringEscapes) }); auto d_column = cudf::column_device_view::create(data); rmm::device_uvector> svs(d_column->size(), stream); - thrust::transform(thrust::device, + thrust::transform(rmm::exec_policy(cudf::default_stream_value), d_column->pair_begin(), d_column->pair_end(), svs.begin(), diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index f0ececaf4eb..bcfde4eedeb 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -21,7 +21,9 @@ #include #include #include +#include #include +#include #include #include @@ -137,7 +139,7 @@ TEST_F(JsonTest, StackContext) using StackSymbolT = char; // Prepare cuda stream for data transfers & kernels - constexpr auto stream = cudf::default_stream_value; + auto const stream = cudf::default_stream_value; // Test input std::string const input = R"( [{)" @@ -156,15 +158,11 @@ TEST_F(JsonTest, StackContext) R"(}] )"; // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream); + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = + cudf::device_span{d_scalar.data(), static_cast(d_scalar.size())}; hostdevice_vector stack_context(input.size(), stream); - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), - input.data(), - input.size() * sizeof(SymbolT), - cudaMemcpyHostToDevice, - stream.value())); - // Run algorithm cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); @@ -174,7 +172,7 @@ TEST_F(JsonTest, StackContext) // Make sure we copied back the stack context stream.synchronize(); - std::vector golden_stack_context{ + std::vector const golden_stack_context{ '_', '_', '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '[', '[', '[', '[', '[', '[', '[', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', @@ -202,21 +200,17 @@ TEST_F(JsonTest, StackContextUtf8) using StackSymbolT = char; // Prepare cuda stream for data transfers & kernels - constexpr auto stream = cudf::default_stream_value; + auto const stream = cudf::default_stream_value; // Test input std::string const input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])"; // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream); + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = + cudf::device_span{d_scalar.data(), static_cast(d_scalar.size())}; hostdevice_vector stack_context(input.size(), stream); - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), - input.data(), - input.size() * sizeof(SymbolT), - cudaMemcpyHostToDevice, - stream.value())); - // Run algorithm cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); @@ -226,7 +220,7 @@ TEST_F(JsonTest, StackContextUtf8) // Make sure we copied back the stack context stream.synchronize(); - std::vector golden_stack_context{ + std::vector const golden_stack_context{ '_', '[', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', '{', @@ -241,13 +235,6 @@ TEST_F(JsonTest, TokenStream) using cuio_json::PdaTokenT; using cuio_json::SymbolOffsetT; using cuio_json::SymbolT; - - // Prepare cuda stream for data transfers & kernels - constexpr auto stream = cudf::default_stream_value; - - // Default parsing options - cudf::io::json_reader_options default_options{}; - // Test input std::string const input = R"( [{)" R"("category": "reference",)" @@ -264,54 +251,191 @@ TEST_F(JsonTest, TokenStream) R"("price": 8.95)" R"(}] )"; - // Prepare input & output buffers - rmm::device_uvector d_input(input.size(), stream); + auto const stream = cudf::default_stream_value; - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_input.data(), - input.data(), - input.size() * sizeof(SymbolT), - cudaMemcpyHostToDevice, - stream.value())); + // Default parsing options + cudf::io::json_reader_options default_options{}; + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = + cudf::device_span{d_scalar.data(), static_cast(d_scalar.size())}; // Parse the JSON and get the token stream - const auto [d_tokens_gpu, d_token_indices_gpu] = + auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(d_input, default_options, stream); - // Copy back the number of tokens that were written - thrust::host_vector tokens_gpu = + thrust::host_vector const tokens_gpu = cudf::detail::make_host_vector_async(d_tokens_gpu, stream); - thrust::host_vector token_indices_gpu = + thrust::host_vector const token_indices_gpu = cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); - // Make sure we copied back all relevant data - stream.synchronize(); + // Golden token stream sample + using token_t = cuio_json::token_t; + std::vector> const golden_token_stream = { + {2, token_t::ListBegin}, + {3, token_t::StructBegin}, + {4, token_t::StructMemberBegin}, + {4, token_t::FieldNameBegin}, + {13, token_t::FieldNameEnd}, + {16, token_t::StringBegin}, + {26, token_t::StringEnd}, + {27, token_t::StructMemberEnd}, + {28, token_t::StructMemberBegin}, + {28, token_t::FieldNameBegin}, + {35, token_t::FieldNameEnd}, + {38, token_t::ListBegin}, + {39, token_t::ValueBegin}, + {40, token_t::ValueEnd}, + {41, token_t::ValueBegin}, + {43, token_t::ValueEnd}, + {44, token_t::ValueBegin}, + {46, token_t::ValueEnd}, + {46, token_t::ListEnd}, + {47, token_t::StructMemberEnd}, + {48, token_t::StructMemberBegin}, + {48, token_t::FieldNameBegin}, + {55, token_t::FieldNameEnd}, + {58, token_t::StringBegin}, + {69, token_t::StringEnd}, + {70, token_t::StructMemberEnd}, + {71, token_t::StructMemberBegin}, + {71, token_t::FieldNameBegin}, + {77, token_t::FieldNameEnd}, + {80, token_t::StringBegin}, + {105, token_t::StringEnd}, + {106, token_t::StructMemberEnd}, + {107, token_t::StructMemberBegin}, + {107, token_t::FieldNameBegin}, + {113, token_t::FieldNameEnd}, + {116, token_t::ValueBegin}, + {120, token_t::ValueEnd}, + {120, token_t::StructMemberEnd}, + {120, token_t::StructEnd}, + {124, token_t::StructBegin}, + {125, token_t::StructMemberBegin}, + {125, token_t::FieldNameBegin}, + {134, token_t::FieldNameEnd}, + {137, token_t::StringBegin}, + {147, token_t::StringEnd}, + {148, token_t::StructMemberEnd}, + {149, token_t::StructMemberBegin}, + {149, token_t::FieldNameBegin}, + {155, token_t::FieldNameEnd}, + {158, token_t::ListBegin}, + {159, token_t::ValueBegin}, + {160, token_t::ValueEnd}, + {161, token_t::StructBegin}, + {162, token_t::StructEnd}, + {164, token_t::ValueBegin}, + {168, token_t::ValueEnd}, + {169, token_t::StructBegin}, + {170, token_t::StructMemberBegin}, + {170, token_t::FieldNameBegin}, + {172, token_t::FieldNameEnd}, + {174, token_t::ListBegin}, + {175, token_t::StructBegin}, + {177, token_t::StructEnd}, + {180, token_t::StructBegin}, + {181, token_t::StructEnd}, + {182, token_t::ListEnd}, + {184, token_t::StructMemberEnd}, + {184, token_t::StructEnd}, + {186, token_t::ListEnd}, + {187, token_t::StructMemberEnd}, + {188, token_t::StructMemberBegin}, + {188, token_t::FieldNameBegin}, + {195, token_t::FieldNameEnd}, + {198, token_t::StringBegin}, + {209, token_t::StringEnd}, + {210, token_t::StructMemberEnd}, + {211, token_t::StructMemberBegin}, + {211, token_t::FieldNameBegin}, + {217, token_t::FieldNameEnd}, + {220, token_t::StringBegin}, + {252, token_t::StringEnd}, + {253, token_t::StructMemberEnd}, + {254, token_t::StructMemberBegin}, + {254, token_t::FieldNameBegin}, + {260, token_t::FieldNameEnd}, + {263, token_t::ValueBegin}, + {267, token_t::ValueEnd}, + {267, token_t::StructMemberEnd}, + {267, token_t::StructEnd}, + {268, token_t::ListEnd}}; + + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); + + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; + + // Ensure the token category is correct + EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i; + } +} + +TEST_F(JsonTest, TokenStream2) +{ + using cuio_json::PdaTokenT; + using cuio_json::SymbolOffsetT; + using cuio_json::SymbolT; + // value end with comma, space, close-brace ", }" + std::string const input = + R"([ {}, { "a": { "y" : 6, "z": [] }}, { "a" : { "x" : 8, "y": 9}, "b" : {"x": 10 , "z": 11)" + "\n}}]"; + + auto const stream = cudf::default_stream_value; + + // Default parsing options + cudf::io::json_reader_options default_options{}; + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = + cudf::device_span{d_scalar.data(), static_cast(d_scalar.size())}; + + // Parse the JSON and get the token stream + auto [d_tokens_gpu, d_token_indices_gpu] = + cuio_json::detail::get_token_stream(d_input, default_options, stream); + // Copy back the number of tokens that were written + thrust::host_vector const tokens_gpu = + cudf::detail::make_host_vector_async(d_tokens_gpu, stream); + thrust::host_vector const token_indices_gpu = + cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Golden token stream sample using token_t = cuio_json::token_t; - std::vector> golden_token_stream = { - {2, token_t::ListBegin}, {3, token_t::StructBegin}, {4, token_t::FieldNameBegin}, - {13, token_t::FieldNameEnd}, {16, token_t::StringBegin}, {26, token_t::StringEnd}, - {28, token_t::FieldNameBegin}, {35, token_t::FieldNameEnd}, {38, token_t::ListBegin}, - {39, token_t::ValueBegin}, {40, token_t::ValueEnd}, {41, token_t::ValueBegin}, - {43, token_t::ValueEnd}, {44, token_t::ValueBegin}, {46, token_t::ValueEnd}, - {46, token_t::ListEnd}, {48, token_t::FieldNameBegin}, {55, token_t::FieldNameEnd}, - {58, token_t::StringBegin}, {69, token_t::StringEnd}, {71, token_t::FieldNameBegin}, - {77, token_t::FieldNameEnd}, {80, token_t::StringBegin}, {105, token_t::StringEnd}, - {107, token_t::FieldNameBegin}, {113, token_t::FieldNameEnd}, {116, token_t::ValueBegin}, - {120, token_t::ValueEnd}, {120, token_t::StructEnd}, {124, token_t::StructBegin}, - {125, token_t::FieldNameBegin}, {134, token_t::FieldNameEnd}, {137, token_t::StringBegin}, - {147, token_t::StringEnd}, {149, token_t::FieldNameBegin}, {155, token_t::FieldNameEnd}, - {158, token_t::ListBegin}, {159, token_t::ValueBegin}, {160, token_t::ValueEnd}, - {161, token_t::StructBegin}, {162, token_t::StructEnd}, {164, token_t::ValueBegin}, - {168, token_t::ValueEnd}, {169, token_t::StructBegin}, {170, token_t::FieldNameBegin}, - {172, token_t::FieldNameEnd}, {174, token_t::ListBegin}, {175, token_t::StructBegin}, - {177, token_t::StructEnd}, {180, token_t::StructBegin}, {181, token_t::StructEnd}, - {182, token_t::ListEnd}, {184, token_t::StructEnd}, {186, token_t::ListEnd}, - {188, token_t::FieldNameBegin}, {195, token_t::FieldNameEnd}, {198, token_t::StringBegin}, - {209, token_t::StringEnd}, {211, token_t::FieldNameBegin}, {217, token_t::FieldNameEnd}, - {220, token_t::StringBegin}, {252, token_t::StringEnd}, {254, token_t::FieldNameBegin}, - {260, token_t::FieldNameEnd}, {263, token_t::ValueBegin}, {267, token_t::ValueEnd}, - {267, token_t::StructEnd}, {268, token_t::ListEnd}}; + // clang-format off + std::vector> const golden_token_stream = { + {0, token_t::ListBegin}, + {2, token_t::StructBegin}, {3, token_t::StructEnd}, //{} + {6, token_t::StructBegin}, + {8, token_t::StructMemberBegin}, {8, token_t::FieldNameBegin}, {10, token_t::FieldNameEnd}, //a + {13, token_t::StructBegin}, + {15, token_t::StructMemberBegin}, {15, token_t::FieldNameBegin}, {17, token_t::FieldNameEnd}, {21, token_t::ValueBegin}, {22, token_t::ValueEnd}, {22, token_t::StructMemberEnd}, //a.y + {24, token_t::StructMemberBegin}, {24, token_t::FieldNameBegin}, {26, token_t::FieldNameEnd}, {29, token_t::ListBegin}, {30, token_t::ListEnd}, {32, token_t::StructMemberEnd}, //a.z + {32, token_t::StructEnd}, + {33, token_t::StructMemberEnd}, + {33, token_t::StructEnd}, + {36, token_t::StructBegin}, + {38, token_t::StructMemberBegin}, {38, token_t::FieldNameBegin}, {40, token_t::FieldNameEnd}, //a + {44, token_t::StructBegin}, + {46, token_t::StructMemberBegin}, {46, token_t::FieldNameBegin}, {48, token_t::FieldNameEnd}, {52, token_t::ValueBegin}, {53, token_t::ValueEnd}, {53, token_t::StructMemberEnd}, //a.x + {55, token_t::StructMemberBegin}, {55, token_t::FieldNameBegin}, {57, token_t::FieldNameEnd}, {60, token_t::ValueBegin}, {61, token_t::ValueEnd}, {61, token_t::StructMemberEnd}, //a.y + {61, token_t::StructEnd}, + {62, token_t::StructMemberEnd}, + {64, token_t::StructMemberBegin}, {64, token_t::FieldNameBegin}, {66, token_t::FieldNameEnd}, //b + {70, token_t::StructBegin}, + {71, token_t::StructMemberBegin}, {71, token_t::FieldNameBegin}, {73, token_t::FieldNameEnd}, {76, token_t::ValueBegin}, {78, token_t::ValueEnd}, {79, token_t::StructMemberEnd}, //b.x + {81, token_t::StructMemberBegin}, {81, token_t::FieldNameBegin}, {83, token_t::FieldNameEnd}, {86, token_t::ValueBegin}, {88, token_t::ValueEnd}, {89, token_t::StructMemberEnd}, //b.z + {89, token_t::StructEnd}, + {90, token_t::StructMemberEnd}, + {90, token_t::StructEnd}, + {91, token_t::ListEnd}}; + // clang-format on // Verify the number of tokens matches ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); @@ -331,7 +455,7 @@ TEST_F(JsonTest, ExtractColumn) using cuio_json::SymbolT; // Prepare cuda stream for data transfers & kernels - constexpr auto stream = cudf::default_stream_value; + auto const stream = cudf::default_stream_value; // Default parsing options cudf::io::json_reader_options default_options{}; @@ -346,8 +470,10 @@ TEST_F(JsonTest, ExtractColumn) auto const second_column_index = 1; EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); - auto expected_col1 = cudf::test::strings_column_wrapper({"0.0", "0.1", "0.2"}); - auto expected_col2 = cudf::test::strings_column_wrapper({"1.0", "1.1", "1.2"}); + auto expected_col1 = + cudf::test::fixed_width_column_wrapper({0.0, 0.1, 0.2}, {true, true, true}); + auto expected_col2 = + cudf::test::fixed_width_column_wrapper({1.0, 1.1, 1.2}, {true, true, true}); cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); @@ -357,7 +483,7 @@ TEST_F(JsonTest, ExtractColumn) TEST_F(JsonTest, UTF_JSON) { // Prepare cuda stream for data transfers & kernels - constexpr auto stream = cudf::default_stream_value; + auto const stream = cudf::default_stream_value; // Default parsing options cudf::io::json_reader_options default_options{}; @@ -395,110 +521,65 @@ TEST_F(JsonTest, UTF_JSON) CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, default_options, stream)); } -TEST_F(JsonTest, FromParquet) +TEST_F(JsonTest, ExtractColumnWithQuotes) { using cuio_json::SymbolT; - std::string const input = - R"([{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}])"; - // Prepare cuda stream for data transfers & kernels - constexpr auto stream = cudf::default_stream_value; + auto const stream = cudf::default_stream_value; // Default parsing options - cudf::io::json_reader_options default_options{}; + cudf::io::json_reader_options options{}; + options.enable_keep_quotes(true); - // Binary parquet data containing the same data as the data represented by the JSON string. - // We could add a dataset to include this file, but we don't want tests in cudf to have data. - const unsigned char parquet_data[] = { - 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x18, 0x15, 0x18, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, - 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x21, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x31, 0x15, 0x00, 0x15, 0x24, 0x15, 0x20, 0x2C, 0x15, 0x08, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, - 0x00, 0x00, 0x12, 0x18, 0x03, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x05, 0x07, 0x04, 0x2D, 0x00, - 0x01, 0x01, 0x15, 0x00, 0x15, 0x22, 0x15, 0x22, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, 0x06, 0x15, - 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x04, 0x07, 0x00, 0x00, 0x00, 0x57, 0x26, 0x52, - 0x52, 0x3D, 0x2B, 0x49, 0x15, 0x00, 0x15, 0x14, 0x15, 0x14, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, - 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00, 0x15, - 0x00, 0x15, 0x14, 0x15, 0x14, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00, 0x15, 0x02, 0x19, 0xCC, 0x48, 0x06, - 0x73, 0x63, 0x68, 0x65, 0x6D, 0x61, 0x15, 0x06, 0x00, 0x35, 0x02, 0x18, 0x01, 0x30, 0x15, 0x02, - 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x61, 0x25, 0x00, 0x00, 0x35, 0x02, 0x18, 0x01, 0x31, - 0x15, 0x02, 0x15, 0x06, 0x00, 0x35, 0x04, 0x18, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x15, 0x02, 0x00, - 0x35, 0x00, 0x18, 0x07, 0x65, 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x15, 0x02, 0x15, 0x06, 0x00, - 0x35, 0x04, 0x18, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x15, 0x02, 0x00, 0x15, 0x0C, 0x25, 0x00, 0x18, - 0x07, 0x65, 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x25, 0x00, 0x00, 0x35, 0x00, 0x18, 0x01, 0x32, - 0x15, 0x06, 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x30, 0x25, 0x00, 0x00, 0x15, 0x0C, 0x25, - 0x02, 0x18, 0x01, 0x31, 0x25, 0x00, 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x32, 0x25, 0x00, - 0x00, 0x16, 0x06, 0x19, 0x1C, 0x19, 0x5C, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, - 0x19, 0x28, 0x01, 0x30, 0x01, 0x61, 0x15, 0x00, 0x16, 0x06, 0x16, 0x3A, 0x16, 0x3A, 0x26, 0x08, - 0x3C, 0x36, 0x04, 0x28, 0x01, 0x31, 0x18, 0x01, 0x31, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, - 0x0C, 0x19, 0x25, 0x00, 0x06, 0x19, 0x58, 0x01, 0x31, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x07, 0x65, - 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x07, 0x65, 0x6C, 0x65, 0x6D, - 0x65, 0x6E, 0x74, 0x15, 0x02, 0x16, 0x08, 0x16, 0x46, 0x16, 0x42, 0x26, 0x42, 0x3C, 0x36, 0x00, - 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, - 0x19, 0x28, 0x01, 0x32, 0x01, 0x30, 0x15, 0x00, 0x16, 0x06, 0x16, 0x44, 0x16, 0x44, 0x26, 0x84, - 0x01, 0x3C, 0x36, 0x04, 0x28, 0x07, 0x57, 0x26, 0x52, 0x52, 0x3D, 0x2B, 0x49, 0x18, 0x07, 0x57, - 0x26, 0x52, 0x52, 0x3D, 0x2B, 0x49, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, - 0x00, 0x06, 0x19, 0x28, 0x01, 0x32, 0x01, 0x31, 0x15, 0x00, 0x16, 0x06, 0x16, 0x36, 0x16, 0x36, - 0x26, 0xC8, 0x01, 0x3C, 0x36, 0x04, 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, - 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, 0x32, 0x01, 0x32, 0x15, 0x00, 0x16, 0x06, - 0x16, 0x36, 0x16, 0x36, 0x26, 0xFE, 0x01, 0x3C, 0x36, 0x04, 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x00, 0x16, 0xAC, 0x02, 0x16, 0x06, 0x00, 0x19, 0x1C, 0x18, 0x06, 0x70, 0x61, 0x6E, 0x64, 0x61, - 0x73, 0x18, 0xFE, 0x04, 0x7B, 0x22, 0x69, 0x6E, 0x64, 0x65, 0x78, 0x5F, 0x63, 0x6F, 0x6C, 0x75, - 0x6D, 0x6E, 0x73, 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, 0x6B, 0x69, 0x6E, 0x64, 0x22, 0x3A, 0x20, - 0x22, 0x72, 0x61, 0x6E, 0x67, 0x65, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, - 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, 0x20, 0x22, 0x73, 0x74, 0x61, 0x72, 0x74, 0x22, 0x3A, 0x20, - 0x30, 0x2C, 0x20, 0x22, 0x73, 0x74, 0x6F, 0x70, 0x22, 0x3A, 0x20, 0x33, 0x2C, 0x20, 0x22, 0x73, - 0x74, 0x65, 0x70, 0x22, 0x3A, 0x20, 0x31, 0x7D, 0x5D, 0x2C, 0x20, 0x22, 0x63, 0x6F, 0x6C, 0x75, - 0x6D, 0x6E, 0x5F, 0x69, 0x6E, 0x64, 0x65, 0x78, 0x65, 0x73, 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, - 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, 0x20, 0x22, 0x66, 0x69, - 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, - 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, - 0x22, 0x75, 0x6E, 0x69, 0x63, 0x6F, 0x64, 0x65, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x75, 0x6D, 0x70, - 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, - 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x20, 0x7B, - 0x22, 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67, 0x22, 0x3A, 0x20, 0x22, 0x55, 0x54, 0x46, - 0x2D, 0x38, 0x22, 0x7D, 0x7D, 0x5D, 0x2C, 0x20, 0x22, 0x63, 0x6F, 0x6C, 0x75, 0x6D, 0x6E, 0x73, - 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x30, 0x22, - 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, - 0x22, 0x30, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, - 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6E, - 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, - 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, - 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x2C, 0x20, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, - 0x3A, 0x20, 0x22, 0x31, 0x22, 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, - 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x31, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, - 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6C, 0x69, 0x73, 0x74, 0x5B, 0x6C, - 0x69, 0x73, 0x74, 0x5B, 0x75, 0x6E, 0x69, 0x63, 0x6F, 0x64, 0x65, 0x5D, 0x5D, 0x22, 0x2C, 0x20, - 0x22, 0x6E, 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, - 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, - 0x61, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x2C, 0x20, 0x7B, 0x22, 0x6E, 0x61, 0x6D, - 0x65, 0x22, 0x3A, 0x20, 0x22, 0x32, 0x22, 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, - 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x32, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, - 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, - 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, - 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, - 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x5D, 0x2C, - 0x20, 0x22, 0x63, 0x72, 0x65, 0x61, 0x74, 0x6F, 0x72, 0x22, 0x3A, 0x20, 0x7B, 0x22, 0x6C, 0x69, - 0x62, 0x72, 0x61, 0x72, 0x79, 0x22, 0x3A, 0x20, 0x22, 0x70, 0x79, 0x61, 0x72, 0x72, 0x6F, 0x77, - 0x22, 0x2C, 0x20, 0x22, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x22, 0x3A, 0x20, 0x22, 0x38, - 0x2E, 0x30, 0x2E, 0x31, 0x22, 0x7D, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, - 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x22, 0x3A, 0x20, 0x22, 0x31, 0x2E, 0x34, 0x2E, 0x33, - 0x22, 0x7D, 0x00, 0x29, 0x5C, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, 0x00, - 0x00, 0x1C, 0x00, 0x00, 0x00, 0x0B, 0x04, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - - // Read in the data via parquet reader - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(parquet_data), sizeof(parquet_data)}); - auto result = cudf::io::read_parquet(read_opts); - - // Read in the data via the JSON parser + std::string const input = R"( [{"a":"0.0", "b":1.0}, {"b":1.1}, {"b":2.1, "a":"2.0"}] )"; + // Get the JSON's tree representation auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, default_options, stream); + cudf::host_span{input.data(), input.size()}, options, stream); + + auto constexpr expected_col_count = 2; + EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); + + auto expected_col1 = + cudf::test::strings_column_wrapper({R"("0.0")", R"()", R"("2.0")"}, {true, false, true}); + auto expected_col2 = + cudf::test::fixed_width_column_wrapper({1.0, 1.1, 2.1}, {true, true, true}); + cudf::column_view parsed_col1 = cudf_table.tbl->get_column(0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); + cudf::column_view parsed_col2 = cudf_table.tbl->get_column(1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); +} - // Verify that the data read via parquet matches the data read via JSON - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf_table.tbl->view(), result.tbl->view()); +TEST_F(JsonTest, ExpectFailMixStructAndList) +{ + using cuio_json::SymbolT; + + // Prepare cuda stream for data transfers & kernels + auto const stream = cudf::default_stream_value; - // Verify that the schema read via parquet matches the schema read via JSON - cudf::test::expect_metadata_equal(cudf_table.metadata, result.metadata); + // Default parsing options + cudf::io::json_reader_options options{}; + options.enable_keep_quotes(true); + + std::vector const inputs_fail{ + R"( [{"a":[123], "b":1.0}, {"b":1.1}, {"b":2.1, "a":{"0":123}}] )", + R"( [{"a":{"0":"foo"}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )", + R"( [{"a":{"0":null}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )"}; + + std::vector const inputs_succeed{ + R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}] )", + R"( [{"a":[123, "123"], "b":1.0}, {"b":1.1}, {"b":2.1}] )"}; + + for (auto const& input : inputs_fail) { + CUDF_EXPECT_THROW_MESSAGE( + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream), + "A mix of lists and structs within the same column is not supported"); + } + + for (auto const& input : inputs_succeed) { + CUDF_EXPECT_NO_THROW( + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream)); + } } diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 76ffc92e243..a658ed0a55d 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1246,7 +1246,7 @@ TEST_F(OrcStatisticsTest, Overflow) not_too_small_seq, not_too_small_seq + num_rows, validity); table_view tbl({col1, col2, col3, col4}); - auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc"); + auto filepath = temp_env->get_temp_filepath("OrcStatsOverflow.orc"); cudf_io::orc_writer_options out_opts = cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl); @@ -1264,6 +1264,63 @@ TEST_F(OrcStatisticsTest, Overflow) check_sum_exist(3, true); check_sum_exist(4, true); } + +TEST_F(OrcStatisticsTest, HasNull) +{ + // cudf's ORC writer doesn't yet support the ability to encode the hasNull value in statistics so + // we're embedding a file created using pyorc + // + // Method to create file: + // >>> import pyorc + // >>> output = open("./temp.orc", "wb") + // >>> writer = pyorc.Writer(output, pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt())) + // >>> writer.write((1, 3)) + // >>> writer.write((2, 4)) + // >>> writer.write((None, 5)) + // >>> writer.close() + // + // Contents of file: + // >>> import pyarrow.orc as po + // >>> po.ORCFile('new.orc').read() + // pyarrow.Table + // a: int64 + // b: int64 + // ---- + // a: [[1,2,null]] + // b: [[3,4,5]] + auto nulls_orc = std::array{ + 0x4F, 0x52, 0x43, 0x1D, 0x00, 0x00, 0x0A, 0x0C, 0x0A, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x04, + 0x08, 0x03, 0x50, 0x00, 0x2C, 0x00, 0x00, 0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E, + 0x0E, 0x26, 0x21, 0x36, 0x0E, 0x26, 0x01, 0x16, 0x09, 0xB6, 0x00, 0x46, 0x00, 0x2C, 0x00, 0x00, + 0xE3, 0x12, 0xE7, 0x62, 0x67, 0x80, 0x00, 0x21, 0x1E, 0x0E, 0x66, 0x21, 0x36, 0x0E, 0x36, 0x01, + 0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x05, 0x00, 0x00, 0xFF, 0xC0, + 0x07, 0x00, 0x00, 0x46, 0x01, 0x24, 0x05, 0x00, 0x00, 0xFF, 0xE0, 0x09, 0x00, 0x00, 0x46, 0x02, + 0x68, 0xA0, 0x68, 0x00, 0x00, 0xE3, 0x62, 0xE3, 0x60, 0x13, 0x60, 0x90, 0x10, 0xE4, 0x02, 0xD1, + 0x8C, 0x12, 0x92, 0x60, 0x9A, 0x09, 0x4C, 0x33, 0x00, 0xC5, 0x59, 0xC1, 0x34, 0x23, 0x98, 0x66, + 0x04, 0xD2, 0x6C, 0x60, 0x3E, 0x13, 0x94, 0xCF, 0x24, 0xC1, 0x2E, 0xC4, 0x02, 0x52, 0x07, 0x24, + 0x99, 0x60, 0xA4, 0x14, 0x73, 0x68, 0x88, 0x33, 0x00, 0x46, 0x00, 0x00, 0xE3, 0x52, 0xE2, 0x62, + 0xE1, 0x60, 0x0E, 0x60, 0xE0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91, 0x60, + 0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x00, 0xB4, + 0x00, 0x00, 0xE3, 0x60, 0x16, 0x98, 0xC6, 0x28, 0xC5, 0xC5, 0xC1, 0x2C, 0xE0, 0x2C, 0x21, 0xA3, + 0x60, 0xAE, 0xC1, 0xAC, 0x24, 0xC4, 0xC1, 0x23, 0xC4, 0xC4, 0xC8, 0x24, 0xC5, 0x98, 0x28, 0xC5, + 0x98, 0xA4, 0xC0, 0xA0, 0xC1, 0x60, 0xC0, 0xA0, 0xC4, 0xC1, 0xC1, 0x82, 0xCE, 0x32, 0x60, 0xB6, + 0x62, 0xE1, 0x60, 0x0E, 0x60, 0xB0, 0xE2, 0xE1, 0x60, 0x12, 0x62, 0xE3, 0x60, 0x12, 0x60, 0x91, + 0x60, 0x0B, 0x60, 0x04, 0xF2, 0x98, 0x81, 0x3C, 0x36, 0x01, 0x2E, 0x09, 0x89, 0x00, 0x06, 0x87, + 0x09, 0x7E, 0x1E, 0x8C, 0x49, 0xAC, 0x86, 0x7A, 0xE6, 0x7A, 0xA6, 0x00, 0x08, 0x5D, 0x10, 0x01, + 0x18, 0x80, 0x80, 0x04, 0x22, 0x02, 0x00, 0x0C, 0x28, 0x26, 0x30, 0x06, 0x82, 0xF4, 0x03, 0x03, + 0x4F, 0x52, 0x43, 0x17, + }; + + auto const stats = cudf_io::read_parsed_orc_statistics( + cudf_io::source_info{reinterpret_cast(nulls_orc.data()), nulls_orc.size()}); + + EXPECT_EQ(stats.file_stats[1].has_null, true); + EXPECT_EQ(stats.file_stats[2].has_null, false); + + EXPECT_EQ(stats.stripes_stats[0][1].has_null, true); + EXPECT_EQ(stats.stripes_stats[0][2].has_null, false); +} + struct OrcWriterTestStripes : public OrcWriterTest, public ::testing::WithParamInterface> { diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index c5000bc0add..cf22ab8a525 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -388,11 +388,10 @@ class ParquetSizedTest : public ::testing::TestWithParam { // test the allowed bit widths for dictionary encoding // values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries -INSTANTIATE_TEST_SUITE_P( - ParquetDictionaryTest, - ParquetSizedTest, - testing::Values(2, 4, 8, 16, 32, 64, 256, 1024, 4096, 65536, 128 * 1024, 2 * 1024 * 1024), - testing::PrintToStringParamName()); +INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest, + ParquetSizedTest, + testing::Range(1, 25), + testing::PrintToStringParamName()); namespace { // Generates a vector of uniform random values of type T @@ -4594,12 +4593,61 @@ TEST_F(ParquetReaderTest, StructByteArray) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +TEST_F(ParquetWriterTest, SingleValueDictionaryTest) +{ + constexpr unsigned int expected_bits = 1; + constexpr unsigned int nrows = 1'000'000U; + + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return "a unique string value suffixed with 1"; }); + auto const col0 = cudf::test::strings_column_wrapper(elements, elements + nrows); + auto const expected = table_view{{col0}}; + + auto const filepath = temp_env->get_temp_filepath("SingleValueDictionaryTest.parquet"); + // set row group size so that there will be only one row group + // no compression so we can easily read page data + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .compression(cudf::io::compression_type::NONE) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) + .row_group_size_rows(nrows); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); + + // make sure dictionary was used + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::FileMetaData fmd; + + read_footer(source, &fmd); + auto used_dict = [&fmd]() { + for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { + if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + return true; + } + } + return false; + }; + EXPECT_TRUE(used_dict()); + + // and check that the correct number of bits was used + auto const oi = read_offset_index(source, fmd.row_groups[0].columns[0]); + auto const nbits = read_dict_bits(source, oi.page_locations[0]); + EXPECT_EQ(nbits, expected_bits); +} + TEST_P(ParquetSizedTest, DictionaryTest) { - constexpr int nrows = 3'000'000; + const unsigned int cardinality = (1 << (GetParam() - 1)) + 1; + const unsigned int nrows = std::max(cardinality * 3 / 2, 3'000'000U); - auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return "a unique string value suffixed with " + std::to_string(i % GetParam()); + auto elements = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) { + return "a unique string value suffixed with " + std::to_string(i % cardinality); }); auto const col0 = cudf::test::strings_column_wrapper(elements, elements + nrows); auto const expected = table_view{{col0}}; @@ -4608,16 +4656,16 @@ TEST_P(ParquetSizedTest, DictionaryTest) // set row group size so that there will be only one row group // no compression so we can easily read page data cudf::io::parquet_writer_options out_opts = - cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected) + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) .compression(cudf::io::compression_type::NONE) .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN) .row_group_size_rows(nrows) - .row_group_size_bytes(256 * 1024 * 1024); + .row_group_size_bytes(512 * 1024 * 1024); cudf::io::write_parquet(out_opts); cudf::io::parquet_reader_options default_in_opts = - cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath}); - auto const result = cudf_io::read_parquet(default_in_opts); + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(default_in_opts); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); @@ -4640,16 +4688,7 @@ TEST_P(ParquetSizedTest, DictionaryTest) // and check that the correct number of bits was used auto const oi = read_offset_index(source, fmd.row_groups[0].columns[0]); auto const nbits = read_dict_bits(source, oi.page_locations[0]); - auto const expected_bits = - cudf::io::parquet::CompactProtocolReader::NumRequiredBits(GetParam() - 1); - - // copied from writer_impl.cu - constexpr auto allowed_bitsizes = - std::array{1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24}; - auto const rle_bits = - *std::lower_bound(allowed_bitsizes.begin(), allowed_bitsizes.end(), expected_bits); - - EXPECT_EQ(nbits, rle_bits); + EXPECT_EQ(nbits, GetParam()); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp index 8ec88696355..43debf3d5b3 100644 --- a/cpp/tests/io/text/multibyte_split_test.cpp +++ b/cpp/tests/io/text/multibyte_split_test.cpp @@ -197,7 +197,7 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange) auto host_input = std::string(); auto host_expected = std::vector(); - for (auto i = 0; i < 1000; i++) { + for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) { host_input += "...:|"; } @@ -222,7 +222,7 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange) auto host_input = std::string(); auto host_expected = std::vector(); - for (auto i = 0; i < 1000; i++) { + for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) { host_input += "....."; } @@ -244,6 +244,57 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS); } +TEST_F(MultibyteSplitTest, LargeInputMultipleRangeSingleByte) +{ + auto host_input = std::string(); + auto host_expected = std::vector(); + + for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) { + host_input += "...:|"; + } + + auto delimiter = std::string("|"); + auto source = cudf::io::text::make_source(host_input); + + auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); + auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); + auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); + auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + + auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); + auto out = cudf::concatenate(out_views); + + auto expected = cudf::io::text::multibyte_split(*source, delimiter); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS); +} + +TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRangeSingleByte) +{ + auto host_input = std::string(); + auto host_expected = std::vector(); + + for (auto i = 0; i < (2 * 32 * 128 * 1024); i++) { + host_input += "....."; + } + + auto delimiter = std::string("|"); + host_input[host_input.size() / 2] = '|'; + auto source = cudf::io::text::make_source(host_input); + + auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3); + auto out0 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]); + auto out1 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]); + auto out2 = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]); + + auto out_views = std::vector({out0->view(), out1->view(), out2->view()}); + auto out = cudf::concatenate(out_views); + + auto expected = cudf::io::text::multibyte_split(*source, delimiter); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), *out, debug_output_level::ALL_ERRORS); +} + TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges) { using namespace cudf::io::text; diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu new file mode 100644 index 00000000000..04bb7507934 --- /dev/null +++ b/cpp/tests/io/type_inference_test.cu @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +using cudf::io::parse_options; +using cudf::io::detail::infer_data_type; + +// Base test fixture for tests +struct TypeInference : public cudf::test::BaseFixture { +}; + +TEST_F(TypeInference, Basic) +{ + auto const stream = cudf::default_stream_value; + + auto options = parse_options{',', '\n', '\"'}; + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = R"json([42,52,5])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 4, 7}; + auto const string_length = std::vector{2, 2, 1}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); + + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64}); +} + +TEST_F(TypeInference, Null) +{ + auto const stream = cudf::default_stream_value; + + auto options = parse_options{',', '\n', '\"'}; + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = R"json([52,5])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 1, 4}; + auto const string_length = std::vector{0, 2, 1}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); + + EXPECT_EQ(res_type, + cudf::data_type{cudf::type_id::FLOAT64}); // FLOAT64 to align with pandas's behavior +} + +TEST_F(TypeInference, AllNull) +{ + auto const stream = cudf::default_stream_value; + + auto options = parse_options{',', '\n', '\"'}; + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = R"json([null])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 1, 1}; + auto const string_length = std::vector{0, 0, 4}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); + + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT8}); // INT8 if all nulls +} + +TEST_F(TypeInference, String) +{ + auto const stream = cudf::default_stream_value; + + auto options = parse_options{',', '\n', '\"'}; + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = R"json(["1990","8","25"])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 8, 12}; + auto const string_length = std::vector{6, 3, 4}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); + + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); +} + +TEST_F(TypeInference, Bool) +{ + auto const stream = cudf::default_stream_value; + + auto options = parse_options{',', '\n', '\"'}; + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = R"json([true,false,false])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 6, 12}; + auto const string_length = std::vector{4, 5, 5}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); + + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::BOOL8}); +} + +TEST_F(TypeInference, Timestamp) +{ + auto const stream = cudf::default_stream_value; + + auto options = parse_options{',', '\n', '\"'}; + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = R"json([1970/2/5,1970/8/25])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); + + std::size_t constexpr size = 3; + auto const string_offset = std::vector{1, 10}; + auto const string_length = std::vector{8, 9}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); + + // All data time (quoted and unquoted) is inferred as string for now + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); +} + +TEST_F(TypeInference, InvalidInput) +{ + auto const stream = cudf::default_stream_value; + + auto options = parse_options{',', '\n', '\"'}; + options.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + options.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + options.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + + std::string data = R"json([1,2,3,a,5])json"; + auto d_data = cudf::make_string_scalar(data); + auto& d_string_scalar = static_cast(*d_data); + + std::size_t constexpr size = 5; + auto const string_offset = std::vector{1, 3, 5, 7, 9}; + auto const string_length = std::vector{1, 1, 1, 1, 1}; + rmm::device_vector d_string_offset{string_offset}; + rmm::device_vector d_string_length{string_length}; + + auto d_col_strings = + thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin())); + + auto res_type = + infer_data_type(options.json_view(), + {d_string_scalar.data(), static_cast(d_string_scalar.size())}, + d_col_strings, + size, + stream); + + // Invalid input is inferred as string for now + EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING}); +} diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh index 0bc5d3c4a15..26902b43662 100644 --- a/cpp/tests/iterator/iterator_tests.cuh +++ b/cpp/tests/iterator/iterator_tests.cuh @@ -54,8 +54,14 @@ struct IteratorTest : public cudf::test::BaseFixture { // Get temporary storage size size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce( - nullptr, temp_storage_bytes, d_in, dev_result.begin(), num_items, thrust::minimum{}, init); + cub::DeviceReduce::Reduce(nullptr, + temp_storage_bytes, + d_in, + dev_result.begin(), + num_items, + thrust::minimum{}, + init, + cudf::default_stream_value.value()); // Allocate temporary storage rmm::device_buffer d_temp_storage(temp_storage_bytes, cudf::default_stream_value); @@ -67,7 +73,8 @@ struct IteratorTest : public cudf::test::BaseFixture { dev_result.begin(), num_items, thrust::minimum{}, - init); + init, + cudf::default_stream_value.value()); evaluate(expected, dev_result, "cub test"); } @@ -85,14 +92,16 @@ struct IteratorTest : public cudf::test::BaseFixture { // using a temporary vector and calling transform and all_of separately is // equivalent to thrust::equal but compiles ~3x faster auto dev_results = rmm::device_uvector(num_items, cudf::default_stream_value); - thrust::transform(thrust::device, + thrust::transform(rmm::exec_policy(cudf::default_stream_value), d_in, d_in_last, dev_expected.begin(), dev_results.begin(), thrust::equal_to{}); - auto result = thrust::all_of( - thrust::device, dev_results.begin(), dev_results.end(), thrust::identity{}); + auto result = thrust::all_of(rmm::exec_policy(cudf::default_stream_value), + dev_results.begin(), + dev_results.end(), + thrust::identity{}); EXPECT_TRUE(result) << "thrust test"; } diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu index 2756247c368..c5b7393550a 100644 --- a/cpp/tests/iterator/optional_iterator_test_numeric.cu +++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu @@ -112,12 +112,13 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output) // this can be computed with a single reduce and without a temporary output vector // but the approach increases the compile time by ~2x auto results = rmm::device_uvector(d_col->size(), cudf::default_stream_value); - thrust::transform(thrust::device, + thrust::transform(rmm::exec_policy(cudf::default_stream_value), it_dev_squared, it_dev_squared + d_col->size(), results.begin(), optional_to_meanvar{}); - auto result = thrust::reduce(thrust::device, results.begin(), results.end(), T_output{}); + auto result = thrust::reduce( + rmm::exec_policy(cudf::default_stream_value), results.begin(), results.end(), T_output{}); if (not std::is_floating_point()) { EXPECT_EQ(expected_value, result) << "optional iterator reduction sum"; diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu index 41dd9b65e42..f570df44286 100644 --- a/cpp/tests/iterator/pair_iterator_test_numeric.cu +++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu @@ -14,6 +14,8 @@ */ #include +#include + #include #include #include @@ -111,7 +113,8 @@ TYPED_TEST(NumericPairIteratorTest, mean_var_output) // GPU test auto it_dev = d_col->pair_begin(); auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer); - auto result = thrust::reduce(it_dev_squared, + auto result = thrust::reduce(rmm::exec_policy(cudf::default_stream_value), + it_dev_squared, it_dev_squared + d_col->size(), thrust::make_pair(T_output{}, true), sum_if_not_null{}); diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu index 9e75d6c6eb9..bc2a96b5adf 100644 --- a/cpp/tests/join/conditional_join_tests.cu +++ b/cpp/tests/join/conditional_join_tests.cu @@ -24,6 +24,8 @@ #include #include +#include + #include #include #include @@ -256,7 +258,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest { thrust::device_vector> reference_pairs( reference.first->size()); - thrust::transform(thrust::device, + thrust::transform(rmm::exec_policy(cudf::default_stream_value), result.first->begin(), result.first->end(), result.second->begin(), @@ -264,7 +266,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest { [] __device__(cudf::size_type first, cudf::size_type second) { return thrust::make_pair(first, second); }); - thrust::transform(thrust::device, + thrust::transform(rmm::exec_policy(cudf::default_stream_value), reference.first->begin(), reference.first->end(), reference.second->begin(), @@ -273,11 +275,15 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest { return thrust::make_pair(first, second); }); - thrust::sort(thrust::device, result_pairs.begin(), result_pairs.end()); - thrust::sort(thrust::device, reference_pairs.begin(), reference_pairs.end()); + thrust::sort( + rmm::exec_policy(cudf::default_stream_value), result_pairs.begin(), result_pairs.end()); + thrust::sort( + rmm::exec_policy(cudf::default_stream_value), reference_pairs.begin(), reference_pairs.end()); - EXPECT_TRUE(thrust::equal( - thrust::device, reference_pairs.begin(), reference_pairs.end(), result_pairs.begin())); + EXPECT_TRUE(thrust::equal(rmm::exec_policy(cudf::default_stream_value), + reference_pairs.begin(), + reference_pairs.end(), + result_pairs.begin())); } void compare_to_hash_join(ColumnVector left_data, ColumnVector right_data) @@ -696,9 +702,13 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest { void _compare_to_hash_join(std::unique_ptr> const& result, std::unique_ptr> const& reference) { - thrust::sort(thrust::device, result->begin(), result->end()); - thrust::sort(thrust::device, reference->begin(), reference->end()); - EXPECT_TRUE(thrust::equal(thrust::device, result->begin(), result->end(), reference->begin())); + thrust::sort(rmm::exec_policy(cudf::default_stream_value), result->begin(), result->end()); + thrust::sort( + rmm::exec_policy(cudf::default_stream_value), reference->begin(), reference->end()); + EXPECT_TRUE(thrust::equal(rmm::exec_policy(cudf::default_stream_value), + result->begin(), + result->end(), + reference->begin())); } /* diff --git a/cpp/tests/lists/sort_lists_tests.cpp b/cpp/tests/lists/sort_lists_tests.cpp index ade626a5c2b..a26ae5c2f48 100644 --- a/cpp/tests/lists/sort_lists_tests.cpp +++ b/cpp/tests/lists/sort_lists_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,39 +24,21 @@ template using LCW = cudf::test::lists_column_wrapper; -namespace cudf { -namespace test { - -auto generate_sorted_lists(lists_column_view const& input, - order column_order, - null_order null_precedence) +auto generate_sorted_lists(cudf::lists_column_view const& input, + cudf::order column_order, + cudf::null_order null_precedence) { - return std::pair{lists::sort_lists(input, column_order, null_precedence), - lists::stable_sort_lists(input, column_order, null_precedence)}; + return std::pair{cudf::lists::sort_lists(input, column_order, null_precedence), + cudf::lists::stable_sort_lists(input, column_order, null_precedence)}; } template -struct SortLists : public BaseFixture { +struct SortLists : public cudf::test::BaseFixture { }; -TYPED_TEST_SUITE(SortLists, NumericTypes); -using SortListsInt = SortLists; +using TypesForTest = cudf::test::Concat; +TYPED_TEST_SUITE(SortLists, TypesForTest); -/* -empty case - empty list - single row with empty list - multi row with empty lists -single case - single list with single element - single list with multi element -normal case without nulls -Null cases - null rows - null elements in list. -Error: - depth>1 -*/ TYPED_TEST(SortLists, NoNull) { using T = TypeParam; @@ -68,14 +50,14 @@ TYPED_TEST(SortLists, NoNull) // LCW order{{2, 1, 0, 3}, {0}, {1, 2, 0}, {0, 1}}; LCW expected{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; { - auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{list}, order::ASCENDING, null_order::AFTER); + auto const [sorted_lists, stable_sorted_lists] = generate_sorted_lists( + cudf::lists_column_view{list}, cudf::order::ASCENDING, cudf::null_order::AFTER); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } { - auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{list}, order::ASCENDING, null_order::BEFORE); + auto const [sorted_lists, stable_sorted_lists] = generate_sorted_lists( + cudf::lists_column_view{list}, cudf::order::ASCENDING, cudf::null_order::BEFORE); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } @@ -84,14 +66,14 @@ TYPED_TEST(SortLists, NoNull) // LCW order{{3, 0, 1, 2}, {0}, {0, 1, 2}, {1, 0}}; LCW expected2{{4, 3, 2, 1}, {5}, {10, 9, 8}, {7, 6}}; { - auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{list}, order::DESCENDING, null_order::AFTER); + auto const [sorted_lists, stable_sorted_lists] = generate_sorted_lists( + cudf::lists_column_view{list}, cudf::order::DESCENDING, cudf::null_order::AFTER); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected2); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected2); } { - auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{list}, order::DESCENDING, null_order::BEFORE); + auto const [sorted_lists, stable_sorted_lists] = generate_sorted_lists( + cudf::lists_column_view{list}, cudf::order::DESCENDING, cudf::null_order::BEFORE); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected2); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected2); } @@ -111,16 +93,16 @@ TYPED_TEST(SortLists, Null) { LCW expected{{{1, 2, 3, 4}, valids_a.begin()}, {5}, {8, 9, 10}, {6, 7}}; - auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{list}, order::ASCENDING, null_order::AFTER); + auto const [sorted_lists, stable_sorted_lists] = generate_sorted_lists( + cudf::lists_column_view{list}, cudf::order::ASCENDING, cudf::null_order::AFTER); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } { LCW expected{{{4, 1, 2, 3}, valids_b.begin()}, {5}, {8, 9, 10}, {6, 7}}; - auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{list}, order::ASCENDING, null_order::BEFORE); + auto const [sorted_lists, stable_sorted_lists] = generate_sorted_lists( + cudf::lists_column_view{list}, cudf::order::ASCENDING, cudf::null_order::BEFORE); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } @@ -129,21 +111,23 @@ TYPED_TEST(SortLists, Null) // LCW order{{3, 0, 1, 2}, {0}, {0, 1, 2}, {1, 0}}; { LCW expected{{{4, 3, 2, 1}, valids_b.begin()}, {5}, {10, 9, 8}, {7, 6}}; - auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{list}, order::DESCENDING, null_order::AFTER); + auto const [sorted_lists, stable_sorted_lists] = generate_sorted_lists( + cudf::lists_column_view{list}, cudf::order::DESCENDING, cudf::null_order::AFTER); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } { LCW expected{{{3, 2, 1, 4}, valids_a.begin()}, {5}, {10, 9, 8}, {7, 6}}; - auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{list}, order::DESCENDING, null_order::BEFORE); + auto const [sorted_lists, stable_sorted_lists] = generate_sorted_lists( + cudf::lists_column_view{list}, cudf::order::DESCENDING, cudf::null_order::BEFORE); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } } +using SortListsInt = SortLists; + TEST_F(SortListsInt, Empty) { using T = int; @@ -151,21 +135,21 @@ TEST_F(SortListsInt, Empty) { LCW l{}; auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{l}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{l}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l); } { LCW l{LCW{}}; auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{l}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{l}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l); } { LCW l{LCW{}, LCW{}}; auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{l}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{l}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l); } @@ -178,14 +162,14 @@ TEST_F(SortListsInt, Single) { LCW l{1}; auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{l}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{l}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l); } { LCW l{{1, 2, 3}}; auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{l}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{l}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l); } @@ -198,22 +182,20 @@ TEST_F(SortListsInt, NullRows) LCW l{{{1, 2, 3}, {4, 5, 6}, {7}}, valids.begin()}; // offset 0, 0, 3, 3 auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{l}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{l}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l); } -/* // Disabling this test. // Reason: After this exception "cudaErrorAssert device-side assert triggered", further tests fail -TEST_F(SortListsInt, Depth) +TEST_F(SortListsInt, DISABLED_Depth) { using T = int; LCW l1{LCW{{1, 2}, {3}}, LCW{{4, 5}}}; // device exception - EXPECT_THROW(sort_lists(lists_column_view{l1}, {}, {}), std::exception); + EXPECT_THROW(cudf::lists::sort_lists(cudf::lists_column_view{l1}, {}, {}), std::exception); } -*/ TEST_F(SortListsInt, Sliced) { @@ -224,7 +206,7 @@ TEST_F(SortListsInt, Sliced) auto const sliced_list = cudf::slice(l, {0, 4})[0]; auto const expected = LCW{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}}; auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{sliced_list}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{sliced_list}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } @@ -233,7 +215,7 @@ TEST_F(SortListsInt, Sliced) auto const sliced_list = cudf::slice(l, {1, 4})[0]; auto const expected = LCW{{5, 6, 7}, {8, 9}, {10}}; auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{sliced_list}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{sliced_list}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } @@ -242,7 +224,7 @@ TEST_F(SortListsInt, Sliced) auto const sliced_list = cudf::slice(l, {1, 2})[0]; auto const expected = LCW{{5, 6, 7}}; auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{sliced_list}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{sliced_list}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } @@ -251,11 +233,60 @@ TEST_F(SortListsInt, Sliced) auto const sliced_list = cudf::slice(l, {0, 2})[0]; auto const expected = LCW{{1, 2, 3, 4}, {5, 6, 7}}; auto const [sorted_lists, stable_sorted_lists] = - generate_sorted_lists(lists_column_view{sliced_list}, {}, {}); + generate_sorted_lists(cudf::lists_column_view{sliced_list}, {}, {}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected); } } -} // namespace test -} // namespace cudf +using SortListsDouble = SortLists; +TEST_F(SortListsDouble, InfinityAndNaN) +{ + auto constexpr NaN = std::numeric_limits::quiet_NaN(); + auto constexpr Inf = std::numeric_limits::infinity(); + + using LCW = cudf::test::lists_column_wrapper; + { + LCW input{-0.0, -NaN, -NaN, NaN, Inf, -Inf, 7, 5, 6, NaN, Inf, -Inf, -NaN, -NaN, -0.0}; + auto [sorted_lists, stable_sorted_lists] = + generate_sorted_lists(cudf::lists_column_view{input}, {}, {}); + LCW expected{-Inf, -Inf, -0, -0, 5, 6, 7, Inf, Inf, -NaN, -NaN, NaN, NaN, -NaN, -NaN}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sorted_lists->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(stable_sorted_lists->view(), expected); + } + // This data includes a row with over 200 elements to test the + // radix sort is not used in the logic path in segmented_sort. + // Technically radix sort is not expected to be used in either case. + { + // clang-format off + LCW input{0.0, -0.0, -NaN, -NaN, NaN, Inf, -Inf, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, + NaN, Inf, -Inf, -NaN, -NaN, -0.0, 0.0}; + LCW expected{-Inf, -Inf, 0.0, -0.0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + Inf, Inf, -NaN, -NaN, NaN, NaN, -NaN, -NaN}; + // clang-format on + auto [sorted_lists, stable_sorted_lists] = + generate_sorted_lists(cudf::lists_column_view{input}, {}, {}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sorted_lists->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(stable_sorted_lists->view(), expected); + } +} diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu index ec005412a61..f4a1c94c3e6 100644 --- a/cpp/tests/scalar/scalar_device_view_test.cu +++ b/cpp/tests/scalar/scalar_device_view_test.cu @@ -59,13 +59,15 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value) auto scalar_device_view1 = cudf::get_scalar_device_view(s1); rmm::device_scalar result{cudf::default_stream_value}; - test_set_value<<<1, 1>>>(scalar_device_view, scalar_device_view1); + test_set_value<<<1, 1, 0, cudf::default_stream_value.value()>>>(scalar_device_view, + scalar_device_view1); CUDF_CHECK_CUDA(0); EXPECT_EQ(s1.value(), value); EXPECT_TRUE(s1.is_valid()); - test_value<<<1, 1>>>(scalar_device_view, scalar_device_view1, result.data()); + test_value<<<1, 1, 0, cudf::default_stream_value.value()>>>( + scalar_device_view, scalar_device_view1, result.data()); CUDF_CHECK_CUDA(0); EXPECT_TRUE(result.value(cudf::default_stream_value)); @@ -84,7 +86,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull) auto scalar_device_view = cudf::get_scalar_device_view(s); rmm::device_scalar result{cudf::default_stream_value}; - test_null<<<1, 1>>>(scalar_device_view, result.data()); + test_null<<<1, 1, 0, cudf::default_stream_value.value()>>>(scalar_device_view, result.data()); CUDF_CHECK_CUDA(0); EXPECT_FALSE(result.value(cudf::default_stream_value)); @@ -104,7 +106,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, SetNull) s.set_valid_async(true); EXPECT_TRUE(s.is_valid()); - test_setnull<<<1, 1>>>(scalar_device_view); + test_setnull<<<1, 1, 0, cudf::default_stream_value.value()>>>(scalar_device_view); CUDF_CHECK_CUDA(0); EXPECT_FALSE(s.is_valid()); @@ -130,7 +132,8 @@ TEST_F(StringScalarDeviceViewTest, Value) rmm::device_scalar result{cudf::default_stream_value}; auto value_v = cudf::detail::make_device_uvector_sync(value); - test_string_value<<<1, 1>>>(scalar_device_view, value_v.data(), value.size(), result.data()); + test_string_value<<<1, 1, 0, cudf::default_stream_value.value()>>>( + scalar_device_view, value_v.data(), value.size(), result.data()); CUDF_CHECK_CUDA(0); EXPECT_TRUE(result.value(cudf::default_stream_value)); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 9ca4fbb6cb7..ba738f7b616 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -330,6 +330,7 @@ TEST_F(StringsContainsTests, Errors) auto strings_view = cudf::strings_column_view(input); EXPECT_THROW(cudf::strings::contains_re(strings_view, "(3?)+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::contains_re(strings_view, "(?:3?)+"), cudf::logic_error); EXPECT_THROW(cudf::strings::contains_re(strings_view, "3?+"), cudf::logic_error); EXPECT_THROW(cudf::strings::count_re(strings_view, "{3}a"), cudf::logic_error); } diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index e998be92216..36fdd423168 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -213,7 +214,7 @@ TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty) auto d_column = cudf::column_device_view::create(data); rmm::device_uvector pairs(d_column->size(), cudf::default_stream_value); - thrust::transform(thrust::device, + thrust::transform(rmm::exec_policy(cudf::default_stream_value), d_column->pair_begin(), d_column->pair_end(), pairs.data(), diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 9c3326cf575..61c2fa12895 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -27,7 +27,6 @@ #include -#include #include #include #include @@ -54,7 +53,7 @@ TYPED_TEST(RowBitCountTyped, SimpleTypes) // expect size of the type per row auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16); cudf::mutable_column_view mcv(*expected); - thrust::fill(rmm::exec_policy(), + thrust::fill(rmm::exec_policy(cudf::default_stream_value), mcv.begin(), mcv.end(), sizeof(device_storage_type_t) * CHAR_BIT); @@ -77,7 +76,7 @@ TYPED_TEST(RowBitCountTyped, SimpleTypesWithNulls) // expect size of the type + 1 bit per row auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16); cudf::mutable_column_view mcv(*expected); - thrust::fill(rmm::exec_policy(), + thrust::fill(rmm::exec_policy(cudf::default_stream_value), mcv.begin(), mcv.end(), (sizeof(device_storage_type_t) * CHAR_BIT) + 1); @@ -241,13 +240,15 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock) // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows}; auto ints = make_numeric_column(data_type{type_id::INT32}, num_rows * 2); auto ints_view = ints->mutable_view(); - thrust::tabulate( - thrust::device, ints_view.begin(), ints_view.end(), thrust::identity{}); + thrust::tabulate(rmm::exec_policy(cudf::default_stream_value), + ints_view.begin(), + ints_view.end(), + thrust::identity{}); // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2}; auto list_offsets = make_numeric_column(data_type{type_id::INT32}, num_rows + 1); auto list_offsets_view = list_offsets->mutable_view(); - thrust::tabulate(thrust::device, + thrust::tabulate(rmm::exec_policy(cudf::default_stream_value), list_offsets_view.begin(), list_offsets_view.end(), times_2{}); @@ -263,7 +264,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock) // Compute row_bit_count, and compare. auto row_bit_counts = row_bit_count(table_view{{structs_column->view()}}); auto expected_row_bit_counts = make_numeric_column(data_type{type_id::INT32}, num_rows); - thrust::fill_n(thrust::device, + thrust::fill_n(rmm::exec_policy(cudf::default_stream_value), expected_row_bit_counts->mutable_view().begin(), num_rows, CHAR_BIT * (2 * sizeof(int32_t) + sizeof(offset_type))); @@ -612,7 +613,7 @@ TEST_F(RowBitCount, Table) auto expected = cudf::make_fixed_width_column(data_type{type_id::INT32}, t.num_rows()); cudf::mutable_column_view mcv(*expected); thrust::transform( - rmm::exec_policy(), + rmm::exec_policy(cudf::default_stream_value), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + t.num_rows(), mcv.begin(), diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu index eee9cd46e4f..3280339ea85 100644 --- a/cpp/tests/types/type_dispatcher_test.cu +++ b/cpp/tests/types/type_dispatcher_test.cu @@ -70,7 +70,8 @@ __global__ void dispatch_test_kernel(cudf::type_id id, bool* d_result) TYPED_TEST(TypedDispatcherTest, DeviceDispatch) { auto result = cudf::detail::make_zeroed_device_uvector_sync(1); - dispatch_test_kernel<<<1, 1>>>(cudf::type_to_id(), result.data()); + dispatch_test_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>( + cudf::type_to_id(), result.data()); CUDF_CUDA_TRY(cudaDeviceSynchronize()); EXPECT_EQ(true, result.front_element(cudf::default_stream_value)); } @@ -130,7 +131,7 @@ __global__ void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2 TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch) { auto result = cudf::detail::make_zeroed_device_uvector_sync(1); - double_dispatch_test_kernel<<<1, 1>>>( + double_dispatch_test_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>( cudf::type_to_id(), cudf::type_to_id(), result.data()); CUDF_CUDA_TRY(cudaDeviceSynchronize()); EXPECT_EQ(true, result.front_element(cudf::default_stream_value)); diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index f978f6bbc66..5106196a58f 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -66,7 +66,7 @@ std::unique_ptr generate_all_row_indices(size_type num_rows) { auto indices = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED); - thrust::sequence(rmm::exec_policy(), + thrust::sequence(rmm::exec_policy(cudf::default_stream_value), indices->mutable_view().begin(), indices->mutable_view().end(), 0); @@ -132,8 +132,9 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, ? (offsets[true_index + 1] - offsets[true_index]) : 0; }); - auto const output_size = - thrust::reduce(rmm::exec_policy(), row_size_iter, row_size_iter + row_indices.size()); + auto const output_size = thrust::reduce(rmm::exec_policy(cudf::default_stream_value), + row_size_iter, + row_size_iter + row_indices.size()); // no output. done. auto result = cudf::make_fixed_width_column(data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED); @@ -146,7 +147,7 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, // auto output_row_start = cudf::make_fixed_width_column( data_type{type_id::INT32}, row_indices.size(), mask_state::UNALLOCATED); - thrust::exclusive_scan(rmm::exec_policy(), + thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value), row_size_iter, row_size_iter + row_indices.size(), output_row_start->mutable_view().begin()); @@ -155,7 +156,7 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, // // result = [1, 1, 1, 1, 1] // - thrust::generate(rmm::exec_policy(), + thrust::generate(rmm::exec_policy(cudf::default_stream_value), result->mutable_view().begin(), result->mutable_view().end(), [] __device__() { return 1; }); @@ -174,7 +175,7 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, auto const true_index = row_indices[index] + offset; return offsets[true_index] - first_offset; }); - thrust::scatter_if(rmm::exec_policy(), + thrust::scatter_if(rmm::exec_policy(cudf::default_stream_value), output_row_iter, output_row_iter + row_indices.size(), output_row_start->view().begin(), @@ -188,18 +189,18 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, // auto keys = cudf::make_fixed_width_column(data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED); - thrust::generate(rmm::exec_policy(), + thrust::generate(rmm::exec_policy(cudf::default_stream_value), keys->mutable_view().begin(), keys->mutable_view().end(), [] __device__() { return 0; }); - thrust::scatter_if(rmm::exec_policy(), + thrust::scatter_if(rmm::exec_policy(cudf::default_stream_value), row_size_iter, row_size_iter + row_indices.size(), output_row_start->view().begin(), row_size_iter, keys->mutable_view().begin(), [] __device__(auto row_size) { return row_size != 0; }); - thrust::inclusive_scan(rmm::exec_policy(), + thrust::inclusive_scan(rmm::exec_policy(cudf::default_stream_value), keys->view().begin(), keys->view().end(), keys->mutable_view().begin()); @@ -212,7 +213,7 @@ std::unique_ptr generate_child_row_indices(lists_column_view const& c, // output // result = [6, 7, 11, 12, 13] // - thrust::inclusive_scan_by_key(rmm::exec_policy(), + thrust::inclusive_scan_by_key(rmm::exec_policy(cudf::default_stream_value), keys->view().begin(), keys->view().end(), result->view().begin(), @@ -255,7 +256,9 @@ struct column_property_comparator { auto const true_index = row_indices[index] + offset; return !validity || cudf::bit_is_set(validity, true_index) ? 0 : 1; }); - return thrust::reduce(rmm::exec_policy(), validity_iter, validity_iter + row_indices.size()); + return thrust::reduce(rmm::exec_policy(cudf::default_stream_value), + validity_iter, + validity_iter + row_indices.size()); } bool compare_common(cudf::column_view const& lhs, @@ -549,7 +552,7 @@ struct column_comparator_impl { lhs.size(), cudf::default_stream_value); // worst case: everything different auto input_iter = thrust::make_counting_iterator(0); auto diff_iter = thrust::copy_if( - rmm::exec_policy(), + rmm::exec_policy(cudf::default_stream_value), input_iter, input_iter + lhs_row_indices.size(), differences.begin(), @@ -640,7 +643,7 @@ struct column_comparator_impl { // auto input_iter = thrust::make_counting_iterator(0); auto diff_iter = thrust::copy_if( - rmm::exec_policy(), + rmm::exec_policy(cudf::default_stream_value), input_iter, input_iter + lhs_row_indices.size(), differences.begin(), @@ -862,7 +865,8 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt } auto typed_lhs = static_cast(lhs); auto typed_rhs = static_cast(rhs); - EXPECT_TRUE(thrust::equal(thrust::device, typed_lhs, typed_lhs + size_bytes, typed_rhs)); + EXPECT_TRUE(thrust::equal( + rmm::exec_policy(cudf::default_stream_value), typed_lhs, typed_lhs + size_bytes, typed_rhs)); } /** @@ -957,7 +961,7 @@ std::string nested_offsets_to_string(NestedColumnView const& c, std::string cons // normalize the offset values for the column offset size_type const* d_offsets = offsets.head() + c.offset(); thrust::transform( - rmm::exec_policy(), + rmm::exec_policy(cudf::default_stream_value), d_offsets, d_offsets + output_size, shifted_offsets.begin(), diff --git a/cpp/tests/utilities/table_utilities.cu b/cpp/tests/utilities/table_utilities.cu index e7fe97efa96..e1740eb5023 100644 --- a/cpp/tests/utilities/table_utilities.cu +++ b/cpp/tests/utilities/table_utilities.cu @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu index 78b76ff043e..fc4104c765b 100644 --- a/cpp/tests/utilities_tests/span_tests.cu +++ b/cpp/tests/utilities_tests/span_tests.cu @@ -238,7 +238,7 @@ TEST(SpanTest, CanUseDeviceSpan) auto d_span = device_span(d_message.data().get(), d_message.size()); - simple_device_kernel<<<1, 1>>>(d_span); + simple_device_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>(d_span); cudaDeviceSynchronize(); @@ -277,8 +277,8 @@ TEST(MdSpanTest, DeviceReadWrite) { auto vector = hostdevice_2dvector(11, 23, cudf::default_stream_value); - readwrite_kernel<<<1, 1>>>(vector); - readwrite_kernel<<<1, 1>>>(vector); + readwrite_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>(vector); + readwrite_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>(vector); vector.device_to_host(cudf::default_stream_value, true); EXPECT_EQ(vector[5][6], 30); } diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu index 05acf635d90..73bfd15744a 100644 --- a/cpp/tests/wrappers/timestamps_test.cu +++ b/cpp/tests/wrappers/timestamps_test.cu @@ -94,8 +94,8 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation) fixed_width_column_wrapper(chrono_col_data.begin(), chrono_col_data.end()); rmm::device_uvector indices(this->size(), cudf::default_stream_value); - thrust::sequence(rmm::exec_policy(), indices.begin(), indices.end()); - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(), + thrust::sequence(rmm::exec_policy(cudf::default_stream_value), indices.begin(), indices.end()); + EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value), indices.begin(), indices.end(), compare_chrono_elements_to_primitive_representation{ @@ -148,10 +148,10 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode) generate_timestamps(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs)); rmm::device_uvector indices(this->size(), cudf::default_stream_value); - thrust::sequence(rmm::exec_policy(), indices.begin(), indices.end()); + thrust::sequence(rmm::exec_policy(cudf::default_stream_value), indices.begin(), indices.end()); EXPECT_TRUE(thrust::all_of( - rmm::exec_policy(), + rmm::exec_policy(cudf::default_stream_value), indices.begin(), indices.end(), compare_chrono_elements{cudf::binary_operator::LESS, @@ -159,7 +159,7 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode) *cudf::column_device_view::create(chrono_rhs_col)})); EXPECT_TRUE(thrust::all_of( - rmm::exec_policy(), + rmm::exec_policy(cudf::default_stream_value), indices.begin(), indices.end(), compare_chrono_elements{cudf::binary_operator::GREATER, @@ -167,7 +167,7 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode) *cudf::column_device_view::create(chrono_lhs_col)})); EXPECT_TRUE(thrust::all_of( - rmm::exec_policy(), + rmm::exec_policy(cudf::default_stream_value), indices.begin(), indices.end(), compare_chrono_elements{cudf::binary_operator::LESS_EQUAL, @@ -175,7 +175,7 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode) *cudf::column_device_view::create(chrono_lhs_col)})); EXPECT_TRUE(thrust::all_of( - rmm::exec_policy(), + rmm::exec_policy(cudf::default_stream_value), indices.begin(), indices.end(), compare_chrono_elements{cudf::binary_operator::GREATER_EQUAL, diff --git a/docs/cudf/source/_ext/PandasCompat.py b/docs/cudf/source/_ext/PandasCompat.py index 50fc801e4c1..af2b16035c3 100644 --- a/docs/cudf/source/_ext/PandasCompat.py +++ b/docs/cudf/source/_ext/PandasCompat.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION + # This file is adapted from official sphinx tutorial for `todo` extension: # https://www.sphinx-doc.org/en/master/development/tutorials/todo.html diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst index b77c98f3ac3..ef04167c327 100644 --- a/docs/cudf/source/api_docs/index.rst +++ b/docs/cudf/source/api_docs/index.rst @@ -19,4 +19,6 @@ This page provides a list of all publicly accessible modules, methods and classe io subword_tokenize string_handling + list_handling + struct_handling options diff --git a/docs/cudf/source/api_docs/list_handling.rst b/docs/cudf/source/api_docs/list_handling.rst new file mode 100644 index 00000000000..f1fb6d1ca74 --- /dev/null +++ b/docs/cudf/source/api_docs/list_handling.rst @@ -0,0 +1,21 @@ +List handling +~~~~~~~~~~~~~ + +``Series.list`` can be used to access the values of the series as +lists and apply list methods to it. These can be accessed like +``Series.list.``. + +.. currentmodule:: cudf.core.column.lists.ListMethods +.. autosummary:: + :toctree: api/ + + astype + concat + contains + index + get + leaves + len + sort_values + take + unique diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 1e53c90b44d..53042041f6d 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -310,21 +310,6 @@ Timedelta properties .. include:: string_handling.rst -.. - The following is needed to ensure the generated pages are created with the - correct template (otherwise they would be created in the Series/Index class page) - -.. - .. currentmodule:: cudf - .. autosummary:: - :toctree: api/ - :template: autosummary/accessor.rst - - Series.str - Series.cat - Series.dt - Index.str - .. _api.series.cat: Categorical accessor @@ -349,42 +334,27 @@ the ``Series.cat`` accessor. .. _api.series.list: - -List handling -~~~~~~~~~~~~~ - -``Series.list`` can be used to access the values of the series as -lists and apply list methods to it. These can be accessed like -``Series.list.``. - -.. currentmodule:: cudf.core.column.lists.ListMethods -.. autosummary:: - :toctree: api/ - - concat - contains - get - len - sort_values - take - unique +.. include:: list_handling.rst .. _api.series.struct: +.. include:: struct_handling.rst -Struct handling -~~~~~~~~~~~~~~~ -``Series.struct`` can be used to access the values of the series as -Structs and apply struct methods to it. These can be accessed like -``Series.struct.``. +.. + The following is needed to ensure the generated pages are created with the + correct template (otherwise they would be created in the Series/Index class page) -.. currentmodule:: cudf.core.column.struct.StructMethods -.. autosummary:: - :toctree: api/ +.. + .. currentmodule:: cudf + .. autosummary:: + :toctree: api/ + :template: autosummary/accessor.rst - field - explode + Series.str + Series.cat + Series.dt + Index.str Serialization / IO / conversion diff --git a/docs/cudf/source/api_docs/struct_handling.rst b/docs/cudf/source/api_docs/struct_handling.rst new file mode 100644 index 00000000000..05ba990382a --- /dev/null +++ b/docs/cudf/source/api_docs/struct_handling.rst @@ -0,0 +1,13 @@ +Struct handling +~~~~~~~~~~~~~~~ + +``Series.struct`` can be used to access the values of the series as +Structs and apply struct methods to it. These can be accessed like +``Series.struct.``. + +.. currentmodule:: cudf.core.column.struct.StructMethods +.. autosummary:: + :toctree: api/ + + field + explode diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 4bab7c1a403..dbc2a28c38c 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -3922,7 +3922,7 @@ public Table aggregateWindowsOverRanges(AggregationOverWindow... windowAggregate break; default: throw new IllegalArgumentException("Expected range-based window orderBy's " + - "type: integral (Boolean-exclusive) and timestamp"); + "type: integral (Boolean-exclusive), decimal, and timestamp"); } ColumnWindowOps ops = groupedOps.computeIfAbsent(agg.getColumnIndex(), (idx) -> new ColumnWindowOps()); diff --git a/print_env.sh b/print_env.sh index c929794da2b..6774f94b540 100755 --- a/print_env.sh +++ b/print_env.sh @@ -1,7 +1,8 @@ #!/usr/bin/env bash +# Copyright (c) 2022, NVIDIA CORPORATION. # Reports relevant environment information useful for diagnosing and # debugging cuDF issues. -# Usage: +# Usage: # "./print_env.sh" - prints to stdout # "./print_env.sh > env.txt" - prints to file "env.txt" @@ -14,16 +15,16 @@ git submodule status --recursive else echo "Not inside a git repository" fi -echo +echo echo "***OS Information***" cat /etc/*-release uname -a -echo +echo echo "***GPU Information***" nvidia-smi -echo +echo echo "***CPU***" lscpu @@ -31,15 +32,15 @@ echo echo "***CMake***" which cmake && cmake --version -echo +echo echo "***g++***" which g++ && g++ --version -echo +echo echo "***nvcc***" which nvcc && nvcc --version -echo +echo echo "***Python***" which python && python -c "import sys; print('Python {0}.{1}.{2}'.format(sys.version_info[0], sys.version_info[1], sys.version_info[2]))" diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index f34c11de1e9..28eb380f7cb 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -9,7 +9,6 @@ import rmm -from cudf.api.types import dtype from cudf import api, core, datasets, testing from cudf._version import get_versions from cudf.api.extensions import ( @@ -17,15 +16,28 @@ register_index_accessor, register_series_accessor, ) -from cudf.core.scalar import Scalar +from cudf.api.types import dtype +from cudf.core.algorithms import factorize +from cudf.core.cut import cut +from cudf.core.dataframe import DataFrame, from_dataframe, from_pandas, merge +from cudf.core.dtypes import ( + CategoricalDtype, + Decimal32Dtype, + Decimal64Dtype, + Decimal128Dtype, + IntervalDtype, + ListDtype, + StructDtype, +) +from cudf.core.groupby import Grouper from cudf.core.index import ( BaseIndex, CategoricalIndex, DatetimeIndex, Float32Index, Float64Index, - Index, GenericIndex, + Index, Int8Index, Int16Index, Int32Index, @@ -40,22 +52,8 @@ UInt64Index, interval_range, ) -from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe -from cudf.core.series import Series from cudf.core.missing import NA from cudf.core.multiindex import MultiIndex -from cudf.core.cut import cut -from cudf.core.algorithms import factorize -from cudf.core.dtypes import ( - CategoricalDtype, - Decimal64Dtype, - Decimal32Dtype, - Decimal128Dtype, - IntervalDtype, - ListDtype, - StructDtype, -) -from cudf.core.groupby import Grouper from cudf.core.reshape import ( concat, crosstab, @@ -65,8 +63,9 @@ pivot_table, unstack, ) -from cudf.core.series import isclose -from cudf.core.tools.datetimes import DateOffset, to_datetime +from cudf.core.scalar import Scalar +from cudf.core.series import Series, isclose +from cudf.core.tools.datetimes import DateOffset, date_range, to_datetime from cudf.core.tools.numeric import to_numeric from cudf.io import ( from_dlpack, @@ -79,15 +78,9 @@ read_parquet, read_text, ) -from cudf.core.tools.datetimes import date_range +from cudf.options import describe_option, get_option, set_option from cudf.utils.dtypes import _NA_REP -from cudf.utils.utils import set_allocator, clear_cache - -from cudf.options import ( - get_option, - set_option, - describe_option, -) +from cudf.utils.utils import clear_cache, set_allocator try: from cubinlinker.patch import patch_numba_linker_if_needed diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index bab28433c41..8ecb9a57426 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -33,9 +33,9 @@ stream_compaction, string_casting, strings, + text, transpose, unary, - text, ) MAX_COLUMN_SIZE = np.iinfo(np.int32).max diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 498fc313cf9..74addb87357 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -1,3 +1,5 @@ +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index aea733c2445..ff558a06d87 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -62,7 +62,7 @@ startswith_multiple, ) from cudf._lib.strings.findall import findall -from cudf._lib.strings.json import get_json_object, GetJsonObjectOptions +from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object from cudf._lib.strings.padding import ( SideType, center, diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 549b8bae12a..6898ae4941c 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -101,7 +101,9 @@ def __getitem__(self, key): def __contains__(self, item): return item in self._values - def _copy_type_metadata(self: BaseIndexT, other: BaseIndexT) -> BaseIndexT: + def _copy_type_metadata( + self: BaseIndexT, other: BaseIndexT, *, override_dtypes=None + ) -> BaseIndexT: raise NotImplementedError def get_level_values(self, level): diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 231a42a27b7..6d4a2990e34 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -16,6 +16,7 @@ from cudf.core.missing import NA from cudf.utils.dtypes import ( _can_cast, + _dtype_can_hold_element, find_common_type, is_mixed_with_object_dtype, ) @@ -84,6 +85,12 @@ def _check_and_cast_columns_with_other( other, source_dtype ): common_dtype = source_dtype + elif ( + isinstance(source_col, cudf.core.column.NumericalColumn) + and other_is_scalar + and _dtype_can_hold_element(source_dtype, other) + ): + common_dtype = source_dtype else: common_dtype = find_common_type( [ diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index d438f47e1c4..601ad707ba6 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -755,7 +755,7 @@ def __setitem__(self, key, value): ) if to_add_categories > 0: - raise ValueError( + raise TypeError( "Cannot setitem on a Categorical with a new " "category, set the categories first" ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 4fe365768ef..8b2c51dae90 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -62,6 +62,7 @@ is_string_dtype, is_struct_dtype, ) +from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like from cudf.core.dtypes import ( @@ -83,6 +84,11 @@ ) from cudf.utils.utils import _array_ufunc, mask_dtype +if PANDAS_GE_150: + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType +else: + from pandas.core.arrays._arrow_utils import ArrowIntervalType + T = TypeVar("T", bound="ColumnBase") # TODO: This workaround allows type hints for `slice`, since `slice` is a # method in ColumnBase. @@ -290,9 +296,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: size=codes.size, ordered=array.type.ordered, ) - elif isinstance( - array.type, pd.core.arrays._arrow_utils.ArrowIntervalType - ): + elif isinstance(array.type, ArrowIntervalType): return cudf.core.column.IntervalColumn.from_arrow(array) result = libcudf.interop.from_arrow(data)[0] @@ -1591,6 +1595,14 @@ def build_list_column( offset: int, optional """ dtype = ListDtype(element_type=elements.dtype) + if size is None: + if indices.size == 0: + size = 0 + else: + # one less because the last element of offsets is the number of + # bytes in the data buffer + size = indices.size - 1 + size = size - offset result = build_column( data=None, diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index ad73eaf2b93..657403a6082 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -131,5 +131,5 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series": # types into pandas (trying to convert the underlying numerical columns # directly is problematic), so we're stuck with this for now. return pd.Series( - pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index + self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index ) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 4b74dde129c..a66c11c8bdc 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -34,6 +34,7 @@ is_integer, is_integer_dtype, is_number, + is_scalar, ) from cudf.core.buffer import DeviceBufferLike, as_device_buffer_like from cudf.core.column import ( @@ -128,6 +129,43 @@ def has_nulls(self, include_nan=False): self.nan_count != 0 if include_nan else False ) + def __setitem__(self, key: Any, value: Any): + """ + Set the value of ``self[key]`` to ``value``. + + If ``value`` and ``self`` are of different types, ``value`` is coerced + to ``self.dtype``. + """ + + # Normalize value to scalar/column + device_value = ( + cudf.Scalar( + value, + dtype=self.dtype + if cudf._lib.scalar._is_null_host_scalar(value) + else None, + ) + if is_scalar(value) + else as_column(value) + ) + + if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype): + raise TypeError(f"Invalid value {value} for dtype {self.dtype}") + else: + device_value = device_value.astype(self.dtype) + + out: Optional[ColumnBase] # If None, no need to perform mimic inplace. + if isinstance(key, slice): + out = self._scatter_by_slice(key, device_value) + else: + key = as_column(key) + if not isinstance(key, cudf.core.column.NumericalColumn): + raise ValueError(f"Invalid scatter map type {key.dtype}.") + out = self._scatter_by_column(key, device_value) + + if out: + self._mimic_inplace(out, inplace=True) + @property def __cuda_array_interface__(self) -> Mapping[str, Any]: output = { diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 172a1ed9edc..287e68531f8 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -23,6 +23,7 @@ from numba import cuda import cudf +import cudf.api.types from cudf import _lib as libcudf from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column @@ -58,47 +59,47 @@ def str_to_boolean(column: StringColumn): _str_to_numeric_typecast_functions = { - cudf.dtype("int8"): str_cast.stoi8, - cudf.dtype("int16"): str_cast.stoi16, - cudf.dtype("int32"): str_cast.stoi, - cudf.dtype("int64"): str_cast.stol, - cudf.dtype("uint8"): str_cast.stoui8, - cudf.dtype("uint16"): str_cast.stoui16, - cudf.dtype("uint32"): str_cast.stoui, - cudf.dtype("uint64"): str_cast.stoul, - cudf.dtype("float32"): str_cast.stof, - cudf.dtype("float64"): str_cast.stod, - cudf.dtype("bool"): str_to_boolean, + cudf.api.types.dtype("int8"): str_cast.stoi8, + cudf.api.types.dtype("int16"): str_cast.stoi16, + cudf.api.types.dtype("int32"): str_cast.stoi, + cudf.api.types.dtype("int64"): str_cast.stol, + cudf.api.types.dtype("uint8"): str_cast.stoui8, + cudf.api.types.dtype("uint16"): str_cast.stoui16, + cudf.api.types.dtype("uint32"): str_cast.stoui, + cudf.api.types.dtype("uint64"): str_cast.stoul, + cudf.api.types.dtype("float32"): str_cast.stof, + cudf.api.types.dtype("float64"): str_cast.stod, + cudf.api.types.dtype("bool"): str_to_boolean, } _numeric_to_str_typecast_functions = { - cudf.dtype("int8"): str_cast.i8tos, - cudf.dtype("int16"): str_cast.i16tos, - cudf.dtype("int32"): str_cast.itos, - cudf.dtype("int64"): str_cast.ltos, - cudf.dtype("uint8"): str_cast.ui8tos, - cudf.dtype("uint16"): str_cast.ui16tos, - cudf.dtype("uint32"): str_cast.uitos, - cudf.dtype("uint64"): str_cast.ultos, - cudf.dtype("float32"): str_cast.ftos, - cudf.dtype("float64"): str_cast.dtos, - cudf.dtype("bool"): str_cast.from_booleans, + cudf.api.types.dtype("int8"): str_cast.i8tos, + cudf.api.types.dtype("int16"): str_cast.i16tos, + cudf.api.types.dtype("int32"): str_cast.itos, + cudf.api.types.dtype("int64"): str_cast.ltos, + cudf.api.types.dtype("uint8"): str_cast.ui8tos, + cudf.api.types.dtype("uint16"): str_cast.ui16tos, + cudf.api.types.dtype("uint32"): str_cast.uitos, + cudf.api.types.dtype("uint64"): str_cast.ultos, + cudf.api.types.dtype("float32"): str_cast.ftos, + cudf.api.types.dtype("float64"): str_cast.dtos, + cudf.api.types.dtype("bool"): str_cast.from_booleans, } _datetime_to_str_typecast_functions = { # TODO: support Date32 UNIX days - # cudf.dtype("datetime64[D]"): str_cast.int2timestamp, - cudf.dtype("datetime64[s]"): str_cast.int2timestamp, - cudf.dtype("datetime64[ms]"): str_cast.int2timestamp, - cudf.dtype("datetime64[us]"): str_cast.int2timestamp, - cudf.dtype("datetime64[ns]"): str_cast.int2timestamp, + # cudf.api.types.dtype("datetime64[D]"): str_cast.int2timestamp, + cudf.api.types.dtype("datetime64[s]"): str_cast.int2timestamp, + cudf.api.types.dtype("datetime64[ms]"): str_cast.int2timestamp, + cudf.api.types.dtype("datetime64[us]"): str_cast.int2timestamp, + cudf.api.types.dtype("datetime64[ns]"): str_cast.int2timestamp, } _timedelta_to_str_typecast_functions = { - cudf.dtype("timedelta64[s]"): str_cast.int2timedelta, - cudf.dtype("timedelta64[ms]"): str_cast.int2timedelta, - cudf.dtype("timedelta64[us]"): str_cast.int2timedelta, - cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta, + cudf.api.types.dtype("timedelta64[s]"): str_cast.int2timedelta, + cudf.api.types.dtype("timedelta64[ms]"): str_cast.int2timedelta, + cudf.api.types.dtype("timedelta64[us]"): str_cast.int2timedelta, + cudf.api.types.dtype("timedelta64[ns]"): str_cast.int2timedelta, } @@ -1642,7 +1643,7 @@ def isnumeric(self) -> SeriesOrIndex: also includes other characters that can represent quantities such as unicode fractions. - >>> s2 = pd.Series(['23', '³', '⅕', '']) + >>> s2 = pd.Series(['23', '³', '⅕', ''], dtype='str') >>> s2.str.isnumeric() 0 True 1 True @@ -3751,8 +3752,9 @@ def endswith(self, pat: str) -> SeriesOrIndex: dtype: bool """ if pat is None: - result_col = column.column_empty( - len(self._column), dtype="bool", masked=True + raise TypeError( + f"expected a string or a sequence-like object, not " + f"{type(pat).__name__}" ) elif is_scalar(pat): result_col = libstrings.endswith( @@ -3813,8 +3815,9 @@ def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex: dtype: bool """ if pat is None: - result_col = column.column_empty( - len(self._column), dtype="bool", masked=True + raise TypeError( + f"expected a string or a sequence-like object, not " + f"{type(pat).__name__}" ) elif is_scalar(pat): result_col = libstrings.startswith( @@ -5146,7 +5149,7 @@ def __init__( null_count: int = None, children: Tuple["column.ColumnBase", ...] = (), ): - dtype = cudf.dtype("object") + dtype = cudf.api.types.dtype("object") if size is None: for child in children: @@ -5304,7 +5307,7 @@ def __contains__(self, item: ScalarLike) -> bool: def as_numerical_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.NumericalColumn": - out_dtype = cudf.dtype(dtype) + out_dtype = cudf.api.types.dtype(dtype) string_col = self if out_dtype.kind in {"i", "u"}: if not libstrings.is_integer(string_col).all(): @@ -5346,7 +5349,7 @@ def _as_datetime_or_timedelta_column(self, dtype, format): def as_datetime_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.DatetimeColumn": - out_dtype = cudf.dtype(dtype) + out_dtype = cudf.api.types.dtype(dtype) # infer on host from the first not na element # or return all null column if all values @@ -5370,7 +5373,7 @@ def as_datetime_column( def as_timedelta_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.TimeDeltaColumn": - out_dtype = cudf.dtype(dtype) + out_dtype = cudf.api.types.dtype(dtype) format = "%D days %H:%M:%S" return self._as_datetime_or_timedelta_column(out_dtype, format) @@ -5412,7 +5415,7 @@ def to_pandas( return pd_series def can_cast_safely(self, to_dtype: Dtype) -> bool: - to_dtype = cudf.dtype(to_dtype) + to_dtype = cudf.api.types.dtype(to_dtype) if self.dtype == to_dtype: return True @@ -5585,7 +5588,7 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": raise ValueError( "Can not produce a view of a string column with nulls" ) - dtype = cudf.dtype(dtype) + dtype = cudf.api.types.dtype(dtype) str_byte_offset = self.base_children[0].element_indexing(self.offset) str_end_byte_offset = self.base_children[0].element_indexing( self.offset + self.size diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2f1695e4445..77aeec286a5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -39,7 +39,7 @@ import cudf import cudf.core.common from cudf import _lib as libcudf -from cudf._typing import ColumnLike, NotImplementedType +from cudf._typing import ColumnLike, Dtype, NotImplementedType from cudf.api.types import ( _is_scalar_or_zero_d_array, is_bool_dtype, @@ -1057,7 +1057,7 @@ def dtypes(self): string object dtype: object """ - return pd.Series(self._dtypes) + return pd.Series(self._dtypes, dtype="object") @property def ndim(self): @@ -6536,9 +6536,14 @@ def _from_columns_like_self( columns: List[ColumnBase], column_names: abc.Iterable[str], index_names: Optional[List[str]] = None, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ) -> DataFrame: result = super()._from_columns_like_self( - columns, column_names, index_names + columns, + column_names, + index_names, + override_dtypes=override_dtypes, ) result._set_column_names_like(self) return result @@ -6972,7 +6977,7 @@ def from_pandas(obj, nan_as_null=None): Converting a Pandas Series to cuDF Series: - >>> psr = pd.Series(['a', 'b', 'c', 'd'], name='apple') + >>> psr = pd.Series(['a', 'b', 'c', 'd'], name='apple', dtype='str') >>> psr 0 a 1 b diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index ad352dc6543..d770f4f6130 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -529,6 +529,16 @@ def __init__( self._nan_as_null = nan_as_null self._allow_copy = allow_copy + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> "_CuDFDataFrame": + """ + See the docstring of the `cudf.DataFrame.__dataframe__` for details + """ + return _CuDFDataFrame( + self._df, nan_as_null=nan_as_null, allow_copy=allow_copy + ) + @property def metadata(self): # `index` isn't a regular column, and the protocol doesn't support row diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 1e342871ace..5cff057ce7c 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -10,7 +10,6 @@ import pyarrow as pa from pandas.api import types as pd_types from pandas.api.extensions import ExtensionDtype -from pandas.core.arrays._arrow_utils import ArrowIntervalType from pandas.core.dtypes.dtypes import ( CategoricalDtype as pd_CategoricalDtype, CategoricalDtypeType as pd_CategoricalDtypeType, @@ -18,10 +17,15 @@ import cudf from cudf._typing import Dtype -from cudf.core._compat import PANDAS_GE_130 +from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150 from cudf.core.abc import Serializable from cudf.core.buffer import DeviceBufferLike +if PANDAS_GE_150: + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType +else: + from pandas.core.arrays._arrow_utils import ArrowIntervalType + def dtype(arbitrary): """ @@ -610,6 +614,12 @@ def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype": else: return cls(subtype=pd_dtype.subtype) + def to_pandas(self) -> pd.IntervalDtype: + if PANDAS_GE_130: + return pd.IntervalDtype(subtype=self.subtype, closed=self.closed) + else: + return pd.IntervalDtype(subtype=self.subtype) + def __eq__(self, other): if isinstance(other, str): # This means equality isn't transitive but mimics pandas diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ec78a8a37cf..40926a1c8cb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,6 +3,7 @@ from __future__ import annotations import copy +import itertools import operator import pickle import warnings @@ -131,6 +132,8 @@ def _from_columns_like_self( self, columns: List[ColumnBase], column_names: Optional[abc.Iterable[str]] = None, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ): """Construct a Frame from a list of columns with metadata from self. @@ -139,7 +142,7 @@ def _from_columns_like_self( if column_names is None: column_names = self._column_names frame = self.__class__._from_columns(columns, column_names) - return frame._copy_type_metadata(self) + return frame._copy_type_metadata(self, override_dtypes=override_dtypes) def _mimic_inplace( self: T, result: T, inplace: bool = False @@ -1012,8 +1015,8 @@ def from_arrow(cls, data): ) column_names = data.column_names - pandas_dtypes = None - np_dtypes = None + pandas_dtypes = {} + np_dtypes = {} if isinstance(data.schema.pandas_metadata, dict): metadata = data.schema.pandas_metadata pandas_dtypes = { @@ -1085,42 +1088,45 @@ def from_arrow(cls, data): # There are some special cases that need to be handled # based on metadata. - if pandas_dtypes: - for name in result: - dtype = None - if ( - len(result[name]) == 0 - and pandas_dtypes[name] == "categorical" - ): - # When pandas_dtype is a categorical column and the size - # of column is 0(i.e., empty) then we will have an - # int8 column in result._data[name] returned by libcudf, - # which needs to be type-casted to 'category' dtype. - dtype = "category" - elif ( - pandas_dtypes[name] == "empty" - and np_dtypes[name] == "object" - ): - # When a string column has all null values, pandas_dtype is - # is specified as 'empty' and np_dtypes as 'object', - # hence handling this special case to type-cast the empty - # float column to str column. - dtype = np_dtypes[name] - elif pandas_dtypes[ - name - ] == "object" and cudf.api.types.is_struct_dtype( - np_dtypes[name] - ): - # Incase of struct column, libcudf is not aware of names of - # struct fields, hence renaming the struct fields is - # necessary by extracting the field names from arrow - # struct types. - result[name] = result[name]._rename_fields( - [field.name for field in data[name].type] - ) - - if dtype is not None: - result[name] = result[name].astype(dtype) + for name in result: + if ( + len(result[name]) == 0 + and pandas_dtypes.get(name) == "categorical" + ): + # When pandas_dtype is a categorical column and the size + # of column is 0 (i.e., empty) then we will have an + # int8 column in result._data[name] returned by libcudf, + # which needs to be type-casted to 'category' dtype. + result[name] = result[name].as_categorical_column("category") + elif ( + pandas_dtypes.get(name) == "empty" + and np_dtypes.get(name) == "object" + ): + # When a string column has all null values, pandas_dtype is + # is specified as 'empty' and np_dtypes as 'object', + # hence handling this special case to type-cast the empty + # float column to str column. + result[name] = result[name].as_string_column(cudf.dtype("str")) + elif name in data.column_names and isinstance( + data[name].type, + (pa.StructType, pa.ListType, pa.Decimal128Type), + ): + # In case of struct column, libcudf is not aware of names of + # struct fields, hence renaming the struct fields is + # necessary by extracting the field names from arrow + # struct types. + + # In case of decimal column, libcudf is not aware of the + # decimal precision. + + # In case of list column, there is a possibility of nested + # list columns to have struct or decimal columns inside them. + + # All of these cases are handled by calling the + # _with_type_metadata method on the column. + result[name] = result[name]._with_type_metadata( + cudf.utils.dtypes.cudf_dtype_from_pa_type(data[name].type) + ) return cls._from_data({name: result[name] for name in column_names}) @@ -1160,17 +1166,31 @@ def _positions_from_column_names(self, column_names): if name in set(column_names) ] - def _copy_type_metadata(self: T, other: T) -> T: + def _copy_type_metadata( + self: T, + other: T, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + ) -> T: """ Copy type metadata from each column of `other` to the corresponding column of `self`. + + If override_dtypes is provided, any non-None entry + will be used in preference to the relevant column of other to + provide the new dtype. + See `ColumnBase._with_type_metadata` for more information. """ - for name, col, other_col in zip( - self._data.keys(), self._data.values(), other._data.values() - ): + if override_dtypes is None: + override_dtypes = itertools.repeat(None) + dtypes = ( + dtype if dtype is not None else col.dtype + for (dtype, col) in zip(override_dtypes, other._data.values()) + ) + for (name, col), dtype in zip(self._data.items(), dtypes): self._data.set_by_label( - name, col._with_type_metadata(other_col.dtype), validate=False + name, col._with_type_metadata(dtype), validate=False ) return self diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d1995615e0c..57a10358561 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -184,7 +184,9 @@ def __init__( # whereas _stop is an upper bound. self._end = self._start + self._step * (len(self._range) - 1) - def _copy_type_metadata(self: RangeIndex, other: RangeIndex) -> RangeIndex: + def _copy_type_metadata( + self: RangeIndex, other: RangeIndex, *, override_dtypes=None + ) -> RangeIndex: # There is no metadata to be copied for RangeIndex since it does not # have an underlying column. return self @@ -978,9 +980,11 @@ def _binaryop( # Override just to make mypy happy. @_cudf_nvtx_annotate def _copy_type_metadata( - self: GenericIndex, other: GenericIndex + self: GenericIndex, other: GenericIndex, *, override_dtypes=None ) -> GenericIndex: - return super()._copy_type_metadata(other) + return super()._copy_type_metadata( + other, override_dtypes=override_dtypes + ) @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 9bda475589a..30b1bc704c8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -30,7 +30,12 @@ import cudf import cudf._lib as libcudf -from cudf._typing import ColumnLike, DataFrameOrSeries, NotImplementedType +from cudf._typing import ( + ColumnLike, + DataFrameOrSeries, + Dtype, + NotImplementedType, +) from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_bool_dtype, @@ -45,13 +50,19 @@ from cudf.core._base_index import BaseIndex from cudf.core.column import ColumnBase, as_column, full from cudf.core.column_accessor import ColumnAccessor +from cudf.core.dtypes import ListDtype from cudf.core.frame import Frame from cudf.core.groupby.groupby import GroupBy from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.missing import NA from cudf.core.multiindex import MultiIndex from cudf.core.resample import _Resampler -from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame +from cudf.core.udf.utils import ( + _compile_or_get, + _get_input_args_from_frame, + _post_process_output_col, + _return_arr_from_dtype, +) from cudf.utils import docutils from cudf.utils.utils import _cudf_nvtx_annotate @@ -327,18 +338,28 @@ def _from_columns_like_self( columns: List[ColumnBase], column_names: Optional[abc.Iterable[str]] = None, index_names: Optional[List[str]] = None, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ): """Construct a `Frame` from a list of columns with metadata from self. If `index_names` is set, the first `len(index_names)` columns are used to construct the index of the frame. + + If override_dtypes is provided then any non-None entry will be + used for the dtype of the matching column in preference to the + dtype of the column in self. """ if column_names is None: column_names = self._column_names frame = self.__class__._from_columns( columns, column_names, index_names ) - return frame._copy_type_metadata(self, include_index=bool(index_names)) + return frame._copy_type_metadata( + self, + include_index=bool(index_names), + override_dtypes=override_dtypes, + ) def _mimic_inplace( self: T, result: T, inplace: bool = False @@ -899,40 +920,44 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) def _copy_type_metadata( - self: T, other: T, include_index: bool = True + self: T, + other: T, + include_index: bool = True, + *, + override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, ) -> T: """ Copy type metadata from each column of `other` to the corresponding column of `self`. See `ColumnBase._with_type_metadata` for more information. """ - super()._copy_type_metadata(other) - - if include_index: - if self._index is not None and other._index is not None: - self._index._copy_type_metadata(other._index) - # When other._index is a CategoricalIndex, the current index - # will be a NumericalIndex with an underlying CategoricalColumn - # (the above _copy_type_metadata call will have converted the - # column). Calling cudf.Index on that column generates the - # appropriate index. - if isinstance( - other._index, cudf.core.index.CategoricalIndex - ) and not isinstance( - self._index, cudf.core.index.CategoricalIndex - ): - self._index = cudf.Index( - cast( - cudf.core.index.NumericIndex, self._index - )._column, - name=self._index.name, - ) - elif isinstance( - other._index, cudf.MultiIndex - ) and not isinstance(self._index, cudf.MultiIndex): - self._index = cudf.MultiIndex._from_data( - self._index._data, name=self._index.name - ) + super()._copy_type_metadata(other, override_dtypes=override_dtypes) + if ( + include_index + and self._index is not None + and other._index is not None + ): + self._index._copy_type_metadata(other._index) + # When other._index is a CategoricalIndex, the current index + # will be a NumericalIndex with an underlying CategoricalColumn + # (the above _copy_type_metadata call will have converted the + # column). Calling cudf.Index on that column generates the + # appropriate index. + if isinstance( + other._index, cudf.core.index.CategoricalIndex + ) and not isinstance( + self._index, cudf.core.index.CategoricalIndex + ): + self._index = cudf.Index( + cast(cudf.core.index.NumericIndex, self._index)._column, + name=self._index.name, + ) + elif isinstance(other._index, cudf.MultiIndex) and not isinstance( + self._index, cudf.MultiIndex + ): + self._index = cudf.MultiIndex._from_data( + self._index._data, name=self._index.name + ) return self @_cudf_nvtx_annotate @@ -1799,30 +1824,19 @@ def _apply(self, func, kernel_getter, *args, **kwargs): ) from e # Mask and data column preallocated - ans_col = cp.empty(len(self), dtype=retty) + ans_col = _return_arr_from_dtype(retty, len(self)) ans_mask = cudf.core.column.column_empty(len(self), dtype="bool") - launch_args = [(ans_col, ans_mask), len(self)] - offsets = [] - - # if _compile_or_get succeeds, it is safe to create a kernel that only - # consumes the columns that are of supported dtype - for col in _supported_cols_from_frame(self).values(): - data = col.data - mask = col.mask - if mask is None: - launch_args.append(data) - else: - launch_args.append((data, mask)) - offsets.append(col.offset) - launch_args += offsets - launch_args += list(args) + output_args = [(ans_col, ans_mask), len(self)] + input_args = _get_input_args_from_frame(self) + launch_args = output_args + input_args + list(args) try: kernel.forall(len(self))(*launch_args) except Exception as e: raise RuntimeError("UDF kernel execution failed.") from e - col = cudf.core.column.as_column(ans_col) + col = _post_process_output_col(ans_col, retty) + col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) result = cudf.Series._from_data({None: col}, self._index) @@ -3476,22 +3490,32 @@ def _explode(self, explode_column: Any, ignore_index: bool): idx = None if ignore_index else self._index.copy(deep=True) return self.__class__._from_data(data, index=idx) - explode_column_num = self._column_names.index(explode_column) + column_index = self._column_names.index(explode_column) if not ignore_index and self._index is not None: - explode_column_num += self._index.nlevels + index_offset = self._index.nlevels + else: + index_offset = 0 exploded = libcudf.lists.explode_outer( [ *(self._index._data.columns if not ignore_index else ()), *self._columns, ], - explode_column_num, + column_index + index_offset, ) - + # We must copy inner datatype of the exploded list column to + # maintain struct dtype key names + exploded_dtype = cast( + ListDtype, self._columns[column_index].dtype + ).element_type return self._from_columns_like_self( exploded, self._column_names, self._index_names if not ignore_index else None, + override_dtypes=( + exploded_dtype if i == column_index else None + for i in range(len(self._columns)) + ), ) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 06a2cc33c1f..650d1d0d83a 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -20,7 +20,7 @@ from cudf._typing import DataFrameOrSeries from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column -from cudf.core._compat import PANDAS_GE_120 +from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150 from cudf.core.frame import Frame from cudf.core.index import ( BaseIndex, @@ -451,8 +451,8 @@ def __repr__(self): ) ) - if PANDAS_GE_120: - # TODO: Remove this whole `if` block, + if PANDAS_GE_120 and not PANDAS_GE_150: + # Need this whole `if` block, # this is a workaround for the following issue: # https://github.com/pandas-dev/pandas/issues/39984 preprocess_pdf = pd.DataFrame( @@ -1854,7 +1854,9 @@ def _intersection(self, other, sort=None): return midx @_cudf_nvtx_annotate - def _copy_type_metadata(self: MultiIndex, other: MultiIndex) -> MultiIndex: + def _copy_type_metadata( + self: MultiIndex, other: MultiIndex, *, override_dtypes=None + ) -> MultiIndex: res = super()._copy_type_metadata(other) res._names = other._names return res diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index 57630e7d4a9..df901f05787 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -19,6 +19,7 @@ import cudf import cudf._lib.labeling +import cudf.core.index from cudf._typing import DataFrameOrSeries from cudf.core.groupby.groupby import ( DataFrameGroupBy, @@ -40,7 +41,7 @@ def __init__(self, obj, by, axis=None, kind=None): def agg(self, func): result = super().agg(func) if len(self.grouping.bin_labels) != len(result): - index = cudf.Index( + index = cudf.core.index.Index( self.grouping.bin_labels, name=self.grouping.names[0] ) return result._align_to_index( @@ -92,7 +93,7 @@ class SeriesResampler(_Resampler, SeriesGroupBy): class _ResampleGrouping(_Grouping): - bin_labels: cudf.Index + bin_labels: cudf.core.index.Index def _handle_frequency_grouper(self, by): # if `by` is a time frequency grouper, we bin the key column diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4ab28cab5a0..e94ca8d653d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -589,7 +589,7 @@ def from_pandas(cls, s, nan_as_null=None): >>> import pandas as pd >>> import numpy as np >>> data = [10, 20, 30, np.nan] - >>> pds = pd.Series(data) + >>> pds = pd.Series(data, dtype='float64') >>> cudf.Series.from_pandas(pds) 0 10.0 1 20.0 diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py index 4608cae3228..c128bc2436c 100644 --- a/python/cudf/cudf/core/udf/__init__.py +++ b/python/cudf/cudf/core/udf/__init__.py @@ -1 +1,65 @@ -from . import typing, lowering +# Copyright (c) 2022, NVIDIA CORPORATION. +import numpy as np +from numba import cuda, types +from numba.cuda.cudaimpl import ( + lower as cuda_lower, + registry as cuda_lowering_registry, +) + +from cudf.core.dtypes import dtype +from cudf.core.udf import api, row_function, utils +from cudf.utils.dtypes import STRING_TYPES + +from . import masked_lowering, masked_typing + +_units = ["ns", "ms", "us", "s"] +_datetime_cases = {types.NPDatetime(u) for u in _units} +_timedelta_cases = {types.NPTimedelta(u) for u in _units} + + +_supported_masked_types = ( + types.integer_domain + | types.real_domain + | _datetime_cases + | _timedelta_cases + | {types.boolean} +) + +_STRING_UDFS_ENABLED = False +try: + import strings_udf + + if strings_udf.ENABLED: + from . import strings_typing # isort: skip + from . import strings_lowering # isort: skip + from strings_udf import ptxpath + from strings_udf._lib.cudf_jit_udf import to_string_view_array + from strings_udf._typing import str_view_arg_handler, string_view + + # add an overload of MaskedType.__init__(string_view, bool) + cuda_lower(api.Masked, strings_typing.string_view, types.boolean)( + masked_lowering.masked_constructor + ) + + # add an overload of pack_return(string_view) + cuda_lower(api.pack_return, strings_typing.string_view)( + masked_lowering.pack_return_scalar_impl + ) + + _supported_masked_types |= {strings_typing.string_view} + utils.launch_arg_getters[dtype("O")] = to_string_view_array + utils.masked_array_types[dtype("O")] = string_view + utils.JIT_SUPPORTED_TYPES |= STRING_TYPES + utils.ptx_files.append(ptxpath) + utils.arg_handlers.append(str_view_arg_handler) + row_function.itemsizes[dtype("O")] = string_view.size_bytes + + _STRING_UDFS_ENABLED = True + else: + del strings_udf + +except ImportError as e: + # allow cuDF to work without strings_udf + pass + +masked_typing.register_masked_constructor(_supported_masked_types) diff --git a/python/cudf/cudf/core/udf/_ops.py b/python/cudf/cudf/core/udf/_ops.py index 7307b43fceb..559a5bfad4f 100644 --- a/python/cudf/cudf/core/udf/_ops.py +++ b/python/cudf/cudf/core/udf/_ops.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + import math import operator diff --git a/python/cudf/cudf/core/udf/api.py b/python/cudf/cudf/core/udf/api.py index 23b4d02c57d..7a68cffeed2 100644 --- a/python/cudf/cudf/core/udf/api.py +++ b/python/cudf/cudf/core/udf/api.py @@ -1,3 +1,6 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + + class Masked: """ Most of the time, MaskedType as defined in typing.py diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/masked_lowering.py similarity index 99% rename from python/cudf/cudf/core/udf/lowering.py rename to python/cudf/cudf/core/udf/masked_lowering.py index 7dfe8427bfd..f825b6538bf 100644 --- a/python/cudf/cudf/core/udf/lowering.py +++ b/python/cudf/cudf/core/udf/masked_lowering.py @@ -18,7 +18,7 @@ comparison_ops, unary_ops, ) -from cudf.core.udf.typing import MaskedType, NAType +from cudf.core.udf.masked_typing import MaskedType, NAType @cuda_lowering_registry.lower_constant(NAType) @@ -62,7 +62,6 @@ def masked_scalar_op_impl(context, builder, sig, args): result = cgutils.create_struct_proxy(masked_return_type)( context, builder ) - # compute output validity valid = builder.and_(m1.valid, m2.valid) result.valid = valid diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/masked_typing.py similarity index 85% rename from python/cudf/cudf/core/udf/typing.py rename to python/cudf/cudf/core/udf/masked_typing.py index 073900d115d..a815a9f6dae 100644 --- a/python/cudf/cudf/core/udf/typing.py +++ b/python/cudf/cudf/core/udf/masked_typing.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. import operator +from typing import Any, Dict from numba import types from numba.core.extending import ( @@ -26,6 +27,12 @@ comparison_ops, unary_ops, ) +from cudf.utils.dtypes import ( + DATETIME_TYPES, + NUMERIC_TYPES, + STRING_TYPES, + TIMEDELTA_TYPES, +) SUPPORTED_NUMBA_TYPES = ( types.Number, @@ -34,29 +41,60 @@ types.NPTimedelta, ) +SUPPORTED_NUMPY_TYPES = ( + NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES +) +supported_type_str = "\n".join(sorted(list(SUPPORTED_NUMPY_TYPES) + ["bool"])) +MASKED_INIT_MAP: Dict[Any, Any] = {} -class MaskedType(types.Type): + +def _format_error_string(err): """ - A Numba type consisting of a value of some primitive type - and a validity boolean, over which we can define math ops + Wrap an error message in newlines and color it red. """ + return "\033[91m" + "\n" + err + "\n" + "\033[0m" - def __init__(self, value): - # MaskedType in Numba shall be parameterized - # with a value type - if isinstance(value, SUPPORTED_NUMBA_TYPES): - self.value_type = value + +def _type_to_masked_type(t): + result = MASKED_INIT_MAP.get(t) + if result is None: + if isinstance(t, SUPPORTED_NUMBA_TYPES): + return t else: # Unsupported Dtype. Numba tends to print out the type info # for whatever operands and operation failed to type and then # output its own error message. Putting the message in the repr # then is one way of getting the true cause to the user - self.value_type = types.Poison( - "\n\n\n Unsupported MaskedType. This is usually caused by " + err = _format_error_string( + "Unsupported MaskedType. This is usually caused by " "attempting to use a column of unsupported dtype in a UDF. " - f"Supported dtypes are {SUPPORTED_NUMBA_TYPES}" + f"Supported dtypes are:\n{supported_type_str}" ) - super().__init__(name=f"Masked{self.value_type}") + return types.Poison(err) + else: + return result + + +MASKED_INIT_MAP[types.pyobject] = types.Poison( + _format_error_string( + "strings_udf library required for usage of string dtypes " + "inside user defined functions." + ) +) + + +# Masked scalars of all types +class MaskedType(types.Type): + """ + A Numba type consisting of a value of some primitive type + and a validity boolean, over which we can define math ops + """ + + def __init__(self, value): + # MaskedType in Numba shall be parameterized + # with a value type + self.value_type = _type_to_masked_type(value) + super().__init__(name=f"Masked({self.value_type})") def __hash__(self): """ @@ -131,44 +169,35 @@ def typeof_masked(val, c): # Implemented typing for Masked(value, valid) - the construction of a Masked # type in a kernel. -@cuda_decl_registry.register -class MaskedConstructor(ConcreteTemplate): - key = api.Masked - units = ["ns", "ms", "us", "s"] - datetime_cases = {types.NPDatetime(u) for u in units} - timedelta_cases = {types.NPTimedelta(u) for u in units} - cases = [ - nb_signature(MaskedType(t), t, types.boolean) - for t in ( - types.integer_domain - | types.real_domain - | datetime_cases - | timedelta_cases - | {types.boolean} - ) - ] +def register_masked_constructor(supported_masked_types): + class MaskedConstructor(ConcreteTemplate): + key = api.Masked + cases = [ + nb_signature(MaskedType(t), t, types.boolean) + for t in supported_masked_types + ] + cuda_decl_registry.register(MaskedConstructor) -# Provide access to `m.value` and `m.valid` in a kernel for a Masked `m`. -make_attribute_wrapper(MaskedType, "value", "value") -make_attribute_wrapper(MaskedType, "valid", "valid") - + # Typing for `api.Masked` + @cuda_decl_registry.register_attr + class ClassesTemplate(AttributeTemplate): + key = types.Module(api) -# Typing for `api.Masked` -@cuda_decl_registry.register_attr -class ClassesTemplate(AttributeTemplate): - key = types.Module(api) + def resolve_Masked(self, mod): + return types.Function(MaskedConstructor) - def resolve_Masked(self, mod): - return types.Function(MaskedConstructor) + # Registration of the global is also needed for Numba to type api.Masked + cuda_decl_registry.register_global(api, types.Module(api)) + # For typing bare Masked (as in `from .api import Masked` + cuda_decl_registry.register_global( + api.Masked, types.Function(MaskedConstructor) + ) -# Registration of the global is also needed for Numba to type api.Masked -cuda_decl_registry.register_global(api, types.Module(api)) -# For typing bare Masked (as in `from .api import Masked` -cuda_decl_registry.register_global( - api.Masked, types.Function(MaskedConstructor) -) +# Provide access to `m.value` and `m.valid` in a kernel for a Masked `m`. +make_attribute_wrapper(MaskedType, "value", "value") +make_attribute_wrapper(MaskedType, "valid", "valid") # Tell numba how `MaskedType` is constructed on the backend in terms diff --git a/python/cudf/cudf/core/udf/row_function.py b/python/cudf/cudf/core/udf/row_function.py index 1d0bd5ac99d..8d887a37706 100644 --- a/python/cudf/cudf/core/udf/row_function.py +++ b/python/cudf/cudf/core/udf/row_function.py @@ -1,5 +1,6 @@ # Copyright (c) 2021-2022, NVIDIA CORPORATION. import math +from typing import Any, Dict import numpy as np from numba import cuda @@ -7,13 +8,13 @@ from numba.types import Record from cudf.core.udf.api import Masked, pack_return +from cudf.core.udf.masked_typing import MaskedType from cudf.core.udf.templates import ( masked_input_initializer_template, row_initializer_template, row_kernel_template, unmasked_input_initializer_template, ) -from cudf.core.udf.typing import MaskedType from cudf.core.udf.utils import ( _all_dtypes_from_frame, _construct_signature, @@ -24,6 +25,8 @@ _supported_dtypes_from_frame, ) +itemsizes: Dict[Any, int] = {} + def _get_frame_row_type(dtype): """ @@ -31,12 +34,10 @@ def _get_frame_row_type(dtype): Models each column and its mask as a MaskedType and models the row as a dictionary like data structure containing these MaskedTypes. - Large parts of this function are copied with comments from the Numba internals and slightly modified to account for validity bools to be present in the final struct. - See numba.np.numpy_support.from_struct_dtype for details. """ @@ -45,7 +46,9 @@ def _get_frame_row_type(dtype): fields = [] offset = 0 - sizes = [val[0].itemsize for val in dtype.fields.values()] + sizes = [ + itemsizes.get(val[0], val[0].itemsize) for val in dtype.fields.values() + ] for i, (name, info) in enumerate(dtype.fields.items()): # *info* consists of the element dtype, its offset from the beginning # of the record, and an optional "title" containing metadata. @@ -62,7 +65,8 @@ def _get_frame_row_type(dtype): fields.append((name, infos)) # increment offset by itemsize plus one byte for validity - offset += elemdtype.itemsize + 1 + itemsize = itemsizes.get(elemdtype, elemdtype.itemsize) + offset += itemsize + 1 # Align the next member of the struct to be a multiple of the # memory access size, per PTX ISA 7.4/5.4.5 @@ -127,10 +131,8 @@ def _get_row_kernel(frame, func, args): np.dtype(list(_all_dtypes_from_frame(frame).items())) ) scalar_return_type = _get_udf_return_type(row_type, func, args) - # this is the signature for the final full kernel compilation sig = _construct_signature(frame, scalar_return_type, args) - # this row type is used within the kernel to pack up the column and # mask data into the dict like data structure the user udf expects np_field_types = np.dtype( diff --git a/python/cudf/cudf/core/udf/scalar_function.py b/python/cudf/cudf/core/udf/scalar_function.py index a7b887dd2d5..31599f4151e 100644 --- a/python/cudf/cudf/core/udf/scalar_function.py +++ b/python/cudf/cudf/core/udf/scalar_function.py @@ -4,12 +4,12 @@ from numba.np import numpy_support from cudf.core.udf.api import Masked, pack_return +from cudf.core.udf.masked_typing import MaskedType from cudf.core.udf.templates import ( masked_input_initializer_template, scalar_kernel_template, unmasked_input_initializer_template, ) -from cudf.core.udf.typing import MaskedType from cudf.core.udf.utils import ( _construct_signature, _get_kernel, diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py new file mode 100644 index 00000000000..5b69d1a9da3 --- /dev/null +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -0,0 +1,125 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import operator + +from numba import types +from numba.core import cgutils +from numba.core.typing import signature as nb_signature +from numba.cuda.cudaimpl import lower as cuda_lower + +from strings_udf._typing import size_type, string_view +from strings_udf.lowering import ( + contains_impl, + count_impl, + endswith_impl, + find_impl, + isalnum_impl, + isalpha_impl, + isdecimal_impl, + isdigit_impl, + islower_impl, + isspace_impl, + isupper_impl, + len_impl, + rfind_impl, + startswith_impl, +) + +from cudf.core.udf.masked_typing import MaskedType + + +@cuda_lower(len, MaskedType(string_view)) +def masked_len_impl(context, builder, sig, args): + ret = cgutils.create_struct_proxy(sig.return_type)(context, builder) + masked_sv_ty = sig.args[0] + masked_sv = cgutils.create_struct_proxy(masked_sv_ty)( + context, builder, value=args[0] + ) + result = len_impl( + context, builder, size_type(string_view), (masked_sv.value,) + ) + ret.value = result + ret.valid = masked_sv.valid + + return ret._getvalue() + + +def create_binary_string_func(op, cuda_func, retty): + """ + Provide a wrapper around numba's low-level extension API which + produces the boilerplate needed to implement a binary function + of two masked strings. + """ + + def masked_binary_func_impl(context, builder, sig, args): + ret = cgutils.create_struct_proxy(sig.return_type)(context, builder) + + lhs_masked = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + rhs_masked = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[1] + ) + + result = cuda_func( + context, + builder, + nb_signature(retty, string_view, string_view), + (lhs_masked.value, rhs_masked.value), + ) + + ret.value = result + ret.valid = builder.and_(lhs_masked.valid, rhs_masked.valid) + + return ret._getvalue() + + cuda_lower(op, MaskedType(string_view), MaskedType(string_view))( + masked_binary_func_impl + ) + + +create_binary_string_func( + "MaskedType.startswith", + startswith_impl, + types.boolean, +) +create_binary_string_func("MaskedType.endswith", endswith_impl, types.boolean) +create_binary_string_func("MaskedType.find", find_impl, size_type) +create_binary_string_func("MaskedType.rfind", rfind_impl, size_type) +create_binary_string_func("MaskedType.count", count_impl, size_type) +create_binary_string_func(operator.contains, contains_impl, types.boolean) + + +def create_masked_unary_identifier_func(op, cuda_func): + """ + Provide a wrapper around numba's low-level extension API which + produces the boilerplate needed to implement a unary function + of a masked string. + """ + + def masked_unary_func_impl(context, builder, sig, args): + ret = cgutils.create_struct_proxy(sig.return_type)(context, builder) + masked_str = cgutils.create_struct_proxy(sig.args[0])( + context, builder, value=args[0] + ) + + result = cuda_func( + context, + builder, + types.boolean(string_view, string_view), + (masked_str.value,), + ) + ret.value = result + ret.valid = masked_str.valid + return ret._getvalue() + + cuda_lower(op, MaskedType(string_view))(masked_unary_func_impl) + + +create_masked_unary_identifier_func("MaskedType.isalnum", isalnum_impl) +create_masked_unary_identifier_func("MaskedType.isalpha", isalpha_impl) +create_masked_unary_identifier_func("MaskedType.isdigit", isdigit_impl) +create_masked_unary_identifier_func("MaskedType.isupper", isupper_impl) +create_masked_unary_identifier_func("MaskedType.islower", islower_impl) +create_masked_unary_identifier_func("MaskedType.isspace", isspace_impl) +create_masked_unary_identifier_func("MaskedType.isdecimal", isdecimal_impl) diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py new file mode 100644 index 00000000000..1179688651f --- /dev/null +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -0,0 +1,182 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import operator + +from numba import types +from numba.core.typing import signature as nb_signature +from numba.core.typing.templates import AbstractTemplate, AttributeTemplate +from numba.cuda.cudadecl import registry as cuda_decl_registry + +from strings_udf._typing import ( + StringView, + bool_binary_funcs, + id_unary_funcs, + int_binary_funcs, + size_type, + string_view, +) + +from cudf.core.udf import masked_typing +from cudf.core.udf._ops import comparison_ops +from cudf.core.udf.masked_typing import MaskedType + +masked_typing.MASKED_INIT_MAP[types.pyobject] = string_view +masked_typing.MASKED_INIT_MAP[string_view] = string_view + + +def _is_valid_string_arg(ty): + return ( + isinstance(ty, MaskedType) and isinstance(ty.value_type, StringView) + ) or isinstance(ty, types.StringLiteral) + + +def register_string_function(func): + """ + Helper function wrapping numba's low level extension API. Provides + the boilerplate needed to associate a signature with a function or + operator to be overloaded. + """ + + def deco(generic): + class MaskedStringFunction(AbstractTemplate): + pass + + MaskedStringFunction.generic = generic + cuda_decl_registry.register_global(func)(MaskedStringFunction) + + return deco + + +@register_string_function(len) +def len_typing(self, args, kws): + if isinstance(args[0], MaskedType) and isinstance( + args[0].value_type, StringView + ): + return nb_signature(MaskedType(size_type), args[0]) + elif isinstance(args[0], types.StringLiteral) and len(args) == 1: + return nb_signature(size_type, args[0]) + + +@register_string_function(operator.contains) +def contains_typing(self, args, kws): + if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]): + return nb_signature( + MaskedType(types.boolean), + MaskedType(string_view), + MaskedType(string_view), + ) + + +class MaskedStringViewCmpOp(AbstractTemplate): + """ + return the boolean result of `cmpop` between to strings + since the typing is the same for every comparison operator, + we can reuse this class for all of them. + """ + + def generic(self, args, kws): + if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]): + return nb_signature( + MaskedType(types.boolean), + MaskedType(string_view), + MaskedType(string_view), + ) + + +for op in comparison_ops: + cuda_decl_registry.register_global(op)(MaskedStringViewCmpOp) + + +def create_masked_binary_attr(attrname, retty): + """ + Helper function wrapping numba's low level extension API. Provides + the boilerplate needed to register a binary function of two masked + string objects as an attribute of one, e.g. `string.func(other)`. + """ + + class MaskedStringViewBinaryAttr(AbstractTemplate): + key = attrname + + def generic(self, args, kws): + return nb_signature( + MaskedType(retty), MaskedType(string_view), recvr=self.this + ) + + def attr(self, mod): + return types.BoundFunction( + MaskedStringViewBinaryAttr, + MaskedType(string_view), + ) + + return attr + + +def create_masked_identifier_attr(attrname): + """ + Helper function wrapping numba's low level extension API. Provides + the boilerplate needed to register a unary function of a masked + string object as an attribute, e.g. `string.func()`. + """ + + class MaskedStringViewIdentifierAttr(AbstractTemplate): + key = attrname + + def generic(self, args, kws): + return nb_signature(MaskedType(types.boolean), recvr=self.this) + + def attr(self, mod): + return types.BoundFunction( + MaskedStringViewIdentifierAttr, + MaskedType(string_view), + ) + + return attr + + +class MaskedStringViewCount(AbstractTemplate): + key = "MaskedType.count" + + def generic(self, args, kws): + return nb_signature( + MaskedType(size_type), MaskedType(string_view), recvr=self.this + ) + + +class MaskedStringViewAttrs(AttributeTemplate): + key = MaskedType(string_view) + + def resolve_count(self, mod): + return types.BoundFunction( + MaskedStringViewCount, MaskedType(string_view) + ) + + def resolve_value(self, mod): + return string_view + + def resolve_valid(self, mod): + return types.boolean + + +# Build attributes for `MaskedType(string_view)` +for func in bool_binary_funcs: + setattr( + MaskedStringViewAttrs, + f"resolve_{func}", + create_masked_binary_attr(f"MaskedType.{func}", types.boolean), + ) + +for func in int_binary_funcs: + setattr( + MaskedStringViewAttrs, + f"resolve_{func}", + create_masked_binary_attr(f"MaskedType.{func}", size_type), + ) + +for func in id_unary_funcs: + setattr( + MaskedStringViewAttrs, + f"resolve_{func}", + create_masked_identifier_attr(f"MaskedType.{func}"), + ) + +cuda_decl_registry.register_attr(MaskedStringViewAttrs) diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 5e46c6d0d77..fa79088046c 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -1,16 +1,18 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -from typing import Callable +from typing import Any, Callable, Dict, List import cachetools +import cupy as cp import numpy as np from numba import cuda, typeof from numba.core.errors import TypingError from numba.np import numpy_support -from numba.types import Poison, Tuple, boolean, int64, void +from numba.types import CPointer, Poison, Tuple, boolean, int64, void +from cudf.core.column.column import as_column from cudf.core.dtypes import CategoricalDtype -from cudf.core.udf.typing import MaskedType +from cudf.core.udf.masked_typing import MaskedType from cudf.utils import cudautils from cudf.utils.dtypes import ( BOOL_TYPES, @@ -23,11 +25,12 @@ JIT_SUPPORTED_TYPES = ( NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES ) - libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32")) MASK_BITSIZE = np.dtype("int32").itemsize * 8 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) +arg_handlers: List[Any] = [] +ptx_files: List[Any] = [] @_cudf_nvtx_annotate @@ -109,6 +112,9 @@ def _supported_cols_from_frame(frame): } +masked_array_types: Dict[Any, Any] = {} + + def _masked_array_type_from_col(col): """ Return a type representing a tuple of arrays, @@ -116,11 +122,18 @@ def _masked_array_type_from_col(col): corresponding to `dtype`, and the second an array of bools representing a mask. """ - nb_scalar_ty = numpy_support.from_dtype(col.dtype) + + col_type = masked_array_types.get(col.dtype) + if col_type: + col_type = CPointer(col_type) + else: + nb_scalar_ty = numpy_support.from_dtype(col.dtype) + col_type = nb_scalar_ty[::1] + if col.mask is None: - return nb_scalar_ty[::1] + return col_type else: - return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1])) + return Tuple((col_type, libcudf_bitmask_type[::1])) def _construct_signature(frame, return_type, args): @@ -200,7 +213,6 @@ def _compile_or_get(frame, func, args, kernel_getter=None): # could be a MaskedType or a scalar type. kernel, scalar_return_type = kernel_getter(frame, func, args) - np_return_type = numpy_support.as_dtype(scalar_return_type) precompiled[cache_key] = (kernel, np_return_type) @@ -213,6 +225,37 @@ def _get_kernel(kernel_string, globals_, sig, func): globals_["f_"] = f_ exec(kernel_string, globals_) _kernel = globals_["_kernel"] - kernel = cuda.jit(sig)(_kernel) + kernel = cuda.jit(sig, link=ptx_files, extensions=arg_handlers)(_kernel) return kernel + + +launch_arg_getters: Dict[Any, Any] = {} + + +def _get_input_args_from_frame(fr): + args = [] + offsets = [] + for col in _supported_cols_from_frame(fr).values(): + getter = launch_arg_getters.get(col.dtype) + if getter: + data = getter(col) + else: + data = col.data + if col.mask is not None: + # argument is a tuple of data, mask + args.append((data, col.mask)) + else: + # argument is just the data pointer + args.append(data) + offsets.append(col.offset) + + return args + offsets + + +def _return_arr_from_dtype(dt, size): + return cp.empty(size, dtype=dt) + + +def _post_process_output_col(col, retty): + return as_column(col, retty) diff --git a/python/cudf/cudf/core/window/__init__.py b/python/cudf/cudf/core/window/__init__.py index 76e034e4a5b..8ea3eb0179b 100644 --- a/python/cudf/cudf/core/window/__init__.py +++ b/python/cudf/cudf/core/window/__init__.py @@ -1 +1,3 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION + from cudf.core.window.rolling import Rolling diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 8d6d0171ee7..fb1cafa5625 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -10,6 +10,7 @@ from cudf import _lib as libcudf from cudf.api.types import is_integer, is_number from cudf.core import column +from cudf.core._compat import PANDAS_GE_150 from cudf.core.column.column import as_column from cudf.core.mixins import Reducible from cudf.utils import cudautils @@ -215,12 +216,21 @@ def _apply_agg_column(self, source_column, agg_name): following_window = None window = self.window elif isinstance(self.window, BaseIndexer): - start, end = self.window.get_window_bounds( - num_values=len(self.obj), - min_periods=self.min_periods, - center=self.center, - closed=None, - ) + if PANDAS_GE_150: + start, end = self.window.get_window_bounds( + num_values=len(self.obj), + min_periods=self.min_periods, + center=self.center, + closed=None, + step=None, + ) + else: + start, end = self.window.get_window_bounds( + num_values=len(self.obj), + min_periods=self.min_periods, + center=self.center, + closed=None, + ) start = as_column(start, dtype="int32") end = as_column(end, dtype="int32") diff --git a/python/cudf/cudf/io/__init__.py b/python/cudf/cudf/io/__init__.py index 4ec84ecbc74..6d4b44d5ecc 100644 --- a/python/cudf/cudf/io/__init__.py +++ b/python/cudf/cudf/io/__init__.py @@ -7,9 +7,9 @@ from cudf.io.json import read_json from cudf.io.orc import read_orc, read_orc_metadata, to_orc from cudf.io.parquet import ( + ParquetDatasetWriter, merge_parquet_filemetadata, read_parquet, - ParquetDatasetWriter, read_parquet_metadata, write_to_dataset, ) diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.apache_timestamp.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.apache_timestamp.orc new file mode 100644 index 00000000000..dd51856c3f7 Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.apache_timestamp.orc differ diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py index f025549971f..8870eb421c7 100644 --- a/python/cudf/cudf/tests/test_apply_rows.py +++ b/python/cudf/cudf/tests/test_apply_rows.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import pytest import cudf diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 3ff5210ed94..b3be097b878 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -10,6 +10,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_150 from cudf.testing._utils import assert_eq, set_random_null_mask_inplace _UFUNCS = [ @@ -84,14 +85,19 @@ def test_ufunc_index(ufunc): assert_eq(g, e, check_exact=False) else: assert_eq(got, expect, check_exact=False) - except AssertionError: + except AssertionError as e: # TODO: This branch can be removed when # https://github.com/rapidsai/cudf/issues/10178 is resolved if fname in ("power", "float_power"): if (got - expect).abs().max() == 1: pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") elif fname in ("bitwise_and", "bitwise_or", "bitwise_xor"): - pytest.xfail("https://github.com/pandas-dev/pandas/issues/46769") + if PANDAS_GE_150: + raise e + else: + pytest.xfail( + "https://github.com/pandas-dev/pandas/issues/46769" + ) raise diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index c1a08e507b3..2229bcc1938 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -13,6 +13,7 @@ import cudf from cudf import Series +from cudf.core._compat import PANDAS_GE_150 from cudf.core.index import as_index from cudf.testing import _utils as utils from cudf.utils.dtypes import ( @@ -768,7 +769,7 @@ def test_operator_func_between_series_logical( @pytest.mark.parametrize("func", _operators_comparison) @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("scalar", [-59.0, np.nan, 0, 59.0]) -@pytest.mark.parametrize("fill_value", [None, True, False, 1.0]) +@pytest.mark.parametrize("fill_value", [None, 1.0]) @pytest.mark.parametrize("use_cudf_scalar", [False, True]) def test_operator_func_series_and_scalar_logical( dtype, func, has_nulls, scalar, fill_value, use_cudf_scalar @@ -1561,7 +1562,8 @@ def test_scalar_null_binops(op, dtype_l, dtype_r): pytest.param( "nanoseconds", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/36589" + condition=not PANDAS_GE_150, + reason="https://github.com/pandas-dev/pandas/issues/36589", ), ), ], @@ -1668,7 +1670,8 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op): pytest.param( "nanoseconds", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/36589" + condition=not PANDAS_GE_150, + reason="https://github.com/pandas-dev/pandas/issues/36589", ), ), ], diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index df18dbb291e..46998c6830a 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -414,7 +414,7 @@ def test_categorical_as_unordered(pd_str_cat, inplace): pytest.param( True, marks=pytest.mark.skipif( - not PANDAS_GE_134, + condition=not PANDAS_GE_134, reason="https://github.com/pandas-dev/pandas/issues/43232", ), ), @@ -454,7 +454,7 @@ def test_categorical_reorder_categories( pytest.param( True, marks=pytest.mark.skipif( - not PANDAS_GE_134, + condition=not PANDAS_GE_134, reason="https://github.com/pandas-dev/pandas/issues/43232", ), ), @@ -491,7 +491,7 @@ def test_categorical_add_categories(pd_str_cat, inplace): pytest.param( True, marks=pytest.mark.skipif( - not PANDAS_GE_134, + condition=not PANDAS_GE_134, reason="https://github.com/pandas-dev/pandas/issues/43232", ), ), diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index bf1e9de5d1a..8f6dce4828a 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -9,7 +9,7 @@ import cudf as gd from cudf.api.types import is_categorical_dtype -from cudf.core._compat import PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -811,10 +811,13 @@ def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): axis=axis, ) - # TODO: Remove special handling of check_index_type below - # after the following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + if PANDAS_GE_150: + assert_eq(expected, actual, check_index_type=True) + else: + # special handling of check_index_type below + # required because: + # https://github.com/pandas-dev/pandas/issues/47501 + assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -881,10 +884,13 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis ) - # TODO: Remove special handling of check_index_type below - # after the following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + if PANDAS_GE_150: + assert_eq(expected, actual, check_index_type=True) + else: + # special handling of check_index_type below + # required because: + # https://github.com/pandas-dev/pandas/issues/47501 + assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) @pytest.mark.parametrize( @@ -933,10 +939,13 @@ def test_concat_join_no_overlapping_columns( axis=axis, ) - # TODO: Remove special handling of check_index_type below - # after the following bug from pandas is fixed: - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) + if PANDAS_GE_150: + assert_eq(expected, actual, check_index_type=True) + else: + # special handling of check_index_type below + # required because: + # https://github.com/pandas-dev/pandas/issues/47501 + assert_eq(expected, actual, check_index_type=not (axis == 1 and sort)) @pytest.mark.parametrize("ignore_index", [False, True]) @@ -1124,15 +1133,21 @@ def test_concat_join_series(ignore_index, sort, join, axis): axis=axis, ) - # TODO: Remove special handling of check_index_type below - # after the following bugs from pandas are fixed: - # https://github.com/pandas-dev/pandas/issues/46675 - # https://github.com/pandas-dev/pandas/issues/47501 - assert_eq( - expected, - actual, - check_index_type=(axis == 0), - ) + if PANDAS_GE_150: + assert_eq( + expected, + actual, + check_index_type=True, + ) + else: + # special handling of check_index_type required below: + # https://github.com/pandas-dev/pandas/issues/46675 + # https://github.com/pandas-dev/pandas/issues/47501 + assert_eq( + expected, + actual, + check_index_type=(axis == 0), + ) @pytest.mark.parametrize( @@ -1299,7 +1314,8 @@ def test_concat_join_empty_dataframes( pytest.param( "outer", marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/37937" + condition=not PANDAS_GE_150, + reason="https://github.com/pandas-dev/pandas/issues/37937", ), ), ], diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index af719958c1a..d9e9a4dbba1 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3106,7 +3106,7 @@ def test_to_frame(pdf, gdf): gdf_new_name = gdf.x.to_frame(name=name) pdf_new_name = pdf.x.to_frame(name=name) assert_eq(gdf_new_name, pdf_new_name) - assert gdf_new_name.columns[0] is name + assert gdf_new_name.columns[0] == np.bool(name) def test_dataframe_empty_sort_index(): @@ -4424,8 +4424,8 @@ def test_isin_dataframe(data, values): except ValueError as e: if str(e) == "Lengths must match.": pytest.xfail( - not PANDAS_GE_110, - "https://github.com/pandas-dev/pandas/issues/34256", + condition=not PANDAS_GE_110, + reason="https://github.com/pandas-dev/pandas/issues/34256", ) except TypeError as e: # Can't do isin with different categories @@ -9549,3 +9549,23 @@ def test_non_string_column_name_to_arrow(data): actual = pa.Table.from_pandas(df.to_pandas()) assert expected.equals(actual) + + +def test_complex_types_from_arrow(): + + expected = pa.Table.from_arrays( + [ + pa.array([1, 2, 3]), + pa.array([10, 20, 30]), + pa.array([{"a": 9}, {"b": 10}, {"c": 11}]), + pa.array([[{"a": 1}], [{"b": 2}], [{"c": 3}]]), + pa.array([10, 11, 12]).cast(pa.decimal128(21, 2)), + pa.array([{"a": 9}, {"b": 10, "c": {"g": 43}}, {"c": {"a": 10}}]), + ], + names=["a", "b", "c", "d", "e", "f"], + ) + + df = cudf.DataFrame.from_arrow(expected) + actual = df.to_arrow() + + assert expected.equals(actual) diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py index c9f07eab5dd..98f801d0cba 100644 --- a/python/cudf/cudf/tests/test_datasets.py +++ b/python/cudf/cudf/tests/test_datasets.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import numpy as np import cudf as gd diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 04ff5b88214..800a8aeeab5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -657,7 +657,12 @@ def test_to_datetime_errors(data): gd_data = pd_data assert_exceptions_equal( - pd.to_datetime, cudf.to_datetime, ([pd_data],), ([gd_data],) + pd.to_datetime, + cudf.to_datetime, + ([pd_data],), + ([gd_data],), + compare_error_message=False, + expected_error_message="Given date string not likely a datetime.", ) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index c88b6ac9228..7b83eec9b63 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -124,6 +124,9 @@ def test_from_dataframe(): df2 = cudf.from_dataframe(df1) assert_eq(df1, df2) + df3 = cudf.from_dataframe(df2) + assert_eq(df1, df3) + def test_int_dtype(): data_int = dict(a=[1, 2, 3], b=[9, 10, 11]) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 811cae929d8..2f8e1ac5c2f 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -6,7 +6,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_130 +from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150 from cudf.core.column import ColumnBase from cudf.core.dtypes import ( CategoricalDtype, @@ -20,6 +20,11 @@ from cudf.testing._utils import assert_eq from cudf.utils.dtypes import np_to_pa_dtype +if PANDAS_GE_150: + from pandas.core.arrays.arrow.extension_types import ArrowIntervalType +else: + from pandas.core.arrays._arrow_utils import ArrowIntervalType + def test_cdt_basic(): psr = pd.Series(["a", "b", "a", "c"], dtype="category") @@ -176,7 +181,7 @@ def closed(request): def test_interval_dtype_pyarrow_round_trip(subtype, closed): - pa_array = pd.core.arrays._arrow_utils.ArrowIntervalType(subtype, closed) + pa_array = ArrowIntervalType(subtype, closed) expect = pa_array got = IntervalDtype.from_arrow(expect).to_arrow() assert expect.equals(got) diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py index 692f40873d7..f1ed17c5df5 100644 --- a/python/cudf/cudf/tests/test_extension_compilation.py +++ b/python/cudf/cudf/tests/test_extension_compilation.py @@ -10,7 +10,7 @@ from cudf import NA from cudf.core.udf.api import Masked -from cudf.core.udf.typing import MaskedType +from cudf.core.udf.masked_typing import MaskedType from cudf.testing._utils import parametrize_numeric_dtypes_pairwise arith_ops = ( diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 911b1d5443e..c4c8e81dda2 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -699,7 +699,8 @@ def test_advanced_groupby_levels(): pytest.param( lambda df: df.groupby(["x", "y", "z"]).sum(), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/32464" + condition=not PANDAS_GE_150, + reason="https://github.com/pandas-dev/pandas/issues/32464", ), ), lambda df: df.groupby(["x", "y"]).sum(), @@ -1578,8 +1579,10 @@ def test_groupby_list_of_structs(list_agg): ) gdf = cudf.from_pandas(pdf) - with pytest.raises(pd.core.base.DataError): - gdf.groupby("a").agg({"b": list_agg}), + with pytest.raises( + pd.errors.DataError if PANDAS_GE_150 else pd.core.base.DataError + ): + gdf.groupby("a").agg({"b": list_agg}) @pytest.mark.parametrize("list_agg", [list, "collect"]) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index f3d9180d44d..f6ca4691669 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -615,6 +615,48 @@ def test_json_nested_lines(data): ) bytes.seek(0) pdf = pd.read_json(bytes, orient="records", lines=True) - # In the second test-case: - # Pandas omits "f1" in first row, so we have to enforce a common schema - assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) + # In the second test-case we need to take a detour via pyarrow + # Pandas omits "f1" in first row, so we have to enforce a common schema, + # such that pandas would have the f1 member with null + # Also, pyarrow chooses to select different ordering of a nested column + # children though key-value pairs are correct. + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +def test_json_nested_data(): + json_str = ( + '[{"0":{},"2":{}},{"1":[[""],[]],"2":{"2":""}},' + '{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' + ) + df = cudf.read_json( + StringIO(json_str), engine="cudf_experimental", orient="records" + ) + pdf = pd.read_json(StringIO(json_str), orient="records") + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +def test_json_types_data(): + # 0:<0:string,1:float> + # 1:list + # 2:<0:bool> + json_str = ( + '[{"0":null,"2":{}},' + '{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},' + '{"0":{},"1":[],"2":{"0":null}}]' + ) + df = cudf.read_json( + StringIO(json_str), engine="cudf_experimental", orient="records" + ) + pdf = pd.read_json(StringIO(json_str), orient="records") + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index aa4e5393e5b..8ea11382419 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -107,6 +107,51 @@ def test_listdtype_hash(): assert hash(a) != hash(c) +@pytest.fixture(params=["int", "float", "datetime", "timedelta"]) +def leaf_value(request): + if request.param == "int": + return np.int32(1) + elif request.param == "float": + return np.float64(1) + elif request.param == "datetime": + return pd.to_datetime("1900-01-01") + elif request.param == "timedelta": + return pd.to_timedelta("10d") + else: + raise ValueError("Unhandled data type") + + +@pytest.fixture(params=["list", "struct"]) +def list_or_struct(request, leaf_value): + if request.param == "list": + return [[leaf_value], [leaf_value]] + elif request.param == "struct": + return {"a": leaf_value, "b": [leaf_value], "c": {"d": [leaf_value]}} + else: + raise ValueError("Unhandled data type") + + +@pytest.fixture(params=["list", "struct"]) +def nested_list(request, list_or_struct, leaf_value): + if request.param == "list": + return [list_or_struct, list_or_struct] + elif request.param == "struct": + return [ + { + "a": list_or_struct, + "b": leaf_value, + "c": {"d": list_or_struct, "e": leaf_value}, + } + ] + else: + raise ValueError("Unhandled data type") + + +def test_list_dtype_explode(nested_list): + sr = cudf.Series([nested_list]) + assert sr.dtype.element_type == sr.explode().dtype + + @pytest.mark.parametrize( "data", [ diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py index 160db7053b9..e2fbd55c051 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/test_numerical.py @@ -5,6 +5,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_150 from cudf.testing._utils import NUMERIC_TYPES, assert_eq from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes @@ -263,9 +264,12 @@ def test_to_numeric_downcast_large_float_pd_bug(data, downcast): expected = pd.to_numeric(ps, downcast=downcast) got = cudf.to_numeric(gs, downcast=downcast) - # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 - with pytest.raises(AssertionError, match="Series are different"): + if PANDAS_GE_150: assert_eq(expected, got) + else: + # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 + with pytest.raises(AssertionError, match="Series are different"): + assert_eq(expected, got) @pytest.mark.parametrize( @@ -342,9 +346,12 @@ def test_to_numeric_downcast_string_large_float(data, downcast): expected = pd.to_numeric(ps, downcast=downcast) got = cudf.to_numeric(gs, downcast=downcast) - # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 - with pytest.raises(AssertionError, match="Series are different"): + if PANDAS_GE_150: assert_eq(expected, got) + else: + # Pandas bug: https://github.com/pandas-dev/pandas/issues/19729 + with pytest.raises(AssertionError, match="Series are different"): + assert_eq(expected, got) else: expected = pd.Series([np.inf, -np.inf]) with pytest.warns( diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py index 55b5a38c3e5..46324a85bb4 100644 --- a/python/cudf/cudf/tests/test_numpy_interop.py +++ b/python/cudf/cudf/tests/test_numpy_interop.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import numpy as np import pytest diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 18d159bc423..c6bf17110c2 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1871,3 +1871,22 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df): assert_eq(negative_timestamp_df, pd.read_orc(buffer)) assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read()) + + +def test_orc_reader_apache_negative_timestamp(datadir): + path = datadir / "TestOrcFile.apache_timestamp.orc" + + pdf = pd.read_orc(path) + gdf = cudf.read_orc(path) + + assert_eq(pdf, gdf) + + +def test_statistics_string_sum(): + strings = ["a string", "another string!"] + buff = BytesIO() + df = cudf.DataFrame({"str": strings}) + df.to_orc(buff) + + file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) + assert_eq(file_stats[0]["str"].get("sum"), sum(len(s) for s in strings)) diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py index ab1c085c6c0..ae5171f28d4 100644 --- a/python/cudf/cudf/tests/test_query_mask.py +++ b/python/cudf/cudf/tests/test_query_mask.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import numpy as np import pandas as pd import pytest diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index bede054037d..08188c25ffa 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -8,7 +8,12 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110, PANDAS_LT_140 +from cudf.core._compat import ( + PANDAS_GE_110, + PANDAS_GE_130, + PANDAS_GE_150, + PANDAS_LT_140, +) from cudf.testing._utils import _create_pandas_series, assert_eq from cudf.testing.dataset_generator import rand_dataframe @@ -214,12 +219,14 @@ def test_rolling_var_std_large(agg, ddof, center, seed, window_size): assert_eq(expect, got, **kwargs) -@pytest.mark.xfail +@pytest.mark.xfail( + condition=not PANDAS_GE_130, + reason="https://github.com/pandas-dev/pandas/issues/37051", +) def test_rolling_var_uniform_window(): """ Pandas adopts an online variance calculation algorithm. This gives a floating point artifact. - https://github.com/pandas-dev/pandas/issues/37051 In cudf, each window is computed independently from the previous window, this gives better numeric precision. @@ -492,7 +499,9 @@ def test_rolling_custom_index_support(): from pandas.api.indexers import BaseIndexer class CustomIndexer(BaseIndexer): - def get_window_bounds(self, num_values, min_periods, center, closed): + def custom_get_window_bounds( + self, num_values, min_periods, center, closed, step=None + ): start = np.empty(num_values, dtype=np.int64) end = np.empty(num_values, dtype=np.int64) @@ -506,6 +515,24 @@ def get_window_bounds(self, num_values, min_periods, center, closed): return start, end + if PANDAS_GE_150: + + def get_window_bounds( + self, num_values, min_periods, center, closed, step + ): + return self.custom_get_window_bounds( + num_values, min_periods, center, closed, step + ) + + else: + + def get_window_bounds( + self, num_values, min_periods, center, closed + ): + return self.custom_get_window_bounds( + num_values, min_periods, center, closed + ) + use_expanding = [True, False, True, False, True] indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 1fdd2dae31d..3219a6ad847 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -416,16 +416,6 @@ def test_write_parquet(s3_base, s3so, pdf, partition_cols): def test_read_json(s3_base, s3so): fname = "test_json_reader.json" bucket = "json" - # TODO: After following bug is fixed switch - # back to using bytes: - # https://github.com/pandas-dev/pandas/issues/46935 - - # buffer = ( - # b'{"amount": 100, "name": "Alice"}\n' - # b'{"amount": 200, "name": "Bob"}\n' - # b'{"amount": 300, "name": "Charlie"}\n' - # b'{"amount": 400, "name": "Dennis"}\n' - # ) buffer = ( '{"amount": 100, "name": "Alice"}\n' '{"amount": 200, "name": "Bob"}\n' diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 61eee6bba43..53318eef1c8 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -8,6 +8,7 @@ import pytest import cudf +from cudf.core._compat import PANDAS_GE_150 from cudf.testing import _utils as utils from cudf.testing._utils import assert_eq @@ -86,13 +87,14 @@ ), ), pd._testing.makeRangeIndex, - pd._testing.makeStringIndex, pd._testing.makeStringSeries, pd._testing.makeTimeDataFrame, pd._testing.makeTimeSeries, pd._testing.makeTimedeltaIndex, pd._testing.makeUIntIndex, - pd._testing.makeUnicodeIndex, + pd._testing.makeUnicodeIndex + if not PANDAS_GE_150 + else pd._testing.makeStringIndex, ], ) @pytest.mark.parametrize("to_host", [True, False]) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index b1ecb38e4d4..c0b99f56238 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -395,7 +395,7 @@ def test_series_describe_numeric(dtype): actual = gs.describe() expected = ps.describe() - assert_eq(expected, actual) + assert_eq(expected, actual, check_dtype=True) @pytest.mark.parametrize("dtype", ["datetime64[ns]"]) @@ -1650,7 +1650,7 @@ def test_isin_numeric(data, values): assert_eq(got, expected) -@pytest.mark.xfail(raises=ValueError) +@pytest.mark.xfail(raises=TypeError) def test_fill_new_category(): gs = cudf.Series(pd.Categorical(["a", "b", "c"])) gs[0:1] = "d" @@ -1937,3 +1937,17 @@ def test_series_ordered_dedup(): expect = pd.Series(sr.to_pandas().unique()) got = cudf.Series(sr._column.unique(preserve_order=True)) assert_eq(expect.values, got.values) + + +@pytest.mark.parametrize("dtype", ["int64", "float64"]) +@pytest.mark.parametrize("bool_scalar", [True, False]) +def test_set_bool_error(dtype, bool_scalar): + sr = cudf.Series([1, 2, 3], dtype=dtype) + psr = sr.to_pandas(nullable=True) + + assert_exceptions_equal( + lfunc=sr.__setitem__, + rfunc=psr.__setitem__, + lfunc_args_and_kwargs=([bool_scalar],), + rfunc_args_and_kwargs=([bool_scalar],), + ) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 733fb4d5e4d..cb455ae831c 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_120, PANDAS_LE_122 +from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150, PANDAS_LE_122 from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -220,23 +220,25 @@ def test_column_set_unequal_length_object_by_mask(): def test_categorical_setitem_invalid(): - # ps = pd.Series([1, 2, 3], dtype="category") + ps = pd.Series([1, 2, 3], dtype="category") gs = cudf.Series([1, 2, 3], dtype="category") - # TODO: After https://github.com/pandas-dev/pandas/issues/46646 - # is fixed remove the following workaround and - # uncomment assert_exceptions_equal - # WORKAROUND - with pytest.raises( - ValueError, - match="Cannot setitem on a Categorical with a new category, set the " - "categories first", - ): - gs[0] = 5 - - # assert_exceptions_equal( - # lfunc=ps.__setitem__, - # rfunc=gs.__setitem__, - # lfunc_args_and_kwargs=([0, 5], {}), - # rfunc_args_and_kwargs=([0, 5], {}), - # ) + if PANDAS_GE_150: + assert_exceptions_equal( + lfunc=ps.__setitem__, + rfunc=gs.__setitem__, + lfunc_args_and_kwargs=([0, 5], {}), + rfunc_args_and_kwargs=([0, 5], {}), + compare_error_message=False, + expected_error_message="Cannot setitem on a Categorical with a " + "new category, set the categories first", + ) + else: + # Following workaround is needed because: + # https://github.com/pandas-dev/pandas/issues/46646 + with pytest.raises( + ValueError, + match="Cannot setitem on a Categorical with a new category, set " + "the categories first", + ): + gs[0] = 5 diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 046d4498a2a..74d602c2cf1 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -15,7 +15,7 @@ import cudf from cudf import concat -from cudf.core._compat import PANDAS_GE_110 +from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_150 from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index from cudf.testing._utils import ( @@ -1769,7 +1769,8 @@ def test_strings_filling_tests(data, width, fillchar): pytest.param( ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], marks=pytest.mark.xfail( - reason="pandas 1.5 upgrade TODO", + condition=not PANDAS_GE_150, + reason="https://github.com/pandas-dev/pandas/issues/20868", ), ), [" ", "\t\r\n ", ""], @@ -2012,10 +2013,32 @@ def test_string_starts_ends(data, pat): ps = pd.Series(data) gs = cudf.Series(data) - assert_eq( - ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False - ) - assert_eq(ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False) + if pat is None: + assert_exceptions_equal( + lfunc=ps.str.startswith, + rfunc=gs.str.startswith, + lfunc_args_and_kwargs=([pat],), + rfunc_args_and_kwargs=([pat],), + compare_error_message=False, + expected_error_message="expected a string or a sequence-like " + "object, not NoneType", + ) + assert_exceptions_equal( + lfunc=ps.str.endswith, + rfunc=gs.str.endswith, + lfunc_args_and_kwargs=([pat],), + rfunc_args_and_kwargs=([pat],), + compare_error_message=False, + expected_error_message="expected a string or a sequence-like " + "object, not NoneType", + ) + else: + assert_eq( + ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False + ) + assert_eq( + ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 4f385656405..2b96c920765 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -8,6 +8,7 @@ import cudf from cudf.core.missing import NA +from cudf.core.udf import _STRING_UDFS_ENABLED from cudf.core.udf._ops import ( arith_ops, bitwise_ops, @@ -22,6 +23,49 @@ ) +# only run string udf tests if library exists and is enabled +def string_udf_test(f): + if _STRING_UDFS_ENABLED: + return f + else: + return pytest.mark.skip(reason="String UDFs not enabled")(f) + + +@pytest.fixture(scope="module") +def str_udf_data(): + return cudf.DataFrame( + { + "str_col": [ + "abc", + "ABC", + "AbC", + "123", + "123aBc", + "123@.!", + "", + "rapids ai", + "gpu", + "True", + "False", + "1.234", + ".123a", + "0.013", + "1.0", + "01", + "20010101", + "cudf", + "cuda", + "gpu", + ] + } + ) + + +@pytest.fixture(params=["a", "cu", "2", "gpu", "", " "]) +def substr(request): + return request.param + + def run_masked_udf_test(func, data, args=(), **kwargs): gdf = data pdf = data.to_pandas(nullable=True) @@ -537,7 +581,6 @@ def func(row): @pytest.mark.parametrize( "unsupported_col", [ - ["a", "b", "c"], _decimal_series( ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) ), @@ -682,6 +725,128 @@ def f(x): assert precompiled.currsize == 1 +@string_udf_test +def test_string_udf_len(str_udf_data): + def func(row): + return len(row["str_col"]) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_startswith(str_udf_data, substr): + def func(row): + return row["str_col"].startswith(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_endswith(str_udf_data, substr): + def func(row): + return row["str_col"].endswith(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_find(str_udf_data, substr): + def func(row): + return row["str_col"].find(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_rfind(str_udf_data, substr): + def func(row): + return row["str_col"].rfind(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_contains(str_udf_data, substr): + def func(row): + return substr in row["str_col"] + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +@pytest.mark.parametrize("other", ["cudf", "123", "", " "]) +@pytest.mark.parametrize("cmpop", comparison_ops) +def test_string_udf_cmpops(str_udf_data, other, cmpop): + def func(row): + return cmpop(row["str_col"], other) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_isalnum(str_udf_data): + def func(row): + return row["str_col"].isalnum() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_isalpha(str_udf_data): + def func(row): + return row["str_col"].isalpha() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_isdigit(str_udf_data): + def func(row): + return row["str_col"].isdigit() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_isdecimal(str_udf_data): + def func(row): + return row["str_col"].isdecimal() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_isupper(str_udf_data): + def func(row): + return row["str_col"].isupper() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_islower(str_udf_data): + def func(row): + return row["str_col"].islower() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_isspace(str_udf_data): + def func(row): + return row["str_col"].isspace() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@string_udf_test +def test_string_udf_count(str_udf_data, substr): + def func(row): + return row["str_col"].count(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + @pytest.mark.parametrize( "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] ) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index 8b9a6be0ffe..e2bd4556ce8 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -197,13 +197,15 @@ def make_cache_key(udf, sig): """ codebytes = udf.__code__.co_code constants = udf.__code__.co_consts + names = udf.__code__.co_names + if udf.__closure__ is not None: cvars = tuple(x.cell_contents for x in udf.__closure__) cvarbytes = dumps(cvars) else: cvarbytes = b"" - return constants, codebytes, cvarbytes, sig + return names, constants, codebytes, cvarbytes, sig def compile_udf(udf, type_signature): @@ -248,7 +250,7 @@ def compile_udf(udf, type_signature): ptx_code, return_type = cuda.compile_ptx_for_current_device( udf, type_signature, device=True ) - if not isinstance(return_type, cudf.core.udf.typing.MaskedType): + if not isinstance(return_type, cudf.core.udf.masked_typing.MaskedType): output_type = numpy_support.as_dtype(return_type).type else: output_type = return_type diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 29d2337e9d6..92c23d8b97b 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.common import infer_dtype_from_object import cudf +from cudf.api.types import is_bool, is_float, is_integer from cudf.core._compat import PANDAS_GE_120 from cudf.core.missing import NA @@ -671,6 +672,43 @@ def _maybe_convert_to_default_type(dtype): return dtype +def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: + if not len(rng): + return True + return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype) + + +def _dtype_can_hold_element(dtype: np.dtype, element) -> bool: + if dtype.kind in {"i", "u"}: + if isinstance(element, range): + if _dtype_can_hold_range(element, dtype): + return True + return False + + elif is_integer(element) or ( + is_float(element) and element.is_integer() + ): + info = np.iinfo(dtype) + if info.min <= element <= info.max: + return True + return False + + elif dtype.kind == "f": + if is_integer(element) or is_float(element): + casted = dtype.type(element) + if np.isnan(casted) or casted == element: + return True + # otherwise e.g. overflow see TestCoercionFloat32 + return False + + elif dtype.kind == "b": + if is_bool(element): + return True + return False + + raise NotImplementedError(f"Unsupported dtype: {dtype}") + + # Type dispatch loops similar to what are found in `np.add.types` # In NumPy, whether or not an op can be performed between two # operands is determined by checking to see if NumPy has a c/c++ diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 52830fe219f..63bc6d59524 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -13,11 +13,12 @@ import rmm import cudf +import cudf.api.types from cudf.core import column from cudf.core.buffer import as_device_buffer_like # The size of the mask in bytes -mask_dtype = cudf.dtype(np.int32) +mask_dtype = cudf.api.types.dtype(np.int32) mask_bitsize = mask_dtype.itemsize * 8 # Mapping from ufuncs to the corresponding binary operators. diff --git a/python/cudf/setup.cfg b/python/cudf/setup.cfg index 1f7cfeb49ae..8a648097ac8 100644 --- a/python/cudf/setup.cfg +++ b/python/cudf/setup.cfg @@ -25,6 +25,7 @@ known_dask= dask_cuda known_rapids= rmm + strings_udf known_first_party= cudf default_section=THIRDPARTY @@ -41,4 +42,4 @@ skip= buck-out build dist - __init__.py \ No newline at end of file + __init__.py diff --git a/python/cudf/setup.py b/python/cudf/setup.py index cfc31fccc03..93948afc0f6 100644 --- a/python/cudf/setup.py +++ b/python/cudf/setup.py @@ -7,12 +7,11 @@ import sys from distutils.spawn import find_executable +import versioneer from setuptools import find_packages from skbuild import setup from skbuild.command.build_ext import build_ext -import versioneer - install_requires = [ "cachetools", "cuda-python>=11.5,<11.7.1", @@ -21,7 +20,7 @@ "numpy", "nvtx>=0.2.1", "packaging", - "pandas>=1.0,<1.5.0dev0", + "pandas>=1.0,<1.6.0dev0", "protobuf>=3.20.1,<3.21.0a0", "typing_extensions", ] diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py index 6416bfb550d..aafb5630b94 100644 --- a/python/cudf_kafka/setup.py +++ b/python/cudf_kafka/setup.py @@ -5,12 +5,11 @@ from distutils.sysconfig import get_python_lib import numpy as np +import versioneer from Cython.Build import cythonize from setuptools import find_packages, setup from setuptools.extension import Extension -import versioneer - install_requires = ["cudf", "cython"] extras_require = {"test": ["pytest", "pytest-xdist"]} diff --git a/python/custreamz/setup.py b/python/custreamz/setup.py index 37a45729921..911f83ff6bd 100644 --- a/python/custreamz/setup.py +++ b/python/custreamz/setup.py @@ -1,8 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -from setuptools import find_packages, setup - import versioneer +from setuptools import find_packages, setup install_requires = ["cudf_kafka", "cudf"] diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 764b05f149f..159602f195a 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -57,10 +57,10 @@ def wrapper(*args, **kwargs): class CudfDataFrameGroupBy(DataFrameGroupBy): @_dask_cudf_nvtx_annotate - def __init__(self, *args, **kwargs): + def __init__(self, *args, sort=None, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, **kwargs) + super().__init__(*args, sort=sort, **kwargs) @_dask_cudf_nvtx_annotate def __getitem__(self, key): @@ -280,10 +280,10 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None): class CudfSeriesGroupBy(SeriesGroupBy): @_dask_cudf_nvtx_annotate - def __init__(self, *args, **kwargs): + def __init__(self, *args, sort=None, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, **kwargs) + super().__init__(*args, sort=sort, **kwargs) @_dask_cudf_nvtx_annotate @_check_groupby_supported diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index 266693dafe2..6c3c95d1a2e 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + from functools import partial import dask diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index 32960a90bd7..564a719fb86 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import gzip import os import warnings diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index 3a1e98feb31..3f854bb343b 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import os import pandas as pd diff --git a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py index 7789664afae..6c68d92a8df 100644 --- a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py +++ b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py @@ -1,6 +1,9 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + """ Test IO with dask.delayed API """ + import numpy as np import pytest from pandas.testing import assert_frame_equal diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index 6bf4b956404..5d17a605577 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + import pandas as pd from dask.dataframe.methods import is_categorical_dtype diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index a8e9e8e92aa..cc27c7f2a86 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -577,14 +577,18 @@ def test_groupby_categorical_key(): got = gddf.groupby("name", sort=True).agg( {"x": ["mean", "max"], "y": ["mean", "count"]} ) - expect = ddf.groupby("name", sort=True).agg( - {"x": ["mean", "max"], "y": ["mean", "count"]} + # Use `compute` to avoid upstream issue for now + # (See: https://github.com/dask/dask/issues/9515) + expect = ( + ddf.compute() + .groupby("name", sort=True) + .agg({"x": ["mean", "max"], "y": ["mean", "count"]}) ) dd.assert_eq(expect, got) @pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize("split_out", [None, 1, 2]) +@pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2]) @pytest.mark.parametrize("split_every", [False, 4]) @pytest.mark.parametrize("npartitions", [1, 10]) def test_groupby_agg_params(npartitions, split_every, split_out, as_index): @@ -602,14 +606,17 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): "c": ["mean", "std", "var"], } + split_kwargs = {"split_every": split_every, "split_out": split_out} + if split_out == "use_dask_default": + split_kwargs.pop("split_out") + # Check `sort=True` behavior if split_out == 1: gf = ( ddf.groupby(["name", "a"], sort=True, as_index=as_index) .aggregate( agg_dict, - split_every=split_every, - split_out=split_out, + **split_kwargs, ) .compute() ) @@ -630,13 +637,11 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): # Full check (`sort=False`) gr = ddf.groupby(["name", "a"], sort=False, as_index=as_index).aggregate( agg_dict, - split_every=split_every, - split_out=split_out, + **split_kwargs, ) pr = pddf.groupby(["name", "a"], sort=False).agg( agg_dict, - split_every=split_every, - split_out=split_out, + **split_kwargs, ) # Test `as_index` argument @@ -648,7 +653,9 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): assert ("name", "") in gr.columns and ("a", "") in gr.columns # Check `split_out` argument - assert gr.npartitions == (split_out or 1) + assert gr.npartitions == ( + 1 if split_out == "use_dask_default" else split_out + ) # Compute for easier multiindex handling gf = gr.compute() diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py index 8b2d85c59d7..15e383ac4d9 100644 --- a/python/dask_cudf/dask_cudf/tests/test_join.py +++ b/python/dask_cudf/dask_cudf/tests/test_join.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + from functools import partial import numpy as np diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 0b258dd33e7..770a52316b6 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + import cupy as cp import numpy as np import pytest diff --git a/python/dask_cudf/dask_cudf/tests/test_struct.py b/python/dask_cudf/dask_cudf/tests/test_struct.py index 4164c683b51..6abac4cf53b 100644 --- a/python/dask_cudf/dask_cudf/tests/test_struct.py +++ b/python/dask_cudf/dask_cudf/tests/test_struct.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + import pytest import cudf diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index 0477fea8ee9..107fd5e7d33 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -4,9 +4,8 @@ import re import shutil -from setuptools import find_packages, setup - import versioneer +from setuptools import find_packages, setup install_requires = [ "cudf", @@ -14,13 +13,13 @@ "distributed>=2022.7.1", "fsspec>=0.6.0", "numpy", - "pandas>=1.0,<1.5.0dev0", + "pandas>=1.0,<1.6.0dev0", ] extras_require = { "test": [ "numpy", - "pandas>=1.0,<1.5.0dev0", + "pandas>=1.0,<1.6.0dev0", "pytest", "numba>=0.56.2", "dask>=2021.09.1", diff --git a/python/strings_udf/CMakeLists.txt b/python/strings_udf/CMakeLists.txt new file mode 100644 index 00000000000..59d8ae795f2 --- /dev/null +++ b/python/strings_udf/CMakeLists.txt @@ -0,0 +1,43 @@ +# ============================================================================= +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) + +set(strings_udf_version 22.10.00) + +include(../../fetch_rapids.cmake) + +project( + strings-udf-python + VERSION ${strings_udf_version} + LANGUAGES CXX + # TODO: Building Python extension modules via the python_extension_module requires the C + # language to be enabled here. The test project that is built in scikit-build to verify + # various linking options for the python library is hardcoded to build with C, so until + # that is fixed we need to keep C. + C + # TODO: Enabling CUDA will not be necessary once we upgrade to CMake 3.22, which will + # pull in the required languages for the C++ project even if this project does not + # require those languges. + CUDA +) + +find_package(cudf ${strings_udf_version} REQUIRED) + +add_subdirectory(cpp) + +include(rapids-cython) +rapids_cython_init() + +add_subdirectory(strings_udf/_lib) diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt new file mode 100644 index 00000000000..d157acfefde --- /dev/null +++ b/python/strings_udf/cpp/CMakeLists.txt @@ -0,0 +1,111 @@ +# ============================================================================= +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +cmake_minimum_required(VERSION 3.20.1) + +include(rapids-cmake) +include(rapids-cpm) +include(rapids-cuda) +include(rapids-find) + +rapids_cpm_init() + +rapids_cuda_init_architectures(STRINGS_UDF) + +# Create a project so that we can enable CUDA architectures in this file. +project( + strings-udf-cpp + VERSION ${strings_udf_version} + LANGUAGES CUDA +) + +rapids_find_package( + CUDAToolkit REQUIRED + BUILD_EXPORT_SET strings-udf-exports + INSTALL_EXPORT_SET strings-udf-exports +) + +include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) +rapids_cpm_libcudacxx(BUILD_EXPORT_SET strings-udf-exports INSTALL_EXPORT_SET strings-udf-exports) + +add_library(cudf_strings_udf SHARED src/strings/udf/udf_apis.cu) +target_include_directories( + cudf_strings_udf PUBLIC "$" +) + +set_target_properties( + cudf_strings_udf + PROPERTIES BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON +) + +set(UDF_CXX_FLAGS) +set(UDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) +target_compile_options( + cudf_strings_udf PRIVATE "$<$:${UDF_CXX_FLAGS}>" + "$<$:${UDF_CUDA_FLAGS}>" +) +target_link_libraries(cudf_strings_udf PUBLIC cudf::cudf CUDA::nvrtc) +install(TARGETS cudf_strings_udf DESTINATION ./strings_udf/_lib/) + +# This function will copy the generated PTX file from its generator-specific location in the build +# tree into a specified location in the build tree from which we can install it. +function(copy_ptx_to_location target destination) + set(cmake_generated_file + "${CMAKE_CURRENT_BINARY_DIR}/cmake/cp_${target}_$>_ptx.cmake" + ) + file( + GENERATE + OUTPUT "${cmake_generated_file}" + CONTENT + " +set(ptx_paths \"$\") +file(COPY \${ptx_paths} DESTINATION \"${destination}\")" + ) + + add_custom_target( + ${target}_cp_ptx ALL + COMMAND ${CMAKE_COMMAND} -P "${cmake_generated_file}" + DEPENDS $ + COMMENT "Copying PTX files to '${destination}'" + ) +endfunction() + +# Create the shim library for each architecture. +set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true) + +foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES) + set(tgt shim_${arch}) + + add_library(${tgt} OBJECT src/strings/udf/shim.cu) + + set_target_properties(${tgt} PROPERTIES CUDA_ARCHITECTURES ${arch} CUDA_PTX_COMPILATION ON) + + target_include_directories(${tgt} PUBLIC include) + target_compile_options(${tgt} PRIVATE "$<$:${SHIM_CUDA_FLAGS}>") + target_link_libraries(${tgt} PUBLIC cudf::cudf) + + copy_ptx_to_location(${tgt} "${CMAKE_CURRENT_BINARY_DIR}/../strings_udf") + install( + FILES $ + DESTINATION ./strings_udf + RENAME ${tgt}.ptx + ) +endforeach() diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh new file mode 100644 index 00000000000..e28111fd1f2 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/char_types.cuh @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Returns true if all characters in the string are of the type specified. + * + * The output will be false if the string is empty or has at least one character + * not of the specified type. If all characters fit the type then true is returned. + * + * To ignore all but specific types, set the `verify_types` to those types + * which should be checked. Otherwise, the default `ALL_TYPES` will verify all + * characters match `types`. + * + * @code{.pseudo} + * Examples: + * s = ['ab', 'a b', 'a7', 'a B'] + * all_characters_of_type('ab', LOWER) => true + * all_characters_of_type('a b', LOWER) => false + * all_characters_of_type('a7b', LOWER) => false + * all_characters_of_type('aB', LOWER) => false + * all_characters_of_type('ab', LOWER, LOWER|UPPER) => true + * all_characters_of_type('a b', LOWER, LOWER|UPPER) => true + * all_characters_of_type('a7', LOWER, LOWER|UPPER) => true + * all_characters_of_type('a B', LOWER, LOWER|UPPER) => false + * @endcode + * + * @param flags_table Table of character-type flags + * @param d_str String for this operation + * @param types The character types to check in the string + * @param verify_types Only verify against these character types. + * Default `ALL_TYPES` means return `true` + * iff all characters match `types`. + * @return True if all characters match the type conditions + */ +__device__ inline bool all_characters_of_type( + cudf::strings::detail::character_flags_table_type* flags_table, + string_view d_str, + string_character_types types, + string_character_types verify_types = string_character_types::ALL_TYPES) +{ + bool check = !d_str.empty(); // require at least one character + size_type check_count = 0; + for (auto itr = d_str.begin(); check && (itr != d_str.end()); ++itr) { + auto code_point = cudf::strings::detail::utf8_to_codepoint(*itr); + // lookup flags in table by code-point + auto flag = code_point <= 0x00FFFF ? flags_table[code_point] : 0; + if ((verify_types & flag) || // should flag be verified + (flag == 0 && verify_types == ALL_TYPES)) // special edge case + { + check = (types & flag) > 0; + ++check_count; + } + } + return check && (check_count > 0); +} + +/** + * @brief Returns true if all characters are alphabetic only + * + * @param flags_table Table required for checking character types + * @param d_str Input string to check + * @return True if characters alphabetic + */ +__device__ inline bool is_alpha(cudf::strings::detail::character_flags_table_type* flags_table, + string_view d_str) +{ + return all_characters_of_type(flags_table, d_str, string_character_types::ALPHA); +} + +/** + * @brief Returns true if all characters are alphanumeric only + * + * @param flags_table Table required for checking character types + * @param d_str Input string to check + * @return True if characters are alphanumeric + */ +__device__ inline bool is_alpha_numeric( + cudf::strings::detail::character_flags_table_type* flags_table, string_view d_str) +{ + return all_characters_of_type(flags_table, d_str, string_character_types::ALPHANUM); +} + +/** + * @brief Returns true if all characters are numeric only + * + * @param flags_table Table required for checking character types + * @param d_str Input string to check + * @return True if characters are numeric + */ +__device__ inline bool is_numeric(cudf::strings::detail::character_flags_table_type* flags_table, + string_view d_str) +{ + return all_characters_of_type(flags_table, d_str, string_character_types::NUMERIC); +} + +/** + * @brief Returns true if all characters are digits only + * + * @param flags_table Table required for checking character types + * @param d_str Input string to check + * @return True if characters are digits + */ +__device__ inline bool is_digit(cudf::strings::detail::character_flags_table_type* flags_table, + string_view d_str) +{ + return all_characters_of_type(flags_table, d_str, string_character_types::DIGIT); +} + +/** + * @brief Returns true if all characters are decimal only + * + * @param flags_table Table required for checking character types + * @param d_str Input string to check + * @return True if characters are decimal + */ +__device__ inline bool is_decimal(cudf::strings::detail::character_flags_table_type* flags_table, + string_view d_str) +{ + return all_characters_of_type(flags_table, d_str, string_character_types::DECIMAL); +} + +/** + * @brief Returns true if all characters are spaces only + * + * @param flags_table Table required for checking character types + * @param d_str Input string to check + * @return True if characters spaces + */ +__device__ inline bool is_space(cudf::strings::detail::character_flags_table_type* flags_table, + string_view d_str) +{ + return all_characters_of_type(flags_table, d_str, string_character_types::SPACE); +} + +/** + * @brief Returns true if all characters are upper case only + * + * @param flags_table Table required for checking character types + * @param d_str Input string to check + * @return True if characters are upper case + */ +__device__ inline bool is_upper(cudf::strings::detail::character_flags_table_type* flags_table, + string_view d_str) +{ + return all_characters_of_type( + flags_table, d_str, string_character_types::UPPER, string_character_types::CASE_TYPES); +} + +/** + * @brief Returns true if all characters are lower case only + * + * @param flags_table Table required for checking character types + * @param d_str Input string to check + * @return True if characters are lower case + */ +__device__ inline bool is_lower(cudf::strings::detail::character_flags_table_type* flags_table, + string_view d_str) +{ + return all_characters_of_type( + flags_table, d_str, string_character_types::LOWER, string_character_types::CASE_TYPES); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh new file mode 100644 index 00000000000..ef15886f1f5 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/search.cuh @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Returns the number of times that the target string appears + * in the source string. + * + * If `start <= 0` the search begins at the beginning of the `source` string. + * If `end <=0` or `end` is greater the length of the `source` string, + * the search stops at the end of the string. + * + * @param source Source string to search + * @param target String to match within source + * @param start First character position within source to start the search + * @param end Last character position (exclusive) within source to search + * @return Number of matches + */ +__device__ inline cudf::size_type count(string_view const source, + string_view const target, + cudf::size_type start = 0, + cudf::size_type end = -1) +{ + auto const tgt_length = target.length(); + auto const src_length = source.length(); + + start = start < 0 ? 0 : start; + end = (end < 0 || end > src_length) ? src_length : end; + + if (tgt_length == 0) { return (end - start) + 1; } + cudf::size_type count = 0; + cudf::size_type pos = start; + while (pos != cudf::string_view::npos) { + pos = source.find(target, pos, end - pos); + if (pos != cudf::string_view::npos) { + ++count; + pos += tgt_length; + } + } + return count; +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh new file mode 100644 index 00000000000..38c609ae505 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/starts_with.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Returns true if the beginning of the specified string + * matches the given character array. + * + * @param dstr String to check + * @param tgt Character array encoded in UTF-8 + * @param bytes Number of bytes to read from `tgt` + * @return true if `tgt` matches the beginning of `dstr` + */ +__device__ inline bool starts_with(cudf::string_view const dstr, + char const* tgt, + cudf::size_type bytes) +{ + if (bytes > dstr.size_bytes()) { return false; } + auto const start_str = cudf::string_view{dstr.data(), bytes}; + return start_str.compare(tgt, bytes) == 0; +} + +/** + * @brief Returns true if the beginning of the specified string + * matches the given target string. + * + * @param dstr String to check + * @param tgt String to match + * @return true if `tgt` matches the beginning of `dstr` + */ +__device__ inline bool starts_with(cudf::string_view const dstr, cudf::string_view const& tgt) +{ + return starts_with(dstr, tgt.data(), tgt.size_bytes()); +} + +/** + * @brief Returns true if the end of the specified string + * matches the given character array. + * + * @param dstr String to check + * @param tgt Character array encoded in UTF-8 + * @param bytes Number of bytes to read from `tgt` + * @return true if `tgt` matches the end of `dstr` + */ +__device__ inline bool ends_with(cudf::string_view const dstr, + char const* tgt, + cudf::size_type bytes) +{ + if (bytes > dstr.size_bytes()) { return false; } + auto const end_str = cudf::string_view{dstr.data() + dstr.size_bytes() - bytes, bytes}; + return end_str.compare(tgt, bytes) == 0; +} + +/** + * @brief Returns true if the end of the specified string + * matches the given target` string. + * + * @param dstr String to check + * @param tgt String to match + * @return true if `tgt` matches the end of `dstr` + */ +__device__ inline bool ends_with(cudf::string_view const dstr, cudf::string_view const& tgt) +{ + return ends_with(dstr, tgt.data(), tgt.size_bytes()); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp new file mode 100644 index 00000000000..6de9b91de08 --- /dev/null +++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include + +namespace cudf { +namespace strings { +namespace udf { + +/** + * @brief Return a cudf::string_view array for the given strings column + * + * @param input Strings column to convert to a string_view array. + * @throw cudf::logic_error if input is not a strings column. + */ +std::unique_ptr to_string_view_array(cudf::column_view const input); + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu new file mode 100644 index 00000000000..656861f9cd6 --- /dev/null +++ b/python/strings_udf/cpp/src/strings/udf/shim.cu @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +using namespace cudf::strings::udf; + +extern "C" __device__ int len(int* nb_retval, void const* str) +{ + auto sv = reinterpret_cast(str); + *nb_retval = sv->length(); + return 0; +} + +extern "C" __device__ int startswith(bool* nb_retval, void const* str, void const* substr) +{ + auto str_view = reinterpret_cast(str); + auto substr_view = reinterpret_cast(substr); + + *nb_retval = starts_with(*str_view, *substr_view); + return 0; +} + +extern "C" __device__ int endswith(bool* nb_retval, void const* str, void const* substr) +{ + auto str_view = reinterpret_cast(str); + auto substr_view = reinterpret_cast(substr); + + *nb_retval = ends_with(*str_view, *substr_view); + return 0; +} + +extern "C" __device__ int contains(bool* nb_retval, void const* str, void const* substr) +{ + auto str_view = reinterpret_cast(str); + auto substr_view = reinterpret_cast(substr); + + *nb_retval = (str_view->find(*substr_view) != cudf::string_view::npos); + return 0; +} + +extern "C" __device__ int find(int* nb_retval, void const* str, void const* substr) +{ + auto str_view = reinterpret_cast(str); + auto substr_view = reinterpret_cast(substr); + + *nb_retval = str_view->find(*substr_view); + return 0; +} + +extern "C" __device__ int rfind(int* nb_retval, void const* str, void const* substr) +{ + auto str_view = reinterpret_cast(str); + auto substr_view = reinterpret_cast(substr); + + *nb_retval = str_view->rfind(*substr_view); + return 0; +} + +extern "C" __device__ int eq(bool* nb_retval, void const* str, void const* rhs) +{ + auto str_view = reinterpret_cast(str); + auto rhs_view = reinterpret_cast(rhs); + + *nb_retval = (*str_view == *rhs_view); + return 0; +} + +extern "C" __device__ int ne(bool* nb_retval, void const* str, void const* rhs) +{ + auto str_view = reinterpret_cast(str); + auto rhs_view = reinterpret_cast(rhs); + + *nb_retval = (*str_view != *rhs_view); + return 0; +} + +extern "C" __device__ int ge(bool* nb_retval, void const* str, void const* rhs) +{ + auto str_view = reinterpret_cast(str); + auto rhs_view = reinterpret_cast(rhs); + + *nb_retval = (*str_view >= *rhs_view); + return 0; +} + +extern "C" __device__ int le(bool* nb_retval, void const* str, void const* rhs) +{ + auto str_view = reinterpret_cast(str); + auto rhs_view = reinterpret_cast(rhs); + + *nb_retval = (*str_view <= *rhs_view); + return 0; +} + +extern "C" __device__ int gt(bool* nb_retval, void const* str, void const* rhs) +{ + auto str_view = reinterpret_cast(str); + auto rhs_view = reinterpret_cast(rhs); + + *nb_retval = (*str_view > *rhs_view); + return 0; +} + +extern "C" __device__ int lt(bool* nb_retval, void const* str, void const* rhs) +{ + auto str_view = reinterpret_cast(str); + auto rhs_view = reinterpret_cast(rhs); + + *nb_retval = (*str_view < *rhs_view); + return 0; +} + +extern "C" __device__ int pyislower(bool* nb_retval, void const* str, std::int64_t chars_table) +{ + auto str_view = reinterpret_cast(str); + + *nb_retval = is_lower( + reinterpret_cast(chars_table), *str_view); + return 0; +} + +extern "C" __device__ int pyisupper(bool* nb_retval, void const* str, std::int64_t chars_table) +{ + auto str_view = reinterpret_cast(str); + + *nb_retval = is_upper( + reinterpret_cast(chars_table), *str_view); + return 0; +} + +extern "C" __device__ int pyisspace(bool* nb_retval, void const* str, std::int64_t chars_table) +{ + auto str_view = reinterpret_cast(str); + + *nb_retval = is_space( + reinterpret_cast(chars_table), *str_view); + return 0; +} + +extern "C" __device__ int pyisdecimal(bool* nb_retval, void const* str, std::int64_t chars_table) +{ + auto str_view = reinterpret_cast(str); + + *nb_retval = is_decimal( + reinterpret_cast(chars_table), *str_view); + return 0; +} + +extern "C" __device__ int pyisnumeric(bool* nb_retval, void const* str, std::int64_t chars_table) +{ + auto str_view = reinterpret_cast(str); + + *nb_retval = is_numeric( + reinterpret_cast(chars_table), *str_view); + return 0; +} + +extern "C" __device__ int pyisdigit(bool* nb_retval, void const* str, std::int64_t chars_table) +{ + auto str_view = reinterpret_cast(str); + + *nb_retval = is_digit( + reinterpret_cast(chars_table), *str_view); + return 0; +} + +extern "C" __device__ int pyisalnum(bool* nb_retval, void const* str, std::int64_t chars_table) +{ + auto str_view = reinterpret_cast(str); + + *nb_retval = is_alpha_numeric( + reinterpret_cast(chars_table), *str_view); + return 0; +} + +extern "C" __device__ int pyisalpha(bool* nb_retval, void const* str, std::int64_t chars_table) +{ + auto str_view = reinterpret_cast(str); + + *nb_retval = is_alpha( + reinterpret_cast(chars_table), *str_view); + return 0; +} + +extern "C" __device__ int pycount(int* nb_retval, void const* str, void const* substr) +{ + auto str_view = reinterpret_cast(str); + auto substr_view = reinterpret_cast(substr); + + *nb_retval = count(*str_view, *substr_view); + return 0; +} diff --git a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu new file mode 100644 index 00000000000..dfef1be39f5 --- /dev/null +++ b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace udf { +namespace detail { + +std::unique_ptr to_string_view_array(cudf::column_view const input, + rmm::cuda_stream_view stream) +{ + return std::make_unique( + std::move(cudf::strings::detail::create_string_vector_from_column( + cudf::strings_column_view(input), stream) + .release())); +} + +} // namespace detail + +std::unique_ptr to_string_view_array(cudf::column_view const input) +{ + return detail::to_string_view_array(input, rmm::cuda_stream_default); +} + +} // namespace udf +} // namespace strings +} // namespace cudf diff --git a/python/strings_udf/setup.cfg b/python/strings_udf/setup.cfg new file mode 100644 index 00000000000..9f29b26b5e0 --- /dev/null +++ b/python/strings_udf/setup.cfg @@ -0,0 +1,41 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +[versioneer] +VCS = git +style = pep440 +versionfile_source = strings_udf/_version.py +versionfile_build = strings_udf/_version.py +tag_prefix = v +parentdir_prefix = strings_udf- + +[isort] +line_length=79 +multi_line_output=3 +include_trailing_comma=True +force_grid_wrap=0 +combine_as_imports=True +order_by_type=True +known_dask= + dask + distributed + dask_cuda +known_rapids= + rmm + cudf +known_first_party= + strings_udf +default_section=THIRDPARTY +sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER +skip= + thirdparty + .eggs + .git + .hg + .mypy_cache + .tox + .venv + _build + buck-out + build + dist + __init__.py diff --git a/python/strings_udf/setup.py b/python/strings_udf/setup.py new file mode 100644 index 00000000000..c8cafe978f7 --- /dev/null +++ b/python/strings_udf/setup.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import os +import re +import shutil + +import versioneer +from setuptools import find_packages +from skbuild import setup + +install_requires = ["numba>=0.53.1", "numpy", "cudf"] + +extras_require = { + "test": [ + "pytest", + ] +} + + +def get_cuda_version_from_header(cuda_include_dir, delimeter=""): + + cuda_version = None + + with open(os.path.join(cuda_include_dir, "cuda.h"), encoding="utf-8") as f: + for line in f.readlines(): + if re.search(r"#define CUDA_VERSION ", line) is not None: + cuda_version = line + break + + if cuda_version is None: + raise TypeError("CUDA_VERSION not found in cuda.h") + cuda_version = int(cuda_version.split()[2]) + return "%d%s%d" % ( + cuda_version // 1000, + delimeter, + (cuda_version % 1000) // 10, + ) + + +CUDA_HOME = os.environ.get("CUDA_HOME", False) +if not CUDA_HOME: + path_to_cuda_gdb = shutil.which("cuda-gdb") + if path_to_cuda_gdb is None: + raise OSError( + "Could not locate CUDA. " + "Please set the environment variable " + "CUDA_HOME to the path to the CUDA installation " + "and try again." + ) + CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb)) + +if not os.path.isdir(CUDA_HOME): + raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}") + +cuda_include_dir = os.path.join(CUDA_HOME, "include") + +setup( + name="strings_udf", + version=versioneer.get_version(), + description="Strings UDF Library", + url="https://github.com/rapidsai/cudf", + author="NVIDIA Corporation", + license="Apache 2.0", + classifiers=[ + "Intended Audience :: Developers", + "Topic :: Database", + "Topic :: Scientific/Engineering", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + ], + packages=find_packages(include=["strings_udf", "strings_udf.*"]), + package_data={ + key: ["*.pxd"] for key in find_packages(include=["strings_udf._lib*"]) + }, + cmdclass=versioneer.get_cmdclass(), + install_requires=install_requires, + extras_require=extras_require, + zip_safe=False, +) diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py new file mode 100644 index 00000000000..94bd2531779 --- /dev/null +++ b/python/strings_udf/strings_udf/__init__.py @@ -0,0 +1,75 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +import glob +import os +import re +import subprocess +import sys + +from numba import cuda +from ptxcompiler.patch import CMD + +from . import _version + +ENABLED = False + + +def compiler_from_ptx_file(path): + """Parse a PTX file header and extract the CUDA version used to compile it. + + Here is an example PTX header that this function should parse: + + // Generated by NVIDIA NVVM Compiler + // + // Compiler Build ID: CL-30672275 + // Cuda compilation tools, release 11.5, V11.5.119 + // Based on NVVM 7 + """ + file = open(path).read() + major, minor = ( + re.search(r"Cuda compilation tools, release ([0-9\.]+)", file) + .group(1) + .split(".") + ) + return int(major), int(minor) + + +# adapted from PTXCompiler +cp = subprocess.run([sys.executable, "-c", CMD], capture_output=True) +if cp.returncode == 0: + # must have a driver to proceed + versions = [int(s) for s in cp.stdout.strip().split()] + driver_version = tuple(versions[:2]) + runtime_version = tuple(versions[2:]) + + # CUDA enhanced compatibility not yet enabled + if driver_version >= runtime_version: + # Load the highest compute capability file available that is less than + # the current device's. + files = glob.glob( + os.path.join(os.path.dirname(__file__), "shim_*.ptx") + ) + dev = cuda.get_current_device() + cc = "".join(str(x) for x in dev.compute_capability) + files = glob.glob( + os.path.join(os.path.dirname(__file__), "shim_*.ptx") + ) + if len(files) == 0: + raise RuntimeError( + "This strings_udf installation is missing the necessary PTX " + "files. Please file an issue reporting this error and how you " + "installed cudf and strings_udf." + ) + sms = [ + os.path.basename(f).rstrip(".ptx").lstrip("shim_") for f in files + ] + selected_sm = max(sm for sm in sms if sm < cc) + ptxpath = os.path.join( + os.path.dirname(__file__), f"shim_{selected_sm}.ptx" + ) + + if driver_version >= compiler_from_ptx_file(ptxpath): + ENABLED = True + else: + del ptxpath + +__version__ = _version.get_versions()["version"] diff --git a/python/strings_udf/strings_udf/_lib/CMakeLists.txt b/python/strings_udf/strings_udf/_lib/CMakeLists.txt new file mode 100644 index 00000000000..91069a43891 --- /dev/null +++ b/python/strings_udf/strings_udf/_lib/CMakeLists.txt @@ -0,0 +1,25 @@ +# ============================================================================= +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources cudf_jit_udf.pyx tables.pyx) +set(linked_libraries cudf::cudf cudf_strings_udf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" +) + +foreach(cython_module IN LISTS _RAPIDS_CYTHON_CREATED_TARGETS) + set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/cpp") +endforeach() diff --git a/python/strings_udf/strings_udf/_lib/__init__.py b/python/strings_udf/strings_udf/_lib/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/strings_udf/strings_udf/_lib/cpp/__init__.pxd b/python/strings_udf/strings_udf/_lib/cpp/__init__.pxd new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd new file mode 100644 index 00000000000..fb8e3a949bf --- /dev/null +++ b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd @@ -0,0 +1,20 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from libc.stdint cimport uint8_t +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.vector cimport vector + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.types cimport size_type +from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer + + +cdef extern from "cudf/strings/udf/udf_apis.hpp" namespace \ + "cudf::strings::udf" nogil: + cdef unique_ptr[device_buffer] to_string_view_array(column_view) except + + +cdef extern from "cudf/strings/detail/char_tables.hpp" namespace \ + "cudf::strings::detail" nogil: + cdef const uint8_t* get_character_flags_table() except + diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx new file mode 100644 index 00000000000..bb1892a4d26 --- /dev/null +++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx @@ -0,0 +1,24 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf.core.buffer import Buffer + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column, column_view +from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer + +from strings_udf._lib.cpp.strings_udf cimport ( + to_string_view_array as cpp_to_string_view_array, +) + + +def to_string_view_array(Column strings_col): + cdef unique_ptr[device_buffer] c_buffer + cdef column_view input_view = strings_col.view() + with nogil: + c_buffer = move(cpp_to_string_view_array(input_view)) + + device_buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer)) + return Buffer(device_buffer) diff --git a/python/strings_udf/strings_udf/_lib/tables.pyx b/python/strings_udf/strings_udf/_lib/tables.pyx new file mode 100644 index 00000000000..5443364a4a7 --- /dev/null +++ b/python/strings_udf/strings_udf/_lib/tables.pyx @@ -0,0 +1,14 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from libc.stdint cimport uint8_t, uintptr_t + +from strings_udf._lib.cpp.strings_udf cimport ( + get_character_flags_table as cpp_get_character_flags_table, +) + +import numpy as np + + +def get_character_flags_table_ptr(): + cdef const uint8_t* tbl_ptr = cpp_get_character_flags_table() + return np.int64(tbl_ptr) diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py new file mode 100644 index 00000000000..2e4519a01fe --- /dev/null +++ b/python/strings_udf/strings_udf/_typing.py @@ -0,0 +1,229 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import operator + +import llvmlite.binding as ll +from numba import types +from numba.core.datamodel import default_manager +from numba.core.extending import models, register_model +from numba.core.typing import signature as nb_signature +from numba.core.typing.templates import AbstractTemplate, AttributeTemplate +from numba.cuda.cudadecl import registry as cuda_decl_registry +from numba.cuda.cudadrv import nvvm + +data_layout = nvvm.data_layout + +# libcudf size_type +size_type = types.int32 + +# workaround for numba < 0.56 +if isinstance(data_layout, dict): + data_layout = data_layout[64] +target_data = ll.create_target_data(data_layout) + + +# String object definitions +class DString(types.Type): + def __init__(self): + super().__init__(name="dstring") + llty = default_manager[self].get_value_type() + self.size_bytes = llty.get_abi_size(target_data) + + +class StringView(types.Type): + def __init__(self): + super().__init__(name="string_view") + llty = default_manager[self].get_value_type() + self.size_bytes = llty.get_abi_size(target_data) + + +@register_model(StringView) +class stringview_model(models.StructModel): + # from string_view.hpp: + _members = ( + # const char* _data{} + # Pointer to device memory contain char array for this string + ("data", types.CPointer(types.char)), + # size_type _bytes{}; + # Number of bytes in _data for this string + ("bytes", size_type), + # mutable size_type _length{}; + # Number of characters in this string (computed) + ("length", size_type), + ) + + def __init__(self, dmm, fe_type): + super().__init__(dmm, fe_type, self._members) + + +@register_model(DString) +class dstring_model(models.StructModel): + # from dstring.hpp: + # private: + # char* m_data{}; + # cudf::size_type m_bytes{}; + # cudf::size_type m_size{}; + + _members = ( + ("m_data", types.CPointer(types.char)), + ("m_bytes", size_type), + ("m_size", size_type), + ) + + def __init__(self, dmm, fe_type): + super().__init__(dmm, fe_type, self._members) + + +any_string_ty = (StringView, DString, types.StringLiteral) +string_view = StringView() + + +class StrViewArgHandler: + """ + As part of Numba's preprocessing step, incoming function arguments are + modified based on the associated type for that argument that was used + to JIT the kernel. However it only knows how to handle built in array + types natively. With string UDFs, the jitted type is string_view*, + which numba does not know how to handle. + + This class converts string_view* to raw pointer arguments, which Numba + knows how to use. + + See numba.cuda.compiler._prepare_args for details. + """ + + def prepare_args(self, ty, val, **kwargs): + if isinstance(ty, types.CPointer) and isinstance(ty.dtype, StringView): + return types.uint64, val.ptr + else: + return ty, val + + +str_view_arg_handler = StrViewArgHandler() + + +# String functions +@cuda_decl_registry.register_global(len) +class StringLength(AbstractTemplate): + """ + provide the length of a cudf::string_view like struct + """ + + def generic(self, args, kws): + if isinstance(args[0], any_string_ty) and len(args) == 1: + # length: + # string_view -> int32 + # dstring -> int32 + # literal -> int32 + return nb_signature(size_type, args[0]) + + +def register_stringview_binaryop(op, retty): + """ + Helper function wrapping numba's low level extension API. Provides + the boilerplate needed to associate a signature with a function or + operator expecting a string. + """ + + class StringViewBinaryOp(AbstractTemplate): + def generic(self, args, kws): + if isinstance(args[0], any_string_ty) and isinstance( + args[1], any_string_ty + ): + return nb_signature(retty, string_view, string_view) + + cuda_decl_registry.register_global(op)(StringViewBinaryOp) + + +register_stringview_binaryop(operator.eq, types.boolean) +register_stringview_binaryop(operator.ne, types.boolean) +register_stringview_binaryop(operator.lt, types.boolean) +register_stringview_binaryop(operator.gt, types.boolean) +register_stringview_binaryop(operator.le, types.boolean) +register_stringview_binaryop(operator.ge, types.boolean) +register_stringview_binaryop(operator.contains, types.boolean) + + +def create_binary_attr(attrname, retty): + """ + Helper function wrapping numba's low level extension API. Provides + the boilerplate needed to register a binary function of two string + objects as an attribute of one, e.g. `string.func(other)`. + """ + + class StringViewBinaryAttr(AbstractTemplate): + key = f"StringView.{attrname}" + + def generic(self, args, kws): + return nb_signature(retty, string_view, recvr=self.this) + + def attr(self, mod): + return types.BoundFunction(StringViewBinaryAttr, string_view) + + return attr + + +def create_identifier_attr(attrname): + """ + Helper function wrapping numba's low level extension API. Provides + the boilerplate needed to register a unary function of a string + object as an attribute, e.g. `string.func()`. + """ + + class StringViewIdentifierAttr(AbstractTemplate): + key = f"StringView.{attrname}" + + def generic(self, args, kws): + return nb_signature(types.boolean, recvr=self.this) + + def attr(self, mod): + return types.BoundFunction(StringViewIdentifierAttr, string_view) + + return attr + + +class StringViewCount(AbstractTemplate): + key = "StringView.count" + + def generic(self, args, kws): + return nb_signature(size_type, string_view, recvr=self.this) + + +@cuda_decl_registry.register_attr +class StringViewAttrs(AttributeTemplate): + key = string_view + + def resolve_count(self, mod): + return types.BoundFunction(StringViewCount, string_view) + + +# Build attributes for `MaskedType(string_view)` +bool_binary_funcs = ["startswith", "endswith"] +int_binary_funcs = ["find", "rfind"] +id_unary_funcs = [ + "isalpha", + "isalnum", + "isdecimal", + "isdigit", + "isupper", + "islower", + "isspace", + "isnumeric", +] + +for func in bool_binary_funcs: + setattr( + StringViewAttrs, + f"resolve_{func}", + create_binary_attr(func, types.boolean), + ) + +for func in int_binary_funcs: + setattr( + StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type) + ) + +for func in id_unary_funcs: + setattr(StringViewAttrs, f"resolve_{func}", create_identifier_attr(func)) + +cuda_decl_registry.register_attr(StringViewAttrs) diff --git a/python/strings_udf/strings_udf/_version.py b/python/strings_udf/strings_udf/_version.py new file mode 100644 index 00000000000..14ff9ec314d --- /dev/null +++ b/python/strings_udf/strings_udf/_version.py @@ -0,0 +1,711 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.23 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import functools +import os +import re +import subprocess +import sys +from typing import Callable, Dict + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "v" + cfg.parentdir_prefix = "strings_udf-" + cfg.versionfile_source = "strings_udf/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + + return decorate + + +def run_command( + commands, args, cwd=None, verbose=False, hide_stderr=False, env=None +): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + + popen_kwargs = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + **popen_kwargs, + ) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, process.returncode + return stdout, process.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix) :] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r"\d", r): + continue + if verbose: + print("picking %s" % r) + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner( + GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True + ) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + f"{tag_prefix}[[:digit:]]*", + ], + cwd=root, + ) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner( + GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root + ) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[: git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + if not mo: + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = ( + "unable to parse git-describe output: '%s'" % describe_out + ) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver): + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces): + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + if pieces["distance"]: + # update the post release segment + tag_version, post_version = pep440_split_post( + pieces["closest-tag"] + ) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % ( + post_version + 1, + pieces["distance"], + ) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords( + get_keywords(), cfg.tag_prefix, verbose + ) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for _ in cfg.versionfile_source.split("/"): + root = os.path.dirname(root) + except NameError: + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py new file mode 100644 index 00000000000..fd965a7a187 --- /dev/null +++ b/python/strings_udf/strings_udf/lowering.py @@ -0,0 +1,287 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import operator +from functools import partial + +from numba import cuda, types +from numba.core import cgutils +from numba.core.typing import signature as nb_signature +from numba.cuda.cudadrv import nvvm +from numba.cuda.cudaimpl import ( + lower as cuda_lower, + registry as cuda_lowering_registry, +) + +from strings_udf._lib.tables import get_character_flags_table_ptr +from strings_udf._typing import size_type, string_view + +character_flags_table_ptr = get_character_flags_table_ptr() + + +# read-only functions +# We will provide only one overload for this set of functions, which will +# expect a string_view. When a literal is encountered, numba will promote it to +# a string_view whereas when a dstring is encountered, numba will convert it to +# a view via its native view() method. + +_STR_VIEW_PTR = types.CPointer(string_view) + + +# CUDA function declarations +_string_view_len = cuda.declare_device("len", size_type(_STR_VIEW_PTR)) + + +def _declare_binary_func(lhs, rhs, out, name): + # Declare a binary function + return cuda.declare_device( + name, + out(lhs, rhs), + ) + + +# A binary function of the form f(string, string) -> bool +_declare_bool_str_str_func = partial( + _declare_binary_func, _STR_VIEW_PTR, _STR_VIEW_PTR, types.boolean +) + +_declare_size_type_str_str_func = partial( + _declare_binary_func, _STR_VIEW_PTR, _STR_VIEW_PTR, size_type +) + +_string_view_contains = _declare_bool_str_str_func("contains") +_string_view_eq = _declare_bool_str_str_func("eq") +_string_view_ne = _declare_bool_str_str_func("ne") +_string_view_ge = _declare_bool_str_str_func("ge") +_string_view_le = _declare_bool_str_str_func("le") +_string_view_gt = _declare_bool_str_str_func("gt") +_string_view_lt = _declare_bool_str_str_func("lt") +_string_view_startswith = _declare_bool_str_str_func("startswith") +_string_view_endswith = _declare_bool_str_str_func("endswith") +_string_view_find = _declare_size_type_str_str_func("find") +_string_view_rfind = _declare_size_type_str_str_func("rfind") +_string_view_contains = _declare_bool_str_str_func("contains") + + +# A binary function of the form f(string, int) -> bool +_declare_bool_str_int_func = partial( + _declare_binary_func, _STR_VIEW_PTR, types.int64, types.boolean +) + + +_string_view_isdigit = _declare_bool_str_int_func("pyisdigit") +_string_view_isalnum = _declare_bool_str_int_func("pyisalnum") +_string_view_isalpha = _declare_bool_str_int_func("pyisalpha") +_string_view_isdecimal = _declare_bool_str_int_func("pyisdecimal") +_string_view_isnumeric = _declare_bool_str_int_func("pyisnumeric") +_string_view_isspace = _declare_bool_str_int_func("pyisspace") +_string_view_isupper = _declare_bool_str_int_func("pyisupper") +_string_view_islower = _declare_bool_str_int_func("pyislower") + + +_string_view_count = cuda.declare_device( + "pycount", + size_type(_STR_VIEW_PTR, _STR_VIEW_PTR), +) + + +# casts +@cuda_lowering_registry.lower_cast(types.StringLiteral, string_view) +def cast_string_literal_to_string_view(context, builder, fromty, toty, val): + """ + Cast a literal to a string_view + """ + # create an empty string_view + sv = cgutils.create_struct_proxy(string_view)(context, builder) + + # set the empty strview data pointer to point to the literal value + s = context.insert_const_string(builder.module, fromty.literal_value) + sv.data = context.insert_addrspace_conv( + builder, s, nvvm.ADDRSPACE_CONSTANT + ) + sv.length = context.get_constant(size_type, len(fromty.literal_value)) + sv.bytes = context.get_constant( + size_type, len(fromty.literal_value.encode("UTF-8")) + ) + + return sv._getvalue() + + +# String function implementations +def call_len_string_view(st): + return _string_view_len(st) + + +@cuda_lower(len, string_view) +def len_impl(context, builder, sig, args): + sv_ptr = builder.alloca(args[0].type) + builder.store(args[0], sv_ptr) + result = context.compile_internal( + builder, + call_len_string_view, + nb_signature(size_type, _STR_VIEW_PTR), + (sv_ptr,), + ) + + return result + + +def create_binary_string_func(binary_func, retty): + """ + Provide a wrapper around numba's low-level extension API which + produces the boilerplate needed to implement a binary function + of two strings. + """ + + def deco(cuda_func): + @cuda_lower(binary_func, string_view, string_view) + def binary_func_impl(context, builder, sig, args): + lhs_ptr = builder.alloca(args[0].type) + rhs_ptr = builder.alloca(args[1].type) + + builder.store(args[0], lhs_ptr) + builder.store(args[1], rhs_ptr) + result = context.compile_internal( + builder, + cuda_func, + nb_signature(retty, _STR_VIEW_PTR, _STR_VIEW_PTR), + (lhs_ptr, rhs_ptr), + ) + + return result + + return binary_func_impl + + return deco + + +@create_binary_string_func(operator.contains, types.boolean) +def contains_impl(st, substr): + return _string_view_contains(st, substr) + + +@create_binary_string_func(operator.eq, types.boolean) +def eq_impl(st, rhs): + return _string_view_eq(st, rhs) + + +@create_binary_string_func(operator.ne, types.boolean) +def ne_impl(st, rhs): + return _string_view_ne(st, rhs) + + +@create_binary_string_func(operator.ge, types.boolean) +def ge_impl(st, rhs): + return _string_view_ge(st, rhs) + + +@create_binary_string_func(operator.le, types.boolean) +def le_impl(st, rhs): + return _string_view_le(st, rhs) + + +@create_binary_string_func(operator.gt, types.boolean) +def gt_impl(st, rhs): + return _string_view_gt(st, rhs) + + +@create_binary_string_func(operator.lt, types.boolean) +def lt_impl(st, rhs): + return _string_view_lt(st, rhs) + + +@create_binary_string_func("StringView.startswith", types.boolean) +def startswith_impl(sv, substr): + return _string_view_startswith(sv, substr) + + +@create_binary_string_func("StringView.endswith", types.boolean) +def endswith_impl(sv, substr): + return _string_view_endswith(sv, substr) + + +@create_binary_string_func("StringView.count", size_type) +def count_impl(st, substr): + return _string_view_count(st, substr) + + +@create_binary_string_func("StringView.find", size_type) +def find_impl(sv, substr): + return _string_view_find(sv, substr) + + +@create_binary_string_func("StringView.rfind", size_type) +def rfind_impl(sv, substr): + return _string_view_rfind(sv, substr) + + +def create_unary_identifier_func(id_func): + """ + Provide a wrapper around numba's low-level extension API which + produces the boilerplate needed to implement a unary function + of a string. + """ + + def deco(cuda_func): + @cuda_lower(id_func, string_view) + def id_func_impl(context, builder, sig, args): + str_ptr = builder.alloca(args[0].type) + builder.store(args[0], str_ptr) + + # Lookup table required for conversion functions + # must be resolved at runtime after context initialization, + # therefore cannot be a global variable + tbl_ptr = context.get_constant( + types.int64, character_flags_table_ptr + ) + result = context.compile_internal( + builder, + cuda_func, + nb_signature(types.boolean, _STR_VIEW_PTR, types.int64), + (str_ptr, tbl_ptr), + ) + + return result + + return id_func_impl + + return deco + + +@create_unary_identifier_func("StringView.isdigit") +def isdigit_impl(st, tbl): + return _string_view_isdigit(st, tbl) + + +@create_unary_identifier_func("StringView.isalnum") +def isalnum_impl(st, tbl): + return _string_view_isalnum(st, tbl) + + +@create_unary_identifier_func("StringView.isalpha") +def isalpha_impl(st, tbl): + return _string_view_isalpha(st, tbl) + + +@create_unary_identifier_func("StringView.isnumeric") +def isnumeric_impl(st, tbl): + return _string_view_isnumeric(st, tbl) + + +@create_unary_identifier_func("StringView.isdecimal") +def isdecimal_impl(st, tbl): + return _string_view_isdecimal(st, tbl) + + +@create_unary_identifier_func("StringView.isspace") +def isspace_impl(st, tbl): + return _string_view_isspace(st, tbl) + + +@create_unary_identifier_func("StringView.isupper") +def isupper_impl(st, tbl): + return _string_view_isupper(st, tbl) + + +@create_unary_identifier_func("StringView.islower") +def islower_impl(st, tbl): + return _string_view_islower(st, tbl) diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py new file mode 100644 index 00000000000..9038f4cc79a --- /dev/null +++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py @@ -0,0 +1,249 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import numba +import numpy as np +import pandas as pd +import pytest +from numba import cuda +from numba.core.typing import signature as nb_signature +from numba.types import CPointer, void + +import cudf +from cudf.testing._utils import assert_eq + +import strings_udf +from strings_udf._lib.cudf_jit_udf import to_string_view_array +from strings_udf._typing import str_view_arg_handler, string_view + +if not strings_udf.ENABLED: + pytest.skip("Strings UDF not enabled.", allow_module_level=True) + + +def get_kernel(func, dtype): + """ + Create a kernel for testing a single scalar string function + Allocates an output vector with a dtype specified by the caller + The returned kernel executes the input function on each data + element of the input and returns the output into the output vector + """ + + func = cuda.jit(device=True)(func) + outty = numba.np.numpy_support.from_dtype(dtype) + sig = nb_signature(void, CPointer(string_view), outty[::1]) + + @cuda.jit( + sig, link=[strings_udf.ptxpath], extensions=[str_view_arg_handler] + ) + def kernel(input_strings, output_col): + id = cuda.grid(1) + if id < len(output_col): + st = input_strings[id] + result = func(st) + output_col[id] = result + + return kernel + + +def run_udf_test(data, func, dtype): + """ + Run a test kernel on a set of input data + Converts the input data to a cuDF column and subsequently + to an array of cudf::string_view objects. It then creates + a CUDA kernel using get_kernel which calls the input function, + and then assembles the result back into a cuDF series before + comparing it with the equivalent pandas result + """ + dtype = np.dtype(dtype) + cudf_column = cudf.core.column.as_column(data) + str_view_ary = to_string_view_array(cudf_column) + + output_ary = cudf.core.column.column_empty(len(data), dtype=dtype) + + kernel = get_kernel(func, dtype) + kernel.forall(len(data))(str_view_ary, output_ary) + got = cudf.Series(output_ary, dtype=dtype) + expect = pd.Series(data).apply(func) + assert_eq(expect, got, check_dtype=False) + + +@pytest.fixture(scope="module") +def data(): + return [ + "abc", + "ABC", + "AbC", + "123", + "123aBc", + "123@.!", + "", + "rapids ai", + "gpu", + "True", + "False", + "1.234", + ".123a", + "0.013", + "1.0", + "01", + "20010101", + "cudf", + "cuda", + "gpu", + ] + + +@pytest.fixture(params=["cudf", "cuda", "gpucudf", "abc"]) +def rhs(request): + return request.param + + +@pytest.fixture(params=["c", "cu", "2", "abc", "", "gpu"]) +def substr(request): + return request.param + + +def test_string_udf_eq(data, rhs): + def func(st): + return st == rhs + + run_udf_test(data, func, "bool") + + +def test_string_udf_ne(data, rhs): + def func(st): + return st != rhs + + run_udf_test(data, func, "bool") + + +def test_string_udf_ge(data, rhs): + def func(st): + return st >= rhs + + run_udf_test(data, func, "bool") + + +def test_string_udf_le(data, rhs): + def func(st): + return st <= rhs + + run_udf_test(data, func, "bool") + + +def test_string_udf_gt(data, rhs): + def func(st): + return st > rhs + + run_udf_test(data, func, "bool") + + +def test_string_udf_lt(data, rhs): + def func(st): + return st < rhs + + run_udf_test(data, func, "bool") + + +def test_string_udf_contains(data, substr): + def func(st): + return substr in st + + run_udf_test(data, func, "bool") + + +def test_string_udf_count(data, substr): + def func(st): + return st.count(substr) + + run_udf_test(data, func, "int32") + + +def test_string_udf_find(data, substr): + def func(st): + return st.find(substr) + + run_udf_test(data, func, "int32") + + +def test_string_udf_endswith(data, substr): + def func(st): + return st.endswith(substr) + + run_udf_test(data, func, "bool") + + +def test_string_udf_isalnum(data): + def func(st): + return st.isalnum() + + run_udf_test(data, func, "bool") + + +def test_string_udf_isalpha(data): + def func(st): + return st.isalpha() + + run_udf_test(data, func, "bool") + + +def test_string_udf_isdecimal(data): + def func(st): + return st.isdecimal() + + run_udf_test(data, func, "bool") + + +def test_string_udf_isdigit(data): + def func(st): + return st.isdigit() + + run_udf_test(data, func, "bool") + + +def test_string_udf_islower(data): + def func(st): + return st.islower() + + run_udf_test(data, func, "bool") + + +def test_string_udf_isnumeric(data): + def func(st): + return st.isnumeric() + + run_udf_test(data, func, "bool") + + +def test_string_udf_isspace(data): + def func(st): + return st.isspace() + + run_udf_test(data, func, "bool") + + +def test_string_udf_isupper(data): + def func(st): + return st.isupper() + + run_udf_test(data, func, "bool") + + +def test_string_udf_len(data): + def func(st): + return len(st) + + run_udf_test(data, func, "int64") + + +def test_string_udf_rfind(data, substr): + def func(st): + return st.rfind(substr) + + run_udf_test(data, func, "int32") + + +def test_string_udf_startswith(data, substr): + def func(st): + return st.startswith(substr) + + run_udf_test(data, func, "bool") diff --git a/python/strings_udf/versioneer.py b/python/strings_udf/versioneer.py new file mode 100644 index 00000000000..6194b6a5698 --- /dev/null +++ b/python/strings_udf/versioneer.py @@ -0,0 +1,2245 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +# Version: 0.23 + +"""The Versioneer - like a rocketeer, but for versions. + +The Versioneer +============== + +* like a rocketeer, but for versions! +* https://github.com/python-versioneer/python-versioneer +* Brian Warner +* License: Public Domain (CC0-1.0) +* Compatible with: Python 3.7, 3.8, 3.9, 3.10 and pypy3 +* [![Latest Version][pypi-image]][pypi-url] +* [![Build Status][travis-image]][travis-url] + +This is a tool for managing a recorded version number in +distutils/setuptools-based python projects. The goal is to +remove the tedious and error-prone "update the embedded version string" +step from your release process. Making a new release should be as easy +as recording a new tag in your version-control +system, and maybe making new tarballs. + + +## Quick Install + +* `pip install versioneer` to somewhere in your $PATH +* add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md)) +* run `versioneer install` in your source tree, commit the results +* Verify version information with `python setup.py version` + +## Version Identifiers + +Source trees come from a variety of places: + +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI + +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: + +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step + +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes). + +The version identifier is used for multiple purposes: + +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball + +## Theory of Operation + +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. + +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. + +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. + +## Installation + +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. + +## Version-String Flavors + +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. + +Both functions return a dictionary with different flavors of version +information: + +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. + +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". + +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. + +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None + +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". + +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. + +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions + +## Styles + +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. + +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". + +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. + +## Debugging + +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). + +## Known Limitations + +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/python-versioneer/python-versioneer/issues). + +### Subprojects + +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: + +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other languages) in subdirectories. + +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). + +`pip install --editable .` should work correctly. `setup.py install` might +work too. + +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. + +[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) +is tracking this issue. The discussion in +[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) +describes the issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. + +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. + +### Editable installs with setuptools <= 18.5 + +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. + +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. + +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. + +[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) +describes this one, but upgrading to a newer version of setuptools should +probably resolve it. + + +## Updating Versioneer + +To upgrade your project to a new release of Versioneer, do the following: + +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files + +## Future Directions + +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. + +## Similar projects + +* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored + build-time dependency +* [minver](https://github.com/jbweston/miniver) - a lightweight + reimplementation of versioneer +* [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based + setuptools plugin + +## License + +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . + +[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg +[pypi-url]: https://pypi.python.org/pypi/versioneer/ +[travis-image]: +https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg +[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer + +""" +# pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring +# pylint:disable=missing-class-docstring,too-many-branches,too-many-statements +# pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error +# pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with +# pylint:disable=attribute-defined-outside-init,too-many-arguments + +import configparser +import errno +import functools +import json +import os +import re +import subprocess +import sys +from typing import Callable, Dict + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + my_path = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(my_path)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(my_path), versioneer_py) + ) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise OSError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.ConfigParser() + with open(setup_cfg, "r") as cfg_file: + parser.read_file(cfg_file) + VCS = parser.get("versioneer", "VCS") # mandatory + + # Dict-like interface for non-mandatory entries + section = parser["versioneer"] + + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = section.get("style", "") + cfg.versionfile_source = section.get("versionfile_source") + cfg.versionfile_build = section.get("versionfile_build") + cfg.tag_prefix = section.get("tag_prefix") + if cfg.tag_prefix in ("''", '""', None): + cfg.tag_prefix = "" + cfg.parentdir_prefix = section.get("parentdir_prefix") + cfg.verbose = section.get("verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + HANDLERS.setdefault(vcs, {})[method] = f + return f + + return decorate + + +def run_command( + commands, args, cwd=None, verbose=False, hide_stderr=False, env=None +): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + + popen_kwargs = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + **popen_kwargs, + ) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, process.returncode + return stdout, process.returncode + + +LONG_VERSION_PY[ + "git" +] = r''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.23 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys +from typing import Callable, Dict +import functools + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + + popen_kwargs = {} + if sys.platform == "win32": + # This hides the console window if pythonw.exe is used + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + popen_kwargs["startupinfo"] = startupinfo + + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen([command] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None), **popen_kwargs) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, process.returncode + return stdout, process.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r'\d', r)} + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r'\d', r): + continue + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner(GITS, [ + "describe", "--tags", "--dirty", "--always", "--long", + "--match", f"{tag_prefix}[[:digit:]]*" + ], cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], + cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], + cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver): + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces): + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + if pieces["distance"]: + # update the post release segment + tag_version, post_version = pep440_split_post( + pieces["closest-tag"] + ) + rendered = tag_version + if post_version is not None: + rendered += ".post%%d.dev%%d" %% ( + post_version + 1, pieces["distance"] + ) + else: + rendered += ".post0.dev%%d" %% (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] + else: + # exception #1 + rendered = "0.post0.dev%%d" %% pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for _ in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix) :] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r"\d", r): + continue + if verbose: + print("picking %s" % r) + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + + # GIT_DIR can interfere with correct operation of Versioneer. + # It may be intended to be passed to the Versioneer-versioned project, + # but that should not change where we get our version from. + env = os.environ.copy() + env.pop("GIT_DIR", None) + runner = functools.partial(runner, env=env) + + _, rc = runner( + GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True + ) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + f"{tag_prefix}[[:digit:]]*", + ], + cwd=root, + ) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner( + GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root + ) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[: git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + if not mo: + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = ( + "unable to parse git-describe output: '%s'" % describe_out + ) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) + pieces["distance"] = len(out.split()) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [versionfile_source] + if ipy: + files.append(ipy) + try: + my_path = __file__ + if my_path.endswith(".pyc") or my_path.endswith(".pyo"): + my_path = os.path.splitext(my_path)[0] + ".py" + versioneer_file = os.path.relpath(my_path) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + with open(".gitattributes", "r") as fobj: + for line in fobj: + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + break + except OSError: + pass + if not present: + with open(".gitattributes", "a+") as fobj: + fobj.write(f"{versionfile_source} export-subst\n") + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.23) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +import json + +version_json = ''' +%s +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except OSError: + raise NotThisMethod("unable to read _version.py") + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", + contents, + re.M | re.S, + ) + if not mo: + mo = re.search( + r"version_json = '''\r\n(.*)''' # END VERSION_JSON", + contents, + re.M | re.S, + ) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps( + versions, sort_keys=True, indent=1, separators=(",", ": ") + ) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set %s to '%s'" % (filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver): + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces): + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + if pieces["distance"]: + # update the post release segment + tag_version, post_version = pep440_split_post( + pieces["closest-tag"] + ) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % ( + post_version + 1, + pieces["distance"], + ) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print("got version from file %s %s" % (versionfile_abs, ver)) + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(cmdclass=None): + """Get the custom setuptools subclasses used by Versioneer. + + If the package uses a different cmdclass (e.g. one from numpy), it + should be provide as an argument. + """ + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see + # https://github.com/python-versioneer/python-versioneer/issues/52 + + cmds = {} if cmdclass is None else cmdclass.copy() + + # we add "version" to setuptools + from setuptools import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + + cmds["version"] = cmd_version + + # we override "build_py" in setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # pip install -e . and setuptool/editable_wheel will invoke build_py + # but the build_py command is not expected to copy any files. + + # we override different "build_py" commands for both environments + if "build_py" in cmds: + _build_py = cmds["build_py"] + else: + from setuptools.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + if getattr(self, "editable_mode", False): + # During editable installs `.py` and data files are + # not copied to build_lib + return + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join( + self.build_lib, cfg.versionfile_build + ) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + cmds["build_py"] = cmd_build_py + + if "build_ext" in cmds: + _build_ext = cmds["build_ext"] + else: + from setuptools.command.build_ext import build_ext as _build_ext + + class cmd_build_ext(_build_ext): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_ext.run(self) + if self.inplace: + # build_ext --inplace will only build extensions in + # build/lib<..> dir with no _version.py to write to. + # As in place builds will already have a _version.py + # in the module dir, we do not need to write one. + return + # now locate _version.py in the new build/ directory and replace + # it with an updated value + target_versionfile = os.path.join( + self.build_lib, cfg.versionfile_build + ) + if not os.path.exists(target_versionfile): + print( + f"Warning: {target_versionfile} does not exist, skipping " + "version update. This can happen if you are running " + "build_ext without first running build_py." + ) + return + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + cmds["build_ext"] = cmd_build_ext + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if "py2exe" in sys.modules: # py2exe enabled? + from py2exe.distutils_buildexe import py2exe as _py2exe + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + cmds["py2exe"] = cmd_py2exe + + # sdist farms its file list building out to egg_info + if "egg_info" in cmds: + _sdist = cmds["egg_info"] + else: + from setuptools.command.egg_info import egg_info as _egg_info + + class cmd_egg_info(_egg_info): + def find_sources(self): + # egg_info.find_sources builds the manifest list and writes it + # in one shot + super().find_sources() + + # Modify the filelist and normalize it + root = get_root() + cfg = get_config_from_root(root) + self.filelist.append("versioneer.py") + if cfg.versionfile_source: + # There are rare cases where versionfile_source might not be + # included by default, so we must be explicit + self.filelist.append(cfg.versionfile_source) + self.filelist.sort() + self.filelist.remove_duplicates() + + # The write method is hidden in the manifest_maker instance that + # generated the filelist and was thrown away + # We will instead replicate their final normalization (to unicode, + # and POSIX-style paths) + from setuptools import unicode_utils + + normalized = [ + unicode_utils.filesys_decode(f).replace(os.sep, "/") + for f in self.filelist.files + ] + + manifest_filename = os.path.join(self.egg_info, "SOURCES.txt") + with open(manifest_filename, "w") as fobj: + fobj.write("\n".join(normalized)) + + cmds["egg_info"] = cmd_egg_info + + # we override different "sdist" commands for both environments + if "sdist" in cmds: + _sdist = cmds["sdist"] + else: + from setuptools.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + +You will also need to edit your setup.py to use the results: + + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) + +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. + +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = + +""" + +OLD_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + +INIT_PY_SNIPPET = """ +from . import {0} +__version__ = {0}.get_versions()['version'] +""" + + +def do_setup(): + """Do main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except ( + OSError, + configparser.NoSectionError, + configparser.NoOptionError, + ) as e: + if isinstance(e, (OSError, configparser.NoSectionError)): + print( + "Adding sample versioneer config to setup.cfg", file=sys.stderr + ) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy, "r") as f: + old = f.read() + except OSError: + old = "" + module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] + snippet = INIT_PY_SNIPPET.format(module) + if OLD_SNIPPET in old: + print(" replacing boilerplate in %s" % ipy) + with open(ipy, "w") as f: + f.write(old.replace(OLD_SNIPPET, snippet)) + elif snippet not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(snippet) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py", "r") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1)