From e46437c39e53a1f952e060d9159477617347b130 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Oct 2024 12:13:47 -1000 Subject: [PATCH 1/5] Add remaining string.char_types APIs to pylibcudf (#16788) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Matthew Murray (https://github.com/Matt711) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16788 --- .../cudf/strings/char_types/char_types.hpp | 5 +- python/cudf/cudf/_lib/strings/char_types.pyx | 178 ++++++------------ .../pylibcudf/libcudf/strings/char_types.pxd | 3 - .../pylibcudf/strings/char_types.pxd | 16 ++ .../pylibcudf/strings/char_types.pyx | 89 +++++++++ .../pylibcudf/tests/test_string_char_types.py | 29 +++ 6 files changed, 195 insertions(+), 125 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_char_types.py diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 3ebe5cb53e9..f229facca08 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -30,7 +30,7 @@ namespace strings { */ /** - * @brief Returns a boolean column identifying strings entries in which all + * @brief Returns a boolean column identifying string entries where all * characters are of the type specified. * * The output row entry will be set to false if the corresponding string element @@ -105,7 +105,8 @@ std::unique_ptr all_characters_of_type( * `types_to_remove` will be filtered. * @param mr Device memory resource used to allocate the returned column's device memory * @param stream CUDA stream used for device memory operations and kernel launches - * @return New column of boolean results for each string + * @return New strings column with the characters of specified types filtered out and replaced by + * the specified replacement string */ std::unique_ptr filter_characters_of_type( strings_column_view const& input, diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx index 376a6f8af97..a57ce29eb45 100644 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ b/python/cudf/cudf/_lib/strings/char_types.pyx @@ -1,23 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.char_types cimport ( - all_characters_of_type as cpp_all_characters_of_type, - filter_characters_of_type as cpp_filter_characters_of_type, - string_character_types, -) - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf.strings import char_types @acquire_spill_lock() @@ -25,26 +14,15 @@ def filter_alphanum(Column source_strings, object py_repl, bool keep=True): """ Returns a Column of strings keeping only alphanumeric character types. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() + plc_column = char_types.filter_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALL_TYPES if keep + else char_types.StringCharacterTypes.ALPHANUM, + py_repl.device_value.c_value, + char_types.StringCharacterTypes.ALPHANUM if keep + else char_types.StringCharacterTypes.ALL_TYPES ) - - with nogil: - c_result = move(cpp_filter_characters_of_type( - source_view, - string_character_types.ALL_TYPES if keep - else string_character_types.ALPHANUM, - scalar_repl[0], - string_character_types.ALPHANUM if keep - else string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -54,17 +32,12 @@ def is_decimal(Column source_strings): that contain only decimal characters -- those that can be used to extract base10 numbers. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DECIMAL, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.DECIMAL, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -75,17 +48,12 @@ def is_alnum(Column source_strings): Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal() """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHANUM, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALPHANUM, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -94,17 +62,12 @@ def is_alpha(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only alphabetic characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHA, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.ALPHA, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -113,17 +76,12 @@ def is_digit(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only decimal and digit characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DIGIT, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.DIGIT, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -133,17 +91,12 @@ def is_numeric(Column source_strings): that contain only numeric characters. These include digit and numeric characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.NUMERIC, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.NUMERIC, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -152,17 +105,12 @@ def is_upper(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only upper-case characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.UPPER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.UPPER, + char_types.StringCharacterTypes.CASE_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -171,17 +119,12 @@ def is_lower(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contain only lower-case characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.LOWER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.LOWER, + char_types.StringCharacterTypes.CASE_TYPES + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -190,14 +133,9 @@ def is_space(Column source_strings): Returns a Column of boolean values with True for `source_strings` that contains all characters which are spaces only. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.SPACE, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = char_types.all_characters_of_type( + source_strings.to_pylibcudf(mode="read"), + char_types.StringCharacterTypes.SPACE, + char_types.StringCharacterTypes.ALL_TYPES + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd index 5d54c1c3593..76afe047e8c 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd @@ -22,9 +22,6 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \ CASE_TYPES ALL_TYPES -cdef extern from "cudf/strings/char_types/char_types.hpp" \ - namespace "cudf::strings" nogil: - cdef unique_ptr[column] all_characters_of_type( column_view source_strings, string_character_types types, diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/strings/char_types.pxd index ad4e4cf61d8..f9f7d244212 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pxd +++ b/python/pylibcudf/pylibcudf/strings/char_types.pxd @@ -1,3 +1,19 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column cimport Column from pylibcudf.libcudf.strings.char_types cimport string_character_types +from pylibcudf.scalar cimport Scalar + + +cpdef Column all_characters_of_type( + Column source_strings, + string_character_types types, + string_character_types verify_types +) + +cpdef Column filter_characters_of_type( + Column source_strings, + string_character_types types_to_remove, + Scalar replacement, + string_character_types types_to_keep +) diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index e7621fb4d84..6a24d79bc4b 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -1,4 +1,93 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport char_types as cpp_char_types +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference from pylibcudf.libcudf.strings.char_types import \ string_character_types as StringCharacterTypes # no-cython-lint + + +cpdef Column all_characters_of_type( + Column source_strings, + string_character_types types, + string_character_types verify_types +): + """ + Identifies strings where all characters match the specified type. + + Parameters + ---------- + source_strings : Column + Strings instance for this operation + types : StringCharacterTypes + The character types to check in each string + verify_types : StringCharacterTypes + Only verify against these character types. + + Returns + ------- + Column + New column of boolean results for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_char_types.all_characters_of_type( + source_strings.view(), + types, + verify_types, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column filter_characters_of_type( + Column source_strings, + string_character_types types_to_remove, + Scalar replacement, + string_character_types types_to_keep +): + """ + Filter specific character types from a column of strings. + + Parameters + ---------- + source_strings : Column + Strings instance for this operation + types_to_remove : StringCharacterTypes + The character types to check in each string. + replacement : Scalar + The replacement character to use when removing characters + types_to_keep : StringCharacterTypes + Default `ALL_TYPES` means all characters of `types_to_remove` + will be filtered. + + Returns + ------- + Column + New column with the specified characters filtered out and + replaced with the specified replacement string. + """ + cdef const string_scalar* c_replacement = ( + replacement.c_obj.get() + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_char_types.filter_characters_of_type( + source_strings.view(), + types_to_remove, + dereference(c_replacement), + types_to_keep, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py new file mode 100644 index 00000000000..bcd030c019e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py @@ -0,0 +1,29 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +from utils import assert_column_eq + + +def test_all_characters_of_type(): + pa_array = pa.array(["1", "A"]) + result = plc.strings.char_types.all_characters_of_type( + plc.interop.from_arrow(pa_array), + plc.strings.char_types.StringCharacterTypes.ALPHA, + plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + expected = pc.utf8_is_alpha(pa_array) + assert_column_eq(result, expected) + + +def test_filter_characters_of_type(): + pa_array = pa.array(["=A="]) + result = plc.strings.char_types.filter_characters_of_type( + plc.interop.from_arrow(pa_array), + plc.strings.char_types.StringCharacterTypes.ALPHANUM, + plc.interop.from_arrow(pa.scalar(" ")), + plc.strings.char_types.StringCharacterTypes.ALL_TYPES, + ) + expected = pc.replace_substring(pa_array, "A", " ") + assert_column_eq(result, expected) From dae9d6899dd722c52bd42dd0fee51f4a6b336c93 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Oct 2024 12:50:27 -1000 Subject: [PATCH 2/5] Add string.translate APIs to pylibcudf (#16934) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16934 --- python/cudf/cudf/_lib/strings/translate.pyx | 93 ++----------- .../pylibcudf/libcudf/strings/CMakeLists.txt | 2 +- .../pylibcudf/libcudf/strings/translate.pxd | 14 +- .../pylibcudf/libcudf/strings/translate.pyx | 0 .../pylibcudf/strings/CMakeLists.txt | 1 + .../pylibcudf/pylibcudf/strings/__init__.pxd | 2 + .../pylibcudf/pylibcudf/strings/__init__.py | 2 + .../pylibcudf/pylibcudf/strings/translate.pxd | 14 ++ .../pylibcudf/pylibcudf/strings/translate.pyx | 122 ++++++++++++++++++ .../pylibcudf/tests/test_string_translate.py | 69 ++++++++++ 10 files changed, 232 insertions(+), 87 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx create mode 100644 python/pylibcudf/pylibcudf/strings/translate.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/translate.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_translate.py diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx index 3fad91bbfc0..3ef478532c2 100644 --- a/python/cudf/cudf/_lib/strings/translate.pyx +++ b/python/cudf/cudf/_lib/strings/translate.pyx @@ -1,25 +1,12 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move -from libcpp.vector cimport vector from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.translate cimport ( - filter_characters as cpp_filter_characters, - filter_type, - translate as cpp_translate, -) -from pylibcudf.libcudf.types cimport char_utf8 - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +import pylibcudf as plc @acquire_spill_lock() @@ -29,30 +16,11 @@ def translate(Column source_strings, Translates individual characters within each string if present in the mapping_table. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - with nogil: - c_result = move(cpp_translate(source_view, c_mapping_table)) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.translate.translate( + source_strings.to_pylibcudf(mode="read"), + mapping_table, + ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -64,44 +32,11 @@ def filter_characters(Column source_strings, Removes or keeps individual characters within each string using the provided mapping_table. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() + plc_result = plc.strings.translate.filter_characters( + source_strings.to_pylibcudf(mode="read"), + mapping_table, + plc.strings.translate.FilterType.KEEP + if keep else plc.strings.translate.FilterType.REMOVE, + py_repl.device_value.c_value ) - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - cdef filter_type c_keep - if keep is True: - c_keep = filter_type.KEEP - else: - c_keep = filter_type.REMOVE - - with nogil: - c_result = move(cpp_filter_characters( - source_view, - c_mapping_table, - c_keep, - scalar_repl[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index abf4357f862..b8b4343173e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx) +set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd index 85fa719128a..9fd24f2987b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd @@ -13,15 +13,15 @@ from pylibcudf.libcudf.types cimport char_utf8 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] translate( - column_view source_strings, + column_view input, vector[pair[char_utf8, char_utf8]] chars_table) except + - ctypedef enum filter_type: - KEEP 'cudf::strings::filter_type::KEEP', - REMOVE 'cudf::strings::filter_type::REMOVE' + cpdef enum class filter_type(bool): + KEEP + REMOVE cdef unique_ptr[column] filter_characters( - column_view source_strings, - vector[pair[char_utf8, char_utf8]] chars_table, - filter_type keep, + column_view input, + vector[pair[char_utf8, char_utf8]] characters_to_filter, + filter_type keep_characters, string_scalar replacement) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 142bc124ca2..052a0cf3c56 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -28,6 +28,7 @@ set(cython_sources side_type.pyx slice.pyx strip.pyx + translate.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d8afccc7336..142637ff577 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -15,6 +15,7 @@ from . cimport ( replace, slice, strip, + translate, ) from .side_type cimport side_type @@ -34,4 +35,5 @@ __all__ = [ "slice", "strip", "side_type", + "translate", ] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 22452812e42..decfadd63a4 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -16,6 +16,7 @@ replace, slice, strip, + translate, ) from .side_type import SideType @@ -35,4 +36,5 @@ "slice", "strip", "SideType", + "translate", ] diff --git a/python/pylibcudf/pylibcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/strings/translate.pxd new file mode 100644 index 00000000000..0ca746801d7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.translate cimport filter_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column translate(Column input, dict chars_table) + +cpdef Column filter_characters( + Column input, + dict characters_to_filter, + filter_type keep_characters, + Scalar replacement +) diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx new file mode 100644 index 00000000000..a62c7ec4528 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -0,0 +1,122 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings cimport translate as cpp_translate +from pylibcudf.libcudf.types cimport char_utf8 +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference +from pylibcudf.libcudf.strings.translate import \ + filter_type as FilterType # no-cython-lint + + +cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table): + """ + Convert str.maketrans table to cudf compatible table. + """ + cdef int table_size = len(table) + cdef vector[pair[char_utf8, char_utf8]] c_table + + c_table.reserve(table_size) + for key, value in table.items(): + if isinstance(value, int): + value = chr(value) + if isinstance(value, str): + value = int.from_bytes(value.encode(), byteorder='big') + if isinstance(key, int): + key = chr(key) + if isinstance(key, str): + key = int.from_bytes(key.encode(), byteorder='big') + c_table.push_back((key, value)) + + return c_table + + +cpdef Column translate(Column input, dict chars_table): + """ + Translates individual characters within each string. + + For details, see :cpp:func:`cudf::strings::translate`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + chars_table : dict + Table of UTF-8 character mappings + + Returns + ------- + Column + New column with padded strings. + """ + cdef unique_ptr[column] c_result + cdef vector[pair[char_utf8, char_utf8]] c_chars_table = _table_to_c_table( + chars_table + ) + + with nogil: + c_result = move( + cpp_translate.translate( + input.view(), + c_chars_table + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column filter_characters( + Column input, + dict characters_to_filter, + filter_type keep_characters, + Scalar replacement +): + """ + Removes ranges of characters from each string in a strings column. + + For details, see :cpp:func:`cudf::strings::filter_characters`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + characters_to_filter : dict + Table of character ranges to filter on + + keep_characters : FilterType + If true, the `characters_to_filter` are retained + and all other characters are removed. + + replacement : Scalar + Replacement string for each character removed. + + Returns + ------- + Column + New column with filtered strings. + """ + cdef unique_ptr[column] c_result + cdef vector[pair[char_utf8, char_utf8]] c_characters_to_filter = _table_to_c_table( + characters_to_filter + ) + cdef const string_scalar* c_replacement = ( + replacement.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_translate.filter_characters( + input.view(), + c_characters_to_filter, + keep_characters, + dereference(c_replacement), + ) + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py new file mode 100644 index 00000000000..2ae893e69fb --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture +def data_col(): + pa_data_col = pa.array( + ["aa", "bbb", "cccc", "abcd", None], + type=pa.string(), + ) + return pa_data_col, plc.interop.from_arrow(pa_data_col) + + +@pytest.fixture +def trans_table(): + return str.maketrans("abd", "A Q") + + +def test_translate(data_col, trans_table): + pa_array, plc_col = data_col + result = plc.strings.translate.translate(plc_col, trans_table) + expected = pa.array( + [ + val.translate(trans_table) if isinstance(val, str) else None + for val in pa_array.to_pylist() + ] + ) + assert_column_eq(expected, result) + + +@pytest.mark.parametrize( + "keep", + [ + plc.strings.translate.FilterType.KEEP, + plc.strings.translate.FilterType.REMOVE, + ], +) +def test_filter_characters(data_col, trans_table, keep): + pa_array, plc_col = data_col + result = plc.strings.translate.filter_characters( + plc_col, trans_table, keep, plc.interop.from_arrow(pa.scalar("*")) + ) + exp_data = [] + flat_trans = set(trans_table.keys()).union(trans_table.values()) + for val in pa_array.to_pylist(): + if not isinstance(val, str): + exp_data.append(val) + else: + new_val = "" + for ch in val: + if ( + ch in flat_trans + and keep == plc.strings.translate.FilterType.KEEP + ): + new_val += ch + elif ( + ch not in flat_trans + and keep == plc.strings.translate.FilterType.REMOVE + ): + new_val += ch + else: + new_val += "*" + exp_data.append(new_val) + expected = pa.array(exp_data) + assert_column_eq(expected, result) From 76cae874a6f75c741055e50ebb839620ea98c8a0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:35:06 -1000 Subject: [PATCH 3/5] Add string.find_multiple APIs to pylibcudf (#16920) Redo at https://github.com/rapidsai/cudf/pull/16824 Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/16920 --- .../pylibcudf/strings/find_multiple.rst | 6 +++ .../api_docs/pylibcudf/strings/index.rst | 1 + .../cudf/cudf/_lib/strings/find_multiple.pyx | 27 ++++--------- .../libcudf/strings/find_multiple.pxd | 2 +- .../pylibcudf/strings/CMakeLists.txt | 1 + .../pylibcudf/pylibcudf/strings/__init__.pxd | 1 + .../pylibcudf/pylibcudf/strings/__init__.py | 1 + .../pylibcudf/strings/find_multiple.pxd | 6 +++ .../pylibcudf/strings/find_multiple.pyx | 39 +++++++++++++++++++ .../tests/test_string_find_multiple.py | 22 +++++++++++ 10 files changed, 85 insertions(+), 21 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/find_multiple.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst new file mode 100644 index 00000000000..8e86b33b1a0 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst @@ -0,0 +1,6 @@ +============= +find_multiple +============= + +.. automodule:: pylibcudf.strings.find_multiple + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 9b1a6b72a88..7e0d128cfb2 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -9,6 +9,7 @@ strings contains extract find + find_multiple findall regex_flags regex_program diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx index 1358f8e3c2c..39e0013769f 100644 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.find_multiple cimport ( - find_multiple as cpp_find_multiple, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def find_multiple(Column source_strings, Column target_strings): @@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings): Returns a column with character position values where each of the `target_strings` are found in each string of `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - - with nogil: - c_result = move(cpp_find_multiple( - source_view, - target_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.find_multiple.find_multiple( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd index 0491644a10a..3d048c1f50b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd @@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] find_multiple( - column_view source_strings, + column_view input, column_view targets) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 052a0cf3c56..71b1e29afcb 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -20,6 +20,7 @@ set(cython_sources contains.pyx extract.pyx find.pyx + find_multiple.pyx findall.pyx regex_flags.pyx regex_program.pyx diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 142637ff577..e6e6bee2750 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -9,6 +9,7 @@ from . cimport ( convert, extract, find, + find_multiple, findall, regex_flags, regex_program, diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index decfadd63a4..7f121279969 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -9,6 +9,7 @@ convert, extract, find, + find_multiple, findall, regex_flags, regex_program, diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd new file mode 100644 index 00000000000..b7b3aefa336 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column find_multiple(Column input, Column targets) diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx new file mode 100644 index 00000000000..413fc1cb79d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple + + +cpdef Column find_multiple(Column input, Column targets): + """ + Returns a lists column with character position values where each + of the target strings are found in each string. + + For details, see :cpp:func:`cudf::strings::find_multiple`. + + Parameters + ---------- + input : Column + Strings instance for this operation + targets : Column + Strings to search for in each string + + Returns + ------- + Column + Lists column with character position values + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_find_multiple.find_multiple( + input.view(), + targets.view() + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py new file mode 100644 index 00000000000..d6b37a388f0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_find_multiple(): + arr = pa.array(["abc", "def"]) + targets = pa.array(["a", "c", "e"]) + result = plc.strings.find_multiple.find_multiple( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(targets), + ) + expected = pa.array( + [ + [elem.find(target) for target in targets.to_pylist()] + for elem in arr.to_pylist() + ], + type=pa.list_(pa.int32()), + ) + assert_column_eq(expected, result) From 6c9064ad074351591f8a4ad757b4d4e32789b8e5 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 1 Oct 2024 18:42:10 -0700 Subject: [PATCH 4/5] Refactor the `cuda_memcpy` functions to make them more usable (#16945) As we expanded the use of the `cuda_memcpy` functions, we realized that they are not very ergonomic, as they require caller to query `is_device_accessible` and pass the correct `PAGEABLE`/`PINNED` enum based on this. This PR aims to make the `cuda_memcpy` functions easier to use, and the call site changes hopefully showcase this. The new implementation takes spans as parameters and relies on the `host_span::is_device_accessible` to enable copy strategies for pinned memory. Host spans set this flag during construction; creating a host span from a `cudf::detail::host_vector` will correctly propagate `is_device_accessible`. Thus, call can simply* call the `cuda_memcpy` functions with their containers as parameters and rely on implicit conversion to `host_span`/`device_span`. Bonus - there's no way to mix up host and device memory pointers :+1: Sharp edges: * Conversion prevents template deduction, so calls that pass containers as parameters need to specify the template parameter (see changes in this PR). * ~The API copies the `min(input.size(), output.size())` bytes, as this is what we can do safely. This might cause surprises to users if they unintentionally pass spans of different sizes. We could instead throw in this case.~ Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Bradley Dice (https://github.com/bdice) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/16945 --- .../cudf/detail/utilities/cuda_memcpy.hpp | 78 +++++++++++++++---- .../detail/utilities/vector_factories.hpp | 16 +--- cpp/src/io/json/host_tree_algorithms.cu | 13 +--- cpp/src/io/utilities/hostdevice_vector.hpp | 14 +--- cpp/src/utilities/cuda_memcpy.cu | 11 +-- 5 files changed, 76 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 632d5a732ec..4f0c52c5954 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -25,33 +26,82 @@ namespace detail { enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; +void cuda_memcpy_async_impl( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); + /** - * @brief Asynchronously copies data between the host and device. + * @brief Asynchronously copies data from host to device memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination device memory + * @param src Source host memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy_async( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = src.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} /** - * @brief Synchronously copies data between the host and device. + * @brief Asynchronously copies data from device to host memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination host memory + * @param src Source device memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = dst.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} + +/** + * @brief Synchronously copies data from host to device memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination device memory + * @param src Source host memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} + +/** + * @brief Synchronously copies data from device to host memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination host memory + * @param src Source device memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} } // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 953ae5b9308..1f1e7a2db77 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -101,12 +101,7 @@ rmm::device_uvector make_device_uvector_async(host_span source_data, rmm::device_async_resource_ref mr) { rmm::device_uvector ret(source_data.size(), stream, mr); - auto const is_pinned = source_data.is_device_accessible(); - cuda_memcpy_async(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + cuda_memcpy_async(ret, source_data, stream); return ret; } @@ -405,13 +400,8 @@ host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str template host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) { - auto result = make_host_vector(v.size(), stream); - auto const is_pinned = result.get_allocator().is_device_accessible(); - cuda_memcpy_async(result.data(), - v.data(), - v.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + auto result = make_host_vector(v.size(), stream); + cuda_memcpy_async(result, v, stream); return result; } diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 5855f1b5a5f..f7e8134b68d 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -634,11 +634,8 @@ std::pair, hashmap_of_device_columns> build_tree is_mixed_type_column[this_col_id] == 1) column_categories[this_col_id] = NC_STR; } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); + cudf::detail::cuda_memcpy_async( + d_column_tree.node_categories, column_categories, stream); } // ignore all children of columns forced as string @@ -653,11 +650,7 @@ std::pair, hashmap_of_device_columns> build_tree forced_as_string_column[this_col_id]) column_categories[this_col_id] = NC_STR; } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, column_categories, stream); // restore unique_col_ids order std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index aed745c42dd..634e6d78ebc 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -125,23 +125,17 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(d_data, h_data, stream); } - void host_to_device_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy(d_data, h_data, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(h_data, d_data, stream); } - void device_to_host_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy(h_data, d_data, stream); } /** * @brief Converts a hostdevice_vector into a hostdevice_span. diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 0efb881eb3e..c0af27a1748 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -30,7 +30,7 @@ namespace cudf::detail { namespace { // Simple kernel to copy between device buffers -CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n) +CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n) { auto const idx = cudf::detail::grid_1d::global_thread_id(); if (idx < n) { dst[idx] = src[idx]; } @@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea }; // namespace -void cuda_memcpy_async( +void cuda_memcpy_async_impl( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { if (kind == host_memory_kind::PINNED) { @@ -73,11 +73,4 @@ void cuda_memcpy_async( } } -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) -{ - cuda_memcpy_async(dst, src, size, kind, stream); - stream.synchronize(); -} - } // namespace cudf::detail From bac81cb8f4c61c9a81e30e79d03c323406bf657a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:54:05 -1000 Subject: [PATCH 5/5] Add string.split APIs to pylibcudf (#16940) Contributes to https://github.com/rapidsai/cudf/issues/15162 Includes `split/split.pxd` and `split/partition.pxd` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/16940 --- .../api_docs/pylibcudf/strings/index.rst | 1 + .../api_docs/pylibcudf/strings/split.rst | 6 + .../cudf/_lib/strings/split/partition.pyx | 59 +--- python/cudf/cudf/_lib/strings/split/split.pyx | 217 +++--------- python/cudf/cudf/core/column/string.py | 12 +- .../libcudf/strings/split/partition.pxd | 4 +- .../pylibcudf/libcudf/strings/split/split.pxd | 24 +- .../pylibcudf/strings/CMakeLists.txt | 1 + .../pylibcudf/pylibcudf/strings/__init__.pxd | 2 + .../pylibcudf/pylibcudf/strings/__init__.py | 2 + .../pylibcudf/strings/split/CMakeLists.txt | 22 ++ .../pylibcudf/strings/split/__init__.pxd | 2 + .../pylibcudf/strings/split/__init__.py | 2 + .../pylibcudf/strings/split/partition.pxd | 10 + .../pylibcudf/strings/split/partition.pyx | 95 +++++ .../pylibcudf/strings/split/split.pxd | 24 ++ .../pylibcudf/strings/split/split.pyx | 326 ++++++++++++++++++ .../tests/test_string_split_partition.py | 43 +++ .../tests/test_string_split_split.py | 130 +++++++ 19 files changed, 750 insertions(+), 232 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst create mode 100644 python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt create mode 100644 python/pylibcudf/pylibcudf/strings/split/__init__.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/split/__init__.py create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/split/partition.pyx create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/split/split.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_split_partition.py create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_split_split.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 7e0d128cfb2..e73ea3370ec 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -16,4 +16,5 @@ strings repeat replace slice + split strip diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst new file mode 100644 index 00000000000..cba96e86f45 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst @@ -0,0 +1,6 @@ +===== +split +===== + +.. automodule:: pylibcudf.strings.split + :members: diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index a81fb18e752..5319addc41c 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -1,21 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.split.partition cimport ( - partition as cpp_partition, - rpartition as cpp_rpartition, -) -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -25,25 +14,11 @@ def partition(Column source_strings, Returns data by splitting the `source_strings` column at the first occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_partition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.partition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -53,22 +28,8 @@ def rpartition(Column source_strings, Returns a Column by splitting the `source_strings` column at the last occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rpartition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.rpartition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index f481fea4c51..4ec6c7073d8 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -1,33 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.split.split cimport ( - rsplit as cpp_rsplit, - rsplit_re as cpp_rsplit_re, - rsplit_record as cpp_rsplit_record, - rsplit_record_re as cpp_rsplit_record_re, - split as cpp_split, - split_re as cpp_split_re, - split_record as cpp_split_record, - split_record_re as cpp_split_record_re, -) -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -39,26 +18,12 @@ def split(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -70,25 +35,12 @@ def split_record(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -100,26 +52,12 @@ def rsplit(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -131,25 +69,12 @@ def rsplit_record(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -160,24 +85,15 @@ def split_re(Column source_strings, Returns data by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -189,24 +105,15 @@ def rsplit_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -217,23 +124,15 @@ def split_record_re(Column source_strings, Returns a Column by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4463e3280df..da422db5eae 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2546,9 +2546,9 @@ def split( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.split_re(self._column, pat, n) + data = libstrings.split_re(self._column, pat, n) else: - data, _ = libstrings.split( + data = libstrings.split( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2719,9 +2719,9 @@ def rsplit( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.rsplit_re(self._column, pat, n) + data = libstrings.rsplit_re(self._column, pat, n) else: - data, _ = libstrings.rsplit( + data = libstrings.rsplit( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2820,7 +2820,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.partition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) @@ -2885,7 +2885,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.rpartition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd index 4162e886a7d..4299cf62e99 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd @@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] partition( - column_view source_strings, + column_view input, string_scalar delimiter) except + cdef unique_ptr[table] rpartition( - column_view source_strings, + column_view input, string_scalar delimiter) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd index 3046149aebb..a22a79fc7d7 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd @@ -14,22 +14,22 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[table] rsplit( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] split_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + @@ -38,21 +38,21 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[table] rsplit_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] split_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 71b1e29afcb..d92f806efbe 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -40,3 +40,4 @@ rapids_cython_create_modules( ) add_subdirectory(convert) +add_subdirectory(split) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index e6e6bee2750..788e2c99ab1 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -15,6 +15,7 @@ from . cimport ( regex_program, replace, slice, + split, strip, translate, ) @@ -35,6 +36,7 @@ __all__ = [ "replace", "slice", "strip", + "split", "side_type", "translate", ] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index 7f121279969..bcaeb073d0b 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -16,6 +16,7 @@ repeat, replace, slice, + split, strip, translate, ) @@ -36,6 +37,7 @@ "replace", "slice", "strip", + "split", "SideType", "translate", ] diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt new file mode 100644 index 00000000000..8f544f6f537 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources partition.pyx split.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.pxd b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd new file mode 100644 index 00000000000..72086e57d9f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . cimport partition, split diff --git a/python/pylibcudf/pylibcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/strings/split/__init__.py new file mode 100644 index 00000000000..2033e5e275b --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . import partition, split diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd new file mode 100644 index 00000000000..c18257a4787 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + + +cpdef Table partition(Column input, Scalar delimiter=*) + +cpdef Table rpartition(Column input, Scalar delimiter=*) diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx new file mode 100644 index 00000000000..ecc959e65b0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -0,0 +1,95 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings.split cimport partition as cpp_partition +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from cython.operator import dereference + + +cpdef Table partition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::partition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = move( + cpp_partition.partition( + input.view(), + dereference(c_delimiter) + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rpartition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rpartition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = move( + cpp_partition.rpartition( + input.view(), + dereference(c_delimiter) + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd new file mode 100644 index 00000000000..355a1874298 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx new file mode 100644 index 00000000000..a7d7f39fc47 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -0,0 +1,326 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.split cimport split as cpp_split +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + +from cython.operator import dereference + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::split`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.split( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns. + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.rsplit( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings. + + For details, see :cpp:func:`cudf::strings::split_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.split_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings starting + from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = move( + cpp_split.rsplit_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_split.split_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string starting from + the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_split.rsplit_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_split.split_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string starting from the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_split.rsplit_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py new file mode 100644 index 00000000000..80cae8d1c6b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_table_eq + + +@pytest.fixture +def data_col(): + pa_arr = pa.array(["ab_cd", "def_g_h", None]) + plc_column = plc.interop.from_arrow(pa_arr) + return pa_arr, plc_column + + +def test_partition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.partition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def", None], + "b": ["_", "_", None], + "c": ["cd", "g_h", None], + } + ) + assert_table_eq(expected, result) + + +def test_rpartition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.rpartition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def_g", None], + "b": ["_", "_", None], + "c": ["cd", "h", None], + } + ) + assert_table_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py new file mode 100644 index 00000000000..2aeffac8209 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py @@ -0,0 +1,130 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq, assert_table_eq + + +@pytest.fixture +def data_col(): + pa_array = pa.array(["a_b_c", "d-e-f", None]) + plc_column = plc.interop.from_arrow(pa_array) + return pa_array, plc_column + + +@pytest.fixture +def delimiter(): + delimiter = "_" + plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter)) + return delimiter, plc_delimiter + + +@pytest.fixture +def re_delimiter(): + return "[_-]" + + +def test_split(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.split(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a", "d-e-f", None], + "b": ["b_c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a_b", "d-e-f", None], + "b": ["c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_split_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.split_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a", "d", None], + "b": ["b_c", "e-f", None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.rsplit_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a_b", "d-e", None], + "b": ["c", "f", None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.split_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.rsplit_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + -1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter) + assert_column_eq(expected, result)