From db1b36592ba5d76158d1c6e1a3c6440c25a382e7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:48:20 -0700 Subject: [PATCH] Migrate string replace.pxd to pylibcudf (#15839) xref #15162 Change replace.pxd to use pylibcudf APIs. Authors: - Thomas Li (https://github.com/lithomas1) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15839 --- .../user_guide/api_docs/pylibcudf/index.rst | 8 +- .../api_docs/pylibcudf/strings/index.rst | 7 + .../api_docs/pylibcudf/strings/replace.rst | 6 + .../_lib/pylibcudf/strings/CMakeLists.txt | 4 +- .../cudf/_lib/pylibcudf/strings/__init__.pxd | 2 +- .../cudf/_lib/pylibcudf/strings/__init__.py | 2 +- .../cudf/_lib/pylibcudf/strings/replace.pxd | 25 +++ .../cudf/_lib/pylibcudf/strings/replace.pyx | 162 ++++++++++++++++++ python/cudf/cudf/_lib/strings/replace.pyx | 99 +++-------- .../pylibcudf_tests/test_string_replace.py | 126 ++++++++++++++ 10 files changed, 362 insertions(+), 79 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_replace.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 58fea77adaa..b6ad1157511 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -6,7 +6,7 @@ This page provides API documentation for pylibcudf. .. toctree:: :maxdepth: 1 - :caption: API Documentation + :caption: Top-level modules aggregation binaryop @@ -32,3 +32,9 @@ This page provides API documentation for pylibcudf. table types unary + +.. toctree:: + :maxdepth: 2 + :caption: Subpackages + + strings/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst new file mode 100644 index 00000000000..8970fc80c0b --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -0,0 +1,7 @@ +strings +======= + +.. toctree:: + :maxdepth: 1 + + replace diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst new file mode 100644 index 00000000000..9575ec226a7 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst @@ -0,0 +1,6 @@ +======= +replace +======= + +.. automodule:: cudf._lib.pylibcudf.strings.replace + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt index 0e9c1c916f0..c9a983e24f4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt @@ -12,11 +12,11 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx) +set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf ) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd index ec3dbc150b5..7563df8a107 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport capitalize, case, char_types, find +from . cimport capitalize, case, char_types, find, replace diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py index 3793bda0aa4..cb4f0e38f97 100644 --- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import capitalize, case, char_types, find +from . import capitalize, case, char_types, find, replace diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd new file mode 100644 index 00000000000..52e2dc3c738 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar + + +cpdef Column replace( + Column input, + Scalar target, + Scalar repl, + size_type maxrepl = * +) +cpdef Column replace_multiple( + Column input, + Column target, + Column repl, + size_type maxrepl = * +) +cpdef Column replace_slice( + Column input, + Scalar repl = *, + size_type start = *, + size_type stop = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx new file mode 100644 index 00000000000..c757150a600 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx @@ -0,0 +1,162 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.column cimport Column +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar +from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( + replace as cpp_replace, + replace_multiple as cpp_replace_multiple, + replace_slice as cpp_replace_slice, +) +from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.scalar cimport Scalar + + +cpdef Column replace( + Column input, + Scalar target, + Scalar repl, + size_type maxrepl = -1 +): + """Replaces target string within each string with the specified replacement string. + + Null string entries will return null output string entries. + + For details, see :cpp:func:`replace`. + + Parameters + ---------- + input : Column + The input strings + target : Scalar + String to search for in each string. + repl : Scalar + String to replace target with. + maxrepl : size_type, default -1 + Maximum times to replace if target appears multiple times in the input string. + Default of -1 specifies to replace all occurrences of target in each string. + + Returns + ------- + pylibcudf.Column + New string column with target replaced. + """ + cdef: + unique_ptr[column] c_result + const string_scalar* target_str + const string_scalar* repl_str + + target_str = (target.c_obj.get()) + repl_str = (repl.c_obj.get()) + + with nogil: + c_result = move(cpp_replace( + input.view(), + target_str[0], + repl_str[0], + maxrepl, + )) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column replace_multiple( + Column input, + Column target, + Column repl, + size_type maxrepl = -1 +): + """Replaces target string within each string with the specified replacement string. + + Null string entries will return null output string entries. + + For details, see :cpp:func:`replace_multiple`. + + Parameters + ---------- + input : Column + The input strings + target : Column + Column containing strings to search for in the input column. + repl : Column + Column containing strings to replace target with. + Each target, when found, will be replaced by the value at the + corresponding index in the repl Column. + + Must be of the same length as target. + + Returns + ------- + pylibcudf.Column + New string column with target replaced. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_replace_multiple( + input.view(), + target.view(), + repl.view(), + )) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column replace_slice( + Column input, + # TODO: default scalar values + # https://github.com/rapidsai/cudf/issues/15505 + Scalar repl = None, + size_type start = 0, + size_type stop = -1 +): + """Replaces each string in the column with the provided repl string + within the [start,stop) character position range. + + Null string entries will return null output string entries. + This function can be used to insert a string into specific position + by specifying the same position value for start and stop. + The repl string can be appended to each string by specifying -1 + for both start and stop. + + For details, see :cpp:func:`replace_slice`. + + Parameters + ---------- + input : Column + The input strings + repl : Scalar, default "" + String scalar to replace target with. + start : size_type, default 0 + Start position where repl will be added. + stop : size_type, default -1 + End position (exclusive) to use for replacement. + Returns + ------- + pylibcudf.Column + New string column + """ + cdef unique_ptr[column] c_result + + if repl is None: + repl = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef const string_scalar* scalar_str = (repl.c_obj.get()) + + with nogil: + c_result = move(cpp_replace_slice( + input.view(), + scalar_str[0], + start, + stop + )) + + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx index 2d9330a8a24..374831f1833 100644 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ b/python/cudf/cudf/_lib/strings/replace.pyx @@ -1,23 +1,15 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column -from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar -from cudf._lib.pylibcudf.libcudf.strings.replace cimport ( - replace as cpp_replace, - replace_multiple as cpp_replace_multiple, - replace_slice as cpp_replace_slice, -) from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.scalar cimport DeviceScalar +import cudf._lib.pylibcudf as plc + @acquire_spill_lock() def slice_replace(Column source_strings, @@ -32,22 +24,12 @@ def slice_replace(Column source_strings, cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace_slice( - source_view, - scalar_str[0], - start, - stop - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_slice( + source_strings.to_pylibcudf(mode="read"), + repl.c_value, + start, + stop + )) @acquire_spill_lock() @@ -61,22 +43,12 @@ def insert(Column source_strings, cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace_slice( - source_view, - scalar_str[0], - start, - start - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_slice( + source_strings.to_pylibcudf(mode="read"), + repl.c_value, + start, + start, + )) @acquire_spill_lock() @@ -92,25 +64,12 @@ def replace(Column source_strings, cdef DeviceScalar target = py_target.device_value cdef DeviceScalar repl = py_repl.device_value - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_target = ( - target.get_raw_ptr() - ) - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_replace( - source_view, - scalar_target[0], - scalar_repl[0], - maxrepl - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace( + source_strings.to_pylibcudf(mode="read"), + target.c_value, + repl.c_value, + maxrepl + )) @acquire_spill_lock() @@ -121,16 +80,8 @@ def replace_multi(Column source_strings, Returns a Column after replacing occurrences of patterns `target_strings` with `repl_strings` in `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - cdef column_view repl_view = repl_strings.view() - - with nogil: - c_result = move(cpp_replace_multiple( - source_view, - target_view, - repl_view - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc.strings.replace.replace_multiple( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read"), + repl_strings.to_pylibcudf(mode="read"), + )) diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py new file mode 100644 index 00000000000..f20edf6a506 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py @@ -0,0 +1,126 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture(scope="module") +def data_col(): + pa_data_col = pa.array( + ["a", "c", "A", "aa", None, "aaaaaaaaa", "AAAA", "ÁÁÁÁ"], + type=pa.string(), + ) + return pa_data_col, plc.interop.from_arrow(pa_data_col) + + +@pytest.fixture(scope="module", params=["a", "c", "A", "Á", "aa", "ÁÁÁ"]) +def scalar_repl_target(request): + pa_target = pa.scalar(request.param, type=pa.string()) + return request.param, plc.interop.from_arrow(pa_target) + + +@pytest.fixture(scope="module", params=["b", "B", "", "B́"]) +def scalar_repl(request): + pa_repl = pa.scalar(request.param, type=pa.string()) + return request.param, plc.interop.from_arrow(pa_repl) + + +@pytest.fixture( + scope="module", + params=[ + ["a", "c", "A", "ÁÁÁÁ"], + ], +) +def col_repl_target(request): + pa_target = pa.array(request.param, type=pa.string()) + return (pa_target, plc.interop.from_arrow(pa_target)) + + +@pytest.fixture( + scope="module", + params=[ + [ + "", + "z", + "XX", + "blahblah", + ] + ], +) +def col_repl(request): + pa_repl = pa.array(request.param, type=pa.string()) + return (pa_repl, plc.interop.from_arrow(pa_repl)) + + +@pytest.mark.parametrize("maxrepl", [-1, 1, 2, 10]) +def test_replace(data_col, scalar_repl_target, scalar_repl, maxrepl): + pa_data_col, plc_data_col = data_col + pa_target, plc_target = scalar_repl_target + pa_repl, plc_repl = scalar_repl + got = plc.strings.replace.replace( + plc_data_col, plc_target, plc_repl, maxrepl + ) + + expected = pa.compute.replace_substring( + pa_data_col, + pattern=pa_target, + replacement=pa_repl, + max_replacements=maxrepl, + ) + + assert_column_eq(expected, got) + + +@pytest.mark.parametrize("startstop", [(0, -1), (0, 0), (1, 3)]) +def test_replace_slice(data_col, scalar_repl, startstop): + pa_data_col, plc_data_col = data_col + pa_repl, plc_repl = scalar_repl + start, stop = startstop + got = plc.strings.replace.replace_slice( + plc_data_col, plc_repl, start, stop + ) + + if stop == -1: + # pyarrow doesn't support -1 as stop, so just set to really big number + + # TODO: once libcudf's count_characters() is migrated, we can call + # count_characters on the input, take the max and set stop to that + stop = 1000 + + expected = pa.compute.utf8_replace_slice(pa_data_col, start, stop, pa_repl) + + assert_column_eq(expected, got) + + +def test_replace_col(data_col, col_repl_target, col_repl): + pa_data_col, plc_data_col = data_col + pa_target, plc_target = col_repl_target + pa_repl, plc_repl = col_repl + got = plc.strings.replace.replace_multiple( + plc_data_col, plc_target, plc_repl + ) + + # There's nothing in pyarrow that does string replace with columns + # for targets/repls, so let's implement our own in python + + def replace_list(elem, targets, repls): + for target, repl in zip(targets, repls): + res = elem.replace(target, repl) + if res != elem: + return res + + targets = pa_target.to_pylist() + repls = pa_repl.to_pylist() + + expected = pa.array( + [ + replace_list(elem, targets, repls) if elem is not None else None + for elem in pa_data_col.to_pylist() + ], + type=pa.string(), + ) + + assert_column_eq(expected, got)