Skip to content

Commit

Permalink
Add string.find_multiple APIs to pylibcudf (#16920)
Browse files Browse the repository at this point in the history
Redo at #16824

Contributes to #15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Murray (https://github.com/Matt711)

URL: #16920
  • Loading branch information
mroeschke authored Oct 2, 2024
1 parent dae9d68 commit 76cae87
Show file tree
Hide file tree
Showing 10 changed files with 85 additions and 21 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
=============
find_multiple
=============

.. automodule:: pylibcudf.strings.find_multiple
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ strings
contains
extract
find
find_multiple
findall
regex_flags
regex_program
Expand Down
27 changes: 7 additions & 20 deletions python/cudf/cudf/_lib/strings/find_multiple.pyx
Original file line number Diff line number Diff line change
@@ -1,33 +1,20 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.find_multiple cimport (
find_multiple as cpp_find_multiple,
)

from cudf._lib.column cimport Column

import pylibcudf as plc


@acquire_spill_lock()
def find_multiple(Column source_strings, Column target_strings):
"""
Returns a column with character position values where each
of the `target_strings` are found in each string of `source_strings`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef column_view target_view = target_strings.view()

with nogil:
c_result = move(cpp_find_multiple(
source_view,
target_view
))

return Column.from_unique_ptr(move(c_result))
plc_result = plc.strings.find_multiple.find_multiple(
source_strings.to_pylibcudf(mode="read"),
target_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_result)
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
nogil:

cdef unique_ptr[column] find_multiple(
column_view source_strings,
column_view input,
column_view targets) except +
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ set(cython_sources
contains.pyx
extract.pyx
find.pyx
find_multiple.pyx
findall.pyx
regex_flags.pyx
regex_program.pyx
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ from . cimport (
convert,
extract,
find,
find_multiple,
findall,
regex_flags,
regex_program,
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
convert,
extract,
find,
find_multiple,
findall,
regex_flags,
regex_program,
Expand Down
6 changes: 6 additions & 0 deletions python/pylibcudf/pylibcudf/strings/find_multiple.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column


cpdef Column find_multiple(Column input, Column targets)
39 changes: 39 additions & 0 deletions python/pylibcudf/pylibcudf/strings/find_multiple.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple


cpdef Column find_multiple(Column input, Column targets):
"""
Returns a lists column with character position values where each
of the target strings are found in each string.
For details, see :cpp:func:`cudf::strings::find_multiple`.
Parameters
----------
input : Column
Strings instance for this operation
targets : Column
Strings to search for in each string
Returns
-------
Column
Lists column with character position values
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_find_multiple.find_multiple(
input.view(),
targets.view()
)
)

return Column.from_libcudf(move(c_result))
22 changes: 22 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
from utils import assert_column_eq


def test_find_multiple():
arr = pa.array(["abc", "def"])
targets = pa.array(["a", "c", "e"])
result = plc.strings.find_multiple.find_multiple(
plc.interop.from_arrow(arr),
plc.interop.from_arrow(targets),
)
expected = pa.array(
[
[elem.find(target) for target in targets.to_pylist()]
for elem in arr.to_pylist()
],
type=pa.list_(pa.int32()),
)
assert_column_eq(expected, result)

0 comments on commit 76cae87

Please sign in to comment.