Skip to content

Commit

Permalink
Migrate strings contains operations to pylibcudf (#15880)
Browse files Browse the repository at this point in the history
This PR creates pylibcudf strings `contains` APIs and migrates the cuDF cython to leverage them. Part of #15162.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #15880
  • Loading branch information
brandon-b-miller authored Jun 6, 2024
1 parent d91380e commit 7fd6918
Show file tree
Hide file tree
Showing 17 changed files with 215 additions and 25 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
========
contains
========

.. automodule:: cudf._lib.pylibcudf.strings.contains
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ strings
.. toctree::
:maxdepth: 1

contains
replace
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources char_types.pyx)
set(cython_sources char_types.pyx regex_flags.pyx)

set(linked_libraries cudf::cudf)

Expand Down
13 changes: 8 additions & 5 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

from libc.stdint cimport int32_t


cdef extern from "cudf/strings/regex/flags.hpp" \
namespace "cudf::strings" nogil:

ctypedef enum regex_flags:
DEFAULT 'cudf::strings::regex_flags::DEFAULT'
MULTILINE 'cudf::strings::regex_flags::MULTILINE'
DOTALL 'cudf::strings::regex_flags::DOTALL'
cpdef enum class regex_flags(int32_t):
DEFAULT
MULTILINE
DOTALL
Empty file.
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# the License.
# =============================================================================

set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx)
set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
regex_program.pyx replace.pyx
)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
11 changes: 10 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport capitalize, case, char_types, find, replace
from . cimport (
capitalize,
case,
char_types,
contains,
find,
regex_flags,
regex_program,
replace,
)
11 changes: 10 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import capitalize, case, char_types, find, replace
from . import (
capitalize,
case,
char_types,
contains,
find,
regex_flags,
regex_program,
replace,
)
7 changes: 7 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram


cpdef Column contains_re(Column input, RegexProgram prog)
41 changes: 41 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains
from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram


cpdef Column contains_re(
Column input,
RegexProgram prog
):
"""Returns a boolean column identifying rows which match the given
regex_program object.
For details, see :cpp:func:`cudf::strings::contains_re`.
Parameters
----------
input : Column
The input strings
prog : RegexProgram
Regex program instance
Returns
-------
pylibcudf.Column
New column of boolean results for each string
"""

cdef unique_ptr[column] result

with nogil:
result = cpp_contains.contains_re(
input.view(),
prog.c_obj.get()[0]
)

return Column.from_libcudf(move(result))
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \
regex_flags as RegexFlags # no-cython-lint
10 changes: 10 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program


cdef class RegexProgram:
cdef unique_ptr[regex_program] c_obj
37 changes: 37 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2024, NVIDIA CORPORATION.


from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program

from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags
from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags


cdef class RegexProgram:

def __init__(self, *args, **kwargs):
raise ValueError("Do not instantiate RegexProgram directly, use create")

@staticmethod
def create(str pattern, int flags):
cdef unique_ptr[regex_program] c_prog
cdef regex_flags c_flags
cdef string c_pattern = pattern.encode()

cdef RegexProgram ret = RegexProgram.__new__(RegexProgram)
if isinstance(flags, object):
if isinstance(flags, (int, RegexFlags)):
c_flags = <regex_flags>flags
with nogil:
c_prog = regex_program.create(c_pattern, c_flags)

ret.c_obj = move(c_prog)
else:
raise ValueError("flags must be of type RegexFlags")

return ret
23 changes: 7 additions & 16 deletions python/cudf/cudf/_lib/strings/contains.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
contains_re as cpp_contains_re,
count_re as cpp_count_re,
like as cpp_like,
matches_re as cpp_matches_re,
Expand All @@ -23,28 +22,20 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
from cudf._lib.scalar cimport DeviceScalar

from cudf._lib.pylibcudf.strings import contains
from cudf._lib.pylibcudf.strings.regex_program import RegexProgram


@acquire_spill_lock()
def contains_re(Column source_strings, object reg_ex, uint32_t flags):
"""
Returns a Column of boolean values with True for `source_strings`
that contain regular expression `reg_ex`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_contains_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_regex_program.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pytest

import cudf._lib.pylibcudf as plc


@pytest.mark.parametrize("pat", ["(", "*", "\\"])
def test_regex_program_invalid(pat):
with pytest.raises(RuntimeError):
plc.strings.regex_program.RegexProgram.create(
pat, plc.strings.regex_flags.RegexFlags.DEFAULT
)
55 changes: 55 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_string_contains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pytest
from utils import assert_column_eq

import cudf._lib.pylibcudf as plc


@pytest.fixture(scope="module")
def pa_target_col():
return pa.array(
["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
)


@pytest.fixture(scope="module")
def plc_target_col(pa_target_col):
return plc.interop.from_arrow(pa_target_col)


@pytest.fixture(
params=[
"A",
"de",
".*",
"^a",
"^A",
"[^a-z]",
"[a-z]{3,}",
"^[A-Z]{2,}",
"j|u",
],
scope="module",
)
def pa_target_scalar(request):
return pa.scalar(request.param, type=pa.string())


@pytest.fixture(scope="module")
def plc_target_pat(pa_target_scalar):
prog = plc.strings.regex_program.RegexProgram.create(
pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT
)
return prog


def test_contains_re(
pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat
):
got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat)
expected = pa.compute.match_substring_regex(
pa_target_col, pa_target_scalar.as_py()
)
assert_column_eq(got, expected)

0 comments on commit 7fd6918

Please sign in to comment.