Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate strings contains operations to pylibcudf #15880

Merged
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/contains.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
========
Contains
========

.. automodule:: cudf._lib.pylibcudf.strings.contains
:members:
9 changes: 9 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,12 @@ This page provides API documentation for pylibcudf.
table
types
unary

String Functions
================

.. toctree::
:maxdepth: 1
:caption: String Functions

contains
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources char_types.pyx)
set(cython_sources char_types.pyx regex_flags.pyx)

set(linked_libraries cudf::cudf)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.

from libc.stdint cimport int32_t


cdef extern from "cudf/strings/regex/flags.hpp" \
namespace "cudf::strings" nogil:

ctypedef enum regex_flags:
DEFAULT 'cudf::strings::regex_flags::DEFAULT'
MULTILINE 'cudf::strings::regex_flags::MULTILINE'
DOTALL 'cudf::strings::regex_flags::DOTALL'
cpdef enum class regex_flags(int32_t):
DEFAULT
MULTILINE
DOTALL
Empty file.
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# the License.
# =============================================================================

set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx)
set(cython_sources case.pyx capitalize.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
regex_program.pyx
)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport capitalize, case, char_types, find
from . cimport (
capitalize,
case,
char_types,
contains,
find,
regex_flags,
regex_program,
)
10 changes: 9 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import capitalize, case, char_types, find
from . import (
capitalize,
case,
char_types,
contains,
find,
regex_flags,
regex_program,
)
7 changes: 7 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram

brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved

cpdef Column contains_re(Column input, RegexProgram prog)
41 changes: 41 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf._lib.pylibcudf.column cimport Column
from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains
from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram


cpdef Column contains_re(
Column input,
RegexProgram prog
):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docstring needed

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might also want to start the sphinx docs for the strings submodule for pylibcudf.

As far as I can tell, there's no .rst files for strings in the api_docs/pylibcudf folder.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a few sphinx docs here

"""Returns a boolean column identifying rows which match the given
regex_program object.

For details, see :cpp:func:`cudf::strings::contains_re`.

Parameters
----------
input : Column
The input strings
prog : RegexProgram
Regex program instance

Returns
-------
pylibcudf.Column
New column of boolean results for each string
"""

cdef unique_ptr[column] result

with nogil:
result = cpp_contains.contains_re(
input.view(),
prog.c_obj.get()[0]
)

return Column.from_libcudf(move(result))
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \
regex_flags as RegexFlags # no-cython-lint
10 changes: 10 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program


cdef class RegexProgram:
cdef unique_ptr[regex_program] c_obj
37 changes: 37 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (c) 2024, NVIDIA CORPORATION.


from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program

from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags
from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags


cdef class RegexProgram:

def __init__(self, *args, **kwargs):
raise ValueError("Do not instantiate RegexProgram directly, use create")

@staticmethod
def create(str pattern, int flags):
cdef unique_ptr[regex_program] c_prog
cdef regex_flags c_flags
cdef string c_pattern = pattern.encode()

cdef RegexProgram ret = RegexProgram.__new__(RegexProgram)
if isinstance(flags, object):
if isinstance(flags, (int, RegexFlags)):
c_flags = <regex_flags>flags
with nogil:
c_prog = regex_program.create(c_pattern, c_flags)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Whose job is it to ensure that the provided input is a "valid" regex?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The libcudf method should validate the pattern at the c++ level:

cudf::logic_error If pattern is invalid or contains unsupported features

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add some simple tests of this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think that's reasonable. Will add some

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couple of tests at b15588a


ret.c_obj = move(c_prog)
else:
raise ValueError("flags must be of type RegexFlags")

return ret
23 changes: 7 additions & 16 deletions python/cudf/cudf/_lib/strings/contains.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
contains_re as cpp_contains_re,
count_re as cpp_count_re,
like as cpp_like,
matches_re as cpp_matches_re,
Expand All @@ -23,28 +22,20 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
from cudf._lib.scalar cimport DeviceScalar

from cudf._lib.pylibcudf.strings import contains
from cudf._lib.pylibcudf.strings.regex_program import RegexProgram


@acquire_spill_lock()
def contains_re(Column source_strings, object reg_ex, uint32_t flags):
"""
Returns a Column of boolean values with True for `source_strings`
that contain regular expression `reg_ex`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_contains_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand Down
13 changes: 13 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_regex_program.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pytest

import cudf._lib.pylibcudf as plc


@pytest.mark.parametrize("pat", ["(", "*", "\\"])
def test_regex_program_invalid(pat):
with pytest.raises(RuntimeError):
plc.strings.regex_program.RegexProgram.create(
pat, plc.strings.regex_flags.RegexFlags.DEFAULT
)
55 changes: 55 additions & 0 deletions python/cudf/cudf/pylibcudf_tests/test_string_contains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pytest
from utils import assert_column_eq

import cudf._lib.pylibcudf as plc


@pytest.fixture(scope="module")
def pa_target_col():
return pa.array(
["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
)


@pytest.fixture(scope="module")
def plc_target_col(pa_target_col):
return plc.interop.from_arrow(pa_target_col)


@pytest.fixture(
params=[
"A",
"de",
".*",
"^a",
"^A",
"[^a-z]",
"[a-z]{3,}",
"^[A-Z]{2,}",
"j|u",
],
scope="module",
)
def pa_target_scalar(request):
return pa.scalar(request.param, type=pa.string())


@pytest.fixture(scope="module")
def plc_target_pat(pa_target_scalar):
prog = plc.strings.regex_program.RegexProgram.create(
pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT
)
return prog


def test_contains_re(
pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat
):
got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat)
expected = pa.compute.match_substring_regex(
pa_target_col, pa_target_scalar.as_py()
)
assert_column_eq(got, expected)
Loading