Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add string.contains APIs to pylibcudf #16814

Merged
merged 4 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 13 additions & 67 deletions python/cudf/cudf/_lib/strings/contains.pyx
Original file line number Diff line number Diff line change
@@ -1,27 +1,10 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libc.stdint cimport uint32_t

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.contains cimport (
count_re as cpp_count_re,
like as cpp_like,
matches_re as cpp_matches_re,
)
from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from pylibcudf.libcudf.strings.regex_program cimport regex_program

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf.strings import contains
from pylibcudf.strings.regex_program import RegexProgram
Expand All @@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags):
Returns a Column with count of occurrences of `reg_ex` in
each string of `source_strings`
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_count_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.count_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand All @@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags):
Returns a Column with each value True if the string matches `reg_ex`
regular expression with each record of `source_strings`
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_matches_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand All @@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape):
Returns a Column with each value True if the string matches the
`py_pattern` like expression with each record of `source_strings`
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef DeviceScalar pattern = py_pattern.device_value
cdef DeviceScalar escape = py_escape.device_value

cdef const string_scalar* scalar_ptn = <const string_scalar*>(
pattern.get_raw_ptr()
)
cdef const string_scalar* scalar_esc = <const string_scalar*>(
escape.get_raw_ptr()
plc_column = contains.like(
source_strings.to_pylibcudf(mode="read"),
py_pattern.device_value.c_value,
py_escape.device_value.c_value,
)

with nogil:
c_result = move(cpp_like(
source_view,
scalar_ptn[0],
scalar_esc[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)
7 changes: 6 additions & 1 deletion python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
cdef unique_ptr[column] like(
column_view source_strings,
string_scalar pattern,
string_scalar escape) except +
string_scalar escape_character) except +

cdef unique_ptr[column] like(
column_view source_strings,
column_view patterns,
string_scalar escape_character) except +
14 changes: 14 additions & 0 deletions python/pylibcudf/pylibcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.scalar cimport Scalar
from pylibcudf.strings.regex_program cimport RegexProgram

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column contains_re(Column input, RegexProgram prog)

cpdef Column count_re(Column input, RegexProgram prog)

cpdef Column matches_re(Column input, RegexProgram prog)

cpdef Column like(
Column input,
ColumnOrScalar pattern,
Scalar escape_character = *
)
130 changes: 129 additions & 1 deletion python/pylibcudf/pylibcudf/strings/contains.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from cython.operator import dereference

from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.scalar.scalar_factories cimport (
make_string_scalar as cpp_make_string_scalar,
)
from pylibcudf.libcudf.strings cimport contains as cpp_contains
from pylibcudf.strings.regex_program cimport RegexProgram

Expand Down Expand Up @@ -32,9 +38,131 @@ cpdef Column contains_re(
cdef unique_ptr[column] result

with nogil:
result = cpp_contains.contains_re(
result = move(cpp_contains.contains_re(
input.view(),
prog.c_obj.get()[0]
))

return Column.from_libcudf(move(result))


cpdef Column count_re(
Column input,
RegexProgram prog
):
"""Returns the number of times the given regex_program's pattern
matches in each string.

For details, see :cpp:func:`cudf::strings::count_re`.

Parameters
----------
input : Column
The input strings
prog : RegexProgram
Regex program instance

Returns
-------
pylibcudf.Column
New column of match counts for each string
"""

cdef unique_ptr[column] result

with nogil:
result = move(cpp_contains.count_re(
input.view(),
prog.c_obj.get()[0]
))

return Column.from_libcudf(move(result))


cpdef Column matches_re(
Column input,
RegexProgram prog
):
"""Returns a boolean column identifying rows which
matching the given regex_program object but only at
the beginning the string.

For details, see :cpp:func:`cudf::strings::matches_re`.

Parameters
----------
input : Column
The input strings
prog : RegexProgram
Regex program instance

Returns
-------
pylibcudf.Column
New column of boolean results for each string
"""

cdef unique_ptr[column] result

with nogil:
result = move(cpp_contains.matches_re(
input.view(),
prog.c_obj.get()[0]
))

return Column.from_libcudf(move(result))


cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None):
"""
Returns a boolean column identifying rows which
match the given like pattern.

For details, see :cpp:func:`cudf::strings::like`.

Parameters
----------
input : Column
The input strings
pattern : Column or Scalar
Like patterns to match within each string
escape_character : Scalar
Optional character specifies the escape prefix.
Default is no escape character.

Returns
-------
pylibcudf.Column
New column of boolean results for each string
"""
cdef unique_ptr[column] result

if escape_character is None:
escape_character = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)

cdef const string_scalar* c_escape_character = <const string_scalar*>(
escape_character.c_obj.get()
)
cdef const string_scalar* c_pattern

if ColumnOrScalar is Column:
with nogil:
result = move(cpp_contains.like(
input.view(),
pattern.view(),
dereference(c_escape_character)
))
elif ColumnOrScalar is Scalar:
c_pattern = <const string_scalar*>(pattern.c_obj.get())
with nogil:
result = move(cpp_contains.like(
input.view(),
dereference(c_pattern),
dereference(c_escape_character)
))
else:
raise ValueError("pattern must be a Column or a Scalar")

return Column.from_libcudf(move(result))
37 changes: 37 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_contains.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat):
pa_target_col, pa_target_scalar.as_py()
)
assert_column_eq(got, expected)


def test_count_re():
pattern = "[1-9][a-z]"
arr = pa.array(["A1a2A3a4", "A1A2A3", None])
result = plc.strings.contains.count_re(
plc.interop.from_arrow(arr),
plc.strings.regex_program.RegexProgram.create(
pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
),
)
expected = pc.count_substring_regex(arr, pattern)
assert_column_eq(result, expected)


def test_match_re():
pattern = "[1-9][a-z]"
arr = pa.array(["1a2b", "b1a2", None])
result = plc.strings.contains.matches_re(
plc.interop.from_arrow(arr),
plc.strings.regex_program.RegexProgram.create(
pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
),
)
expected = pc.match_substring_regex(arr, f"^{pattern}")
assert_column_eq(result, expected)


def test_like():
pattern = "%a"
arr = pa.array(["1a2aa3aaa"])
result = plc.strings.contains.like(
plc.interop.from_arrow(arr),
plc.interop.from_arrow(pa.array([pattern])),
)
expected = pc.match_like(arr, pattern)
assert_column_eq(result, expected)
Loading