Skip to content

Commit

Permalink
Add string.translate APIs to pylibcudf (#16934)
Browse files Browse the repository at this point in the history
Contributes to #15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #16934
  • Loading branch information
mroeschke authored Oct 1, 2024
1 parent e46437c commit dae9d68
Show file tree
Hide file tree
Showing 10 changed files with 232 additions and 87 deletions.
93 changes: 14 additions & 79 deletions python/cudf/cudf/_lib/strings/translate.pyx
Original file line number Diff line number Diff line change
@@ -1,25 +1,12 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.pair cimport pair
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.translate cimport (
filter_characters as cpp_filter_characters,
filter_type,
translate as cpp_translate,
)
from pylibcudf.libcudf.types cimport char_utf8

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

import pylibcudf as plc


@acquire_spill_lock()
Expand All @@ -29,30 +16,11 @@ def translate(Column source_strings,
Translates individual characters within each string
if present in the mapping_table.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef int table_size
table_size = len(mapping_table)

cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
c_mapping_table.reserve(table_size)

for key in mapping_table:
value = mapping_table[key]
if type(value) is int:
value = chr(value)
if type(value) is str:
value = int.from_bytes(value.encode(), byteorder='big')
if type(key) is int:
key = chr(key)
if type(key) is str:
key = int.from_bytes(key.encode(), byteorder='big')
c_mapping_table.push_back((key, value))

with nogil:
c_result = move(cpp_translate(source_view, c_mapping_table))

return Column.from_unique_ptr(move(c_result))
plc_result = plc.strings.translate.translate(
source_strings.to_pylibcudf(mode="read"),
mapping_table,
)
return Column.from_pylibcudf(plc_result)


@acquire_spill_lock()
Expand All @@ -64,44 +32,11 @@ def filter_characters(Column source_strings,
Removes or keeps individual characters within each string
using the provided mapping_table.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_repl = <const string_scalar*>(
repl.get_raw_ptr()
plc_result = plc.strings.translate.filter_characters(
source_strings.to_pylibcudf(mode="read"),
mapping_table,
plc.strings.translate.FilterType.KEEP
if keep else plc.strings.translate.FilterType.REMOVE,
py_repl.device_value.c_value
)
cdef int table_size
table_size = len(mapping_table)

cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
c_mapping_table.reserve(table_size)

for key in mapping_table:
value = mapping_table[key]
if type(value) is int:
value = chr(value)
if type(value) is str:
value = int.from_bytes(value.encode(), byteorder='big')
if type(key) is int:
key = chr(key)
if type(key) is str:
key = int.from_bytes(key.encode(), byteorder='big')
c_mapping_table.push_back((key, value))

cdef filter_type c_keep
if keep is True:
c_keep = filter_type.KEEP
else:
c_keep = filter_type.REMOVE

with nogil:
c_result = move(cpp_filter_characters(
source_view,
c_mapping_table,
c_keep,
scalar_repl[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_result)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx)
set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)

set(linked_libraries cudf::cudf)

Expand Down
14 changes: 7 additions & 7 deletions python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ from pylibcudf.libcudf.types cimport char_utf8
cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:

cdef unique_ptr[column] translate(
column_view source_strings,
column_view input,
vector[pair[char_utf8, char_utf8]] chars_table) except +

ctypedef enum filter_type:
KEEP 'cudf::strings::filter_type::KEEP',
REMOVE 'cudf::strings::filter_type::REMOVE'
cpdef enum class filter_type(bool):
KEEP
REMOVE

cdef unique_ptr[column] filter_characters(
column_view source_strings,
vector[pair[char_utf8, char_utf8]] chars_table,
filter_type keep,
column_view input,
vector[pair[char_utf8, char_utf8]] characters_to_filter,
filter_type keep_characters,
string_scalar replacement) except +
Empty file.
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ set(cython_sources
side_type.pyx
slice.pyx
strip.pyx
translate.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ from . cimport (
replace,
slice,
strip,
translate,
)
from .side_type cimport side_type

Expand All @@ -34,4 +35,5 @@ __all__ = [
"slice",
"strip",
"side_type",
"translate",
]
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/strings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
replace,
slice,
strip,
translate,
)
from .side_type import SideType

Expand All @@ -35,4 +36,5 @@
"slice",
"strip",
"SideType",
"translate",
]
14 changes: 14 additions & 0 deletions python/pylibcudf/pylibcudf/strings/translate.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from pylibcudf.column cimport Column
from pylibcudf.libcudf.strings.translate cimport filter_type
from pylibcudf.scalar cimport Scalar


cpdef Column translate(Column input, dict chars_table)

cpdef Column filter_characters(
Column input,
dict characters_to_filter,
filter_type keep_characters,
Scalar replacement
)
122 changes: 122 additions & 0 deletions python/pylibcudf/pylibcudf/strings/translate.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.pair cimport pair
from libcpp.utility cimport move
from libcpp.vector cimport vector
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings cimport translate as cpp_translate
from pylibcudf.libcudf.types cimport char_utf8
from pylibcudf.scalar cimport Scalar

from cython.operator import dereference
from pylibcudf.libcudf.strings.translate import \
filter_type as FilterType # no-cython-lint


cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table):
"""
Convert str.maketrans table to cudf compatible table.
"""
cdef int table_size = len(table)
cdef vector[pair[char_utf8, char_utf8]] c_table

c_table.reserve(table_size)
for key, value in table.items():
if isinstance(value, int):
value = chr(value)
if isinstance(value, str):
value = int.from_bytes(value.encode(), byteorder='big')
if isinstance(key, int):
key = chr(key)
if isinstance(key, str):
key = int.from_bytes(key.encode(), byteorder='big')
c_table.push_back((key, value))

return c_table


cpdef Column translate(Column input, dict chars_table):
"""
Translates individual characters within each string.
For details, see :cpp:func:`cudf::strings::translate`.
Parameters
----------
input : Column
Strings instance for this operation
chars_table : dict
Table of UTF-8 character mappings
Returns
-------
Column
New column with padded strings.
"""
cdef unique_ptr[column] c_result
cdef vector[pair[char_utf8, char_utf8]] c_chars_table = _table_to_c_table(
chars_table
)

with nogil:
c_result = move(
cpp_translate.translate(
input.view(),
c_chars_table
)
)
return Column.from_libcudf(move(c_result))


cpdef Column filter_characters(
Column input,
dict characters_to_filter,
filter_type keep_characters,
Scalar replacement
):
"""
Removes ranges of characters from each string in a strings column.
For details, see :cpp:func:`cudf::strings::filter_characters`.
Parameters
----------
input : Column
Strings instance for this operation
characters_to_filter : dict
Table of character ranges to filter on
keep_characters : FilterType
If true, the `characters_to_filter` are retained
and all other characters are removed.
replacement : Scalar
Replacement string for each character removed.
Returns
-------
Column
New column with filtered strings.
"""
cdef unique_ptr[column] c_result
cdef vector[pair[char_utf8, char_utf8]] c_characters_to_filter = _table_to_c_table(
characters_to_filter
)
cdef const string_scalar* c_replacement = <const string_scalar*>(
replacement.c_obj.get()
)

with nogil:
c_result = move(
cpp_translate.filter_characters(
input.view(),
c_characters_to_filter,
keep_characters,
dereference(c_replacement),
)
)
return Column.from_libcudf(move(c_result))
69 changes: 69 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

import pyarrow as pa
import pylibcudf as plc
import pytest
from utils import assert_column_eq


@pytest.fixture
def data_col():
pa_data_col = pa.array(
["aa", "bbb", "cccc", "abcd", None],
type=pa.string(),
)
return pa_data_col, plc.interop.from_arrow(pa_data_col)


@pytest.fixture
def trans_table():
return str.maketrans("abd", "A Q")


def test_translate(data_col, trans_table):
pa_array, plc_col = data_col
result = plc.strings.translate.translate(plc_col, trans_table)
expected = pa.array(
[
val.translate(trans_table) if isinstance(val, str) else None
for val in pa_array.to_pylist()
]
)
assert_column_eq(expected, result)


@pytest.mark.parametrize(
"keep",
[
plc.strings.translate.FilterType.KEEP,
plc.strings.translate.FilterType.REMOVE,
],
)
def test_filter_characters(data_col, trans_table, keep):
pa_array, plc_col = data_col
result = plc.strings.translate.filter_characters(
plc_col, trans_table, keep, plc.interop.from_arrow(pa.scalar("*"))
)
exp_data = []
flat_trans = set(trans_table.keys()).union(trans_table.values())
for val in pa_array.to_pylist():
if not isinstance(val, str):
exp_data.append(val)
else:
new_val = ""
for ch in val:
if (
ch in flat_trans
and keep == plc.strings.translate.FilterType.KEEP
):
new_val += ch
elif (
ch not in flat_trans
and keep == plc.strings.translate.FilterType.REMOVE
):
new_val += ch
else:
new_val += "*"
exp_data.append(new_val)
expected = pa.array(exp_data)
assert_column_eq(expected, result)

0 comments on commit dae9d68

Please sign in to comment.