Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into pylibcudf/strings/wrap
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored Oct 1, 2024
2 parents 4ce4f6a + dae9d68 commit 2bbf84c
Show file tree
Hide file tree
Showing 16 changed files with 427 additions and 212 deletions.
5 changes: 3 additions & 2 deletions cpp/include/cudf/strings/char_types/char_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ namespace strings {
*/

/**
* @brief Returns a boolean column identifying strings entries in which all
* @brief Returns a boolean column identifying string entries where all
* characters are of the type specified.
*
* The output row entry will be set to false if the corresponding string element
Expand Down Expand Up @@ -105,7 +105,8 @@ std::unique_ptr<column> all_characters_of_type(
* `types_to_remove` will be filtered.
* @param mr Device memory resource used to allocate the returned column's device memory
* @param stream CUDA stream used for device memory operations and kernel launches
* @return New column of boolean results for each string
* @return New strings column with the characters of specified types filtered out and replaced by
* the specified replacement string
*/
std::unique_ptr<column> filter_characters_of_type(
strings_column_view const& input,
Expand Down
178 changes: 58 additions & 120 deletions python/cudf/cudf/_lib/strings/char_types.pyx
Original file line number Diff line number Diff line change
@@ -1,50 +1,28 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION.


from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.char_types cimport (
all_characters_of_type as cpp_all_characters_of_type,
filter_characters_of_type as cpp_filter_characters_of_type,
string_character_types,
)

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf.strings import char_types


@acquire_spill_lock()
def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
"""
Returns a Column of strings keeping only alphanumeric character types.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_repl = <const string_scalar*>(
repl.get_raw_ptr()
plc_column = char_types.filter_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.ALL_TYPES if keep
else char_types.StringCharacterTypes.ALPHANUM,
py_repl.device_value.c_value,
char_types.StringCharacterTypes.ALPHANUM if keep
else char_types.StringCharacterTypes.ALL_TYPES
)

with nogil:
c_result = move(cpp_filter_characters_of_type(
source_view,
string_character_types.ALL_TYPES if keep
else string_character_types.ALPHANUM,
scalar_repl[0],
string_character_types.ALPHANUM if keep
else string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -54,17 +32,12 @@ def is_decimal(Column source_strings):
that contain only decimal characters -- those that can be used
to extract base10 numbers.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.DECIMAL,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.DECIMAL,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -75,17 +48,12 @@ def is_alnum(Column source_strings):
Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal()
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.ALPHANUM,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.ALPHANUM,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -94,17 +62,12 @@ def is_alpha(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only alphabetic characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.ALPHA,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.ALPHA,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -113,17 +76,12 @@ def is_digit(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only decimal and digit characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.DIGIT,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.DIGIT,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -133,17 +91,12 @@ def is_numeric(Column source_strings):
that contain only numeric characters. These include digit and
numeric characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.NUMERIC,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.NUMERIC,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -152,17 +105,12 @@ def is_upper(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only upper-case characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.UPPER,
string_character_types.CASE_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.UPPER,
char_types.StringCharacterTypes.CASE_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -171,17 +119,12 @@ def is_lower(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contain only lower-case characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.LOWER,
string_character_types.CASE_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.LOWER,
char_types.StringCharacterTypes.CASE_TYPES
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -190,14 +133,9 @@ def is_space(Column source_strings):
Returns a Column of boolean values with True for `source_strings`
that contains all characters which are spaces only.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_all_characters_of_type(
source_view,
string_character_types.SPACE,
string_character_types.ALL_TYPES
))

return Column.from_unique_ptr(move(c_result))
plc_column = char_types.all_characters_of_type(
source_strings.to_pylibcudf(mode="read"),
char_types.StringCharacterTypes.SPACE,
char_types.StringCharacterTypes.ALL_TYPES
)
return Column.from_pylibcudf(plc_column)
93 changes: 14 additions & 79 deletions python/cudf/cudf/_lib/strings/translate.pyx
Original file line number Diff line number Diff line change
@@ -1,25 +1,12 @@
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.pair cimport pair
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.translate cimport (
filter_characters as cpp_filter_characters,
filter_type,
translate as cpp_translate,
)
from pylibcudf.libcudf.types cimport char_utf8

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

import pylibcudf as plc


@acquire_spill_lock()
Expand All @@ -29,30 +16,11 @@ def translate(Column source_strings,
Translates individual characters within each string
if present in the mapping_table.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef int table_size
table_size = len(mapping_table)

cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
c_mapping_table.reserve(table_size)

for key in mapping_table:
value = mapping_table[key]
if type(value) is int:
value = chr(value)
if type(value) is str:
value = int.from_bytes(value.encode(), byteorder='big')
if type(key) is int:
key = chr(key)
if type(key) is str:
key = int.from_bytes(key.encode(), byteorder='big')
c_mapping_table.push_back((key, value))

with nogil:
c_result = move(cpp_translate(source_view, c_mapping_table))

return Column.from_unique_ptr(move(c_result))
plc_result = plc.strings.translate.translate(
source_strings.to_pylibcudf(mode="read"),
mapping_table,
)
return Column.from_pylibcudf(plc_result)


@acquire_spill_lock()
Expand All @@ -64,44 +32,11 @@ def filter_characters(Column source_strings,
Removes or keeps individual characters within each string
using the provided mapping_table.
"""

cdef DeviceScalar repl = py_repl.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef const string_scalar* scalar_repl = <const string_scalar*>(
repl.get_raw_ptr()
plc_result = plc.strings.translate.filter_characters(
source_strings.to_pylibcudf(mode="read"),
mapping_table,
plc.strings.translate.FilterType.KEEP
if keep else plc.strings.translate.FilterType.REMOVE,
py_repl.device_value.c_value
)
cdef int table_size
table_size = len(mapping_table)

cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
c_mapping_table.reserve(table_size)

for key in mapping_table:
value = mapping_table[key]
if type(value) is int:
value = chr(value)
if type(value) is str:
value = int.from_bytes(value.encode(), byteorder='big')
if type(key) is int:
key = chr(key)
if type(key) is str:
key = int.from_bytes(key.encode(), byteorder='big')
c_mapping_table.push_back((key, value))

cdef filter_type c_keep
if keep is True:
c_keep = filter_type.KEEP
else:
c_keep = filter_type.REMOVE

with nogil:
c_result = move(cpp_filter_characters(
source_view,
c_mapping_table,
c_keep,
scalar_repl[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_result)
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx)
set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)

set(linked_libraries cudf::cudf)

Expand Down
Loading

0 comments on commit 2bbf84c

Please sign in to comment.