Merge branch 'branch-24.12' into pylibcudf/strings/wrap

rapidsai · Oct 1, 2024 · 2bbf84c · 2bbf84c
2 parents 4ce4f6a + dae9d68
commit 2bbf84c
Show file tree

Hide file tree

Showing 16 changed files with 427 additions and 212 deletions.
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -30,7 +30,7 @@ namespace strings {
  */
 
 /**
- * @brief Returns a boolean column identifying strings entries in which all
+ * @brief Returns a boolean column identifying string entries where all
  * characters are of the type specified.
  *
  * The output row entry will be set to false if the corresponding string element
@@ -105,7 +105,8 @@ std::unique_ptr<column> all_characters_of_type(
  *        `types_to_remove` will be filtered.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return New column of boolean results for each string
+ * @return New strings column with the characters of specified types filtered out and replaced by
+ * the specified replacement string
  */
 std::unique_ptr<column> filter_characters_of_type(
   strings_column_view const& input,

diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,50 +1,28 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.char_types cimport (
-    all_characters_of_type as cpp_all_characters_of_type,
-    filter_characters_of_type as cpp_filter_characters_of_type,
-    string_character_types,
-)
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+from pylibcudf.strings import char_types
 
 
 @acquire_spill_lock()
 def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
     """
     Returns a Column of strings keeping only alphanumeric character types.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_repl = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_column = char_types.filter_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.ALL_TYPES if keep
+        else char_types.StringCharacterTypes.ALPHANUM,
+        py_repl.device_value.c_value,
+        char_types.StringCharacterTypes.ALPHANUM if keep
+        else char_types.StringCharacterTypes.ALL_TYPES
     )
-
-    with nogil:
-        c_result = move(cpp_filter_characters_of_type(
-            source_view,
-            string_character_types.ALL_TYPES if keep
-            else string_character_types.ALPHANUM,
-            scalar_repl[0],
-            string_character_types.ALPHANUM if keep
-            else string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -54,17 +32,12 @@ def is_decimal(Column source_strings):
     that contain only decimal characters -- those that can be used
     to extract base10 numbers.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.DECIMAL,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.DECIMAL,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -75,17 +48,12 @@ def is_alnum(Column source_strings):
 
     Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal()
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.ALPHANUM,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.ALPHANUM,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -94,17 +62,12 @@ def is_alpha(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only alphabetic characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.ALPHA,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.ALPHA,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -113,17 +76,12 @@ def is_digit(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only decimal and digit characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.DIGIT,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.DIGIT,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -133,17 +91,12 @@ def is_numeric(Column source_strings):
     that contain only numeric characters. These include digit and
     numeric characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.NUMERIC,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.NUMERIC,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -152,17 +105,12 @@ def is_upper(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only upper-case characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.UPPER,
-            string_character_types.CASE_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.UPPER,
+        char_types.StringCharacterTypes.CASE_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -171,17 +119,12 @@ def is_lower(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain only lower-case characters.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.LOWER,
-            string_character_types.CASE_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.LOWER,
+        char_types.StringCharacterTypes.CASE_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -190,14 +133,9 @@ def is_space(Column source_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contains all characters which are spaces only.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_all_characters_of_type(
-            source_view,
-            string_character_types.SPACE,
-            string_character_types.ALL_TYPES
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = char_types.all_characters_of_type(
+        source_strings.to_pylibcudf(mode="read"),
+        char_types.StringCharacterTypes.SPACE,
+        char_types.StringCharacterTypes.ALL_TYPES
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -1,25 +1,12 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.translate cimport (
-    filter_characters as cpp_filter_characters,
-    filter_type,
-    translate as cpp_translate,
-)
-from pylibcudf.libcudf.types cimport char_utf8
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -29,30 +16,11 @@ def translate(Column source_strings,
     Translates individual characters within each string
     if present in the mapping_table.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef int table_size
-    table_size = len(mapping_table)
-
-    cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
-    c_mapping_table.reserve(table_size)
-
-    for key in mapping_table:
-        value = mapping_table[key]
-        if type(value) is int:
-            value = chr(value)
-        if type(value) is str:
-            value = int.from_bytes(value.encode(), byteorder='big')
-        if type(key) is int:
-            key = chr(key)
-        if type(key) is str:
-            key = int.from_bytes(key.encode(), byteorder='big')
-        c_mapping_table.push_back((key, value))
-
-    with nogil:
-        c_result = move(cpp_translate(source_view, c_mapping_table))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.translate.translate(
+        source_strings.to_pylibcudf(mode="read"),
+        mapping_table,
+    )
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -64,44 +32,11 @@ def filter_characters(Column source_strings,
     Removes or keeps individual characters within each string
     using the provided mapping_table.
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef const string_scalar* scalar_repl = <const string_scalar*>(
-        repl.get_raw_ptr()
+    plc_result = plc.strings.translate.filter_characters(
+        source_strings.to_pylibcudf(mode="read"),
+        mapping_table,
+        plc.strings.translate.FilterType.KEEP
+        if keep else plc.strings.translate.FilterType.REMOVE,
+        py_repl.device_value.c_value
     )
-    cdef int table_size
-    table_size = len(mapping_table)
-
-    cdef vector[pair[char_utf8, char_utf8]] c_mapping_table
-    c_mapping_table.reserve(table_size)
-
-    for key in mapping_table:
-        value = mapping_table[key]
-        if type(value) is int:
-            value = chr(value)
-        if type(value) is str:
-            value = int.from_bytes(value.encode(), byteorder='big')
-        if type(key) is int:
-            key = chr(key)
-        if type(key) is str:
-            key = int.from_bytes(key.encode(), byteorder='big')
-        c_mapping_table.push_back((key, value))
-
-    cdef filter_type c_keep
-    if keep is True:
-        c_keep = filter_type.KEEP
-    else:
-        c_keep = filter_type.REMOVE
-
-    with nogil:
-        c_result = move(cpp_filter_characters(
-            source_view,
-            c_mapping_table,
-            c_keep,
-            scalar_repl[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx)
+set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx)
 
 set(linked_libraries cudf::cudf)