From 52abc0e2a3f36328e3b170fea54000fbaee69851 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 3 Oct 2024 13:05:34 -0700 Subject: [PATCH] Add string.convert.convert_integers APIs to pylibcudf --- python/cudf/cudf/_lib/string_casting.pyx | 83 ++----- .../strings/convert/convert_integers.pxd | 17 +- .../pylibcudf/strings/convert/CMakeLists.txt | 2 +- .../pylibcudf/strings/convert/__init__.pxd | 2 +- .../pylibcudf/strings/convert/__init__.py | 2 +- .../strings/convert/convert_integers.pxd | 17 ++ .../strings/convert/convert_integers.pyx | 206 ++++++++++++++++++ .../tests/test_string_convert_integers.py | 69 ++++++ 8 files changed, 327 insertions(+), 71 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 60a6795a402..59bf40443f4 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -26,13 +26,6 @@ from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, to_floats as cpp_to_floats, ) -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - from_integers as cpp_from_integers, - hex_to_integers as cpp_hex_to_integers, - integers_to_hex as cpp_integers_to_hex, - is_hex as cpp_is_hex, - to_integers as cpp_to_integers, -) from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport ( integers_to_ipv4 as cpp_integers_to_ipv4, ipv4_to_integers as cpp_ipv4_to_integers, @@ -143,32 +136,18 @@ def stof(Column input_col): def integer_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_integers( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.from_integers( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) def string_to_integer(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) + plc_column = plc.strings.convert.convert_integers.to_integers( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(out_type) ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_integers( - input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) def i8tos(Column input_col): @@ -696,7 +675,7 @@ def is_ipv4(Column source_strings): return Column.from_unique_ptr(move(c_result)) -def htoi(Column input_col, **kwargs): +def htoi(Column input_col): """ Converting input column of type string having hex values to integer of out_type @@ -709,22 +688,11 @@ def htoi(Column input_col, **kwargs): ------- A Column of integers parsed from hexadecimal string values. """ - - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")] - ) + plc_column = plc.strings.convert.convert_integers.hex_to_integers( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(cudf.dtype("int64")) ) - cdef data_type c_out_type = data_type(tid) - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_hex_to_integers(input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) def is_hex(Column source_strings): @@ -732,15 +700,10 @@ def is_hex(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have hex characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_hex( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.is_hex( + source_strings.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) def itoh(Column input_col): @@ -756,11 +719,7 @@ def itoh(Column input_col): ------- A Column of strings with hexadecimal characters. """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_hex(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.integers_to_hex( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd index f12aab0a2e4..63d0dd13a60 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd @@ -9,23 +9,28 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_integers( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] from_integers( - column_view input_col) except + + column_view integers) except + cdef unique_ptr[column] is_integer( - column_view source_strings + column_view input + ) except + + + cdef unique_ptr[column] is_integer( + column_view input, + data_type int_type ) except + cdef unique_ptr[column] hex_to_integers( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] is_hex( - column_view source_strings + column_view input ) except + cdef unique_ptr[column] integers_to_hex( - column_view input_col) except + + column_view input) except + diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 175c9b3738e..cfea23e302a 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources convert_durations.pyx convert_datetime.pyx) +set(cython_sources convert_durations.pyx convert_datetime.pyx convert_integers.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 05324cb49df..791980aab34 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_datetime, convert_durations +from . cimport convert_datetime, convert_durations, convert_integers diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index d803399d53c..fbed5a4e1f2 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_datetime, convert_durations +from . import convert_datetime, convert_durations, convert_integers diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd new file mode 100644 index 00000000000..eff2e080c27 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_integers(Column input, DataType output_type) + +cpdef Column from_integers(Column integers) + +cpdef Column is_integer(Column input, DataType int_type=*) + +cpdef Column hex_to_integers(Column input, DataType output_type) + +cpdef Column is_hex(Column input) + +cpdef Column integers_to_hex(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx new file mode 100644 index 00000000000..5558683a502 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx @@ -0,0 +1,206 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_integers as cpp_convert_integers, +) +from pylibcudf.types cimport DataType + + +cpdef Column to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing integer values from the + provided strings column. + + For details, cpp:func:`cudf::strings::to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column from_integers(Column integers): + """ + Returns a new strings column converting the integer values from the + provided column into strings. + + For details, cpp:func:`cudf::strings::from_integers`. + + Parameters + ---------- + integers : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column with integers as strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.from_integers( + integers.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_integer(Column input, DataType int_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers. + + For details, cpp:func:`cudf::strings::is_integer`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + int_type : DataType + Integer type used for checking underflow and overflow. + By default, does not check an integer type for underflow + or overflow. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if int_type is None: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + ) + ) + else: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + int_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column hex_to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing hexadecimal values + from the provided strings column. + + For details, cpp:func:`cudf::strings::hex_to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.hex_to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_hex(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from hex. + + For details, cpp:func:`cudf::strings::is_hex`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.is_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_hex(Column input): + """ + Returns a new strings column converting integer columns to hexadecimal + characters. + + For details, cpp:func:`cudf::strings::integers_to_hex`. + + Parameters + ---------- + input : Column + Integer column to convert to hex. + + Returns + ------- + Column + New strings column with hexadecimal characters. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.integers_to_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py new file mode 100644 index 00000000000..6d1d565af30 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_to_integers(): + typ = pa.int8() + arr = pa.array(["1", "-1", None]) + result = plc.strings.convert.convert_integers.to_integers( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_integers(): + arr = pa.array([1, -1, None]) + result = plc.strings.convert.convert_integers.from_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["1", "-1", None]) + assert_column_eq(result, expected) + + +def test_is_integer(): + arr = pa.array(["1", "-1", "1.2", "A", None]) + plc_column = plc.interop.from_arrow(arr) + result = plc.strings.convert.convert_integers.is_integer(plc_column) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) + + result = plc.strings.convert.convert_integers.is_integer( + plc_column, plc.interop.from_arrow(pa.uint8()) + ) + expected = pa.array([True, False, False, False, None]) + assert_column_eq(result, expected) + + +def test_hex_to_integers(): + typ = pa.int32() + data = ["0xff", "0x2a", None] + result = plc.strings.convert.convert_integers.hex_to_integers( + plc.interop.from_arrow(pa.array(data)), plc.interop.from_arrow(typ) + ) + expected = pa.array( + [int(val, 16) if isinstance(val, str) else val for val in data], + type=typ, + ) + assert_column_eq(result, expected) + + +def test_is_hex(): + arr = pa.array(["0xff", "123", "!", None]) + result = plc.strings.convert.convert_integers.is_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, True, False, None]) + assert_column_eq(result, expected) + + +def test_integers_to_hex(): + data = [255, -42, None] + arr = pa.array(data) + result = plc.strings.convert.convert_integers.integers_to_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["FF", "FFFFFFFFFFFFFFD6", None]) + assert_column_eq(result, expected)