diff --git a/CHANGELOG.md b/CHANGELOG.md index 5589a2e1669..7b8624e596d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ - PR #5497 Add `.str.isinteger` & `.str.isfloat` - PR #5528 Add unsigned int reading and writing support to parquet - PR #5510 Add support for `cudf.Index` to create Indexes +- PR #5612 Add `is_hex` strings API ## Improvements diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp index 426f4b00428..74fb48f8cdd 100644 --- a/cpp/include/cudf/strings/convert/convert_integers.hpp +++ b/cpp/include/cudf/strings/convert/convert_integers.hpp @@ -100,6 +100,31 @@ std::unique_ptr hex_to_integers( data_type output_type, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to integers from hex. + * + * The output row entry will be set to `true` if the corresponding string element + * has at least one character in [0-9A-Za-z]. Also, the string may start + * with '0x'. + * + * @code{.pseudo} + * Example: + * s = ['123', '-456', '', 'AGE', '+17EA', '0x9EF' '123ABC'] + * b = s.is_hex(s) + * b is [true, false, false, false, false, true, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_hex( + strings_column_view const& strings, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index 36d18ed2dae..a7241a71a51 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -27,6 +27,7 @@ #include #include +#include #include namespace cudf { @@ -141,6 +142,44 @@ std::unique_ptr hex_to_integers( return results; } +std::unique_ptr is_hex(strings_column_view const& strings, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr) +{ + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + // create output column + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + auto d_results = results->mutable_view().data(); + thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings.size()), + d_results, + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return false; + auto const d_str = d_column.element(idx); + if (d_str.empty()) return false; + auto const starts_with_0x = [](auto const& sv) { + return sv.length() > 1 && (sv.substr(0, 2) == string_view("0x", 2) || + sv.substr(0, 2) == string_view("0X", 2)); + }; + auto begin = d_str.begin() + (starts_with_0x(d_str) ? 2 : 0); + auto end = d_str.end(); + return (thrust::distance(begin, end) > 0) && + thrust::all_of(thrust::seq, begin, end, [] __device__(auto chr) { + return (chr >= '0' && chr <= '9') || (chr >= 'A' && chr <= 'F') || + (chr >= 'a' && chr <= 'f'); + }); + }); + results->set_null_count(strings.null_count()); + return results; +} + } // namespace detail // external API @@ -152,5 +191,12 @@ std::unique_ptr hex_to_integers(strings_column_view const& strings, return detail::hex_to_integers(strings, output_type, mr); } +std::unique_ptr is_hex(strings_column_view const& strings, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_hex(strings, 0, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu index 919ee1b6cbe..7540aa905d3 100644 --- a/cpp/tests/strings/integers_tests.cu +++ b/cpp/tests/strings/integers_tests.cu @@ -199,3 +199,27 @@ TEST_F(StringsConvertTest, HexToInteger) cudf::test::expect_columns_equal(*results, expected); } } + +TEST_F(StringsConvertTest, IsHex) +{ + std::vector h_strings{"", + "1234", + nullptr, + "98BEEF", + "1a5", + "2face", + "0xAABBCCDD", + "112233445566", + "XYZ", + "0", + "0x", + "x"}; + cudf::test::strings_column_wrapper strings( + h_strings.begin(), + h_strings.end(), + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + cudf::test::fixed_width_column_wrapper expected({0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0}, + {1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + auto results = cudf::strings::is_hex(cudf::strings_column_view(strings)); + cudf::test::expect_columns_equal(*results, expected); +} diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd index 767eb49a5d0..92f99a2f5cb 100644 --- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd @@ -18,3 +18,7 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \ cdef unique_ptr[column] hex_to_integers( column_view input_col, data_type output_type) except + + + cdef unique_ptr[column] is_hex( + column_view source_strings + ) except + diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 09884f398c7..878c3c2befb 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -29,7 +29,8 @@ from cudf._lib.cpp.strings.convert.convert_floats cimport ( from cudf._lib.cpp.strings.convert.convert_integers cimport ( to_integers as cpp_to_integers, from_integers as cpp_from_integers, - hex_to_integers as cpp_hex_to_integers + hex_to_integers as cpp_hex_to_integers, + is_hex as cpp_is_hex ) from cudf._lib.cpp.strings.convert.convert_ipv4 cimport ( ipv4_to_integers as cpp_ipv4_to_integers, @@ -643,3 +644,19 @@ def htoi(Column input_col, **kwargs): c_out_type)) return Column.from_unique_ptr(move(c_result)) + + +def is_hex(Column source_strings): + """ + Returns a Column of boolean values with True for `source_strings` + that have hex characters. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + with nogil: + c_result = move(cpp_is_hex( + source_view + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 399e4c3b494..36f9667d1aa 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -904,6 +904,45 @@ def isinteger(self, **kwargs): """ return self._return_or_inplace(cpp_is_integer(self._column), **kwargs) + def ishex(self, **kwargs): + """ + Check whether all characters in each string form a hex integer. + + If a string has zero characters, False is returned for + that check. + + Returns : Series or Index of bool + Series or Index of boolean values with the same + length as the original Series/Index. + + See also + -------- + isdecimal + Check whether all characters are decimal. + + isdigit + Check whether all characters are digits. + + isnumeric + Check whether all characters are numeric. + + isfloat + Check whether all characters are float. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series(["", "123DEF", "0x2D3", "-15", "abc"]) + >>> s.str.ishex() + 0 False + 1 True + 2 True + 3 False + 4 True + dtype: bool + """ + return self._return_or_inplace(str_cast.is_hex(self._column), **kwargs) + def isfloat(self, **kwargs): """ Check whether all characters in each string form floating value. diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 2862271717b..97d83d43eb5 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -2059,6 +2059,13 @@ def test_string_hex_to_int(data): assert_eq(expected, got) +def test_string_ishex(): + gsr = Series(["", None, "0x01a2b3c4d5e6f", "0789", "ABCDEF0"]) + got = gsr.str.ishex() + expected = Series([False, None, True, True, True]) + assert_eq(expected, got) + + def test_string_ip4_to_int(): gsr = Series(["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"]) expected = Series([0, None, 0, 698875905, 2130706433, 700776449])