Skip to content

Commit

Permalink
Merge pull request rapidsai#5612 from davidwendt/fea-strings-is-hex
Browse files Browse the repository at this point in the history
[REVIEW] Add is_hex strings API
  • Loading branch information
davidwendt authored Jul 1, 2020
2 parents 4e5401e + 55f3733 commit 25a6e31
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
- PR #5497 Add `.str.isinteger` & `.str.isfloat`
- PR #5528 Add unsigned int reading and writing support to parquet
- PR #5510 Add support for `cudf.Index` to create Indexes
- PR #5612 Add `is_hex` strings API

## Improvements

Expand Down
25 changes: 25 additions & 0 deletions cpp/include/cudf/strings/convert/convert_integers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,31 @@ std::unique_ptr<column> hex_to_integers(
data_type output_type,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/**
* @brief Returns a boolean column identifying strings in which all
* characters are valid for conversion to integers from hex.
*
* The output row entry will be set to `true` if the corresponding string element
* has at least one character in [0-9A-Za-z]. Also, the string may start
* with '0x'.
*
* @code{.pseudo}
* Example:
* s = ['123', '-456', '', 'AGE', '+17EA', '0x9EF' '123ABC']
* b = s.is_hex(s)
* b is [true, false, false, false, false, true, true]
* @endcode
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
*/
std::unique_ptr<column> is_hex(
strings_column_view const& strings,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
46 changes: 46 additions & 0 deletions cpp/src/strings/convert/convert_hex.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#include <rmm/thrust_rmm_allocator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/logical.h>
#include <thrust/transform.h>

namespace cudf {
Expand Down Expand Up @@ -141,6 +142,44 @@ std::unique_ptr<column> hex_to_integers(
return results;
}

std::unique_ptr<column> is_hex(strings_column_view const& strings,
cudaStream_t stream,
rmm::mr::device_memory_resource* mr)
{
auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;
// create output column
auto results = make_numeric_column(data_type{type_id::BOOL8},
strings.size(),
copy_bitmask(strings.parent(), stream, mr),
strings.null_count(),
stream,
mr);
auto d_results = results->mutable_view().data<bool>();
thrust::transform(rmm::exec_policy(stream)->on(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings.size()),
d_results,
[d_column] __device__(size_type idx) {
if (d_column.is_null(idx)) return false;
auto const d_str = d_column.element<string_view>(idx);
if (d_str.empty()) return false;
auto const starts_with_0x = [](auto const& sv) {
return sv.length() > 1 && (sv.substr(0, 2) == string_view("0x", 2) ||
sv.substr(0, 2) == string_view("0X", 2));
};
auto begin = d_str.begin() + (starts_with_0x(d_str) ? 2 : 0);
auto end = d_str.end();
return (thrust::distance(begin, end) > 0) &&
thrust::all_of(thrust::seq, begin, end, [] __device__(auto chr) {
return (chr >= '0' && chr <= '9') || (chr >= 'A' && chr <= 'F') ||
(chr >= 'a' && chr <= 'f');
});
});
results->set_null_count(strings.null_count());
return results;
}

} // namespace detail

// external API
Expand All @@ -152,5 +191,12 @@ std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
return detail::hex_to_integers(strings, output_type, mr);
}

std::unique_ptr<column> is_hex(strings_column_view const& strings,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::is_hex(strings, 0, mr);
}

} // namespace strings
} // namespace cudf
24 changes: 24 additions & 0 deletions cpp/tests/strings/integers_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,27 @@ TEST_F(StringsConvertTest, HexToInteger)
cudf::test::expect_columns_equal(*results, expected);
}
}

TEST_F(StringsConvertTest, IsHex)
{
std::vector<const char*> h_strings{"",
"1234",
nullptr,
"98BEEF",
"1a5",
"2face",
"0xAABBCCDD",
"112233445566",
"XYZ",
"0",
"0x",
"x"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0},
{1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
auto results = cudf::strings::is_hex(cudf::strings_column_view(strings));
cudf::test::expect_columns_equal(*results, expected);
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
cdef unique_ptr[column] hex_to_integers(
column_view input_col,
data_type output_type) except +

cdef unique_ptr[column] is_hex(
column_view source_strings
) except +
19 changes: 18 additions & 1 deletion python/cudf/cudf/_lib/string_casting.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ from cudf._lib.cpp.strings.convert.convert_floats cimport (
from cudf._lib.cpp.strings.convert.convert_integers cimport (
to_integers as cpp_to_integers,
from_integers as cpp_from_integers,
hex_to_integers as cpp_hex_to_integers
hex_to_integers as cpp_hex_to_integers,
is_hex as cpp_is_hex
)
from cudf._lib.cpp.strings.convert.convert_ipv4 cimport (
ipv4_to_integers as cpp_ipv4_to_integers,
Expand Down Expand Up @@ -643,3 +644,19 @@ def htoi(Column input_col, **kwargs):
c_out_type))

return Column.from_unique_ptr(move(c_result))


def is_hex(Column source_strings):
"""
Returns a Column of boolean values with True for `source_strings`
that have hex characters.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_is_hex(
source_view
))

return Column.from_unique_ptr(move(c_result))
39 changes: 39 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,45 @@ def isinteger(self, **kwargs):
"""
return self._return_or_inplace(cpp_is_integer(self._column), **kwargs)

def ishex(self, **kwargs):
"""
Check whether all characters in each string form a hex integer.
If a string has zero characters, False is returned for
that check.
Returns : Series or Index of bool
Series or Index of boolean values with the same
length as the original Series/Index.
See also
--------
isdecimal
Check whether all characters are decimal.
isdigit
Check whether all characters are digits.
isnumeric
Check whether all characters are numeric.
isfloat
Check whether all characters are float.
Examples
--------
>>> import cudf
>>> s = cudf.Series(["", "123DEF", "0x2D3", "-15", "abc"])
>>> s.str.ishex()
0 False
1 True
2 True
3 False
4 True
dtype: bool
"""
return self._return_or_inplace(str_cast.is_hex(self._column), **kwargs)

def isfloat(self, **kwargs):
"""
Check whether all characters in each string form floating value.
Expand Down
7 changes: 7 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2059,6 +2059,13 @@ def test_string_hex_to_int(data):
assert_eq(expected, got)


def test_string_ishex():
gsr = Series(["", None, "0x01a2b3c4d5e6f", "0789", "ABCDEF0"])
got = gsr.str.ishex()
expected = Series([False, None, True, True, True])
assert_eq(expected, got)


def test_string_ip4_to_int():
gsr = Series(["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"])
expected = Series([0, None, 0, 698875905, 2130706433, 700776449])
Expand Down

0 comments on commit 25a6e31

Please sign in to comment.