Skip to content

Commit

Permalink
Fix strings handling of hex in regex pattern (#10220)
Browse files Browse the repository at this point in the history
Closes #10213 

Fixes parsing logic for `\x` hex characters specified in a regex pattern.
This also adds a gtest checking for all 127 possible matchable hex characters.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #10220
  • Loading branch information
davidwendt authored Feb 8, 2022
1 parent bd98bfe commit fff51b8
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 5 deletions.
8 changes: 4 additions & 4 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,15 +280,15 @@ class regex_parser {
yy = 0;
if (a >= '0' && a <= '9')
yy += (a - '0') << 4;
else if (a > 'a' && a <= 'f')
else if (a >= 'a' && a <= 'f')
yy += (a - 'a' + 10) << 4;
else if (a > 'A' && a <= 'F')
else if (a >= 'A' && a <= 'F')
yy += (a - 'A' + 10) << 4;
if (b >= '0' && b <= '9')
yy += b - '0';
else if (b > 'a' && b <= 'f')
else if (b >= 'a' && b <= 'f')
yy += b - 'a' + 10;
else if (b > 'A' && b <= 'F')
else if (b >= 'A' && b <= 'F')
yy += b - 'A' + 10;
break;
}
Expand Down
29 changes: 28 additions & 1 deletion cpp/tests/strings/contains_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
* limitations under the License.
*/

#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/strings/contains.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <tests/strings/utilities.h>

#include <algorithm>
#include <vector>
Expand Down Expand Up @@ -250,6 +250,33 @@ TEST_F(StringsContainsTests, OctalTest)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringsContainsTests, HexTest)
{
std::vector<char> ascii_chars( // all possible matchable chars
{thrust::make_counting_iterator<char>(0), thrust::make_counting_iterator<char>(127)});
auto const count = static_cast<cudf::size_type>(ascii_chars.size());
std::vector<cudf::offset_type> offsets(
{thrust::make_counting_iterator<cudf::offset_type>(0),
thrust::make_counting_iterator<cudf::offset_type>(0) + count + 1});
auto d_chars = cudf::detail::make_device_uvector_sync(ascii_chars);
auto d_offsets = cudf::detail::make_device_uvector_sync(offsets);
auto input = cudf::make_strings_column(d_chars, d_offsets);

auto strings_view = cudf::strings_column_view(input->view());
for (auto ch : ascii_chars) {
std::stringstream str;
str << "\\x" << std::setfill('0') << std::setw(2) << std::hex << static_cast<int32_t>(ch);
std::string pattern = str.str();

auto results = cudf::strings::contains_re(strings_view, pattern);
// only one element in the input should match ch
auto true_dat = cudf::detail::make_counting_transform_iterator(
0, [ch](auto idx) { return ch == static_cast<char>(idx); });
cudf::test::fixed_width_column_wrapper<bool> expected(true_dat, true_dat + count);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}
}

TEST_F(StringsContainsTests, EmbeddedNullCharacter)
{
std::vector<std::string> data(10);
Expand Down

0 comments on commit fff51b8

Please sign in to comment.