Skip to content

Commit

Permalink
Fix regex non-multiline EOL/$ matching strings ending with a new-line (
Browse files Browse the repository at this point in the history
…#9715)

Closes #9620 

Fixes an edge case described in https://docs.python.org/3/library/re.html#re.MULTILINE
where the '$' EOL regex pattern character (without `MULTILINE` set) should match at the very end of a string and also just before the end of the string if the end of that string contains a new-line.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Sheilah Kirui (https://github.com/skirui-source)

URL: #9715
  • Loading branch information
davidwendt authored Nov 19, 2021
1 parent fc82b1d commit c1bfb26
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 11 deletions.
5 changes: 4 additions & 1 deletion cpp/src/strings/regex/regex.inl
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,10 @@ __device__ inline int32_t reprog_device::regexec(
}
break;
case EOL:
if (last_character || (inst->u1.c == '$' && c == '\n')) {
if (last_character ||
(c == '\n' && (inst->u1.c == '$' ||
// edge case where \n appears at the end of the string
pos + 1 == dstr.length()))) {
id_activate = inst->u2.next_id;
expanded = true;
}
Expand Down
17 changes: 9 additions & 8 deletions cpp/tests/strings/contains_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,28 +302,29 @@ TEST_F(StringsContainsTests, CountTest)

TEST_F(StringsContainsTests, MultiLine)
{
auto input = cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", ""});
auto view = cudf::strings_column_view(input);
auto input =
cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
auto view = cudf::strings_column_view(input);

auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0});
auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
results = cudf::strings::contains_re(view, "^abc$");
expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);

results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0});
auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
results = cudf::strings::matches_re(view, "^abc$");
expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);

results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
results = cudf::strings::count_re(view, "^abc$");
expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0});
expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
}

Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1746,12 +1746,13 @@ def test_string_wrap(data, width):
["A B", "1.5", "3,000"],
["23", "³", "⅕", ""],
[" ", "\t\r\n ", ""],
["$", "B", "Aab$", "$$ca", "C$B$", "cat"],
["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\n"],
["line\nto be wrapped", "another\nline\nto be wrapped"],
],
)
@pytest.mark.parametrize(
"pat", ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be"]
"pat",
["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be", "cat$"],
)
@pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL])
def test_string_count(data, pat, flags):
Expand Down

0 comments on commit c1bfb26

Please sign in to comment.