From e3425a632b0712bf242e3f3759c6231eeac656f5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 29 Jul 2024 14:07:26 -0400 Subject: [PATCH] add support for ANY inst --- cpp/benchmarks/string/contains.cpp | 2 +- cpp/src/strings/regex/regcomp.cpp | 9 ++++++--- cpp/src/strings/regex/regex.inl | 11 ++++++----- cpp/tests/strings/contains_tests.cpp | 12 +++++++++++- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index ae6c8b844c8..80752110090 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -80,7 +80,7 @@ std::unique_ptr build_input_column(cudf::size_type n_rows, } // longer pattern lengths demand more working memory per string -std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"}; +std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43$"}; static void bench_contains(nvbench::state& state) { diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 5c31eb94853..7c4c89bd3fb 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -555,7 +555,10 @@ class regex_parser { return EOL; } case '[': return build_cclass(); - case '.': return dot_type; + case '.': { + _chr = is_ext_newline(_flags) ? 'N' : chr; + return dot_type; + } } if (std::find(quantifiers.begin(), quantifiers.end(), static_cast(chr)) == @@ -967,7 +970,7 @@ class regex_compiler { _prog.inst_at(inst_id).u1.cls_id = class_id; } else if (token == CHAR) { _prog.inst_at(inst_id).u1.c = yy; - } else if (token == BOL || token == EOL) { + } else if (token == BOL || token == EOL || token == ANY) { _prog.inst_at(inst_id).u1.c = yy; } push_and(inst_id, inst_id); @@ -1202,7 +1205,7 @@ void reprog::print(regex_flags const flags) case STAR: printf(" STAR next=%d", inst.u2.next_id); break; case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break; case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break; - case ANY: printf(" ANY next=%d", inst.u2.next_id); break; + case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break; case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break; case NOP: printf(" NOP next=%d", inst.u2.next_id); break; case BOL: { diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index b78ee3ae774..ea8c6bec3ab 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -131,7 +131,7 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist() * * '\n, \r, \u0085, \u2028, or \u2029' */ -__device__ __forceinline__ bool is_newline(char32_t const ch) +constexpr bool is_newline(char32_t const ch) { return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9); } @@ -382,11 +382,12 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const case CHAR: if (inst.u1.c == c) id_activate = inst.u2.next_id; break; - case ANY: - if (!is_newline(c)) { id_activate = inst.u2.next_id; } - break; + case ANY: { + if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; } + [[fallthrough]]; + } case ANYNL: id_activate = inst.u2.next_id; break; - case NCCLASS: + case NCCLASS: [[fallthrough]]; case CCLASS: { auto const cls = get_class(inst.u1.cls_id); if (cls.is_match(static_cast(c), _codepoint_flags) == (inst.type == CCLASS)) { diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 2eb5a61f3c4..8965a1b78a6 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -650,6 +649,17 @@ TEST_F(StringsContainsTests, SpecialNewLines) counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); results = cudf::strings::count_re(view, *prog_ml); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, counts); + + pattern = std::string("q.*l"); + prog = cudf::strings::regex_program::create(pattern); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + // inst ANY will stop matching on first 'newline' and so should not match anything here + prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } TEST_F(StringsContainsTests, EndOfString)