Skip to content

Commit

Permalink
add support for ANY inst
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Jul 29, 2024
1 parent 57f3567 commit e3425a6
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 10 deletions.
2 changes: 1 addition & 1 deletion cpp/benchmarks/string/contains.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
}

// longer pattern lengths demand more working memory per string
std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};
std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43$"};

static void bench_contains(nvbench::state& state)
{
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,10 @@ class regex_parser {
return EOL;
}
case '[': return build_cclass();
case '.': return dot_type;
case '.': {
_chr = is_ext_newline(_flags) ? 'N' : chr;
return dot_type;
}
}

if (std::find(quantifiers.begin(), quantifiers.end(), static_cast<char>(chr)) ==
Expand Down Expand Up @@ -967,7 +970,7 @@ class regex_compiler {
_prog.inst_at(inst_id).u1.cls_id = class_id;
} else if (token == CHAR) {
_prog.inst_at(inst_id).u1.c = yy;
} else if (token == BOL || token == EOL) {
} else if (token == BOL || token == EOL || token == ANY) {
_prog.inst_at(inst_id).u1.c = yy;
}
push_and(inst_id, inst_id);
Expand Down Expand Up @@ -1202,7 +1205,7 @@ void reprog::print(regex_flags const flags)
case STAR: printf(" STAR next=%d", inst.u2.next_id); break;
case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break;
case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break;
case ANY: printf(" ANY next=%d", inst.u2.next_id); break;
case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break;
case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break;
case NOP: printf(" NOP next=%d", inst.u2.next_id); break;
case BOL: {
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/strings/regex/regex.inl
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist()
*
* '\n, \r, \u0085, \u2028, or \u2029'
*/
__device__ __forceinline__ bool is_newline(char32_t const ch)
constexpr bool is_newline(char32_t const ch)
{
return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9);
}
Expand Down Expand Up @@ -382,11 +382,12 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
case CHAR:
if (inst.u1.c == c) id_activate = inst.u2.next_id;
break;
case ANY:
if (!is_newline(c)) { id_activate = inst.u2.next_id; }
break;
case ANY: {
if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; }
[[fallthrough]];
}
case ANYNL: id_activate = inst.u2.next_id; break;
case NCCLASS:
case NCCLASS: [[fallthrough]];
case CCLASS: {
auto const cls = get_class(inst.u1.cls_id);
if (cls.is_match(static_cast<char32_t>(c), _codepoint_flags) == (inst.type == CCLASS)) {
Expand Down
12 changes: 11 additions & 1 deletion cpp/tests/strings/contains_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/debug_utilities.hpp>
#include <cudf_test/iterator_utilities.hpp>

#include <cudf/detail/utilities/vector_factories.hpp>
Expand Down Expand Up @@ -650,6 +649,17 @@ TEST_F(StringsContainsTests, SpecialNewLines)
counts = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1, 1});
results = cudf::strings::count_re(view, *prog_ml);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, counts);

pattern = std::string("q.*l");
prog = cudf::strings::regex_program::create(pattern);
expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
results = cudf::strings::contains_re(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
// inst ANY will stop matching on first 'newline' and so should not match anything here
prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0});
results = cudf::strings::contains_re(view, *prog);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

TEST_F(StringsContainsTests, EndOfString)
Expand Down

0 comments on commit e3425a6

Please sign in to comment.