From cbbfc2f11ad49bfa849cd2ec3585bbb9699953ba Mon Sep 17 00:00:00 2001 From: Stenzek Date: Tue, 13 Aug 2024 14:47:11 +1000 Subject: [PATCH] StringUtil: Add BytePatternSearch() --- src/common/string_util.cpp | 98 +++++++++++++++++++++++++++++++++++++- src/common/string_util.h | 6 ++- 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index 5a92fb5293..7b84400535 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #include "string_util.h" @@ -9,6 +9,12 @@ #include #include +#ifndef __APPLE__ +#include // alloca +#else +#include +#endif + #ifdef _WIN32 #include "windows_headers.h" #endif @@ -427,6 +433,96 @@ void StringUtil::EllipsiseInPlace(std::string& str, u32 max_length, const char* } } +std::optional StringUtil::BytePatternSearch(const std::span bytes, const std::string_view pattern) +{ + // Parse the pattern into a bytemask. + size_t pattern_length = 0; + bool hinibble = true; + for (size_t i = 0; i < pattern.size(); i++) + { + if ((pattern[i] >= '0' && pattern[i] <= '9') || (pattern[i] >= 'a' && pattern[i] <= 'f') || + (pattern[i] >= 'A' && pattern[i] <= 'F') || pattern[i] == '?') + { + hinibble ^= true; + if (hinibble) + pattern_length++; + } + else if (pattern[i] == ' ' || pattern[i] == '\r' || pattern[i] == '\n') + { + continue; + } + else + { + break; + } + } + if (pattern_length == 0) + return std::nullopt; + + const bool allocate_on_heap = (pattern_length >= 512); + u8* match_bytes = allocate_on_heap ? static_cast(alloca(pattern_length * 2)) : new u8[pattern_length * 2]; + u8* match_masks = match_bytes + pattern_length; + + hinibble = true; + u8 match_byte = 0; + u8 match_mask = 0; + for (size_t i = 0, match_len = 0; i < pattern.size(); i++) + { + u8 nibble = 0, nibble_mask = 0xF; + if (pattern[i] >= '0' && pattern[i] <= '9') + nibble = pattern[i] - '0'; + else if (pattern[i] >= 'a' && pattern[i] <= 'f') + nibble = pattern[i] - 'a' + 0xa; + else if (pattern[i] >= 'A' && pattern[i] <= 'F') + nibble = pattern[i] - 'A' + 0xa; + else if (pattern[i] == '?') + nibble_mask = 0; + else if (pattern[i] == ' ' || pattern[i] == '\r' || pattern[i] == '\n') + continue; + else + break; + + hinibble ^= true; + if (hinibble) + { + match_bytes[match_len] = nibble | (match_byte << 4); + match_masks[match_len] = nibble_mask | (match_mask << 4); + match_len++; + } + else + { + match_byte = nibble; + match_mask = nibble_mask; + } + } + if (pattern_length == 0) + return std::nullopt; + + std::optional ret; + const size_t max_search_offset = bytes.size() - pattern_length; + for (size_t offset = 0; offset < max_search_offset; offset++) + { + const u8* start = bytes.data() + offset; + for (size_t match_offset = 0;;) + { + if ((start[match_offset] & match_masks[match_offset]) != match_bytes[match_offset]) + break; + + match_offset++; + if (match_offset == pattern_length) + { + // found it! + ret = offset; + } + } + } + + if (allocate_on_heap) + delete[] match_bytes; + + return ret; +} + size_t StringUtil::DecodeUTF8(const std::string_view str, size_t offset, char32_t* ch) { return DecodeUTF8(str.data() + offset, str.length() - offset, ch); diff --git a/src/common/string_util.h b/src/common/string_util.h index 9e80fec4ad..388bd70df3 100644 --- a/src/common/string_util.h +++ b/src/common/string_util.h @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) #pragma once @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -275,6 +276,9 @@ size_t DecodeUTF8(const std::string& str, size_t offset, char32_t* ch); std::string Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis = "..."); void EllipsiseInPlace(std::string& str, u32 max_length, const char* ellipsis = "..."); +/// Searches for the specified byte pattern in the given memory span. Wildcards (i.e. ??) are supported. +std::optional BytePatternSearch(const std::span bytes, const std::string_view pattern); + /// Strided memcpy/memcmp. ALWAYS_INLINE static void StrideMemCpy(void* dst, std::size_t dst_stride, const void* src, std::size_t src_stride, std::size_t copy_size, std::size_t count)