diff --git a/liblangutil/CharStream.h b/liblangutil/CharStream.h index aa8f01af266c..f6aaf049fa9d 100644 --- a/liblangutil/CharStream.h +++ b/liblangutil/CharStream.h @@ -85,6 +85,19 @@ class CharStream /// @returns The character of the current location after update is returned. char setPosition(size_t _location); + /// Tests whether or not given octet sequence is present at the current reading position. + /// @returns true if the sequence could be found, false otherwise. + bool prefixMatch(std::string_view _sequence) const + { + if (m_position + _sequence.size() >= m_source.size()) + return false; + + for (size_t i = 0; i < _sequence.size(); ++i) + if (_sequence[i] != get(i)) + return false; + return true; + } + void reset() { m_position = 0; } std::string const& source() const noexcept { return m_source; } diff --git a/liblangutil/Scanner.cpp b/liblangutil/Scanner.cpp index 1a3903d5c4df..43653a0e0288 100644 --- a/liblangutil/Scanner.cpp +++ b/liblangutil/Scanner.cpp @@ -79,6 +79,7 @@ string to_string(ScannerError _errorCode) case ScannerError::IllegalExponent: return "Invalid exponent."; case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number."; case ScannerError::OctalNotAllowed: return "Octal numbers not allowed."; + case ScannerError::MismatchingDirectionalOverridesInComment: return "Mismatching directional override markers in comment."; default: solAssert(false, "Unhandled case in to_string(ScannerError)"); return ""; @@ -273,10 +274,29 @@ bool Scanner::skipWhitespaceExceptUnicodeLinebreak() Token Scanner::skipSingleLineComment() { + int rtlOverrideDepth = 0; + // Line terminator is not part of the comment. If it is a // non-ascii line terminator, it will result in a parser error. while (!isUnicodeLinebreak()) - if (!advance()) break; + { + if (tryScanByteSequence("\xE2\x80\xAD") || // U+202D (LRO - Left-to-Right Override) + tryScanByteSequence("\xE2\x80\xAE") // U+202E (RLO - Right-to-Left Override) + ) + { + rtlOverrideDepth++; + } + else if (tryScanByteSequence("\xE2\x80\xAC")) // U+202C (PDF - Pop Directional Formatting) + { + rtlOverrideDepth--; + } + else if (!advance()) + break; + } + + if (rtlOverrideDepth != 0) + // Unbalanced RLO/LRO/PDF codepoint sequences in comment. + return setError(ScannerError::MismatchingDirectionalOverridesInComment); return Token::Whitespace; } @@ -349,18 +369,36 @@ size_t Scanner::scanSingleLineDocComment() Token Scanner::skipMultiLineComment() { + int rtlOverrideDepth = 0; while (!isSourcePastEndOfInput()) { - char ch = m_char; - advance(); - - // If we have reached the end of the multi-line comment, we - // consume the '/' and insert a whitespace. This way all - // multi-line comments are treated as whitespace. - if (ch == '*' && m_char == '/') + if (tryScanByteSequence("\xE2\x80\xAD") || // U+202D (LRO - Left-to-Right Override) + tryScanByteSequence("\xE2\x80\xAE") // U+202E (RLO - Right-to-Left Override) + ) { - m_char = ' '; - return Token::Whitespace; + rtlOverrideDepth++; + } + else if (tryScanByteSequence("\xE2\x80\xAC")) // U+202C (PDF - Pop Directional Formatting) + { + rtlOverrideDepth--; + } + else + { + char ch = m_char; + advance(); + + // If we have reached the end of the multi-line comment, we + // consume the '/' and insert a whitespace. This way all + // multi-line comments are treated as whitespace. + if (ch == '*' && m_char == '/') + { + if (rtlOverrideDepth != 0) + // Unbalanced RLO/LRO/PDF codepoint sequences in comment. + return setError(ScannerError::MismatchingDirectionalOverridesInComment); + + m_char = ' '; + return Token::Whitespace; + } } } // Unterminated multi-line comment. diff --git a/liblangutil/Scanner.h b/liblangutil/Scanner.h index f44a9940a7ac..1d267e1b1335 100644 --- a/liblangutil/Scanner.h +++ b/liblangutil/Scanner.h @@ -89,6 +89,8 @@ enum class ScannerError IllegalExponent, IllegalNumberEnd, + MismatchingDirectionalOverridesInComment, + OctalNotAllowed, }; @@ -248,6 +250,19 @@ class Scanner /// Scans a slash '/' and depending on the characters returns the appropriate token Token scanSlash(); + /// Tries scanning given octet sequence and advances reading position respectively iff found. + /// @returns true if it could be scanned, false otherwise. + bool tryScanByteSequence(std::string_view _sequence) + { + if (!m_source->prefixMatch(_sequence)) + return false; + + for (size_t i = 0; i < _sequence.size(); ++i) + advance(); + + return true; + } + /// Scans an escape-sequence which is part of a string and adds the /// decoded character to the current literal. Returns true if a pattern /// is scanned.