Skip to content

Commit

Permalink
Scanner: Generate error on inbalanced RLO/LRO/PDF override markers.
Browse files Browse the repository at this point in the history
  • Loading branch information
christianparpart authored and mijovic committed Dec 15, 2020
1 parent e3b009d commit 89d09cd
Show file tree
Hide file tree
Showing 28 changed files with 306 additions and 6 deletions.
1 change: 1 addition & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Language Features:
* Code generator: Support copying of nested arrays from calldata to memory.
* The fallback function can now also have a single ``calldata`` argument (equaling ``msg.data``) and return ``bytes memory`` (which will not be ABI-encoded but returned as-is).
* Wasm backend: Add ``i32.select`` and ``i64.select`` instructions.
* Scanner: Generate a parser error when comments or unicode strings do contain an unbalanced or underflowing set of unicode direction override markers (LRO, RLO, PDF)

Compiler Features:
* Build System: Optionally support dynamic loading of Z3 and use that mechanism for Linux release builds.
Expand Down
14 changes: 14 additions & 0 deletions liblangutil/CharStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,20 @@ class CharStream
std::tuple<int, int> translatePositionToLineColumn(int _position) const;
///@}

/// Tests whether or not given octet sequence is present at the current position in stream.
/// @returns true if the sequence could be found, false otherwise.
bool prefixMatch(std::string_view _sequence)
{
if (isPastEndOfInput(_sequence.size()))
return false;

for (size_t i = 0; i < _sequence.size(); ++i)
if (_sequence[i] != get(i))
return false;

return true;
}

private:
std::string m_source;
std::string m_name;
Expand Down
73 changes: 68 additions & 5 deletions liblangutil/Scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@
#include <liblangutil/Exceptions.h>
#include <liblangutil/Scanner.h>

#include <algorithm>
#include <boost/algorithm/string/classification.hpp>

#include <optional>
#include <ostream>
#include <string_view>
#include <tuple>

using namespace std;
Expand All @@ -79,6 +80,8 @@ string to_string(ScannerError _errorCode)
case ScannerError::IllegalExponent: return "Invalid exponent.";
case ScannerError::IllegalNumberEnd: return "Identifier-start is not allowed at end of a number.";
case ScannerError::OctalNotAllowed: return "Octal numbers not allowed.";
case ScannerError::DirectionalOverrideUnderflowInComment: return "Unicode direction override underflow in comment or string literal.";
case ScannerError::DirectionalOverrideMismatchInComment: return "Mismatching directional override markers in comment or string literal.";
default:
solAssert(false, "Unhandled case in to_string(ScannerError)");
return "";
Expand Down Expand Up @@ -271,12 +274,58 @@ bool Scanner::skipWhitespaceExceptUnicodeLinebreak()
return sourcePos() != startPosition;
}


namespace {
/// Tries to scan for an RLO/LRO/RLE/LRE/PDF and keeps track of script writing direction override depth.
///
/// @returns ScannerError::NoError in case of successful parsing and directional encodings are paired
/// and error code in case the input's lexical parser state is invalid and this error should be reported
/// to the user.
static ScannerError validateBiDiMarkup(CharStream& _stream, size_t _startPosition)
{
static array<pair<string_view, int>, 5> constexpr directionalSequences{
pair<string_view, int>{"\xE2\x80\xAD", 1}, // U+202D (LRO - Left-to-Right Override)
pair<string_view, int>{"\xE2\x80\xAE", 1}, // U+202E (RLO - Right-to-Left Override)
pair<string_view, int>{"\xE2\x80\xAA", 1}, // U+202A (LRE - Left-to-Right Embedding)
pair<string_view, int>{"\xE2\x80\xAB", 1}, // U+202B (RLE - Right-to-Left Embedding)
pair<string_view, int>{"\xE2\x80\xAC", -1} // PDF - Pop Directional Formatting
};

size_t endPosition = _stream.position();
_stream.setPosition(_startPosition);

int directionOverrideDepth = 0;

for (size_t currentPos = _startPosition; currentPos < endPosition; ++currentPos)
{
_stream.setPosition(currentPos);

for (auto const& [sequence, depthChange]: directionalSequences)
if (_stream.prefixMatch(sequence))
directionOverrideDepth += depthChange;

if (directionOverrideDepth < 0)
return ScannerError::DirectionalOverrideUnderflowInComment;
}

_stream.setPosition(endPosition);

return directionOverrideDepth > 0 ? ScannerError::DirectionalOverrideMismatchInComment : ScannerError::NoError;
}
}

Token Scanner::skipSingleLineComment()
{
// Line terminator is not part of the comment. If it is a
// non-ascii line terminator, it will result in a parser error.
size_t startPosition = m_source->position();
while (!isUnicodeLinebreak())
if (!advance()) break;
if (!advance())
break;

ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
if (unicodeDirectionError != ScannerError::NoError)
return setError(unicodeDirectionError);

return Token::Whitespace;
}
Expand Down Expand Up @@ -349,16 +398,21 @@ size_t Scanner::scanSingleLineDocComment()

Token Scanner::skipMultiLineComment()
{
size_t startPosition = m_source->position();
while (!isSourcePastEndOfInput())
{
char ch = m_char;
char prevChar = m_char;
advance();

// If we have reached the end of the multi-line comment, we
// consume the '/' and insert a whitespace. This way all
// multi-line comments are treated as whitespace.
if (ch == '*' && m_char == '/')
if (prevChar == '*' && m_char == '/')
{
ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
if (unicodeDirectionError != ScannerError::NoError)
return setError(unicodeDirectionError);

m_char = ' ';
return Token::Whitespace;
}
Expand Down Expand Up @@ -785,6 +839,7 @@ bool Scanner::isUnicodeLinebreak()

Token Scanner::scanString(bool const _isUnicode)
{
size_t startPosition = m_source->position();
char const quote = m_char;
advance(); // consume quote
LiteralScope literal(this, LITERAL_TYPE_STRING);
Expand Down Expand Up @@ -812,6 +867,14 @@ Token Scanner::scanString(bool const _isUnicode)
}
if (m_char != quote)
return setError(ScannerError::IllegalStringEndQuote);

if (_isUnicode)
{
ScannerError unicodeDirectionError = validateBiDiMarkup(*m_source, startPosition);
if (unicodeDirectionError != ScannerError::NoError)
return setError(unicodeDirectionError);
}

literal.complete();
advance(); // consume quote
return _isUnicode ? Token::UnicodeStringLiteral : Token::StringLiteral;
Expand Down
4 changes: 4 additions & 0 deletions liblangutil/Scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ enum class ScannerError
IllegalExponent,
IllegalNumberEnd,

DirectionalOverrideUnderflowInComment,
DirectionalOverrideMismatchInComment,

OctalNotAllowed,
};

Expand Down Expand Up @@ -183,6 +186,7 @@ class Scanner
///@}

private:

inline Token setError(ScannerError _error) noexcept
{
m_tokens[NextNext].error = _error;
Expand Down
5 changes: 4 additions & 1 deletion scripts/test_antlr_grammar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,10 @@ done < <(
grep -riL -E \
"^\/\/ (Syntax|Type|Declaration)Error|^\/\/ ParserError (2837|3716|3997|5333|6275|6281|6933|7319)|^==== Source:" \
"${ROOT_DIR}/test/libsolidity/syntaxTests" \
"${ROOT_DIR}/test/libsolidity/semanticTests" \
"${ROOT_DIR}/test/libsolidity/semanticTests" |
grep -v -E 'comments/.*_direction_override.*.sol' |
grep -v -E 'literals/.*_direction_override.*.sol'
# Skipping the unicode tests as I couldn't adapt the lexical grammar to recursively counting RLO/LRO/PDF's.
)

YUL_FILES=()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// pop 1
/*underflow ‬*/
}
}
// ----
// ParserError 8936: (73-85): Unicode direction override underflow in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// pop 2
/*underflow ‬‬*/
}
}
// ----
// ParserError 8936: (73-85): Unicode direction override underflow in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// push 1
/*overflow ‮*/
}
}
// ----
// ParserError 8936: (74-89): Mismatching directional override markers in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// push 2
/*overflow ‮‮*/
}
}
// ----
// ParserError 8936: (74-92): Mismatching directional override markers in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
contract C {
function f() public pure
{
// first push 1, then pop 1
/*ok ‮‬*/

// first push 2, then pop 2
/*ok ‮‮‬‬*/

// first push 3, then pop 3
/*ok ‮‮‮‬‬‬*/
}
}
// ----
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// first pop, then push
/*overflow ‬‮*/
}
}
// ----
// ParserError 8936: (88-99): Unicode direction override underflow in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// pop 1
// underflow ‬
}
}
// ----
// ParserError 8936: (73-86): Unicode direction override underflow in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// pop 2
// underflow ‬‬
}
}
// ----
// ParserError 8936: (73-86): Unicode direction override underflow in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// push 1
// overflow ‮
}
}
// ----
// ParserError 8936: (74-89): Mismatching directional override markers in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// push 2
// overflow ‮‮
}
}
// ----
// ParserError 8936: (74-92): Mismatching directional override markers in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
contract C {
function f() public pure
{
// first push 1, then pop 1
// ok ‮‬

// first push 2, then pop 2
// ok ‮‮‬‬

// first push 3, then pop 3
// ok ‮‮‮‬‬‬
}
}
// ----
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// first pop, then push
// underflow ‬‮
}
}
// ----
// ParserError 8936: (88-101): Unicode direction override underflow in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
contract C {
function f(bool b) public pure
{
if ‬(b) { return; }
}
}
// ----
// ParserError 2314: (65-66): Expected '(' but got 'ILLEGAL'
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
contract C {
function f(bool b) public pure
{
uint a = 10; ‬
}
}
// ----
// ParserError 8936: (75-76): Invalid token.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
contract TimelockUpgrade {
function confirmUpgrade() external {
uint256 m;
uint256 d;
(/*year*/,/*month‮*/,d/*yad*/,m/*‬‬hour*/,/*minute*/,/*second*/) = BokkyDateTime.timestampToDateTime(block.timestamp);
}
}

// ----
// ParserError 8936: (128-139): Mismatching directional override markers in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// pop 1
bytes memory s = unicode"underflow ‬";
}
}
// ----
// ParserError 8936: (90-108): Unicode direction override underflow in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// pop 2
bytes memory m = unicode"underflow ‬‬";
}
}
// ----
// ParserError 8936: (90-108): Unicode direction override underflow in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// push 1
bytes memory m = unicode"overflow ‮";
}
}
// ----
// ParserError 8936: (91-111): Mismatching directional override markers in comment or string literal.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
contract C {
function f() public pure
{
// push 2
bytes memory m = unicode"overflow ‮‮";
}
}
// ----
// ParserError 8936: (91-114): Mismatching directional override markers in comment or string literal.
Loading

0 comments on commit 89d09cd

Please sign in to comment.