Skip to content

Commit

Permalink
pythongh-105017: Include CRLF lines in strings and column numbers (py…
Browse files Browse the repository at this point in the history
…thonGH-105030)

(cherry picked from commit 96fff35)

Co-authored-by: Marta Gómez Macías <mgmacias@google.com>
Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
  • Loading branch information
2 people authored and miss-islington committed May 28, 2023
1 parent 41b622b commit c220d12
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 26 deletions.
44 changes: 36 additions & 8 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,29 @@ def test_basic(self):
DEDENT '' (5, 0) (5, 0)
""")

self.check_tokenize("foo='bar'\r\n", """\
NAME 'foo' (1, 0) (1, 3)
OP '=' (1, 3) (1, 4)
STRING "'bar'" (1, 4) (1, 9)
NEWLINE '\\n' (1, 9) (1, 10)
self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\
NAME 'if' (1, 0) (1, 2)
NAME 'True' (1, 3) (1, 7)
OP ':' (1, 7) (1, 8)
NEWLINE '\\r\\n' (1, 8) (1, 10)
COMMENT '# NL' (2, 4) (2, 8)
NL '\\r\\n' (2, 8) (2, 10)
INDENT ' ' (3, 0) (3, 4)
NAME 'foo' (3, 4) (3, 7)
OP '=' (3, 7) (3, 8)
STRING "\'bar\'" (3, 8) (3, 13)
NEWLINE '\\r\\n' (3, 13) (3, 15)
NL '\\r\\n' (4, 0) (4, 2)
DEDENT '' (5, 0) (5, 0)
""")

self.check_tokenize("x = 1 + \\\r\n1\r\n", """\
NAME 'x' (1, 0) (1, 1)
OP '=' (1, 2) (1, 3)
NUMBER '1' (1, 4) (1, 5)
OP '+' (1, 6) (1, 7)
NUMBER '1' (2, 0) (2, 1)
NEWLINE '\\r\\n' (2, 1) (2, 3)
""")

indent_error_file = b"""\
Expand Down Expand Up @@ -1784,9 +1802,9 @@ def test_random_files(self):
if support.verbose >= 2:
print('tokenize', testfile)
with open(testfile, 'rb') as f:
# with self.subTest(file=testfile):
self.check_roundtrip(f)
self.check_line_extraction(f)
with self.subTest(file=testfile):
self.check_roundtrip(f)
self.check_line_extraction(f)


def roundtrip(self, code):
Expand Down Expand Up @@ -2084,6 +2102,10 @@ def test_string(self):
b\
c"""', """\
STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
""")

self.check_tokenize(r'"hola\\\r\ndfgf"', """\
STRING \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16)
""")

self.check_tokenize('f"abc"', """\
Expand Down Expand Up @@ -2120,6 +2142,12 @@ def test_string(self):
FSTRING_START 'Rf"' (1, 0) (1, 3)
FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3)
FSTRING_END '"' (2, 3) (2, 4)
""")

self.check_tokenize(r'f"hola\\\r\ndfgf"', """\
FSTRING_START \'f"\' (1, 0) (1, 2)
FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16)
FSTRING_END \'"\' (1, 16) (1, 17)
""")

def test_function(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Show CRLF lines in the tokenize string attribute in both NL and NEWLINE tokens. Patch by Marta Gómez.
4 changes: 2 additions & 2 deletions Parser/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -924,9 +924,9 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen

struct tok_state *tok;
if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
tok = _PyTokenizer_FromUTF8(str, exec_input);
tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
} else {
tok = _PyTokenizer_FromString(str, exec_input);
tok = _PyTokenizer_FromString(str, exec_input, 0);
}
if (tok == NULL) {
if (PyErr_Occurred()) {
Expand Down
38 changes: 26 additions & 12 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,8 @@ translate_into_utf8(const char* str, const char* enc) {


static char *
translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
translate_newlines(const char *s, int exec_input, int preserve_crlf,
struct tok_state *tok) {
int skip_next_lf = 0;
size_t needed_length = strlen(s) + 2, final_length;
char *buf, *current;
Expand All @@ -792,7 +793,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
break;
}
}
if (c == '\r') {
if (!preserve_crlf && c == '\r') {
skip_next_lf = 1;
c = '\n';
}
Expand Down Expand Up @@ -822,14 +823,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
inside TOK. */

static char *
decode_str(const char *input, int single, struct tok_state *tok)
decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
{
PyObject* utf8 = NULL;
char *str;
const char *s;
const char *newl[2] = {NULL, NULL};
int lineno = 0;
tok->input = str = translate_newlines(input, single, tok);
tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
if (str == NULL)
return NULL;
tok->enc = NULL;
Expand Down Expand Up @@ -881,14 +882,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
/* Set up tokenizer for string */

struct tok_state *
_PyTokenizer_FromString(const char *str, int exec_input)
_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
char *decoded;

if (tok == NULL)
return NULL;
decoded = decode_str(str, exec_input, tok);
decoded = decode_str(str, exec_input, tok, preserve_crlf);
if (decoded == NULL) {
_PyTokenizer_Free(tok);
return NULL;
Expand All @@ -902,13 +903,13 @@ _PyTokenizer_FromString(const char *str, int exec_input)
/* Set up tokenizer for UTF-8 string */

struct tok_state *
_PyTokenizer_FromUTF8(const char *str, int exec_input)
_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
char *translated;
if (tok == NULL)
return NULL;
tok->input = translated = translate_newlines(str, exec_input, tok);
tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
if (translated == NULL) {
_PyTokenizer_Free(tok);
return NULL;
Expand Down Expand Up @@ -1050,7 +1051,7 @@ tok_underflow_interactive(struct tok_state *tok) {
}
char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
if (newtok != NULL) {
char *translated = translate_newlines(newtok, 0, tok);
char *translated = translate_newlines(newtok, 0, 0, tok);
PyMem_Free(newtok);
if (translated == NULL) {
return 0;
Expand Down Expand Up @@ -1594,6 +1595,9 @@ tok_decimal_tail(struct tok_state *tok)
static inline int
tok_continuation_line(struct tok_state *tok) {
int c = tok_nextc(tok);
if (c == '\r') {
c = tok_nextc(tok);
}
if (c != '\n') {
tok->done = E_LINECONT;
return -1;
Expand Down Expand Up @@ -1693,7 +1697,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
}
}
tok_backup(tok, c);
if (c == '#' || c == '\n') {
if (c == '#' || c == '\n' || c == '\r') {
/* Lines with only whitespace and/or comments
shouldn't affect the indentation and are
not passed to the parser as NEWLINE tokens,
Expand Down Expand Up @@ -1822,7 +1826,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
const char *prefix, *type_start;
int current_starting_col_offset;

while (c != EOF && c != '\n') {
while (c != EOF && c != '\n' && c != '\r') {
c = tok_nextc(tok);
}

Expand Down Expand Up @@ -2002,6 +2006,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
return MAKE_TOKEN(NAME);
}

if (c == '\r') {
c = tok_nextc(tok);
}

/* Newline */
if (c == '\n') {
tok->atbol = 1;
Expand Down Expand Up @@ -2405,7 +2413,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
else {
end_quote_size = 0;
if (c == '\\') {
tok_nextc(tok); /* skip escaped char */
c = tok_nextc(tok); /* skip escaped char */
if (c == '\r') {
c = tok_nextc(tok);
}
}
}
}
Expand Down Expand Up @@ -2696,6 +2707,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
return MAKE_TOKEN(FSTRING_MIDDLE);
} else if (c == '\\') {
int peek = tok_nextc(tok);
if (peek == '\r') {
peek = tok_nextc(tok);
}
// Special case when the backslash is right before a curly
// brace. We have to restore and return the control back
// to the loop for the next iteration.
Expand Down
4 changes: 2 additions & 2 deletions Parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ struct tok_state {
#endif
};

extern struct tok_state *_PyTokenizer_FromString(const char *, int);
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
const char *, const char *);
extern void _PyTokenizer_Free(struct tok_state *);
Expand Down
9 changes: 7 additions & 2 deletions Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
if (filename == NULL) {
return NULL;
}
self->tok = _PyTokenizer_FromUTF8(source, 1);
self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
if (self->tok == NULL) {
Py_DECREF(filename);
return NULL;
Expand Down Expand Up @@ -240,7 +240,12 @@ tokenizeriter_next(tokenizeriterobject *it)
type = NAME;
}
else if (type == NEWLINE) {
str = PyUnicode_FromString("\n");
Py_DECREF(str);
if (it->tok->start[0] == '\r') {
str = PyUnicode_FromString("\r\n");
} else {
str = PyUnicode_FromString("\n");
}
end_col_offset++;
}
}
Expand Down

0 comments on commit c220d12

Please sign in to comment.