From bc010de9df09f2d730a69734e05e5175ea8bd2d7 Mon Sep 17 00:00:00 2001 From: Zuckjet <1083941774@qq.com> Date: Fri, 14 Aug 2020 06:47:59 +0800 Subject: [PATCH] tokenizer: fix parse bug when tag name is not ASCII alpha (#497) Co-authored-by: zhuyujie --- src/Tokenizer.ts | 21 +++++++++++++++++++ .../Events/34-not-alpha-tags.json | 12 +++++++++++ 2 files changed, 33 insertions(+) create mode 100644 src/__fixtures__/Events/34-not-alpha-tags.json diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 34d7d394b..c67bef934 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -32,6 +32,7 @@ const enum State { //comments BeforeComment, InComment, + InSpecialComment, AfterComment1, AfterComment2, @@ -99,6 +100,10 @@ function whitespace(c: string): boolean { return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r"; } +function isASCIIAlpha(c: string): boolean { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + interface Callbacks { onattribdata(value: string): void; //TODO implement the new event onattribend(): void; @@ -305,6 +310,8 @@ export default class Tokenizer { } else if (c === "?") { this._state = State.InProcessingInstruction; this._sectionStart = this._index + 1; + } else if (!isASCIIAlpha(c)) { + this._state = State.Text; } else { this._state = !this._xmlMode && (c === "s" || c === "S") @@ -336,6 +343,9 @@ export default class Tokenizer { this._state = State.Text; this._index--; } + } else if (!isASCIIAlpha(c)) { + this._state = State.InSpecialComment; + this._sectionStart = this._index; } else { this._state = State.InClosingTagName; this._sectionStart = this._index; @@ -481,6 +491,15 @@ export default class Tokenizer { _stateInComment(c: string) { if (c === "-") this._state = State.AfterComment1; } + _stateInSpecialComment(c: string) { + if (c === ">") { + this._cbs.oncomment( + this._buffer.substring(this._sectionStart, this._index) + ); + this._state = State.Text; + this._sectionStart = this._index + 1; + } + } _stateAfterComment1(c: string) { if (c === "-") { this._state = State.AfterComment2; @@ -718,6 +737,8 @@ export default class Tokenizer { this._stateInAttributeName(c); } else if (this._state === State.InComment) { this._stateInComment(c); + } else if (this._state === State.InSpecialComment) { + this._stateInSpecialComment(c); } else if (this._state === State.BeforeAttributeName) { this._stateBeforeAttributeName(c); } else if (this._state === State.InTagName) { diff --git a/src/__fixtures__/Events/34-not-alpha-tags.json b/src/__fixtures__/Events/34-not-alpha-tags.json new file mode 100644 index 000000000..06f9b687c --- /dev/null +++ b/src/__fixtures__/Events/34-not-alpha-tags.json @@ -0,0 +1,12 @@ +{ + "name": "tag names are not ASCII alpha", + "options": { + "parser": {} + }, + "html": "<12>text", + "expected": [ + { "event": "text", "data": ["<12>text"] }, + { "event": "comment", "data": ["12"] }, + { "event": "commentend", "data": [] } + ] +}