Skip to content

Commit

Permalink
HTML API: Add explicit handling or failure for all tags.
Browse files Browse the repository at this point in the history
The HTML API HTML processor does not yet support all tags. Many tags (e.g. list elements) have some complicated rules in the [https://html.spec.whatwg.org/#parsing-main-inbody "in body" insertion mode].

Implementing these special rules is blocking the implementation for a catch-all rule for "any other tag" because we need to prevent special rules from being handled by the catch-all.

  Any other start tag
  Reconstruct the active formatting elements, if any.

  Insert an HTML element for the token.

  …

This change ensures the HTML Processor fails when handling special tags. This is the same as existing behavior, but will allow us to implement the catch-all "any other tag" handling without unintentionally handling special elements.

Additionally, we add tests that assert the special elements are unhandled. As these tags are implemented, this should help to ensure they're removed from the unsupported tag list.

Props jonsurrell, dmsnell.
Fixes #60092.

git-svn-id: https://develop.svn.wordpress.org/trunk@57248 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
ockham committed Jan 8, 2024
1 parent b315d4e commit 8b2ed2f
Show file tree
Hide file tree
Showing 4 changed files with 290 additions and 154 deletions.
153 changes: 124 additions & 29 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,19 @@
* The following list specifies the HTML tags that _are_ supported:
*
* - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY.
* - Form elements: BUTTON, FIELDSET, SEARCH.
* - Custom elements: All custom elements are supported. :)
* - Form elements: BUTTON, DATALIST, FIELDSET, LABEL, LEGEND, METER, PROGRESS, SEARCH.
* - Formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
* - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
* - Links: A.
* - Lists: DL.
* - Media elements: FIGCAPTION, FIGURE, IMG.
* - Media elements: AUDIO, CANVAS, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, VIDEO.
* - Paragraph: P.
* - Sectioning elements: ARTICLE, ASIDE, NAV, SECTION
* - Deprecated elements: CENTER, DIR
* - Phrasing elements: ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
* - Sectioning elements: ARTICLE, ASIDE, NAV, SECTION.
* - Templating elements: SLOT.
* - Text decoration: RUBY.
* - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, MULTICOL, NEXTID, SPACER.
*
* ### Supported markup
*
Expand Down Expand Up @@ -830,41 +834,132 @@ private function step_in_body() {
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
return true;
}

/*
* These tags require special handling in the 'in body' insertion mode
* but that handling hasn't yet been implemented.
*
* As the rules for each tag are implemented, the corresponding tag
* name should be removed from this list. An accompanying test should
* help ensure this list is maintained.
*
* @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags
*
* Since this switch structure throws a WP_HTML_Unsupported_Exception, it's
* possible to handle "any other start tag" and "any other end tag" below,
* as that guarantees execution doesn't proceed for the unimplemented tags.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
*/
switch ( $tag_name ) {
case 'APPLET':
case 'AREA':
case 'BASE':
case 'BASEFONT':
case 'BGSOUND':
case 'BODY':
case 'BR':
case 'CAPTION':
case 'COL':
case 'COLGROUP':
case 'DD':
case 'DT':
case 'EMBED':
case 'FORM':
case 'FRAME':
case 'FRAMESET':
case 'HEAD':
case 'HR':
case 'HTML':
case 'IFRAME':
case 'INPUT':
case 'KEYGEN':
case 'LI':
case 'LINK':
case 'LISTING':
case 'MARQUEE':
case 'MATH':
case 'META':
case 'NOBR':
case 'NOEMBED':
case 'NOFRAMES':
case 'NOSCRIPT':
case 'OBJECT':
case 'OL':
case 'OPTGROUP':
case 'OPTION':
case 'PARAM':
case 'PLAINTEXT':
case 'PRE':
case 'RB':
case 'RP':
case 'RT':
case 'RTC':
case 'SARCASM':
case 'SCRIPT':
case 'SELECT':
case 'SOURCE':
case 'STYLE':
case 'SVG':
case 'TABLE':
case 'TBODY':
case 'TD':
case 'TEMPLATE':
case 'TEXTAREA':
case 'TFOOT':
case 'TH':
case 'THEAD':
case 'TITLE':
case 'TR':
case 'TRACK':
case 'UL':
case 'WBR':
case 'XMP':
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
}

if ( ! $this->is_tag_closer() ) {
/*
* > Any other start tag
*/
case '+SPAN':
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
return true;
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
return true;
} else {
/*
* > Any other end tag
*/

/*
* Any other end tag
* Find the corresponding tag opener in the stack of open elements, if
* it exists before reaching a special element, which provides a kind
* of boundary in the stack. For example, a `</custom-tag>` should not
* close anything beyond its containing `P` or `DIV` element.
*/
case '-SPAN':
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
// > If node is an HTML element with the same tag name as the token, then:
if ( $item->node_name === $tag_name ) {
$this->generate_implied_end_tags( $tag_name );
foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
if ( $tag_name === $node->node_name ) {
break;
}

// > If node is not the current node, then this is a parse error.
if ( self::is_special( $node->node_name ) ) {
// This is a parse error, ignore the token.
return $this->step();
}
}

$this->state->stack_of_open_elements->pop_until( $tag_name );
return true;
}
$this->generate_implied_end_tags( $tag_name );
if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
// @todo Record parse error: this error doesn't impact parsing.
}

// > Otherwise, if node is in the special category, then this is a parse error; ignore the token, and return.
if ( self::is_special( $item->node_name ) ) {
return $this->step();
}
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
$this->state->stack_of_open_elements->pop();
if ( $node === $item ) {
return true;
}
// Execution should not reach here; if it does then something went wrong.
return false;

default:
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
}
}
}

Expand Down Expand Up @@ -1264,7 +1359,7 @@ private function run_adoption_agency_algorithm() {

// > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return.
if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) {
$this->state->active_formatting_elements->remove_node( $formatting_element->bookmark_name );
$this->state->active_formatting_elements->remove_node( $formatting_element );
return;
}

Expand Down
108 changes: 92 additions & 16 deletions tests/phpunit/tests/html-api/wpHtmlProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,6 @@ public function test_get_tag_is_null_once_document_is_finished() {
$this->assertNull( $p->get_tag() );
}

/**
* Ensures that if the HTML Processor encounters inputs that it can't properly handle,
* that it stops processing the rest of the document. This prevents data corruption.
*
* @ticket 59167
*
* @covers WP_HTML_Processor::next_tag
*/
public function test_stops_processing_after_unsupported_elements() {
$p = WP_HTML_Processor::create_fragment( '<p><x-not-supported></p><p></p>' );
$p->next_tag( 'P' );
$this->assertFalse( $p->next_tag(), 'Stepped into a tag after encountering X-NOT-SUPPORTED element when it should have aborted.' );
$this->assertNull( $p->get_tag(), "Should have aborted processing, but still reported tag {$p->get_tag()} after properly failing to step into tag." );
$this->assertFalse( $p->next_tag( 'P' ), 'Stepped into normal P element after X-NOT-SUPPORTED element when it should have aborted.' );
}

/**
* Ensures that the HTML Processor maintains its internal state through seek calls.
*
Expand Down Expand Up @@ -147,4 +131,96 @@ public function test_fails_to_reconstruct_formatting_elements() {
$this->assertTrue( $p->next_tag( 'EM' ), 'Could not find first EM.' );
$this->assertFalse( $p->next_tag( 'EM' ), 'Should have aborted before finding second EM as it required reconstructing the first EM.' );
}

/**
* Ensures that special handling of unsupported tags is cleaned up
* as handling is implemented. Otherwise there's risk of leaving special
* handling (that is never reached) when tag handling is implemented.
*
* @ticket 60092
*
* @dataProvider data_unsupported_special_in_body_tags
*
* @covers WP_HTML_Processor::step_in_body
*
* @param string $tag_name Name of the tag to test.
*/
public function test_step_in_body_fails_on_unsupported_tags( $tag_name ) {
$fragment = WP_HTML_Processor::create_fragment( '<' . $tag_name . '></' . $tag_name . '>' );
$this->assertFalse( $fragment->next_tag(), 'Should fail to find tag: ' . $tag_name . '.' );
$this->assertEquals( $fragment->get_last_error(), WP_HTML_Processor::ERROR_UNSUPPORTED, 'Should have unsupported last error.' );
}

/**
* Data provider.
*
* @return array[]
*/
public function data_unsupported_special_in_body_tags() {
return array(
'APPLET' => array( 'APPLET' ),
'AREA' => array( 'AREA' ),
'BASE' => array( 'BASE' ),
'BASEFONT' => array( 'BASEFONT' ),
'BGSOUND' => array( 'BGSOUND' ),
'BODY' => array( 'BODY' ),
'BR' => array( 'BR' ),
'CAPTION' => array( 'CAPTION' ),
'COL' => array( 'COL' ),
'COLGROUP' => array( 'COLGROUP' ),
'DD' => array( 'DD' ),
'DT' => array( 'DT' ),
'EMBED' => array( 'EMBED' ),
'FORM' => array( 'FORM' ),
'FRAME' => array( 'FRAME' ),
'FRAMESET' => array( 'FRAMESET' ),
'HEAD' => array( 'HEAD' ),
'HR' => array( 'HR' ),
'HTML' => array( 'HTML' ),
'IFRAME' => array( 'IFRAME' ),
'INPUT' => array( 'INPUT' ),
'KEYGEN' => array( 'KEYGEN' ),
'LI' => array( 'LI' ),
'LINK' => array( 'LINK' ),
'LISTING' => array( 'LISTING' ),
'MARQUEE' => array( 'MARQUEE' ),
'MATH' => array( 'MATH' ),
'META' => array( 'META' ),
'NOBR' => array( 'NOBR' ),
'NOEMBED' => array( 'NOEMBED' ),
'NOFRAMES' => array( 'NOFRAMES' ),
'NOSCRIPT' => array( 'NOSCRIPT' ),
'OBJECT' => array( 'OBJECT' ),
'OL' => array( 'OL' ),
'OPTGROUP' => array( 'OPTGROUP' ),
'OPTION' => array( 'OPTION' ),
'PARAM' => array( 'PARAM' ),
'PLAINTEXT' => array( 'PLAINTEXT' ),
'PRE' => array( 'PRE' ),
'RB' => array( 'RB' ),
'RP' => array( 'RP' ),
'RT' => array( 'RT' ),
'RTC' => array( 'RTC' ),
'SARCASM' => array( 'SARCASM' ),
'SCRIPT' => array( 'SCRIPT' ),
'SELECT' => array( 'SELECT' ),
'SOURCE' => array( 'SOURCE' ),
'STYLE' => array( 'STYLE' ),
'SVG' => array( 'SVG' ),
'TABLE' => array( 'TABLE' ),
'TBODY' => array( 'TBODY' ),
'TD' => array( 'TD' ),
'TEMPLATE' => array( 'TEMPLATE' ),
'TEXTAREA' => array( 'TEXTAREA' ),
'TFOOT' => array( 'TFOOT' ),
'TH' => array( 'TH' ),
'THEAD' => array( 'THEAD' ),
'TITLE' => array( 'TITLE' ),
'TR' => array( 'TR' ),
'TRACK' => array( 'TRACK' ),
'UL' => array( 'UL' ),
'WBR' => array( 'WBR' ),
'XMP' => array( 'XMP' ),
);
}
}
Loading

0 comments on commit 8b2ed2f

Please sign in to comment.