summaryrefslogtreecommitdiffstats
path: root/wp-includes/html-api/class-wp-html-processor.php
diff options
context:
space:
mode:
Diffstat (limited to 'wp-includes/html-api/class-wp-html-processor.php')
-rw-r--r--wp-includes/html-api/class-wp-html-processor.php712
1 files changed, 599 insertions, 113 deletions
diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php
index f27f83b..c76cc19 100644
--- a/wp-includes/html-api/class-wp-html-processor.php
+++ b/wp-includes/html-api/class-wp-html-processor.php
@@ -99,12 +99,20 @@
*
* The following list specifies the HTML tags that _are_ supported:
*
+ * - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY.
+ * - Custom elements: All custom elements are supported. :)
+ * - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH.
+ * - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR.
+ * - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
* - Links: A.
- * - The formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
- * - Containers: DIV, FIGCAPTION, FIGURE, SPAN.
- * - Form elements: BUTTON.
- * - Paragraph: P.
- * - Void elements: IMG.
+ * - Lists: DD, DL, DT, LI, OL, UL.
+ * - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO.
+ * - Paragraph: BR, P.
+ * - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
+ * - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION.
+ * - Templating elements: SLOT.
+ * - Text decoration: RUBY.
+ * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER.
*
* ### Supported markup
*
@@ -142,17 +150,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
const MAX_BOOKMARKS = 100;
/**
- * Static query for instructing the Tag Processor to visit every token.
- *
- * @access private
- *
- * @since 6.4.0
- *
- * @var array
- */
- const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' );
-
- /**
* Holds the working state of the parser, including the stack of
* open elements and the stack of active formatting elements.
*
@@ -244,15 +241,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return null;
}
- $p = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE );
- $p->state->context_node = array( 'BODY', array() );
- $p->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+ $processor = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE );
+ $processor->state->context_node = array( 'BODY', array() );
+ $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
- // @TODO: Create "fake" bookmarks for non-existent but implied nodes.
- $p->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
- $p->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
+ // @todo Create "fake" bookmarks for non-existent but implied nodes.
+ $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
+ $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
- $p->state->stack_of_open_elements->push(
+ $processor->state->stack_of_open_elements->push(
new WP_HTML_Token(
'root-node',
'HTML',
@@ -260,15 +257,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
)
);
- $p->state->stack_of_open_elements->push(
+ $processor->state->stack_of_open_elements->push(
new WP_HTML_Token(
'context-node',
- $p->state->context_node[0],
+ $processor->state->context_node[0],
false
)
);
- return $p;
+ return $processor;
}
/**
@@ -342,7 +339,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* Finds the next tag matching the $query.
*
- * @TODO: Support matching the class name and tag name.
+ * @todo Support matching the class name and tag name.
*
* @since 6.4.0
*
@@ -364,6 +361,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
public function next_tag( $query = null ) {
if ( null === $query ) {
while ( $this->step() ) {
+ if ( '#tag' !== $this->get_token_type() ) {
+ continue;
+ }
+
if ( ! $this->is_tag_closer() ) {
return true;
}
@@ -387,6 +388,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
while ( $this->step() ) {
+ if ( '#tag' !== $this->get_token_type() ) {
+ continue;
+ }
+
if ( ! $this->is_tag_closer() ) {
return true;
}
@@ -408,6 +413,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
$match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;
while ( $match_offset > 0 && $this->step() ) {
+ if ( '#tag' !== $this->get_token_type() ) {
+ continue;
+ }
+
if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) {
return true;
}
@@ -417,6 +426,24 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
}
/**
+ * Ensures internal accounting is maintained for HTML semantic rules while
+ * the underlying Tag Processor class is seeking to a bookmark.
+ *
+ * This doesn't currently have a way to represent non-tags and doesn't process
+ * semantic rules for text nodes. For access to the raw tokens consider using
+ * WP_HTML_Tag_Processor instead.
+ *
+ * @since 6.5.0 Added for internal support; do not use.
+ *
+ * @access private
+ *
+ * @return bool
+ */
+ public function next_token() {
+ return $this->step();
+ }
+
+ /**
* Indicates if the currently-matched tag matches the given breadcrumbs.
*
* A "*" represents a single tag wildcard, where any tag matches, but not no tags.
@@ -442,10 +469,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether the currently-matched tag is found at the given nested structure.
*/
public function matches_breadcrumbs( $breadcrumbs ) {
- if ( ! $this->get_tag() ) {
- return false;
- }
-
// Everything matches when there are zero constraints.
if ( 0 === count( $breadcrumbs ) ) {
return true;
@@ -492,7 +515,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return false;
}
- if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
+ if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) {
/*
* Void elements still hop onto the stack of open elements even though
* there's no corresponding closing tag. This is important for managing
@@ -502,28 +525,42 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* When moving on to the next node, therefore, if the bottom-most element
* on the stack is a void element, it must be closed.
*
- * @TODO: Once self-closing foreign elements and BGSOUND are supported,
+ * @todo Once self-closing foreign elements and BGSOUND are supported,
* they must also be implicitly closed here too. BGSOUND is
* special since it's only self-closing if the self-closing flag
* is provided in the opening tag, otherwise it expects a tag closer.
*/
$top_node = $this->state->stack_of_open_elements->current_node();
- if ( $top_node && self::is_void( $top_node->node_name ) ) {
+ if (
+ $top_node && (
+ // Void elements.
+ self::is_void( $top_node->node_name ) ||
+ // Comments, text nodes, and other atomic tokens.
+ '#' === $top_node->node_name[0] ||
+ // Doctype declarations.
+ 'html' === $top_node->node_name
+ )
+ ) {
$this->state->stack_of_open_elements->pop();
}
+ }
- parent::next_tag( self::VISIT_EVERYTHING );
+ if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
+ parent::next_token();
}
// Finish stepping when there are no more tokens in the document.
- if ( null === $this->get_tag() ) {
+ if (
+ WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
+ WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
+ ) {
return false;
}
$this->state->current_token = new WP_HTML_Token(
- $this->bookmark_tag(),
- $this->get_tag(),
- $this->is_tag_closer(),
+ $this->bookmark_token(),
+ $this->get_token_name(),
+ $this->has_self_closing_flag(),
$this->release_internal_bookmark_on_destruct
);
@@ -551,9 +588,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* Breadcrumbs start at the outermost parent and descend toward the matched element.
* They always include the entire path from the root HTML node to the matched element.
*
- * @TODO: It could be more efficient to expose a generator-based version of this function
- * to avoid creating the array copy on tag iteration. If this is done, it would likely
- * be more useful to walk up the stack when yielding instead of starting at the top.
+ * @todo It could be more efficient to expose a generator-based version of this function
+ * to avoid creating the array copy on tag iteration. If this is done, it would likely
+ * be more useful to walk up the stack when yielding instead of starting at the top.
*
* Example
*
@@ -566,10 +603,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
*/
public function get_breadcrumbs() {
- if ( ! $this->get_tag() ) {
- return null;
- }
-
$breadcrumbs = array();
foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) {
$breadcrumbs[] = $stack_item->node_name;
@@ -594,17 +627,67 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found.
*/
private function step_in_body() {
- $tag_name = $this->get_tag();
- $op_sigil = $this->is_tag_closer() ? '-' : '+';
- $op = "{$op_sigil}{$tag_name}";
+ $token_name = $this->get_token_name();
+ $token_type = $this->get_token_type();
+ $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
+ $op = "{$op_sigil}{$token_name}";
switch ( $op ) {
+ case '#comment':
+ case '#funky-comment':
+ case '#presumptuous-tag':
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+
+ case '#text':
+ $this->reconstruct_active_formatting_elements();
+
+ $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
+
+ /*
+ * > A character token that is U+0000 NULL
+ *
+ * Any successive sequence of NULL bytes is ignored and won't
+ * trigger active format reconstruction. Therefore, if the text
+ * only comprises NULL bytes then the token should be ignored
+ * here, but if there are any other characters in the stream
+ * the active formats should be reconstructed.
+ */
+ if (
+ 1 <= $current_token->length &&
+ "\x00" === $this->html[ $current_token->start ] &&
+ strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
+ ) {
+ // Parse error: ignore the token.
+ return $this->step();
+ }
+
+ /*
+ * Whitespace-only text does not affect the frameset-ok flag.
+ * It is probably inter-element whitespace, but it may also
+ * contain character references which decode only to whitespace.
+ */
+ $text = $this->get_modifiable_text();
+ if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
+ $this->state->frameset_ok = false;
+ }
+
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+
+ case 'html':
+ /*
+ * > A DOCTYPE token
+ * > Parse error. Ignore the token.
+ */
+ return $this->step();
+
/*
* > A start tag whose tag name is "button"
*/
case '+BUTTON':
if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) {
- // @TODO: Indicate a parse error once it's possible. This error does not impact the logic here.
+ // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
$this->generate_implied_end_tags();
$this->state->stack_of_open_elements->pop_until( 'BUTTON' );
}
@@ -621,11 +704,31 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* > "fieldset", "figcaption", "figure", "footer", "header", "hgroup",
* > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul"
*/
+ case '+ADDRESS':
+ case '+ARTICLE':
+ case '+ASIDE':
case '+BLOCKQUOTE':
+ case '+CENTER':
+ case '+DETAILS':
+ case '+DIALOG':
+ case '+DIR':
case '+DIV':
+ case '+DL':
+ case '+FIELDSET':
case '+FIGCAPTION':
case '+FIGURE':
+ case '+FOOTER':
+ case '+HEADER':
+ case '+HGROUP':
+ case '+MAIN':
+ case '+MENU':
+ case '+NAV':
+ case '+OL':
case '+P':
+ case '+SEARCH':
+ case '+SECTION':
+ case '+SUMMARY':
+ case '+UL':
if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->close_a_p_element();
}
@@ -639,22 +742,213 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main",
* > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul"
*/
+ case '-ADDRESS':
+ case '-ARTICLE':
+ case '-ASIDE':
case '-BLOCKQUOTE':
case '-BUTTON':
+ case '-CENTER':
+ case '-DETAILS':
+ case '-DIALOG':
+ case '-DIR':
case '-DIV':
+ case '-DL':
+ case '-FIELDSET':
case '-FIGCAPTION':
case '-FIGURE':
- if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) {
- // @TODO: Report parse error.
+ case '-FOOTER':
+ case '-HEADER':
+ case '-HGROUP':
+ case '-LISTING':
+ case '-MAIN':
+ case '-MENU':
+ case '-NAV':
+ case '-OL':
+ case '-PRE':
+ case '-SEARCH':
+ case '-SECTION':
+ case '-SUMMARY':
+ case '-UL':
+ if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
+ // @todo Report parse error.
// Ignore the token.
return $this->step();
}
$this->generate_implied_end_tags();
- if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
- // @TODO: Record parse error: this error doesn't impact parsing.
+ if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
+ // @todo Record parse error: this error doesn't impact parsing.
+ }
+ $this->state->stack_of_open_elements->pop_until( $token_name );
+ return true;
+
+ /*
+ * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
+ */
+ case '+H1':
+ case '+H2':
+ case '+H3':
+ case '+H4':
+ case '+H5':
+ case '+H6':
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+
+ if (
+ in_array(
+ $this->state->stack_of_open_elements->current_node()->node_name,
+ array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ),
+ true
+ )
+ ) {
+ // @todo Indicate a parse error once it's possible.
+ $this->state->stack_of_open_elements->pop();
+ }
+
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+
+ /*
+ * > A start tag whose tag name is one of: "pre", "listing"
+ */
+ case '+PRE':
+ case '+LISTING':
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+ $this->insert_html_element( $this->state->current_token );
+ $this->state->frameset_ok = false;
+ return true;
+
+ /*
+ * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
+ */
+ case '-H1':
+ case '-H2':
+ case '-H3':
+ case '-H4':
+ case '-H5':
+ case '-H6':
+ if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) {
+ /*
+ * This is a parse error; ignore the token.
+ *
+ * @todo Indicate a parse error once it's possible.
+ */
+ return $this->step();
+ }
+
+ $this->generate_implied_end_tags();
+
+ if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
+ // @todo Record parse error: this error doesn't impact parsing.
}
- $this->state->stack_of_open_elements->pop_until( $tag_name );
+
+ $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' );
+ return true;
+
+ /*
+ * > A start tag whose tag name is "li"
+ * > A start tag whose tag name is one of: "dd", "dt"
+ */
+ case '+DD':
+ case '+DT':
+ case '+LI':
+ $this->state->frameset_ok = false;
+ $node = $this->state->stack_of_open_elements->current_node();
+ $is_li = 'LI' === $token_name;
+
+ in_body_list_loop:
+ /*
+ * The logic for LI and DT/DD is the same except for one point: LI elements _only_
+ * close other LI elements, but a DT or DD element closes _any_ open DT or DD element.
+ */
+ if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) {
+ $node_name = $is_li ? 'LI' : $node->node_name;
+ $this->generate_implied_end_tags( $node_name );
+ if ( $node_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
+ // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
+ }
+
+ $this->state->stack_of_open_elements->pop_until( $node_name );
+ goto in_body_list_done;
+ }
+
+ if (
+ 'ADDRESS' !== $node->node_name &&
+ 'DIV' !== $node->node_name &&
+ 'P' !== $node->node_name &&
+ $this->is_special( $node->node_name )
+ ) {
+ /*
+ * > If node is in the special category, but is not an address, div,
+ * > or p element, then jump to the step labeled done below.
+ */
+ goto in_body_list_done;
+ } else {
+ /*
+ * > Otherwise, set node to the previous entry in the stack of open elements
+ * > and return to the step labeled loop.
+ */
+ foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) {
+ $node = $item;
+ break;
+ }
+ goto in_body_list_loop;
+ }
+
+ in_body_list_done:
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+
+ /*
+ * > An end tag whose tag name is "li"
+ * > An end tag whose tag name is one of: "dd", "dt"
+ */
+ case '-DD':
+ case '-DT':
+ case '-LI':
+ if (
+ /*
+ * An end tag whose tag name is "li":
+ * If the stack of open elements does not have an li element in list item scope,
+ * then this is a parse error; ignore the token.
+ */
+ (
+ 'LI' === $token_name &&
+ ! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
+ ) ||
+ /*
+ * An end tag whose tag name is one of: "dd", "dt":
+ * If the stack of open elements does not have an element in scope that is an
+ * HTML element with the same tag name as that of the token, then this is a
+ * parse error; ignore the token.
+ */
+ (
+ 'LI' !== $token_name &&
+ ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name )
+ )
+ ) {
+ /*
+ * This is a parse error, ignore the token.
+ *
+ * @todo Indicate a parse error once it's possible.
+ */
+ return $this->step();
+ }
+
+ $this->generate_implied_end_tags( $token_name );
+
+ if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
+ // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
+ }
+
+ $this->state->stack_of_open_elements->pop_until( $token_name );
return true;
/*
@@ -730,47 +1024,174 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return true;
/*
+ * > An end tag whose tag name is "br"
+ * > Parse error. Drop the attributes from the token, and act as described in the next
+ * > entry; i.e. act as if this was a "br" start tag token with no attributes, rather
+ * > than the end tag token that it actually is.
+ */
+ case '-BR':
+ $this->last_error = self::ERROR_UNSUPPORTED;
+ throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' );
+
+ /*
* > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr"
*/
+ case '+AREA':
+ case '+BR':
+ case '+EMBED':
case '+IMG':
+ case '+KEYGEN':
+ case '+WBR':
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
+ $this->state->frameset_ok = false;
return true;
/*
- * > Any other start tag
+ * > A start tag whose tag name is "input"
*/
- case '+SPAN':
+ case '+INPUT':
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
+ $type_attribute = $this->get_attribute( 'type' );
+ /*
+ * > If the token does not have an attribute with the name "type", or if it does,
+ * > but that attribute's value is not an ASCII case-insensitive match for the
+ * > string "hidden", then: set the frameset-ok flag to "not ok".
+ */
+ if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) {
+ $this->state->frameset_ok = false;
+ }
return true;
/*
- * Any other end tag
+ * > A start tag whose tag name is "hr"
*/
- case '-SPAN':
- foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
- // > If node is an HTML element with the same tag name as the token, then:
- if ( $item->node_name === $tag_name ) {
- $this->generate_implied_end_tags( $tag_name );
+ case '+HR':
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+ $this->insert_html_element( $this->state->current_token );
+ $this->state->frameset_ok = false;
+ return true;
- // > If node is not the current node, then this is a parse error.
+ /*
+ * > A start tag whose tag name is one of: "param", "source", "track"
+ */
+ case '+PARAM':
+ case '+SOURCE':
+ case '+TRACK':
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+ }
- $this->state->stack_of_open_elements->pop_until( $tag_name );
- return true;
- }
+ /*
+ * These tags require special handling in the 'in body' insertion mode
+ * but that handling hasn't yet been implemented.
+ *
+ * As the rules for each tag are implemented, the corresponding tag
+ * name should be removed from this list. An accompanying test should
+ * help ensure this list is maintained.
+ *
+ * @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags
+ *
+ * Since this switch structure throws a WP_HTML_Unsupported_Exception, it's
+ * possible to handle "any other start tag" and "any other end tag" below,
+ * as that guarantees execution doesn't proceed for the unimplemented tags.
+ *
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
+ */
+ switch ( $token_name ) {
+ case 'APPLET':
+ case 'BASE':
+ case 'BASEFONT':
+ case 'BGSOUND':
+ case 'BODY':
+ case 'CAPTION':
+ case 'COL':
+ case 'COLGROUP':
+ case 'FORM':
+ case 'FRAME':
+ case 'FRAMESET':
+ case 'HEAD':
+ case 'HTML':
+ case 'IFRAME':
+ case 'LINK':
+ case 'MARQUEE':
+ case 'MATH':
+ case 'META':
+ case 'NOBR':
+ case 'NOEMBED':
+ case 'NOFRAMES':
+ case 'NOSCRIPT':
+ case 'OBJECT':
+ case 'OPTGROUP':
+ case 'OPTION':
+ case 'PLAINTEXT':
+ case 'RB':
+ case 'RP':
+ case 'RT':
+ case 'RTC':
+ case 'SARCASM':
+ case 'SCRIPT':
+ case 'SELECT':
+ case 'STYLE':
+ case 'SVG':
+ case 'TABLE':
+ case 'TBODY':
+ case 'TD':
+ case 'TEMPLATE':
+ case 'TEXTAREA':
+ case 'TFOOT':
+ case 'TH':
+ case 'THEAD':
+ case 'TITLE':
+ case 'TR':
+ case 'XMP':
+ $this->last_error = self::ERROR_UNSUPPORTED;
+ throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
+ }
- // > Otherwise, if node is in the special category, then this is a parse error; ignore the token, and return.
- if ( self::is_special( $item->node_name ) ) {
- return $this->step();
- }
+ if ( ! $this->is_tag_closer() ) {
+ /*
+ * > Any other start tag
+ */
+ $this->reconstruct_active_formatting_elements();
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+ } else {
+ /*
+ * > Any other end tag
+ */
+
+ /*
+ * Find the corresponding tag opener in the stack of open elements, if
+ * it exists before reaching a special element, which provides a kind
+ * of boundary in the stack. For example, a `</custom-tag>` should not
+ * close anything beyond its containing `P` or `DIV` element.
+ */
+ foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
+ if ( $token_name === $node->node_name ) {
+ break;
}
- // Execution should not reach here; if it does then something went wrong.
- return false;
- default:
- $this->last_error = self::ERROR_UNSUPPORTED;
- throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
+ if ( self::is_special( $node->node_name ) ) {
+ // This is a parse error, ignore the token.
+ return $this->step();
+ }
+ }
+
+ $this->generate_implied_end_tags( $token_name );
+ if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
+ // @todo Record parse error: this error doesn't impact parsing.
+ }
+
+ foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
+ $this->state->stack_of_open_elements->pop();
+ if ( $node === $item ) {
+ return true;
+ }
+ }
}
}
@@ -779,19 +1200,16 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*/
/**
- * Creates a new bookmark for the currently-matched tag and returns the generated name.
+ * Creates a new bookmark for the currently-matched token and returns the generated name.
*
* @since 6.4.0
+ * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
*
* @throws Exception When unable to allocate requested bookmark.
*
* @return string|false Name of created bookmark, or false if unable to create.
*/
- private function bookmark_tag() {
- if ( ! $this->get_tag() ) {
- return false;
- }
-
+ private function bookmark_token() {
if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
$this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
throw new Exception( 'could not allocate bookmark' );
@@ -863,6 +1281,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* Moves the internal cursor in the HTML Processor to a given bookmark's location.
*
+ * Be careful! Seeking backwards to a previous location resets the parser to the
+ * start of the document and reparses the entire contents up until it finds the
+ * sought-after bookmarked location.
+ *
* In order to prevent accidental infinite loops, there's a
* maximum limit on the number of times seek() can be called.
*
@@ -874,6 +1296,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether the internal cursor was successfully moved to the bookmark's location.
*/
public function seek( $bookmark_name ) {
+ // Flush any pending updates to the document before beginning.
+ $this->get_updated_html();
+
$actual_bookmark_name = "_{$bookmark_name}";
$processor_started_at = $this->state->current_token
? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start
@@ -881,44 +1306,73 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
$bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start;
$direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward';
- switch ( $direction ) {
- case 'forward':
- // When moving forwards, re-parse the document until reaching the same location as the original bookmark.
- while ( $this->step() ) {
- if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
- return true;
- }
+ /*
+ * If seeking backwards, it's possible that the sought-after bookmark exists within an element
+ * which has been closed before the current cursor; in other words, it has already been removed
+ * from the stack of open elements. This means that it's insufficient to simply pop off elements
+ * from the stack of open elements which appear after the bookmarked location and then jump to
+ * that location, as the elements which were open before won't be re-opened.
+ *
+ * In order to maintain consistency, the HTML Processor rewinds to the start of the document
+ * and reparses everything until it finds the sought-after bookmark.
+ *
+ * There are potentially better ways to do this: cache the parser state for each bookmark and
+ * restore it when seeking; store an immutable and idempotent register of where elements open
+ * and close.
+ *
+ * If caching the parser state it will be essential to properly maintain the cached stack of
+ * open elements and active formatting elements when modifying the document. This could be a
+ * tedious and time-consuming process as well, and so for now will not be performed.
+ *
+ * It may be possible to track bookmarks for where elements open and close, and in doing so
+ * be able to quickly recalculate breadcrumbs for any element in the document. It may even
+ * be possible to remove the stack of open elements and compute it on the fly this way.
+ * If doing this, the parser would need to track the opening and closing locations for all
+ * tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves
+ * this list could be automatically maintained while modifying the document. Finding the
+ * breadcrumbs would then amount to traversing that list from the start until the token
+ * being inspected. Once an element closes, if there are no bookmarks pointing to locations
+ * within that element, then all of these locations may be forgotten to save on memory use
+ * and computation time.
+ */
+ if ( 'backward' === $direction ) {
+ /*
+ * Instead of clearing the parser state and starting fresh, calling the stack methods
+ * maintains the proper flags in the parser.
+ */
+ foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
+ if ( 'context-node' === $item->bookmark_name ) {
+ break;
}
- return false;
-
- case 'backward':
- /*
- * When moving backwards, clear out all existing stack entries which appear after the destination
- * bookmark. These could be stored for later retrieval, but doing so would require additional
- * memory overhead and also demand that references and bookmarks are updated as the document
- * changes. In time this could be a valuable optimization, but it's okay to give up that
- * optimization in exchange for more CPU time to recompute the stack, to re-parse the
- * document that may have already been parsed once.
- */
- foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
- if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) {
- break;
- }
+ $this->state->stack_of_open_elements->remove_node( $item );
+ }
- $this->state->stack_of_open_elements->remove_node( $item );
+ foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
+ if ( 'context-node' === $item->bookmark_name ) {
+ break;
}
- foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
- if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) {
- break;
- }
+ $this->state->active_formatting_elements->remove_node( $item );
+ }
- $this->state->active_formatting_elements->remove_node( $item );
- }
+ parent::seek( 'context-node' );
+ $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+ $this->state->frameset_ok = true;
+ }
- return parent::seek( $actual_bookmark_name );
+ // When moving forwards, reparse the document until reaching the same location as the original bookmark.
+ if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
+ return true;
}
+
+ while ( $this->step() ) {
+ if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
+ return true;
+ }
+ }
+
+ return false;
}
/**
@@ -1005,6 +1459,18 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return parent::set_bookmark( "_{$bookmark_name}" );
}
+ /**
+ * Checks whether a bookmark with the given name exists.
+ *
+ * @since 6.5.0
+ *
+ * @param string $bookmark_name Name to identify a bookmark that potentially exists.
+ * @return bool Whether that bookmark exists.
+ */
+ public function has_bookmark( $bookmark_name ) {
+ return parent::has_bookmark( "_{$bookmark_name}" );
+ }
+
/*
* HTML Parsing Algorithms
*/
@@ -1034,6 +1500,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*/
private function generate_implied_end_tags( $except_for_this_element = null ) {
$elements_with_implied_end_tags = array(
+ 'DD',
+ 'DT',
+ 'LI',
'P',
);
@@ -1059,6 +1528,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*/
private function generate_implied_end_tags_thoroughly() {
$elements_with_implied_end_tags = array(
+ 'DD',
+ 'DT',
+ 'LI',
'P',
);
@@ -1170,7 +1642,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
// > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return.
if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) {
- $this->state->active_formatting_elements->remove_node( $formatting_element->bookmark_name );
+ $this->state->active_formatting_elements->remove_node( $formatting_element );
return;
}
@@ -1373,14 +1845,19 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return (
'AREA' === $tag_name ||
'BASE' === $tag_name ||
+ 'BASEFONT' === $tag_name || // Obsolete but still treated as void.
+ 'BGSOUND' === $tag_name || // Obsolete but still treated as void.
'BR' === $tag_name ||
'COL' === $tag_name ||
'EMBED' === $tag_name ||
+ 'FRAME' === $tag_name ||
'HR' === $tag_name ||
'IMG' === $tag_name ||
'INPUT' === $tag_name ||
+ 'KEYGEN' === $tag_name || // Obsolete but still treated as void.
'LINK' === $tag_name ||
'META' === $tag_name ||
+ 'PARAM' === $tag_name || // Obsolete but still treated as void.
'SOURCE' === $tag_name ||
'TRACK' === $tag_name ||
'WBR' === $tag_name
@@ -1410,6 +1887,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
const REPROCESS_CURRENT_NODE = 'reprocess-current-node';
/**
+ * Indicates that the current HTML token should be processed without advancing the parser.
+ *
+ * @since 6.5.0
+ *
+ * @var string
+ */
+ const PROCESS_CURRENT_NODE = 'process-current-node';
+
+ /**
* Indicates that the parser encountered unsupported markup and has bailed.
*
* @since 6.4.0