diff options
Diffstat (limited to 'wp-includes/html-api/class-wp-html-processor.php')
-rw-r--r-- | wp-includes/html-api/class-wp-html-processor.php | 712 |
1 files changed, 599 insertions, 113 deletions
diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php index f27f83b..c76cc19 100644 --- a/wp-includes/html-api/class-wp-html-processor.php +++ b/wp-includes/html-api/class-wp-html-processor.php @@ -99,12 +99,20 @@ * * The following list specifies the HTML tags that _are_ supported: * + * - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY. + * - Custom elements: All custom elements are supported. :) + * - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH. + * - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR. + * - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP. * - Links: A. - * - The formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U. - * - Containers: DIV, FIGCAPTION, FIGURE, SPAN. - * - Form elements: BUTTON. - * - Paragraph: P. - * - Void elements: IMG. + * - Lists: DD, DL, DT, LI, OL, UL. + * - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO. + * - Paragraph: BR, P. + * - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR. + * - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION. + * - Templating elements: SLOT. + * - Text decoration: RUBY. + * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER. * * ### Supported markup * @@ -142,17 +150,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { const MAX_BOOKMARKS = 100; /** - * Static query for instructing the Tag Processor to visit every token. - * - * @access private - * - * @since 6.4.0 - * - * @var array - */ - const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' ); - - /** * Holds the working state of the parser, including the stack of * open elements and the stack of active formatting elements. * @@ -244,15 +241,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return null; } - $p = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE ); - $p->state->context_node = array( 'BODY', array() ); - $p->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + $processor = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE ); + $processor->state->context_node = array( 'BODY', array() ); + $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; - // @TODO: Create "fake" bookmarks for non-existent but implied nodes. - $p->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); - $p->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); + // @todo Create "fake" bookmarks for non-existent but implied nodes. + $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); + $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); - $p->state->stack_of_open_elements->push( + $processor->state->stack_of_open_elements->push( new WP_HTML_Token( 'root-node', 'HTML', @@ -260,15 +257,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { ) ); - $p->state->stack_of_open_elements->push( + $processor->state->stack_of_open_elements->push( new WP_HTML_Token( 'context-node', - $p->state->context_node[0], + $processor->state->context_node[0], false ) ); - return $p; + return $processor; } /** @@ -342,7 +339,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** * Finds the next tag matching the $query. * - * @TODO: Support matching the class name and tag name. + * @todo Support matching the class name and tag name. * * @since 6.4.0 * @@ -364,6 +361,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function next_tag( $query = null ) { if ( null === $query ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -387,6 +388,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -408,6 +413,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1; while ( $match_offset > 0 && $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) { return true; } @@ -417,6 +426,24 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { } /** + * Ensures internal accounting is maintained for HTML semantic rules while + * the underlying Tag Processor class is seeking to a bookmark. + * + * This doesn't currently have a way to represent non-tags and doesn't process + * semantic rules for text nodes. For access to the raw tokens consider using + * WP_HTML_Tag_Processor instead. + * + * @since 6.5.0 Added for internal support; do not use. + * + * @access private + * + * @return bool + */ + public function next_token() { + return $this->step(); + } + + /** * Indicates if the currently-matched tag matches the given breadcrumbs. * * A "*" represents a single tag wildcard, where any tag matches, but not no tags. @@ -442,10 +469,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether the currently-matched tag is found at the given nested structure. */ public function matches_breadcrumbs( $breadcrumbs ) { - if ( ! $this->get_tag() ) { - return false; - } - // Everything matches when there are zero constraints. if ( 0 === count( $breadcrumbs ) ) { return true; @@ -492,7 +515,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return false; } - if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) { /* * Void elements still hop onto the stack of open elements even though * there's no corresponding closing tag. This is important for managing @@ -502,28 +525,42 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * When moving on to the next node, therefore, if the bottom-most element * on the stack is a void element, it must be closed. * - * @TODO: Once self-closing foreign elements and BGSOUND are supported, + * @todo Once self-closing foreign elements and BGSOUND are supported, * they must also be implicitly closed here too. BGSOUND is * special since it's only self-closing if the self-closing flag * is provided in the opening tag, otherwise it expects a tag closer. */ $top_node = $this->state->stack_of_open_elements->current_node(); - if ( $top_node && self::is_void( $top_node->node_name ) ) { + if ( + $top_node && ( + // Void elements. + self::is_void( $top_node->node_name ) || + // Comments, text nodes, and other atomic tokens. + '#' === $top_node->node_name[0] || + // Doctype declarations. + 'html' === $top_node->node_name + ) + ) { $this->state->stack_of_open_elements->pop(); } + } - parent::next_tag( self::VISIT_EVERYTHING ); + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + parent::next_token(); } // Finish stepping when there are no more tokens in the document. - if ( null === $this->get_tag() ) { + if ( + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state + ) { return false; } $this->state->current_token = new WP_HTML_Token( - $this->bookmark_tag(), - $this->get_tag(), - $this->is_tag_closer(), + $this->bookmark_token(), + $this->get_token_name(), + $this->has_self_closing_flag(), $this->release_internal_bookmark_on_destruct ); @@ -551,9 +588,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * Breadcrumbs start at the outermost parent and descend toward the matched element. * They always include the entire path from the root HTML node to the matched element. * - * @TODO: It could be more efficient to expose a generator-based version of this function - * to avoid creating the array copy on tag iteration. If this is done, it would likely - * be more useful to walk up the stack when yielding instead of starting at the top. + * @todo It could be more efficient to expose a generator-based version of this function + * to avoid creating the array copy on tag iteration. If this is done, it would likely + * be more useful to walk up the stack when yielding instead of starting at the top. * * Example * @@ -566,10 +603,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - if ( ! $this->get_tag() ) { - return null; - } - $breadcrumbs = array(); foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { $breadcrumbs[] = $stack_item->node_name; @@ -594,17 +627,67 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether an element was found. */ private function step_in_body() { - $tag_name = $this->get_tag(); - $op_sigil = $this->is_tag_closer() ? '-' : '+'; - $op = "{$op_sigil}{$tag_name}"; + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; switch ( $op ) { + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + case '#text': + $this->reconstruct_active_formatting_elements(); + + $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + + /* + * > A character token that is U+0000 NULL + * + * Any successive sequence of NULL bytes is ignored and won't + * trigger active format reconstruction. Therefore, if the text + * only comprises NULL bytes then the token should be ignored + * here, but if there are any other characters in the stream + * the active formats should be reconstructed. + */ + if ( + 1 <= $current_token->length && + "\x00" === $this->html[ $current_token->start ] && + strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length + ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * Whitespace-only text does not affect the frameset-ok flag. + * It is probably inter-element whitespace, but it may also + * contain character references which decode only to whitespace. + */ + $text = $this->get_modifiable_text(); + if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) { + $this->state->frameset_ok = false; + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + /* + * > A DOCTYPE token + * > Parse error. Ignore the token. + */ + return $this->step(); + /* * > A start tag whose tag name is "button" */ case '+BUTTON': if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) { - // @TODO: Indicate a parse error once it's possible. This error does not impact the logic here. + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. $this->generate_implied_end_tags(); $this->state->stack_of_open_elements->pop_until( 'BUTTON' ); } @@ -621,11 +704,31 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * > "fieldset", "figcaption", "figure", "footer", "header", "hgroup", * > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul" */ + case '+ADDRESS': + case '+ARTICLE': + case '+ASIDE': case '+BLOCKQUOTE': + case '+CENTER': + case '+DETAILS': + case '+DIALOG': + case '+DIR': case '+DIV': + case '+DL': + case '+FIELDSET': case '+FIGCAPTION': case '+FIGURE': + case '+FOOTER': + case '+HEADER': + case '+HGROUP': + case '+MAIN': + case '+MENU': + case '+NAV': + case '+OL': case '+P': + case '+SEARCH': + case '+SECTION': + case '+SUMMARY': + case '+UL': if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->close_a_p_element(); } @@ -639,22 +742,213 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul" */ + case '-ADDRESS': + case '-ARTICLE': + case '-ASIDE': case '-BLOCKQUOTE': case '-BUTTON': + case '-CENTER': + case '-DETAILS': + case '-DIALOG': + case '-DIR': case '-DIV': + case '-DL': + case '-FIELDSET': case '-FIGCAPTION': case '-FIGURE': - if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) { - // @TODO: Report parse error. + case '-FOOTER': + case '-HEADER': + case '-HGROUP': + case '-LISTING': + case '-MAIN': + case '-MENU': + case '-NAV': + case '-OL': + case '-PRE': + case '-SEARCH': + case '-SECTION': + case '-SUMMARY': + case '-UL': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) { + // @todo Report parse error. // Ignore the token. return $this->step(); } $this->generate_implied_end_tags(); - if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) { - // @TODO: Record parse error: this error doesn't impact parsing. + if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { + // @todo Record parse error: this error doesn't impact parsing. + } + $this->state->stack_of_open_elements->pop_until( $token_name ); + return true; + + /* + * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" + */ + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + if ( + in_array( + $this->state->stack_of_open_elements->current_node()->node_name, + array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), + true + ) + ) { + // @todo Indicate a parse error once it's possible. + $this->state->stack_of_open_elements->pop(); + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is one of: "pre", "listing" + */ + case '+PRE': + case '+LISTING': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + return true; + + /* + * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" + */ + case '-H1': + case '-H2': + case '-H3': + case '-H4': + case '-H5': + case '-H6': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) { + /* + * This is a parse error; ignore the token. + * + * @todo Indicate a parse error once it's possible. + */ + return $this->step(); + } + + $this->generate_implied_end_tags(); + + if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { + // @todo Record parse error: this error doesn't impact parsing. } - $this->state->stack_of_open_elements->pop_until( $tag_name ); + + $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' ); + return true; + + /* + * > A start tag whose tag name is "li" + * > A start tag whose tag name is one of: "dd", "dt" + */ + case '+DD': + case '+DT': + case '+LI': + $this->state->frameset_ok = false; + $node = $this->state->stack_of_open_elements->current_node(); + $is_li = 'LI' === $token_name; + + in_body_list_loop: + /* + * The logic for LI and DT/DD is the same except for one point: LI elements _only_ + * close other LI elements, but a DT or DD element closes _any_ open DT or DD element. + */ + if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) { + $node_name = $is_li ? 'LI' : $node->node_name; + $this->generate_implied_end_tags( $node_name ); + if ( $node_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + } + + $this->state->stack_of_open_elements->pop_until( $node_name ); + goto in_body_list_done; + } + + if ( + 'ADDRESS' !== $node->node_name && + 'DIV' !== $node->node_name && + 'P' !== $node->node_name && + $this->is_special( $node->node_name ) + ) { + /* + * > If node is in the special category, but is not an address, div, + * > or p element, then jump to the step labeled done below. + */ + goto in_body_list_done; + } else { + /* + * > Otherwise, set node to the previous entry in the stack of open elements + * > and return to the step labeled loop. + */ + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + goto in_body_list_loop; + } + + in_body_list_done: + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "li" + * > An end tag whose tag name is one of: "dd", "dt" + */ + case '-DD': + case '-DT': + case '-LI': + if ( + /* + * An end tag whose tag name is "li": + * If the stack of open elements does not have an li element in list item scope, + * then this is a parse error; ignore the token. + */ + ( + 'LI' === $token_name && + ! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' ) + ) || + /* + * An end tag whose tag name is one of: "dd", "dt": + * If the stack of open elements does not have an element in scope that is an + * HTML element with the same tag name as that of the token, then this is a + * parse error; ignore the token. + */ + ( + 'LI' !== $token_name && + ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) + ) + ) { + /* + * This is a parse error, ignore the token. + * + * @todo Indicate a parse error once it's possible. + */ + return $this->step(); + } + + $this->generate_implied_end_tags( $token_name ); + + if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + } + + $this->state->stack_of_open_elements->pop_until( $token_name ); return true; /* @@ -730,47 +1024,174 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return true; /* + * > An end tag whose tag name is "br" + * > Parse error. Drop the attributes from the token, and act as described in the next + * > entry; i.e. act as if this was a "br" start tag token with no attributes, rather + * > than the end tag token that it actually is. + */ + case '-BR': + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' ); + + /* * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr" */ + case '+AREA': + case '+BR': + case '+EMBED': case '+IMG': + case '+KEYGEN': + case '+WBR': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; return true; /* - * > Any other start tag + * > A start tag whose tag name is "input" */ - case '+SPAN': + case '+INPUT': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); + $type_attribute = $this->get_attribute( 'type' ); + /* + * > If the token does not have an attribute with the name "type", or if it does, + * > but that attribute's value is not an ASCII case-insensitive match for the + * > string "hidden", then: set the frameset-ok flag to "not ok". + */ + if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) { + $this->state->frameset_ok = false; + } return true; /* - * Any other end tag + * > A start tag whose tag name is "hr" */ - case '-SPAN': - foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { - // > If node is an HTML element with the same tag name as the token, then: - if ( $item->node_name === $tag_name ) { - $this->generate_implied_end_tags( $tag_name ); + case '+HR': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + return true; - // > If node is not the current node, then this is a parse error. + /* + * > A start tag whose tag name is one of: "param", "source", "track" + */ + case '+PARAM': + case '+SOURCE': + case '+TRACK': + $this->insert_html_element( $this->state->current_token ); + return true; + } - $this->state->stack_of_open_elements->pop_until( $tag_name ); - return true; - } + /* + * These tags require special handling in the 'in body' insertion mode + * but that handling hasn't yet been implemented. + * + * As the rules for each tag are implemented, the corresponding tag + * name should be removed from this list. An accompanying test should + * help ensure this list is maintained. + * + * @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags + * + * Since this switch structure throws a WP_HTML_Unsupported_Exception, it's + * possible to handle "any other start tag" and "any other end tag" below, + * as that guarantees execution doesn't proceed for the unimplemented tags. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + */ + switch ( $token_name ) { + case 'APPLET': + case 'BASE': + case 'BASEFONT': + case 'BGSOUND': + case 'BODY': + case 'CAPTION': + case 'COL': + case 'COLGROUP': + case 'FORM': + case 'FRAME': + case 'FRAMESET': + case 'HEAD': + case 'HTML': + case 'IFRAME': + case 'LINK': + case 'MARQUEE': + case 'MATH': + case 'META': + case 'NOBR': + case 'NOEMBED': + case 'NOFRAMES': + case 'NOSCRIPT': + case 'OBJECT': + case 'OPTGROUP': + case 'OPTION': + case 'PLAINTEXT': + case 'RB': + case 'RP': + case 'RT': + case 'RTC': + case 'SARCASM': + case 'SCRIPT': + case 'SELECT': + case 'STYLE': + case 'SVG': + case 'TABLE': + case 'TBODY': + case 'TD': + case 'TEMPLATE': + case 'TEXTAREA': + case 'TFOOT': + case 'TH': + case 'THEAD': + case 'TITLE': + case 'TR': + case 'XMP': + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." ); + } - // > Otherwise, if node is in the special category, then this is a parse error; ignore the token, and return. - if ( self::is_special( $item->node_name ) ) { - return $this->step(); - } + if ( ! $this->is_tag_closer() ) { + /* + * > Any other start tag + */ + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + return true; + } else { + /* + * > Any other end tag + */ + + /* + * Find the corresponding tag opener in the stack of open elements, if + * it exists before reaching a special element, which provides a kind + * of boundary in the stack. For example, a `</custom-tag>` should not + * close anything beyond its containing `P` or `DIV` element. + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + if ( $token_name === $node->node_name ) { + break; } - // Execution should not reach here; if it does then something went wrong. - return false; - default: - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." ); + if ( self::is_special( $node->node_name ) ) { + // This is a parse error, ignore the token. + return $this->step(); + } + } + + $this->generate_implied_end_tags( $token_name ); + if ( $node !== $this->state->stack_of_open_elements->current_node() ) { + // @todo Record parse error: this error doesn't impact parsing. + } + + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + if ( $node === $item ) { + return true; + } + } } } @@ -779,19 +1200,16 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ /** - * Creates a new bookmark for the currently-matched tag and returns the generated name. + * Creates a new bookmark for the currently-matched token and returns the generated name. * * @since 6.4.0 + * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token(). * * @throws Exception When unable to allocate requested bookmark. * * @return string|false Name of created bookmark, or false if unable to create. */ - private function bookmark_tag() { - if ( ! $this->get_tag() ) { - return false; - } - + private function bookmark_token() { if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) { $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS; throw new Exception( 'could not allocate bookmark' ); @@ -863,6 +1281,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** * Moves the internal cursor in the HTML Processor to a given bookmark's location. * + * Be careful! Seeking backwards to a previous location resets the parser to the + * start of the document and reparses the entire contents up until it finds the + * sought-after bookmarked location. + * * In order to prevent accidental infinite loops, there's a * maximum limit on the number of times seek() can be called. * @@ -874,6 +1296,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ public function seek( $bookmark_name ) { + // Flush any pending updates to the document before beginning. + $this->get_updated_html(); + $actual_bookmark_name = "_{$bookmark_name}"; $processor_started_at = $this->state->current_token ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start @@ -881,44 +1306,73 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; - switch ( $direction ) { - case 'forward': - // When moving forwards, re-parse the document until reaching the same location as the original bookmark. - while ( $this->step() ) { - if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { - return true; - } + /* + * If seeking backwards, it's possible that the sought-after bookmark exists within an element + * which has been closed before the current cursor; in other words, it has already been removed + * from the stack of open elements. This means that it's insufficient to simply pop off elements + * from the stack of open elements which appear after the bookmarked location and then jump to + * that location, as the elements which were open before won't be re-opened. + * + * In order to maintain consistency, the HTML Processor rewinds to the start of the document + * and reparses everything until it finds the sought-after bookmark. + * + * There are potentially better ways to do this: cache the parser state for each bookmark and + * restore it when seeking; store an immutable and idempotent register of where elements open + * and close. + * + * If caching the parser state it will be essential to properly maintain the cached stack of + * open elements and active formatting elements when modifying the document. This could be a + * tedious and time-consuming process as well, and so for now will not be performed. + * + * It may be possible to track bookmarks for where elements open and close, and in doing so + * be able to quickly recalculate breadcrumbs for any element in the document. It may even + * be possible to remove the stack of open elements and compute it on the fly this way. + * If doing this, the parser would need to track the opening and closing locations for all + * tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves + * this list could be automatically maintained while modifying the document. Finding the + * breadcrumbs would then amount to traversing that list from the start until the token + * being inspected. Once an element closes, if there are no bookmarks pointing to locations + * within that element, then all of these locations may be forgotten to save on memory use + * and computation time. + */ + if ( 'backward' === $direction ) { + /* + * Instead of clearing the parser state and starting fresh, calling the stack methods + * maintains the proper flags in the parser. + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + if ( 'context-node' === $item->bookmark_name ) { + break; } - return false; - - case 'backward': - /* - * When moving backwards, clear out all existing stack entries which appear after the destination - * bookmark. These could be stored for later retrieval, but doing so would require additional - * memory overhead and also demand that references and bookmarks are updated as the document - * changes. In time this could be a valuable optimization, but it's okay to give up that - * optimization in exchange for more CPU time to recompute the stack, to re-parse the - * document that may have already been parsed once. - */ - foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { - if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) { - break; - } + $this->state->stack_of_open_elements->remove_node( $item ); + } - $this->state->stack_of_open_elements->remove_node( $item ); + foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { + if ( 'context-node' === $item->bookmark_name ) { + break; } - foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { - if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) { - break; - } + $this->state->active_formatting_elements->remove_node( $item ); + } - $this->state->active_formatting_elements->remove_node( $item ); - } + parent::seek( 'context-node' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + $this->state->frameset_ok = true; + } - return parent::seek( $actual_bookmark_name ); + // When moving forwards, reparse the document until reaching the same location as the original bookmark. + if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { + return true; } + + while ( $this->step() ) { + if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { + return true; + } + } + + return false; } /** @@ -1005,6 +1459,18 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return parent::set_bookmark( "_{$bookmark_name}" ); } + /** + * Checks whether a bookmark with the given name exists. + * + * @since 6.5.0 + * + * @param string $bookmark_name Name to identify a bookmark that potentially exists. + * @return bool Whether that bookmark exists. + */ + public function has_bookmark( $bookmark_name ) { + return parent::has_bookmark( "_{$bookmark_name}" ); + } + /* * HTML Parsing Algorithms */ @@ -1034,6 +1500,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private function generate_implied_end_tags( $except_for_this_element = null ) { $elements_with_implied_end_tags = array( + 'DD', + 'DT', + 'LI', 'P', ); @@ -1059,6 +1528,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private function generate_implied_end_tags_thoroughly() { $elements_with_implied_end_tags = array( + 'DD', + 'DT', + 'LI', 'P', ); @@ -1170,7 +1642,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) { - $this->state->active_formatting_elements->remove_node( $formatting_element->bookmark_name ); + $this->state->active_formatting_elements->remove_node( $formatting_element ); return; } @@ -1373,14 +1845,19 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return ( 'AREA' === $tag_name || 'BASE' === $tag_name || + 'BASEFONT' === $tag_name || // Obsolete but still treated as void. + 'BGSOUND' === $tag_name || // Obsolete but still treated as void. 'BR' === $tag_name || 'COL' === $tag_name || 'EMBED' === $tag_name || + 'FRAME' === $tag_name || 'HR' === $tag_name || 'IMG' === $tag_name || 'INPUT' === $tag_name || + 'KEYGEN' === $tag_name || // Obsolete but still treated as void. 'LINK' === $tag_name || 'META' === $tag_name || + 'PARAM' === $tag_name || // Obsolete but still treated as void. 'SOURCE' === $tag_name || 'TRACK' === $tag_name || 'WBR' === $tag_name @@ -1410,6 +1887,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { const REPROCESS_CURRENT_NODE = 'reprocess-current-node'; /** + * Indicates that the current HTML token should be processed without advancing the parser. + * + * @since 6.5.0 + * + * @var string + */ + const PROCESS_CURRENT_NODE = 'process-current-node'; + + /** * Indicates that the parser encountered unsupported markup and has bailed. * * @since 6.4.0 |