diff options
Diffstat (limited to 'wp-includes/html-api')
7 files changed, 1992 insertions, 345 deletions
diff --git a/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/wp-includes/html-api/class-wp-html-active-formatting-elements.php index 9598991..9f7fee9 100644 --- a/wp-includes/html-api/class-wp-html-active-formatting-elements.php +++ b/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -105,7 +105,7 @@ class WP_HTML_Active_Formatting_Elements { * > paired such that the two attributes in each pair have identical names, namespaces, and values * > (the order of the attributes does not matter). * - * @TODO: Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack. + * @todo Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack. */ // > Add element to the list of active formatting elements. $this->stack[] = $token; diff --git a/wp-includes/html-api/class-wp-html-attribute-token.php b/wp-includes/html-api/class-wp-html-attribute-token.php index f938609..74d4132 100644 --- a/wp-includes/html-api/class-wp-html-attribute-token.php +++ b/wp-includes/html-api/class-wp-html-attribute-token.php @@ -15,6 +15,7 @@ * * @access private * @since 6.2.0 + * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`. * * @see WP_HTML_Tag_Processor */ @@ -23,6 +24,7 @@ class WP_HTML_Attribute_Token { * Attribute name. * * @since 6.2.0 + * * @var string */ public $name; @@ -31,6 +33,7 @@ class WP_HTML_Attribute_Token { * Attribute value. * * @since 6.2.0 + * * @var int */ public $value_starts_at; @@ -39,6 +42,7 @@ class WP_HTML_Attribute_Token { * How many bytes the value occupies in the input HTML. * * @since 6.2.0 + * * @var int */ public $value_length; @@ -47,22 +51,43 @@ class WP_HTML_Attribute_Token { * The string offset where the attribute name starts. * * @since 6.2.0 + * * @var int */ public $start; /** - * The string offset after the attribute value or its name. + * Byte length of text spanning the attribute inside a tag. + * + * This span starts at the first character of the attribute name + * and it ends after one of three cases: + * + * - at the end of the attribute name for boolean attributes. + * - at the end of the value for unquoted attributes. + * - at the final single or double quote for quoted attributes. + * + * Example: + * + * <div class="post"> + * ------------ length is 12, including quotes + * + * <input type="checked" checked id="selector"> + * ------- length is 6 + * + * <a rel=noopener> + * ------------ length is 11 + * + * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`. * - * @since 6.2.0 * @var int */ - public $end; + public $length; /** * Whether the attribute is a boolean attribute with value `true`. * * @since 6.2.0 + * * @var bool */ public $is_true; @@ -71,20 +96,21 @@ class WP_HTML_Attribute_Token { * Constructor. * * @since 6.2.0 + * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`. * * @param string $name Attribute name. * @param int $value_start Attribute value. * @param int $value_length Number of bytes attribute value spans. * @param int $start The string offset where the attribute name starts. - * @param int $end The string offset after the attribute value or its name. + * @param int $length Byte length of the entire attribute name or name and value pair expression. * @param bool $is_true Whether the attribute is a boolean attribute with true value. */ - public function __construct( $name, $value_start, $value_length, $start, $end, $is_true ) { + public function __construct( $name, $value_start, $value_length, $start, $length, $is_true ) { $this->name = $name; $this->value_starts_at = $value_start; $this->value_length = $value_length; $this->start = $start; - $this->end = $end; + $this->length = $length; $this->is_true = $is_true; } } diff --git a/wp-includes/html-api/class-wp-html-open-elements.php b/wp-includes/html-api/class-wp-html-open-elements.php index fe56255..1234abc 100644 --- a/wp-includes/html-api/class-wp-html-open-elements.php +++ b/wp-includes/html-api/class-wp-html-open-elements.php @@ -116,13 +116,20 @@ class WP_HTML_Open_Elements { return true; } + if ( + '(internal: H1 through H6 - do not use)' === $tag_name && + in_array( $node->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true ) + ) { + return true; + } + switch ( $node->node_name ) { case 'HTML': return false; } if ( in_array( $node->node_name, $termination_list, true ) ) { - return true; + return false; } } @@ -159,18 +166,22 @@ class WP_HTML_Open_Elements { * Returns whether a particular element is in list item scope. * * @since 6.4.0 + * @since 6.5.0 Implemented: no longer throws on every invocation. * * @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope * - * @throws WP_HTML_Unsupported_Exception Always until this function is implemented. - * * @param string $tag_name Name of tag to check. * @return bool Whether given element is in scope. */ public function has_element_in_list_item_scope( $tag_name ) { - throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on list item scope.' ); - - return false; // The linter requires this unreachable code until the function is implemented and can return. + return $this->has_element_in_specific_scope( + $tag_name, + array( + // There are more elements that belong here which aren't currently supported. + 'OL', + 'UL', + ) + ); } /** @@ -270,6 +281,13 @@ class WP_HTML_Open_Elements { foreach ( $this->walk_up() as $item ) { $this->pop(); + if ( + '(internal: H1 through H6 - do not use)' === $tag_name && + in_array( $item->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true ) + ) { + return true; + } + if ( $tag_name === $item->node_name ) { return true; } @@ -361,10 +379,22 @@ class WP_HTML_Open_Elements { * see WP_HTML_Open_Elements::walk_down(). * * @since 6.4.0 + * @since 6.5.0 Accepts $above_this_node to start traversal above a given node, if it exists. + * + * @param ?WP_HTML_Token $above_this_node Start traversing above this node, if provided and if the node exists. */ - public function walk_up() { + public function walk_up( $above_this_node = null ) { + $has_found_node = null === $above_this_node; + for ( $i = count( $this->stack ) - 1; $i >= 0; $i-- ) { - yield $this->stack[ $i ]; + $node = $this->stack[ $i ]; + + if ( ! $has_found_node ) { + $has_found_node = $node === $above_this_node; + continue; + } + + yield $node; } } diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php index f27f83b..c76cc19 100644 --- a/wp-includes/html-api/class-wp-html-processor.php +++ b/wp-includes/html-api/class-wp-html-processor.php @@ -99,12 +99,20 @@ * * The following list specifies the HTML tags that _are_ supported: * + * - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY. + * - Custom elements: All custom elements are supported. :) + * - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH. + * - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR. + * - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP. * - Links: A. - * - The formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U. - * - Containers: DIV, FIGCAPTION, FIGURE, SPAN. - * - Form elements: BUTTON. - * - Paragraph: P. - * - Void elements: IMG. + * - Lists: DD, DL, DT, LI, OL, UL. + * - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO. + * - Paragraph: BR, P. + * - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR. + * - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION. + * - Templating elements: SLOT. + * - Text decoration: RUBY. + * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER. * * ### Supported markup * @@ -142,17 +150,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { const MAX_BOOKMARKS = 100; /** - * Static query for instructing the Tag Processor to visit every token. - * - * @access private - * - * @since 6.4.0 - * - * @var array - */ - const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' ); - - /** * Holds the working state of the parser, including the stack of * open elements and the stack of active formatting elements. * @@ -244,15 +241,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return null; } - $p = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE ); - $p->state->context_node = array( 'BODY', array() ); - $p->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + $processor = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE ); + $processor->state->context_node = array( 'BODY', array() ); + $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; - // @TODO: Create "fake" bookmarks for non-existent but implied nodes. - $p->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); - $p->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); + // @todo Create "fake" bookmarks for non-existent but implied nodes. + $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); + $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); - $p->state->stack_of_open_elements->push( + $processor->state->stack_of_open_elements->push( new WP_HTML_Token( 'root-node', 'HTML', @@ -260,15 +257,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { ) ); - $p->state->stack_of_open_elements->push( + $processor->state->stack_of_open_elements->push( new WP_HTML_Token( 'context-node', - $p->state->context_node[0], + $processor->state->context_node[0], false ) ); - return $p; + return $processor; } /** @@ -342,7 +339,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** * Finds the next tag matching the $query. * - * @TODO: Support matching the class name and tag name. + * @todo Support matching the class name and tag name. * * @since 6.4.0 * @@ -364,6 +361,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function next_tag( $query = null ) { if ( null === $query ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -387,6 +388,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -408,6 +413,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1; while ( $match_offset > 0 && $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) { return true; } @@ -417,6 +426,24 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { } /** + * Ensures internal accounting is maintained for HTML semantic rules while + * the underlying Tag Processor class is seeking to a bookmark. + * + * This doesn't currently have a way to represent non-tags and doesn't process + * semantic rules for text nodes. For access to the raw tokens consider using + * WP_HTML_Tag_Processor instead. + * + * @since 6.5.0 Added for internal support; do not use. + * + * @access private + * + * @return bool + */ + public function next_token() { + return $this->step(); + } + + /** * Indicates if the currently-matched tag matches the given breadcrumbs. * * A "*" represents a single tag wildcard, where any tag matches, but not no tags. @@ -442,10 +469,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether the currently-matched tag is found at the given nested structure. */ public function matches_breadcrumbs( $breadcrumbs ) { - if ( ! $this->get_tag() ) { - return false; - } - // Everything matches when there are zero constraints. if ( 0 === count( $breadcrumbs ) ) { return true; @@ -492,7 +515,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return false; } - if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) { /* * Void elements still hop onto the stack of open elements even though * there's no corresponding closing tag. This is important for managing @@ -502,28 +525,42 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * When moving on to the next node, therefore, if the bottom-most element * on the stack is a void element, it must be closed. * - * @TODO: Once self-closing foreign elements and BGSOUND are supported, + * @todo Once self-closing foreign elements and BGSOUND are supported, * they must also be implicitly closed here too. BGSOUND is * special since it's only self-closing if the self-closing flag * is provided in the opening tag, otherwise it expects a tag closer. */ $top_node = $this->state->stack_of_open_elements->current_node(); - if ( $top_node && self::is_void( $top_node->node_name ) ) { + if ( + $top_node && ( + // Void elements. + self::is_void( $top_node->node_name ) || + // Comments, text nodes, and other atomic tokens. + '#' === $top_node->node_name[0] || + // Doctype declarations. + 'html' === $top_node->node_name + ) + ) { $this->state->stack_of_open_elements->pop(); } + } - parent::next_tag( self::VISIT_EVERYTHING ); + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + parent::next_token(); } // Finish stepping when there are no more tokens in the document. - if ( null === $this->get_tag() ) { + if ( + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state + ) { return false; } $this->state->current_token = new WP_HTML_Token( - $this->bookmark_tag(), - $this->get_tag(), - $this->is_tag_closer(), + $this->bookmark_token(), + $this->get_token_name(), + $this->has_self_closing_flag(), $this->release_internal_bookmark_on_destruct ); @@ -551,9 +588,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * Breadcrumbs start at the outermost parent and descend toward the matched element. * They always include the entire path from the root HTML node to the matched element. * - * @TODO: It could be more efficient to expose a generator-based version of this function - * to avoid creating the array copy on tag iteration. If this is done, it would likely - * be more useful to walk up the stack when yielding instead of starting at the top. + * @todo It could be more efficient to expose a generator-based version of this function + * to avoid creating the array copy on tag iteration. If this is done, it would likely + * be more useful to walk up the stack when yielding instead of starting at the top. * * Example * @@ -566,10 +603,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - if ( ! $this->get_tag() ) { - return null; - } - $breadcrumbs = array(); foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { $breadcrumbs[] = $stack_item->node_name; @@ -594,17 +627,67 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether an element was found. */ private function step_in_body() { - $tag_name = $this->get_tag(); - $op_sigil = $this->is_tag_closer() ? '-' : '+'; - $op = "{$op_sigil}{$tag_name}"; + $token_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$token_name}"; switch ( $op ) { + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + case '#text': + $this->reconstruct_active_formatting_elements(); + + $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ]; + + /* + * > A character token that is U+0000 NULL + * + * Any successive sequence of NULL bytes is ignored and won't + * trigger active format reconstruction. Therefore, if the text + * only comprises NULL bytes then the token should be ignored + * here, but if there are any other characters in the stream + * the active formats should be reconstructed. + */ + if ( + 1 <= $current_token->length && + "\x00" === $this->html[ $current_token->start ] && + strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length + ) { + // Parse error: ignore the token. + return $this->step(); + } + + /* + * Whitespace-only text does not affect the frameset-ok flag. + * It is probably inter-element whitespace, but it may also + * contain character references which decode only to whitespace. + */ + $text = $this->get_modifiable_text(); + if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) { + $this->state->frameset_ok = false; + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + /* + * > A DOCTYPE token + * > Parse error. Ignore the token. + */ + return $this->step(); + /* * > A start tag whose tag name is "button" */ case '+BUTTON': if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) { - // @TODO: Indicate a parse error once it's possible. This error does not impact the logic here. + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. $this->generate_implied_end_tags(); $this->state->stack_of_open_elements->pop_until( 'BUTTON' ); } @@ -621,11 +704,31 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * > "fieldset", "figcaption", "figure", "footer", "header", "hgroup", * > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul" */ + case '+ADDRESS': + case '+ARTICLE': + case '+ASIDE': case '+BLOCKQUOTE': + case '+CENTER': + case '+DETAILS': + case '+DIALOG': + case '+DIR': case '+DIV': + case '+DL': + case '+FIELDSET': case '+FIGCAPTION': case '+FIGURE': + case '+FOOTER': + case '+HEADER': + case '+HGROUP': + case '+MAIN': + case '+MENU': + case '+NAV': + case '+OL': case '+P': + case '+SEARCH': + case '+SECTION': + case '+SUMMARY': + case '+UL': if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { $this->close_a_p_element(); } @@ -639,22 +742,213 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul" */ + case '-ADDRESS': + case '-ARTICLE': + case '-ASIDE': case '-BLOCKQUOTE': case '-BUTTON': + case '-CENTER': + case '-DETAILS': + case '-DIALOG': + case '-DIR': case '-DIV': + case '-DL': + case '-FIELDSET': case '-FIGCAPTION': case '-FIGURE': - if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) { - // @TODO: Report parse error. + case '-FOOTER': + case '-HEADER': + case '-HGROUP': + case '-LISTING': + case '-MAIN': + case '-MENU': + case '-NAV': + case '-OL': + case '-PRE': + case '-SEARCH': + case '-SECTION': + case '-SUMMARY': + case '-UL': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) { + // @todo Report parse error. // Ignore the token. return $this->step(); } $this->generate_implied_end_tags(); - if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) { - // @TODO: Record parse error: this error doesn't impact parsing. + if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { + // @todo Record parse error: this error doesn't impact parsing. + } + $this->state->stack_of_open_elements->pop_until( $token_name ); + return true; + + /* + * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" + */ + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + if ( + in_array( + $this->state->stack_of_open_elements->current_node()->node_name, + array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), + true + ) + ) { + // @todo Indicate a parse error once it's possible. + $this->state->stack_of_open_elements->pop(); + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is one of: "pre", "listing" + */ + case '+PRE': + case '+LISTING': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + return true; + + /* + * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6" + */ + case '-H1': + case '-H2': + case '-H3': + case '-H4': + case '-H5': + case '-H6': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) { + /* + * This is a parse error; ignore the token. + * + * @todo Indicate a parse error once it's possible. + */ + return $this->step(); + } + + $this->generate_implied_end_tags(); + + if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) { + // @todo Record parse error: this error doesn't impact parsing. } - $this->state->stack_of_open_elements->pop_until( $tag_name ); + + $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' ); + return true; + + /* + * > A start tag whose tag name is "li" + * > A start tag whose tag name is one of: "dd", "dt" + */ + case '+DD': + case '+DT': + case '+LI': + $this->state->frameset_ok = false; + $node = $this->state->stack_of_open_elements->current_node(); + $is_li = 'LI' === $token_name; + + in_body_list_loop: + /* + * The logic for LI and DT/DD is the same except for one point: LI elements _only_ + * close other LI elements, but a DT or DD element closes _any_ open DT or DD element. + */ + if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) { + $node_name = $is_li ? 'LI' : $node->node_name; + $this->generate_implied_end_tags( $node_name ); + if ( $node_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + } + + $this->state->stack_of_open_elements->pop_until( $node_name ); + goto in_body_list_done; + } + + if ( + 'ADDRESS' !== $node->node_name && + 'DIV' !== $node->node_name && + 'P' !== $node->node_name && + $this->is_special( $node->node_name ) + ) { + /* + * > If node is in the special category, but is not an address, div, + * > or p element, then jump to the step labeled done below. + */ + goto in_body_list_done; + } else { + /* + * > Otherwise, set node to the previous entry in the stack of open elements + * > and return to the step labeled loop. + */ + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + goto in_body_list_loop; + } + + in_body_list_done: + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "li" + * > An end tag whose tag name is one of: "dd", "dt" + */ + case '-DD': + case '-DT': + case '-LI': + if ( + /* + * An end tag whose tag name is "li": + * If the stack of open elements does not have an li element in list item scope, + * then this is a parse error; ignore the token. + */ + ( + 'LI' === $token_name && + ! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' ) + ) || + /* + * An end tag whose tag name is one of: "dd", "dt": + * If the stack of open elements does not have an element in scope that is an + * HTML element with the same tag name as that of the token, then this is a + * parse error; ignore the token. + */ + ( + 'LI' !== $token_name && + ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) + ) + ) { + /* + * This is a parse error, ignore the token. + * + * @todo Indicate a parse error once it's possible. + */ + return $this->step(); + } + + $this->generate_implied_end_tags( $token_name ); + + if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) { + // @todo Indicate a parse error once it's possible. This error does not impact the logic here. + } + + $this->state->stack_of_open_elements->pop_until( $token_name ); return true; /* @@ -730,47 +1024,174 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return true; /* + * > An end tag whose tag name is "br" + * > Parse error. Drop the attributes from the token, and act as described in the next + * > entry; i.e. act as if this was a "br" start tag token with no attributes, rather + * > than the end tag token that it actually is. + */ + case '-BR': + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' ); + + /* * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr" */ + case '+AREA': + case '+BR': + case '+EMBED': case '+IMG': + case '+KEYGEN': + case '+WBR': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; return true; /* - * > Any other start tag + * > A start tag whose tag name is "input" */ - case '+SPAN': + case '+INPUT': $this->reconstruct_active_formatting_elements(); $this->insert_html_element( $this->state->current_token ); + $type_attribute = $this->get_attribute( 'type' ); + /* + * > If the token does not have an attribute with the name "type", or if it does, + * > but that attribute's value is not an ASCII case-insensitive match for the + * > string "hidden", then: set the frameset-ok flag to "not ok". + */ + if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) { + $this->state->frameset_ok = false; + } return true; /* - * Any other end tag + * > A start tag whose tag name is "hr" */ - case '-SPAN': - foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { - // > If node is an HTML element with the same tag name as the token, then: - if ( $item->node_name === $tag_name ) { - $this->generate_implied_end_tags( $tag_name ); + case '+HR': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + return true; - // > If node is not the current node, then this is a parse error. + /* + * > A start tag whose tag name is one of: "param", "source", "track" + */ + case '+PARAM': + case '+SOURCE': + case '+TRACK': + $this->insert_html_element( $this->state->current_token ); + return true; + } - $this->state->stack_of_open_elements->pop_until( $tag_name ); - return true; - } + /* + * These tags require special handling in the 'in body' insertion mode + * but that handling hasn't yet been implemented. + * + * As the rules for each tag are implemented, the corresponding tag + * name should be removed from this list. An accompanying test should + * help ensure this list is maintained. + * + * @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags + * + * Since this switch structure throws a WP_HTML_Unsupported_Exception, it's + * possible to handle "any other start tag" and "any other end tag" below, + * as that guarantees execution doesn't proceed for the unimplemented tags. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + */ + switch ( $token_name ) { + case 'APPLET': + case 'BASE': + case 'BASEFONT': + case 'BGSOUND': + case 'BODY': + case 'CAPTION': + case 'COL': + case 'COLGROUP': + case 'FORM': + case 'FRAME': + case 'FRAMESET': + case 'HEAD': + case 'HTML': + case 'IFRAME': + case 'LINK': + case 'MARQUEE': + case 'MATH': + case 'META': + case 'NOBR': + case 'NOEMBED': + case 'NOFRAMES': + case 'NOSCRIPT': + case 'OBJECT': + case 'OPTGROUP': + case 'OPTION': + case 'PLAINTEXT': + case 'RB': + case 'RP': + case 'RT': + case 'RTC': + case 'SARCASM': + case 'SCRIPT': + case 'SELECT': + case 'STYLE': + case 'SVG': + case 'TABLE': + case 'TBODY': + case 'TD': + case 'TEMPLATE': + case 'TEXTAREA': + case 'TFOOT': + case 'TH': + case 'THEAD': + case 'TITLE': + case 'TR': + case 'XMP': + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." ); + } - // > Otherwise, if node is in the special category, then this is a parse error; ignore the token, and return. - if ( self::is_special( $item->node_name ) ) { - return $this->step(); - } + if ( ! $this->is_tag_closer() ) { + /* + * > Any other start tag + */ + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + return true; + } else { + /* + * > Any other end tag + */ + + /* + * Find the corresponding tag opener in the stack of open elements, if + * it exists before reaching a special element, which provides a kind + * of boundary in the stack. For example, a `</custom-tag>` should not + * close anything beyond its containing `P` or `DIV` element. + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + if ( $token_name === $node->node_name ) { + break; } - // Execution should not reach here; if it does then something went wrong. - return false; - default: - $this->last_error = self::ERROR_UNSUPPORTED; - throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." ); + if ( self::is_special( $node->node_name ) ) { + // This is a parse error, ignore the token. + return $this->step(); + } + } + + $this->generate_implied_end_tags( $token_name ); + if ( $node !== $this->state->stack_of_open_elements->current_node() ) { + // @todo Record parse error: this error doesn't impact parsing. + } + + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + if ( $node === $item ) { + return true; + } + } } } @@ -779,19 +1200,16 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ /** - * Creates a new bookmark for the currently-matched tag and returns the generated name. + * Creates a new bookmark for the currently-matched token and returns the generated name. * * @since 6.4.0 + * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token(). * * @throws Exception When unable to allocate requested bookmark. * * @return string|false Name of created bookmark, or false if unable to create. */ - private function bookmark_tag() { - if ( ! $this->get_tag() ) { - return false; - } - + private function bookmark_token() { if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) { $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS; throw new Exception( 'could not allocate bookmark' ); @@ -863,6 +1281,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** * Moves the internal cursor in the HTML Processor to a given bookmark's location. * + * Be careful! Seeking backwards to a previous location resets the parser to the + * start of the document and reparses the entire contents up until it finds the + * sought-after bookmarked location. + * * In order to prevent accidental infinite loops, there's a * maximum limit on the number of times seek() can be called. * @@ -874,6 +1296,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ public function seek( $bookmark_name ) { + // Flush any pending updates to the document before beginning. + $this->get_updated_html(); + $actual_bookmark_name = "_{$bookmark_name}"; $processor_started_at = $this->state->current_token ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start @@ -881,44 +1306,73 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; - switch ( $direction ) { - case 'forward': - // When moving forwards, re-parse the document until reaching the same location as the original bookmark. - while ( $this->step() ) { - if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { - return true; - } + /* + * If seeking backwards, it's possible that the sought-after bookmark exists within an element + * which has been closed before the current cursor; in other words, it has already been removed + * from the stack of open elements. This means that it's insufficient to simply pop off elements + * from the stack of open elements which appear after the bookmarked location and then jump to + * that location, as the elements which were open before won't be re-opened. + * + * In order to maintain consistency, the HTML Processor rewinds to the start of the document + * and reparses everything until it finds the sought-after bookmark. + * + * There are potentially better ways to do this: cache the parser state for each bookmark and + * restore it when seeking; store an immutable and idempotent register of where elements open + * and close. + * + * If caching the parser state it will be essential to properly maintain the cached stack of + * open elements and active formatting elements when modifying the document. This could be a + * tedious and time-consuming process as well, and so for now will not be performed. + * + * It may be possible to track bookmarks for where elements open and close, and in doing so + * be able to quickly recalculate breadcrumbs for any element in the document. It may even + * be possible to remove the stack of open elements and compute it on the fly this way. + * If doing this, the parser would need to track the opening and closing locations for all + * tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves + * this list could be automatically maintained while modifying the document. Finding the + * breadcrumbs would then amount to traversing that list from the start until the token + * being inspected. Once an element closes, if there are no bookmarks pointing to locations + * within that element, then all of these locations may be forgotten to save on memory use + * and computation time. + */ + if ( 'backward' === $direction ) { + /* + * Instead of clearing the parser state and starting fresh, calling the stack methods + * maintains the proper flags in the parser. + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + if ( 'context-node' === $item->bookmark_name ) { + break; } - return false; - - case 'backward': - /* - * When moving backwards, clear out all existing stack entries which appear after the destination - * bookmark. These could be stored for later retrieval, but doing so would require additional - * memory overhead and also demand that references and bookmarks are updated as the document - * changes. In time this could be a valuable optimization, but it's okay to give up that - * optimization in exchange for more CPU time to recompute the stack, to re-parse the - * document that may have already been parsed once. - */ - foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { - if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) { - break; - } + $this->state->stack_of_open_elements->remove_node( $item ); + } - $this->state->stack_of_open_elements->remove_node( $item ); + foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { + if ( 'context-node' === $item->bookmark_name ) { + break; } - foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { - if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) { - break; - } + $this->state->active_formatting_elements->remove_node( $item ); + } - $this->state->active_formatting_elements->remove_node( $item ); - } + parent::seek( 'context-node' ); + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + $this->state->frameset_ok = true; + } - return parent::seek( $actual_bookmark_name ); + // When moving forwards, reparse the document until reaching the same location as the original bookmark. + if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { + return true; } + + while ( $this->step() ) { + if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { + return true; + } + } + + return false; } /** @@ -1005,6 +1459,18 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return parent::set_bookmark( "_{$bookmark_name}" ); } + /** + * Checks whether a bookmark with the given name exists. + * + * @since 6.5.0 + * + * @param string $bookmark_name Name to identify a bookmark that potentially exists. + * @return bool Whether that bookmark exists. + */ + public function has_bookmark( $bookmark_name ) { + return parent::has_bookmark( "_{$bookmark_name}" ); + } + /* * HTML Parsing Algorithms */ @@ -1034,6 +1500,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private function generate_implied_end_tags( $except_for_this_element = null ) { $elements_with_implied_end_tags = array( + 'DD', + 'DT', + 'LI', 'P', ); @@ -1059,6 +1528,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ private function generate_implied_end_tags_thoroughly() { $elements_with_implied_end_tags = array( + 'DD', + 'DT', + 'LI', 'P', ); @@ -1170,7 +1642,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) { - $this->state->active_formatting_elements->remove_node( $formatting_element->bookmark_name ); + $this->state->active_formatting_elements->remove_node( $formatting_element ); return; } @@ -1373,14 +1845,19 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return ( 'AREA' === $tag_name || 'BASE' === $tag_name || + 'BASEFONT' === $tag_name || // Obsolete but still treated as void. + 'BGSOUND' === $tag_name || // Obsolete but still treated as void. 'BR' === $tag_name || 'COL' === $tag_name || 'EMBED' === $tag_name || + 'FRAME' === $tag_name || 'HR' === $tag_name || 'IMG' === $tag_name || 'INPUT' === $tag_name || + 'KEYGEN' === $tag_name || // Obsolete but still treated as void. 'LINK' === $tag_name || 'META' === $tag_name || + 'PARAM' === $tag_name || // Obsolete but still treated as void. 'SOURCE' === $tag_name || 'TRACK' === $tag_name || 'WBR' === $tag_name @@ -1410,6 +1887,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { const REPROCESS_CURRENT_NODE = 'reprocess-current-node'; /** + * Indicates that the current HTML token should be processed without advancing the parser. + * + * @since 6.5.0 + * + * @var string + */ + const PROCESS_CURRENT_NODE = 'process-current-node'; + + /** * Indicates that the parser encountered unsupported markup and has bailed. * * @since 6.4.0 diff --git a/wp-includes/html-api/class-wp-html-span.php b/wp-includes/html-api/class-wp-html-span.php index 46227eb..b1ab865 100644 --- a/wp-includes/html-api/class-wp-html-span.php +++ b/wp-includes/html-api/class-wp-html-span.php @@ -18,6 +18,7 @@ * * @access private * @since 6.2.0 + * @since 6.5.0 Replaced `end` with `length` to more closely align with `substr()`. * * @see WP_HTML_Tag_Processor */ @@ -26,28 +27,30 @@ class WP_HTML_Span { * Byte offset into document where span begins. * * @since 6.2.0 + * * @var int */ public $start; /** - * Byte offset into document where span ends. + * Byte length of this span. + * + * @since 6.5.0 * - * @since 6.2.0 * @var int */ - public $end; + public $length; /** * Constructor. * * @since 6.2.0 * - * @param int $start Byte offset into document where replacement span begins. - * @param int $end Byte offset into document where replacement span ends. + * @param int $start Byte offset into document where replacement span begins. + * @param int $length Byte length of span. */ - public function __construct( $start, $end ) { - $this->start = $start; - $this->end = $end; + public function __construct( $start, $length ) { + $this->start = $start; + $this->length = $length; } } diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php index 0572c46..c540ea9 100644 --- a/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/wp-includes/html-api/class-wp-html-tag-processor.php @@ -15,9 +15,6 @@ * - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c". * This would increase the size of the changes for some operations but leave more * natural-looking output HTML. - * - Decode HTML character references within class names when matching. E.g. match having - * class `1<"2` needs to recognize `class="1<"2"`. Currently the Tag Processor - * will fail to find the right tag if the class name is encoded as such. * - Properly decode HTML character references in `get_attribute()`. PHP's * `html_entity_decode()` is wrong in a couple ways: it doesn't account for the * no-ambiguous-ampersand rule, and it improperly handles the way semicolons may @@ -107,6 +104,56 @@ * given, it will return `true` (the only way to set `false` for an * attribute is to remove it). * + * #### When matching fails + * + * When `next_tag()` returns `false` it could mean different things: + * + * - The requested tag wasn't found in the input document. + * - The input document ended in the middle of an HTML syntax element. + * + * When a document ends in the middle of a syntax element it will pause + * the processor. This is to make it possible in the future to extend the + * input document and proceed - an important requirement for chunked + * streaming parsing of a document. + * + * Example: + * + * $processor = new WP_HTML_Tag_Processor( 'This <div is="a" partial="token' ); + * false === $processor->next_tag(); + * + * If a special element (see next section) is encountered but no closing tag + * is found it will count as an incomplete tag. The parser will pause as if + * the opening tag were incomplete. + * + * Example: + * + * $processor = new WP_HTML_Tag_Processor( '<style>// there could be more styling to come' ); + * false === $processor->next_tag(); + * + * $processor = new WP_HTML_Tag_Processor( '<style>// this is everything</style><div>' ); + * true === $processor->next_tag( 'DIV' ); + * + * #### Special elements + * + * Some HTML elements are handled in a special way; their start and end tags + * act like a void tag. These are special because their contents can't contain + * HTML markup. Everything inside these elements is handled in a special way + * and content that _appears_ like HTML tags inside of them isn't. There can + * be no nesting in these elements. + * + * In the following list, "raw text" means that all of the content in the HTML + * until the matching closing tag is treated verbatim without any replacements + * and without any parsing. + * + * - IFRAME allows no content but requires a closing tag. + * - NOEMBED (deprecated) content is raw text. + * - NOFRAMES (deprecated) content is raw text. + * - SCRIPT content is plaintext apart from legacy rules allowing `</script>` inside an HTML comment. + * - STYLE content is raw text. + * - TITLE content is plain text but character references are decoded. + * - TEXTAREA content is plain text but character references are decoded. + * - XMP (deprecated) content is raw text. + * * ### Modifying HTML attributes for a found tag * * Once you've found the start of an opening tag you can modify @@ -200,6 +247,95 @@ * } * } * + * ## Tokens and finer-grained processing. + * + * It's possible to scan through every lexical token in the + * HTML document using the `next_token()` function. This + * alternative form takes no argument and provides no built-in + * query syntax. + * + * Example: + * + * $title = '(untitled)'; + * $text = ''; + * while ( $processor->next_token() ) { + * switch ( $processor->get_token_name() ) { + * case '#text': + * $text .= $processor->get_modifiable_text(); + * break; + * + * case 'BR': + * $text .= "\n"; + * break; + * + * case 'TITLE': + * $title = $processor->get_modifiable_text(); + * break; + * } + * } + * return trim( "# {$title}\n\n{$text}" ); + * + * ### Tokens and _modifiable text_. + * + * #### Special "atomic" HTML elements. + * + * Not all HTML elements are able to contain other elements inside of them. + * For instance, the contents inside a TITLE element are plaintext (except + * that character references like & will be decoded). This means that + * if the string `<img>` appears inside a TITLE element, then it's not an + * image tag, but rather it's text describing an image tag. Likewise, the + * contents of a SCRIPT or STYLE element are handled entirely separately in + * a browser than the contents of other elements because they represent a + * different language than HTML. + * + * For these elements the Tag Processor treats the entire sequence as one, + * from the opening tag, including its contents, through its closing tag. + * This means that the it's not possible to match the closing tag for a + * SCRIPT element unless it's unexpected; the Tag Processor already matched + * it when it found the opening tag. + * + * The inner contents of these elements are that element's _modifiable text_. + * + * The special elements are: + * - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy + * style of including Javascript inside of HTML comments to avoid accidentally + * closing the SCRIPT from inside a Javascript string. E.g. `console.log( '</script>' )`. + * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any + * character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`. + * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as + * raw plaintext and left as-is. E.g. `1 < 2 < 3` remains `1 < 2 < 3`. + * + * #### Other tokens with modifiable text. + * + * There are also non-elements which are void/self-closing in nature and contain + * modifiable text that is part of that individual syntax token itself. + * + * - `#text` nodes, whose entire token _is_ the modifiable text. + * - HTML comments and tokens that become comments due to some syntax error. The + * text for these tokens is the portion of the comment inside of the syntax. + * E.g. for `<!-- comment -->` the text is `" comment "` (note the spaces are included). + * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for + * `<![CDATA[some content]]>` the text is `"some content"` (with restrictions [1]). + * - "Funky comments," which are a special case of invalid closing tags whose name is + * invalid. The text for these nodes is the text that a browser would transform into + * an HTML comment when parsing. E.g. for `</%post_author>` the text is `%post_author`. + * - `DOCTYPE` declarations like `<DOCTYPE html>` which have no closing tag. + * - XML Processing instruction nodes like `<?wp __( "Like" ); ?>` (with restrictions [2]). + * - The empty end tag `</>` which is ignored in the browser and DOM. + * + * [1]: There are no CDATA sections in HTML. When encountering `<![CDATA[`, everything + * until the next `>` becomes a bogus HTML comment, meaning there can be no CDATA + * section in an HTML document containing `>`. The Tag Processor will first find + * all valid and bogus HTML comments, and then if the comment _would_ have been a + * CDATA section _were they to exist_, it will indicate this as the type of comment. + * + * [2]: XML allows a broader range of characters in a processing instruction's target name + * and disallows "xml" as a name, since it's special. The Tag Processor only recognizes + * target names with an ASCII-representable subset of characters. It also exhibits the + * same constraint as with CDATA sections, in that `>` cannot exist within the token + * since Processing Instructions do no exist within HTML and their syntax transforms + * into a bogus comment in the DOM. + * * ## Design and limitations * * The Tag Processor is designed to linearly scan HTML documents and tokenize @@ -241,9 +377,40 @@ * double-quoted strings, meaning that attributes on input with single-quoted or * unquoted values will appear in the output with double-quotes. * + * ### Scripting Flag + * + * The Tag Processor parses HTML with the "scripting flag" disabled. This means + * that it doesn't run any scripts while parsing the page. In a browser with + * JavaScript enabled, for example, the script can change the parse of the + * document as it loads. On the server, however, evaluating JavaScript is not + * only impractical, but also unwanted. + * + * Practically this means that the Tag Processor will descend into NOSCRIPT + * elements and process its child tags. Were the scripting flag enabled, such + * as in a typical browser, the contents of NOSCRIPT are skipped entirely. + * + * This allows the HTML API to process the content that will be presented in + * a browser when scripting is disabled, but it offers a different view of a + * page than most browser sessions will experience. E.g. the tags inside the + * NOSCRIPT disappear. + * + * ### Text Encoding + * + * The Tag Processor assumes that the input HTML document is encoded with a + * text encoding compatible with 7-bit ASCII's '<', '>', '&', ';', '/', '=', + * "'", '"', 'a' - 'z', 'A' - 'Z', and the whitespace characters ' ', tab, + * carriage-return, newline, and form-feed. + * + * In practice, this includes almost every single-byte encoding as well as + * UTF-8. Notably, however, it does not include UTF-16. If providing input + * that's incompatible, then convert the encoding beforehand. + * * @since 6.2.0 * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive. * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE. + * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token. + * Introduces "special" elements which act like void elements, e.g. TITLE, STYLE. + * Allows scanning through all tokens and processing modifiable text, where applicable. */ class WP_HTML_Tag_Processor { /** @@ -317,6 +484,51 @@ class WP_HTML_Tag_Processor { private $stop_on_tag_closers; /** + * Specifies mode of operation of the parser at any given time. + * + * | State | Meaning | + * | ----------------|----------------------------------------------------------------------| + * | *Ready* | The parser is ready to run. | + * | *Complete* | There is nothing left to parse. | + * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | + * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | + * | *Text node* | Found a #text node; this is plaintext and modifiable. | + * | *CDATA node* | Found a CDATA section; this is modifiable. | + * | *Comment* | Found a comment or bogus comment; this is modifiable. | + * | *Presumptuous* | Found an empty tag closer: `</>`. | + * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. | + * + * @since 6.5.0 + * + * @see WP_HTML_Tag_Processor::STATE_READY + * @see WP_HTML_Tag_Processor::STATE_COMPLETE + * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT + * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG + * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE + * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE + * @see WP_HTML_Tag_Processor::STATE_COMMENT + * @see WP_HTML_Tag_Processor::STATE_DOCTYPE + * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG + * @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT + * + * @var string + */ + protected $parser_state = self::STATE_READY; + + /** + * What kind of syntax token became an HTML comment. + * + * Since there are many ways in which HTML syntax can create an HTML comment, + * this indicates which of those caused it. This allows the Tag Processor to + * represent more from the original input document than would appear in the DOM. + * + * @since 6.5.0 + * + * @var string|null + */ + protected $comment_type = null; + + /** * How many bytes from the original HTML document have been read and parsed. * * This value points to the latest byte offset in the input document which @@ -329,6 +541,40 @@ class WP_HTML_Tag_Processor { private $bytes_already_parsed = 0; /** + * Byte offset in input document where current token starts. + * + * Example: + * + * <div id="test">... + * 01234 + * - token starts at 0 + * + * @since 6.5.0 + * + * @var int|null + */ + private $token_starts_at; + + /** + * Byte length of current token. + * + * Example: + * + * <div id="test">... + * 012345678901234 + * - token length is 14 - 0 = 14 + * + * a <!-- comment --> is a token. + * 0123456789 123456789 123456789 + * - token length is 17 - 2 = 15 + * + * @since 6.5.0 + * + * @var int|null + */ + private $token_length; + + /** * Byte offset in input document where current tag name starts. * * Example: @@ -338,6 +584,7 @@ class WP_HTML_Tag_Processor { * - tag name starts at 1 * * @since 6.2.0 + * * @var int|null */ private $tag_name_starts_at; @@ -352,24 +599,28 @@ class WP_HTML_Tag_Processor { * --- tag name length is 3 * * @since 6.2.0 + * * @var int|null */ private $tag_name_length; /** - * Byte offset in input document where current tag token ends. + * Byte offset into input document where current modifiable text starts. * - * Example: + * @since 6.5.0 * - * <div id="test">... - * 0 1 | - * 01234567890123456 - * --- tag name ends at 14 + * @var int + */ + private $text_starts_at; + + /** + * Byte length of modifiable text. * - * @since 6.2.0 - * @var int|null + * @since 6.5.0 + * + * @var string */ - private $tag_ends_at; + private $text_length; /** * Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>. @@ -388,14 +639,14 @@ class WP_HTML_Tag_Processor { * // <div id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8"> * // ^ parsing will continue from this point. * $this->attributes = array( - * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ) + * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ) * ); * * // When picking up parsing again, or when asking to find the * // `class` attribute we will continue and add to this array. * $this->attributes = array( - * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ), - * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 ) + * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ), + * 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false ) * ); * * // Note that only the `class` attribute value is stored in the index. @@ -484,9 +735,9 @@ class WP_HTML_Tag_Processor { * * // Replace an attribute stored with a new value, indices * // sourced from the lazily-parsed HTML recognizer. - * $start = $attributes['src']->start; - * $end = $attributes['src']->end; - * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value ); + * $start = $attributes['src']->start; + * $length = $attributes['src']->length; + * $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value ); * * // Correspondingly, something like this will appear in this array. * $lexical_updates = array( @@ -523,6 +774,7 @@ class WP_HTML_Tag_Processor { * Finds the next tag matching the $query. * * @since 6.2.0 + * @since 6.5.0 No longer processes incomplete tokens at end of document; pauses the processor at start of token. * * @param array|string|null $query { * Optional. Which tag name to find, having which class, etc. Default is to find any tag. @@ -541,90 +793,253 @@ class WP_HTML_Tag_Processor { $already_found = 0; do { - if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + if ( false === $this->next_token() ) { return false; } - // Find the next tag if it exists. - if ( false === $this->parse_next_tag() ) { - $this->bytes_already_parsed = strlen( $this->html ); - - return false; - } - - // Parse all of its attributes. - while ( $this->parse_next_attribute() ) { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { continue; } - // Ensure that the tag closes before the end of the document. - if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { - return false; + if ( $this->matches() ) { + ++$already_found; } + } while ( $already_found < $this->sought_match_offset ); - $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); - if ( false === $tag_ends_at ) { - return false; - } - $this->tag_ends_at = $tag_ends_at; - $this->bytes_already_parsed = $tag_ends_at; + return true; + } - // Finally, check if the parsed tag and its attributes match the search query. - if ( $this->matches() ) { - ++$already_found; + /** + * Finds the next token in the HTML document. + * + * An HTML document can be viewed as a stream of tokens, + * where tokens are things like HTML tags, HTML comments, + * text nodes, etc. This method finds the next token in + * the HTML document and returns whether it found one. + * + * If it starts parsing a token and reaches the end of the + * document then it will seek to the start of the last + * token and pause, returning `false` to indicate that it + * failed to find a complete token. + * + * Possible token types, based on the HTML specification: + * + * - an HTML tag, whether opening, closing, or void. + * - a text node - the plaintext inside tags. + * - an HTML comment. + * - a DOCTYPE declaration. + * - a processing instruction, e.g. `<?xml version="1.0" ?>`. + * + * The Tag Processor currently only supports the tag token. + * + * @since 6.5.0 + * + * @return bool Whether a token was parsed. + */ + public function next_token() { + return $this->base_class_next_token(); + } + + /** + * Internal method which finds the next token in the HTML document. + * + * This method is a protected internal function which implements the logic for + * finding the next token in a document. It exists so that the parser can update + * its state without affecting the location of the cursor in the document and + * without triggering subclass methods for things like `next_token()`, e.g. when + * applying patches before searching for the next token. + * + * @since 6.5.0 + * + * @access private + * + * @return bool Whether a token was parsed. + */ + private function base_class_next_token() { + $was_at = $this->bytes_already_parsed; + $this->after_tag(); + + // Don't proceed if there's nothing more to scan. + if ( + self::STATE_COMPLETE === $this->parser_state || + self::STATE_INCOMPLETE_INPUT === $this->parser_state + ) { + return false; + } + + /* + * The next step in the parsing loop determines the parsing state; + * clear it so that state doesn't linger from the previous step. + */ + $this->parser_state = self::STATE_READY; + + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $this->parser_state = self::STATE_COMPLETE; + return false; + } + + // Find the next tag if it exists. + if ( false === $this->parse_next_tag() ) { + if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) { + $this->bytes_already_parsed = $was_at; } + return false; + } + + /* + * For legacy reasons the rest of this function handles tags and their + * attributes. If the processor has reached the end of the document + * or if it matched any other token then it should return here to avoid + * attempting to process tag-specific syntax. + */ + if ( + self::STATE_INCOMPLETE_INPUT !== $this->parser_state && + self::STATE_COMPLETE !== $this->parser_state && + self::STATE_MATCHED_TAG !== $this->parser_state + ) { + return true; + } + + // Parse all of its attributes. + while ( $this->parse_next_attribute() ) { + continue; + } + + // Ensure that the tag closes before the end of the document. + if ( + self::STATE_INCOMPLETE_INPUT === $this->parser_state || + $this->bytes_already_parsed >= strlen( $this->html ) + ) { + // Does this appropriately clear state (parsed attributes)? + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $was_at; + + return false; + } + + $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); + if ( false === $tag_ends_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $was_at; + + return false; + } + $this->parser_state = self::STATE_MATCHED_TAG; + $this->token_length = $tag_ends_at - $this->token_starts_at; + $this->bytes_already_parsed = $tag_ends_at + 1; + + /* + * For non-DATA sections which might contain text that looks like HTML tags but + * isn't, scan with the appropriate alternative mode. Looking at the first letter + * of the tag name as a pre-check avoids a string allocation when it's not needed. + */ + $t = $this->html[ $this->tag_name_starts_at ]; + if ( + $this->is_closing_tag || + ! ( + 'i' === $t || 'I' === $t || + 'n' === $t || 'N' === $t || + 's' === $t || 'S' === $t || + 't' === $t || 'T' === $t || + 'x' === $t || 'X' === $t + ) + ) { + return true; + } + + $tag_name = $this->get_tag(); + + /* + * Preserve the opening tag pointers, as these will be overwritten + * when finding the closing tag. They will be reset after finding + * the closing to tag to point to the opening of the special atomic + * tag sequence. + */ + $tag_name_starts_at = $this->tag_name_starts_at; + $tag_name_length = $this->tag_name_length; + $tag_ends_at = $this->token_starts_at + $this->token_length; + $attributes = $this->attributes; + $duplicate_attributes = $this->duplicate_attributes; + + // Find the closing tag if necessary. + $found_closer = false; + switch ( $tag_name ) { + case 'SCRIPT': + $found_closer = $this->skip_script_data(); + break; + + case 'TEXTAREA': + case 'TITLE': + $found_closer = $this->skip_rcdata( $tag_name ); + break; + /* - * For non-DATA sections which might contain text that looks like HTML tags but - * isn't, scan with the appropriate alternative mode. Looking at the first letter - * of the tag name as a pre-check avoids a string allocation when it's not needed. + * In the browser this list would include the NOSCRIPT element, + * but the Tag Processor is an environment with the scripting + * flag disabled, meaning that it needs to descend into the + * NOSCRIPT element to be able to properly process what will be + * sent to a browser. + * + * Note that this rule makes HTML5 syntax incompatible with XML, + * because the parsing of this token depends on client application. + * The NOSCRIPT element cannot be represented in the XHTML syntax. */ - $t = $this->html[ $this->tag_name_starts_at ]; - if ( - ! $this->is_closing_tag && - ( - 'i' === $t || 'I' === $t || - 'n' === $t || 'N' === $t || - 's' === $t || 'S' === $t || - 't' === $t || 'T' === $t - ) ) { - $tag_name = $this->get_tag(); - - if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { - $this->bytes_already_parsed = strlen( $this->html ); - return false; - } elseif ( - ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && - ! $this->skip_rcdata( $tag_name ) - ) { - $this->bytes_already_parsed = strlen( $this->html ); - return false; - } elseif ( - ( - 'IFRAME' === $tag_name || - 'NOEMBED' === $tag_name || - 'NOFRAMES' === $tag_name || - 'NOSCRIPT' === $tag_name || - 'STYLE' === $tag_name - ) && - ! $this->skip_rawtext( $tag_name ) - ) { - /* - * "XMP" should be here too but its rules are more complicated and require the - * complexity of the HTML Processor (it needs to close out any open P element, - * meaning it can't be skipped here or else the HTML Processor will lose its - * place). For now, it can be ignored as it's a rare HTML tag in practice and - * any normative HTML should be using PRE instead. - */ - $this->bytes_already_parsed = strlen( $this->html ); - return false; - } - } - } while ( $already_found < $this->sought_match_offset ); + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + case 'STYLE': + case 'XMP': + $found_closer = $this->skip_rawtext( $tag_name ); + break; + + // No other tags should be treated in their entirety here. + default: + return true; + } + + if ( ! $found_closer ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + $this->bytes_already_parsed = $was_at; + return false; + } + + /* + * The values here look like they reference the opening tag but they reference + * the closing tag instead. This is why the opening tag values were stored + * above in a variable. It reads confusingly here, but that's because the + * functions that skip the contents have moved all the internal cursors past + * the inner content of the tag. + */ + $this->token_starts_at = $was_at; + $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; + $this->text_starts_at = $tag_ends_at + 1; + $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; + $this->tag_name_starts_at = $tag_name_starts_at; + $this->tag_name_length = $tag_name_length; + $this->attributes = $attributes; + $this->duplicate_attributes = $duplicate_attributes; return true; } + /** + * Whether the processor paused because the input HTML document ended + * in the middle of a syntax element, such as in the middle of a tag. + * + * Example: + * + * $processor = new WP_HTML_Tag_Processor( '<input type="text" value="Th' ); + * false === $processor->get_next_tag(); + * true === $processor->paused_at_incomplete_token(); + * + * @since 6.5.0 + * + * @return bool Whether the parse paused at the start of an incomplete token. + */ + public function paused_at_incomplete_token() { + return self::STATE_INCOMPLETE_INPUT === $this->parser_state; + } /** * Generator for a foreach loop to step through each class name for the matched tag. @@ -643,6 +1058,10 @@ class WP_HTML_Tag_Processor { * @since 6.4.0 */ public function class_list() { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + return; + } + /** @var string $class contains the string value of the class attribute, with character references decoded. */ $class = $this->get_attribute( 'class' ); @@ -698,7 +1117,7 @@ class WP_HTML_Tag_Processor { * @return bool|null Whether the matched tag contains the given class name, or null if not matched. */ public function has_class( $wanted_class ) { - if ( ! $this->tag_name_starts_at ) { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return null; } @@ -795,7 +1214,11 @@ class WP_HTML_Tag_Processor { * @return bool Whether the bookmark was successfully created. */ public function set_bookmark( $name ) { - if ( null === $this->tag_name_starts_at ) { + // It only makes sense to set a bookmark if the parser has paused on a concrete token. + if ( + self::STATE_COMPLETE === $this->parser_state || + self::STATE_INCOMPLETE_INPUT === $this->parser_state + ) { return false; } @@ -808,10 +1231,7 @@ class WP_HTML_Tag_Processor { return false; } - $this->bookmarks[ $name ] = new WP_HTML_Span( - $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ), - $this->tag_ends_at - ); + $this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length ); return true; } @@ -873,16 +1293,15 @@ class WP_HTML_Tag_Processor { $at = $this->bytes_already_parsed; while ( false !== $at && $at < $doc_length ) { - $at = strpos( $this->html, '</', $at ); + $at = strpos( $this->html, '</', $at ); + $this->tag_name_starts_at = $at; - // If there is no possible tag closer then fail. + // Fail if there is no possible tag closer. if ( false === $at || ( $at + $tag_length ) >= $doc_length ) { - $this->bytes_already_parsed = $doc_length; return false; } - $closer_potentially_starts_at = $at; - $at += 2; + $at += 2; /* * Find a case-insensitive match to the tag name. @@ -905,6 +1324,10 @@ class WP_HTML_Tag_Processor { $at += $tag_length; $this->bytes_already_parsed = $at; + if ( $at >= strlen( $html ) ) { + return false; + } + /* * Ensure that the tag name terminates to avoid matching on * substrings of a longer tag name. For example, the sequence @@ -919,13 +1342,23 @@ class WP_HTML_Tag_Processor { while ( $this->parse_next_attribute() ) { continue; } + $at = $this->bytes_already_parsed; if ( $at >= strlen( $this->html ) ) { return false; } - if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { - $this->bytes_already_parsed = $closer_potentially_starts_at; + if ( '>' === $html[ $at ] ) { + $this->bytes_already_parsed = $at + 1; + return true; + } + + if ( $at + 1 >= strlen( $this->html ) ) { + return false; + } + + if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) { + $this->bytes_already_parsed = $at + 2; return true; } } @@ -1047,6 +1480,7 @@ class WP_HTML_Tag_Processor { if ( $is_closing ) { $this->bytes_already_parsed = $closer_potentially_starts_at; + $this->tag_name_starts_at = $closer_potentially_starts_at; if ( $this->bytes_already_parsed >= $doc_length ) { return false; } @@ -1055,8 +1489,14 @@ class WP_HTML_Tag_Processor { continue; } + if ( $this->bytes_already_parsed >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + + return false; + } + if ( '>' === $html[ $this->bytes_already_parsed ] ) { - $this->bytes_already_parsed = $closer_potentially_starts_at; + ++$this->bytes_already_parsed; return true; } } @@ -1085,15 +1525,66 @@ class WP_HTML_Tag_Processor { $html = $this->html; $doc_length = strlen( $html ); - $at = $this->bytes_already_parsed; + $was_at = $this->bytes_already_parsed; + $at = $was_at; while ( false !== $at && $at < $doc_length ) { $at = strpos( $html, '<', $at ); + + /* + * This does not imply an incomplete parse; it indicates that there + * can be nothing left in the document other than a #text node. + */ if ( false === $at ) { - return false; + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->token_length = strlen( $html ) - $was_at; + $this->text_starts_at = $was_at; + $this->text_length = $this->token_length; + $this->bytes_already_parsed = strlen( $html ); + return true; + } + + if ( $at > $was_at ) { + /* + * A "<" normally starts a new HTML tag or syntax token, but in cases where the + * following character can't produce a valid token, the "<" is instead treated + * as plaintext and the parser should skip over it. This avoids a problem when + * following earlier practices of typing emoji with text, e.g. "<3". This + * should be a heart, not a tag. It's supposed to be rendered, not hidden. + * + * At this point the parser checks if this is one of those cases and if it is + * will continue searching for the next "<" in search of a token boundary. + * + * @see https://html.spec.whatwg.org/#tag-open-state + */ + if ( strlen( $html ) > $at + 1 ) { + $next_character = $html[ $at + 1 ]; + $at_another_node = ( + '!' === $next_character || + '/' === $next_character || + '?' === $next_character || + ( 'A' <= $next_character && $next_character <= 'Z' ) || + ( 'a' <= $next_character && $next_character <= 'z' ) + ); + if ( ! $at_another_node ) { + ++$at; + continue; + } + } + + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->token_length = $at - $was_at; + $this->text_starts_at = $was_at; + $this->text_length = $this->token_length; + $this->bytes_already_parsed = $at; + return true; } - if ( '/' === $this->html[ $at + 1 ] ) { + $this->token_starts_at = $at; + + if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) { $this->is_closing_tag = true; ++$at; } else { @@ -1117,8 +1608,9 @@ class WP_HTML_Tag_Processor { $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 ); if ( $tag_name_prefix_length > 0 ) { ++$at; - $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); + $this->parser_state = self::STATE_MATCHED_TAG; $this->tag_name_starts_at = $at; + $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); $this->bytes_already_parsed = $at + $this->tag_name_length; return true; } @@ -1127,35 +1619,58 @@ class WP_HTML_Tag_Processor { * Abort if no tag is found before the end of * the document. There is nothing left to parse. */ - if ( $at + 1 >= strlen( $html ) ) { + if ( $at + 1 >= $doc_length ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } /* - * <! transitions to markup declaration open state + * `<!` transitions to markup declaration open state * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state */ if ( '!' === $html[ $at + 1 ] ) { /* - * <!-- transitions to a bogus comment state – skip to the nearest --> + * `<!--` transitions to a comment state – apply further comment rules. * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( - strlen( $html ) > $at + 3 && + $doc_length > $at + 3 && '-' === $html[ $at + 2 ] && '-' === $html[ $at + 3 ] ) { $closer_at = $at + 4; // If it's not possible to close the comment then there is nothing more to scan. - if ( strlen( $html ) <= $closer_at ) { + if ( $doc_length <= $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } // Abruptly-closed empty comments are a sequence of dashes followed by `>`. $span_of_dashes = strspn( $html, '-', $closer_at ); if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { - $at = $closer_at + $span_of_dashes + 1; - continue; + /* + * @todo When implementing `set_modifiable_text()` ensure that updates to this token + * don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment + * and bogus comment syntax, these leave no clear insertion point for text and + * they need to be modified specially in order to contain text. E.g. to store + * `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which + * involves inserting an additional `-` into the token after the modifiable text. + */ + $this->parser_state = self::STATE_COMMENT; + $this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT; + $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at; + + // Only provide modifiable text if the token is long enough to contain it. + if ( $span_of_dashes >= 2 ) { + $this->comment_type = self::COMMENT_AS_HTML_COMMENT; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = $span_of_dashes - 2; + } + + $this->bytes_already_parsed = $closer_at + $span_of_dashes + 1; + return true; } /* @@ -1165,55 +1680,47 @@ class WP_HTML_Tag_Processor { * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment */ --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. - while ( ++$closer_at < strlen( $html ) ) { + while ( ++$closer_at < $doc_length ) { $closer_at = strpos( $html, '--', $closer_at ); if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } - if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) { - $at = $closer_at + 3; - continue 2; + if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) { + $this->parser_state = self::STATE_COMMENT; + $this->comment_type = self::COMMENT_AS_HTML_COMMENT; + $this->token_length = $closer_at + 3 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 3; + return true; } - if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { - $at = $closer_at + 4; - continue 2; + if ( + $closer_at + 3 < $doc_length && + '!' === $html[ $closer_at + 2 ] && + '>' === $html[ $closer_at + 3 ] + ) { + $this->parser_state = self::STATE_COMMENT; + $this->comment_type = self::COMMENT_AS_HTML_COMMENT; + $this->token_length = $closer_at + 4 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 4; + return true; } } } /* - * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]> - * The CDATA is case-sensitive. - * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state - */ - if ( - strlen( $html ) > $at + 8 && - '[' === $html[ $at + 2 ] && - 'C' === $html[ $at + 3 ] && - 'D' === $html[ $at + 4 ] && - 'A' === $html[ $at + 5 ] && - 'T' === $html[ $at + 6 ] && - 'A' === $html[ $at + 7 ] && - '[' === $html[ $at + 8 ] - ) { - $closer_at = strpos( $html, ']]>', $at + 9 ); - if ( false === $closer_at ) { - return false; - } - - $at = $closer_at + 3; - continue; - } - - /* - * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest > + * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest > * These are ASCII-case-insensitive. * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( - strlen( $html ) > $at + 8 && + $doc_length > $at + 8 && ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) && ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) && ( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) && @@ -1224,59 +1731,179 @@ class WP_HTML_Tag_Processor { ) { $closer_at = strpos( $html, '>', $at + 9 ); if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } - $at = $closer_at + 1; - continue; + $this->parser_state = self::STATE_DOCTYPE; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; } /* * Anything else here is an incorrectly-opened comment and transitions - * to the bogus comment state - skip to the nearest >. + * to the bogus comment state - skip to the nearest >. If no closer is + * found then the HTML was truncated inside the markup declaration. */ - $at = strpos( $html, '>', $at + 1 ); - continue; + $closer_at = strpos( $html, '>', $at + 1 ); + if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + + return false; + } + + $this->parser_state = self::STATE_COMMENT; + $this->comment_type = self::COMMENT_AS_INVALID_HTML; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 2; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + + /* + * Identify nodes that would be CDATA if HTML had CDATA sections. + * + * This section must occur after identifying the bogus comment end + * because in an HTML parser it will span to the nearest `>`, even + * if there's no `]]>` as would be required in an XML document. It + * is therefore not possible to parse a CDATA section containing + * a `>` in the HTML syntax. + * + * Inside foreign elements there is a discrepancy between browsers + * and the specification on this. + * + * @todo Track whether the Tag Processor is inside a foreign element + * and require the proper closing `]]>` in those cases. + */ + if ( + $this->token_length >= 10 && + '[' === $html[ $this->token_starts_at + 2 ] && + 'C' === $html[ $this->token_starts_at + 3 ] && + 'D' === $html[ $this->token_starts_at + 4 ] && + 'A' === $html[ $this->token_starts_at + 5 ] && + 'T' === $html[ $this->token_starts_at + 6 ] && + 'A' === $html[ $this->token_starts_at + 7 ] && + '[' === $html[ $this->token_starts_at + 8 ] && + ']' === $html[ $closer_at - 1 ] && + ']' === $html[ $closer_at - 2 ] + ) { + $this->parser_state = self::STATE_COMMENT; + $this->comment_type = self::COMMENT_AS_CDATA_LOOKALIKE; + $this->text_starts_at += 7; + $this->text_length -= 9; + } + + return true; } /* * </> is a missing end tag name, which is ignored. * + * This was also known as the "presumptuous empty tag" + * in early discussions as it was proposed to close + * the nearest previous opening tag. + * * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name */ if ( '>' === $html[ $at + 1 ] ) { - ++$at; - continue; + $this->parser_state = self::STATE_PRESUMPTUOUS_TAG; + $this->token_length = $at + 2 - $this->token_starts_at; + $this->bytes_already_parsed = $at + 2; + return true; } /* - * <? transitions to a bogus comment state – skip to the nearest > + * `<?` transitions to a bogus comment state – skip to the nearest > * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state */ if ( '?' === $html[ $at + 1 ] ) { $closer_at = strpos( $html, '>', $at + 2 ); if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } - $at = $closer_at + 1; - continue; + $this->parser_state = self::STATE_COMMENT; + $this->comment_type = self::COMMENT_AS_INVALID_HTML; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 2; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + + /* + * Identify a Processing Instruction node were HTML to have them. + * + * This section must occur after identifying the bogus comment end + * because in an HTML parser it will span to the nearest `>`, even + * if there's no `?>` as would be required in an XML document. It + * is therefore not possible to parse a Processing Instruction node + * containing a `>` in the HTML syntax. + * + * XML allows for more target names, but this code only identifies + * those with ASCII-representable target names. This means that it + * may identify some Processing Instruction nodes as bogus comments, + * but it will not misinterpret the HTML structure. By limiting the + * identification to these target names the Tag Processor can avoid + * the need to start parsing UTF-8 sequences. + * + * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | + * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | + * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | + * [#x10000-#xEFFFF] + * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] + * + * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget + */ + if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { + $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 ); + $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' ); + + if ( 0 < $pi_target_length ) { + $pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length ); + + $this->comment_type = self::COMMENT_AS_PI_NODE_LOOKALIKE; + $this->tag_name_starts_at = $this->token_starts_at + 2; + $this->tag_name_length = $pi_target_length; + $this->text_starts_at += $pi_target_length; + $this->text_length -= $pi_target_length + 1; + } + } + + return true; } /* * If a non-alpha starts the tag name in a tag closer it's a comment. * Find the first `>`, which closes the comment. * + * This parser classifies these particular comments as special "funky comments" + * which are made available for further processing. + * * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name */ if ( $this->is_closing_tag ) { + // No chance of finding a closer. + if ( $at + 3 > $doc_length ) { + return false; + } + $closer_at = strpos( $html, '>', $at + 3 ); if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } - $at = $closer_at + 1; - continue; + $this->parser_state = self::STATE_FUNKY_COMMENT; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 2; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; } ++$at; @@ -1296,6 +1923,8 @@ class WP_HTML_Tag_Processor { // Skip whitespace and slashes. $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } @@ -1318,11 +1947,15 @@ class WP_HTML_Tag_Processor { $attribute_name = substr( $this->html, $attribute_start, $name_length ); $this->bytes_already_parsed += $name_length; if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } $this->skip_whitespace(); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } @@ -1331,6 +1964,8 @@ class WP_HTML_Tag_Processor { ++$this->bytes_already_parsed; $this->skip_whitespace(); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } @@ -1357,6 +1992,8 @@ class WP_HTML_Tag_Processor { } if ( $attribute_end >= strlen( $this->html ) ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + return false; } @@ -1381,7 +2018,7 @@ class WP_HTML_Tag_Processor { $value_start, $value_length, $attribute_start, - $attribute_end, + $attribute_end - $attribute_start, ! $has_value ); @@ -1396,7 +2033,7 @@ class WP_HTML_Tag_Processor { * an array when encountering duplicates avoids needless allocations in the * normative case of parsing tags with no duplicate attributes. */ - $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end ); + $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start ); if ( null === $this->duplicate_attributes ) { $this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) ); } elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) { @@ -1423,12 +2060,54 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 */ private function after_tag() { - $this->get_updated_html(); + /* + * There could be lexical updates enqueued for an attribute that + * also exists on the next tag. In order to avoid conflating the + * attributes across the two tags, lexical updates with names + * need to be flushed to raw lexical updates. + */ + $this->class_name_updates_to_attributes_updates(); + + /* + * Purge updates if there are too many. The actual count isn't + * scientific, but a few values from 100 to a few thousand were + * tests to find a practially-useful limit. + * + * If the update queue grows too big, then the Tag Processor + * will spend more time iterating through them and lose the + * efficiency gains of deferring applying them. + */ + if ( 1000 < count( $this->lexical_updates ) ) { + $this->get_updated_html(); + } + + foreach ( $this->lexical_updates as $name => $update ) { + /* + * Any updates appearing after the cursor should be applied + * before proceeding, otherwise they may be overlooked. + */ + if ( $update->start >= $this->bytes_already_parsed ) { + $this->get_updated_html(); + break; + } + + if ( is_int( $name ) ) { + continue; + } + + $this->lexical_updates[] = $update; + unset( $this->lexical_updates[ $name ] ); + } + + $this->token_starts_at = null; + $this->token_length = null; $this->tag_name_starts_at = null; $this->tag_name_length = null; - $this->tag_ends_at = null; + $this->text_starts_at = 0; + $this->text_length = 0; $this->is_closing_tag = null; $this->attributes = array(); + $this->comment_type = null; $this->duplicate_attributes = null; } @@ -1606,10 +2285,10 @@ class WP_HTML_Tag_Processor { $bytes_already_copied = 0; $output_buffer = ''; foreach ( $this->lexical_updates as $diff ) { - $shift = strlen( $diff->text ) - ( $diff->end - $diff->start ); + $shift = strlen( $diff->text ) - $diff->length; // Adjust the cursor position by however much an update affects it. - if ( $diff->start <= $this->bytes_already_parsed ) { + if ( $diff->start < $this->bytes_already_parsed ) { $this->bytes_already_parsed += $shift; } @@ -1620,7 +2299,7 @@ class WP_HTML_Tag_Processor { $output_buffer .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied ); $output_buffer .= $diff->text; - $bytes_already_copied = $diff->end; + $bytes_already_copied = $diff->start + $diff->length; } $this->html = $output_buffer . substr( $this->html, $bytes_already_copied ); @@ -1630,6 +2309,8 @@ class WP_HTML_Tag_Processor { * replacements adjust offsets in the input document. */ foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { + $bookmark_end = $bookmark->start + $bookmark->length; + /* * Each lexical update which appears before the bookmark's endpoints * might shift the offsets for those endpoints. Loop through each change @@ -1640,28 +2321,30 @@ class WP_HTML_Tag_Processor { $tail_delta = 0; foreach ( $this->lexical_updates as $diff ) { - if ( $bookmark->start < $diff->start && $bookmark->end < $diff->start ) { + $diff_end = $diff->start + $diff->length; + + if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) { break; } - if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) { + if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) { $this->release_bookmark( $bookmark_name ); continue 2; } - $delta = strlen( $diff->text ) - ( $diff->end - $diff->start ); + $delta = strlen( $diff->text ) - $diff->length; if ( $bookmark->start >= $diff->start ) { $head_delta += $delta; } - if ( $bookmark->end >= $diff->end ) { + if ( $bookmark_end >= $diff_end ) { $tail_delta += $delta; } } - $bookmark->start += $head_delta; - $bookmark->end += $tail_delta; + $bookmark->start += $head_delta; + $bookmark->length += $tail_delta - $head_delta; } $this->lexical_updates = array(); @@ -1716,7 +2399,8 @@ class WP_HTML_Tag_Processor { // Point this tag processor before the sought tag opener and consume it. $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; - return $this->next_tag( array( 'tag_closers' => 'visit' ) ); + $this->parser_state = self::STATE_READY; + return $this->next_token(); } /** @@ -1743,7 +2427,7 @@ class WP_HTML_Tag_Processor { * This code should be unreachable, because it implies the two replacements * start at the same location and contain the same text. */ - return $a->end - $b->end; + return $a->length - $b->length; } /** @@ -1761,6 +2445,10 @@ class WP_HTML_Tag_Processor { * @return string|boolean|null Value of enqueued update if present, otherwise false. */ private function get_enqueued_attribute_value( $comparable_name ) { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + return false; + } + if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { return false; } @@ -1828,7 +2516,7 @@ class WP_HTML_Tag_Processor { * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. */ public function get_attribute( $name ) { - if ( null === $this->tag_name_starts_at ) { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return null; } @@ -1908,7 +2596,10 @@ class WP_HTML_Tag_Processor { * @return array|null List of attribute names, or `null` when no tag opener is matched. */ public function get_attribute_names_with_prefix( $prefix ) { - if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { + if ( + self::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_closing_tag + ) { return null; } @@ -1946,7 +2637,18 @@ class WP_HTML_Tag_Processor { $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); - return strtoupper( $tag_name ); + if ( self::STATE_MATCHED_TAG === $this->parser_state ) { + return strtoupper( $tag_name ); + } + + if ( + self::STATE_COMMENT === $this->parser_state && + self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type() + ) { + return $tag_name; + } + + return null; } /** @@ -1967,11 +2669,19 @@ class WP_HTML_Tag_Processor { * @return bool Whether the currently matched tag contains the self-closing flag. */ public function has_self_closing_flag() { - if ( ! $this->tag_name_starts_at ) { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return false; } - return '/' === $this->html[ $this->tag_ends_at - 1 ]; + /* + * The self-closing flag is the solidus at the _end_ of the tag, not the beginning. + * + * Example: + * + * <figure /> + * ^ this appears one character before the end of the closing ">". + */ + return '/' === $this->html[ $this->token_starts_at + $this->token_length - 1 ]; } /** @@ -1991,7 +2701,191 @@ class WP_HTML_Tag_Processor { * @return bool Whether the current tag is a tag closer. */ public function is_tag_closer() { - return $this->is_closing_tag; + return ( + self::STATE_MATCHED_TAG === $this->parser_state && + $this->is_closing_tag + ); + } + + /** + * Indicates the kind of matched token, if any. + * + * This differs from `get_token_name()` in that it always + * returns a static string indicating the type, whereas + * `get_token_name()` may return values derived from the + * token itself, such as a tag name or processing + * instruction tag. + * + * Possible values: + * - `#tag` when matched on a tag. + * - `#text` when matched on a text node. + * - `#cdata-section` when matched on a CDATA node. + * - `#comment` when matched on a comment. + * - `#doctype` when matched on a DOCTYPE declaration. + * - `#presumptuous-tag` when matched on an empty tag closer. + * - `#funky-comment` when matched on a funky comment. + * + * @since 6.5.0 + * + * @return string|null What kind of token is matched, or null. + */ + public function get_token_type() { + switch ( $this->parser_state ) { + case self::STATE_MATCHED_TAG: + return '#tag'; + + case self::STATE_DOCTYPE: + return '#doctype'; + + default: + return $this->get_token_name(); + } + } + + /** + * Returns the node name represented by the token. + * + * This matches the DOM API value `nodeName`. Some values + * are static, such as `#text` for a text node, while others + * are dynamically generated from the token itself. + * + * Dynamic names: + * - Uppercase tag name for tag matches. + * - `html` for DOCTYPE declarations. + * + * Note that if the Tag Processor is not matched on a token + * then this function will return `null`, either because it + * hasn't yet found a token or because it reached the end + * of the document without matching a token. + * + * @since 6.5.0 + * + * @return string|null Name of the matched token. + */ + public function get_token_name() { + switch ( $this->parser_state ) { + case self::STATE_MATCHED_TAG: + return $this->get_tag(); + + case self::STATE_TEXT_NODE: + return '#text'; + + case self::STATE_CDATA_NODE: + return '#cdata-section'; + + case self::STATE_COMMENT: + return '#comment'; + + case self::STATE_DOCTYPE: + return 'html'; + + case self::STATE_PRESUMPTUOUS_TAG: + return '#presumptuous-tag'; + + case self::STATE_FUNKY_COMMENT: + return '#funky-comment'; + } + } + + /** + * Indicates what kind of comment produced the comment node. + * + * Because there are different kinds of HTML syntax which produce + * comments, the Tag Processor tracks and exposes this as a type + * for the comment. Nominally only regular HTML comments exist as + * they are commonly known, but a number of unrelated syntax errors + * also produce comments. + * + * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT + * @see self::COMMENT_AS_CDATA_LOOKALIKE + * @see self::COMMENT_AS_INVALID_HTML + * @see self::COMMENT_AS_HTML_COMMENT + * @see self::COMMENT_AS_PI_NODE_LOOKALIKE + * + * @since 6.5.0 + * + * @return string|null + */ + public function get_comment_type() { + if ( self::STATE_COMMENT !== $this->parser_state ) { + return null; + } + + return $this->comment_type; + } + + /** + * Returns the modifiable text for a matched token, or an empty string. + * + * Modifiable text is text content that may be read and changed without + * changing the HTML structure of the document around it. This includes + * the contents of `#text` nodes in the HTML as well as the inner + * contents of HTML comments, Processing Instructions, and others, even + * though these nodes aren't part of a parsed DOM tree. They also contain + * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any + * other section in an HTML document which cannot contain HTML markup (DATA). + * + * If a token has no modifiable text then an empty string is returned to + * avoid needless crashing or type errors. An empty string does not mean + * that a token has modifiable text, and a token with modifiable text may + * have an empty string (e.g. a comment with no contents). + * + * @since 6.5.0 + * + * @return string + */ + public function get_modifiable_text() { + if ( null === $this->text_starts_at ) { + return ''; + } + + $text = substr( $this->html, $this->text_starts_at, $this->text_length ); + + // Comment data is not decoded. + if ( + self::STATE_CDATA_NODE === $this->parser_state || + self::STATE_COMMENT === $this->parser_state || + self::STATE_DOCTYPE === $this->parser_state || + self::STATE_FUNKY_COMMENT === $this->parser_state + ) { + return $text; + } + + $tag_name = $this->get_tag(); + if ( + // Script data is not decoded. + 'SCRIPT' === $tag_name || + + // RAWTEXT data is not decoded. + 'IFRAME' === $tag_name || + 'NOEMBED' === $tag_name || + 'NOFRAMES' === $tag_name || + 'STYLE' === $tag_name || + 'XMP' === $tag_name + ) { + return $text; + } + + $decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE ); + + /* + * TEXTAREA skips a leading newline, but this newline may appear not only as the + * literal character `\n`, but also as a character reference, such as in the + * following markup: `<textarea>
Content</textarea>`. + * + * For these cases it's important to first decode the text content before checking + * for a leading newline and removing it. + */ + if ( + self::STATE_MATCHED_TAG === $this->parser_state && + 'TEXTAREA' === $tag_name && + strlen( $decoded ) > 0 && + "\n" === $decoded[0] + ) { + return substr( $decoded, 1 ); + } + + return $decoded; } /** @@ -2011,7 +2905,10 @@ class WP_HTML_Tag_Processor { * @return bool Whether an attribute value was set. */ public function set_attribute( $name, $value ) { - if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { + if ( + self::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_closing_tag + ) { return false; } @@ -2031,8 +2928,8 @@ class WP_HTML_Tag_Processor { * * @see https://html.spec.whatwg.org/#attributes-2 * - * @TODO as the only regex pattern maybe we should take it out? are - * Unicode patterns available broadly in Core? + * @todo As the only regex pattern maybe we should take it out? + * Are Unicode patterns available broadly in Core? */ if ( preg_match( '~[' . @@ -2101,7 +2998,7 @@ class WP_HTML_Tag_Processor { $existing_attribute = $this->attributes[ $comparable_name ]; $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $existing_attribute->start, - $existing_attribute->end, + $existing_attribute->length, $updated_attribute ); } else { @@ -2119,7 +3016,7 @@ class WP_HTML_Tag_Processor { */ $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $this->tag_name_starts_at + $this->tag_name_length, - $this->tag_name_starts_at + $this->tag_name_length, + 0, ' ' . $updated_attribute ); } @@ -2144,7 +3041,10 @@ class WP_HTML_Tag_Processor { * @return bool Whether an attribute was removed. */ public function remove_attribute( $name ) { - if ( $this->is_closing_tag ) { + if ( + self::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_closing_tag + ) { return false; } @@ -2194,7 +3094,7 @@ class WP_HTML_Tag_Processor { */ $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( $this->attributes[ $name ]->start, - $this->attributes[ $name ]->end, + $this->attributes[ $name ]->length, '' ); @@ -2203,7 +3103,7 @@ class WP_HTML_Tag_Processor { foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) { $this->lexical_updates[] = new WP_HTML_Text_Replacement( $attribute_token->start, - $attribute_token->end, + $attribute_token->length, '' ); } @@ -2221,13 +3121,14 @@ class WP_HTML_Tag_Processor { * @return bool Whether the class was set to be added. */ public function add_class( $class_name ) { - if ( $this->is_closing_tag ) { + if ( + self::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_closing_tag + ) { return false; } - if ( null !== $this->tag_name_starts_at ) { - $this->classname_updates[ $class_name ] = self::ADD_CLASS; - } + $this->classname_updates[ $class_name ] = self::ADD_CLASS; return true; } @@ -2241,7 +3142,10 @@ class WP_HTML_Tag_Processor { * @return bool Whether the class was set to be removed. */ public function remove_class( $class_name ) { - if ( $this->is_closing_tag ) { + if ( + self::STATE_MATCHED_TAG !== $this->parser_state || + $this->is_closing_tag + ) { return false; } @@ -2289,7 +3193,7 @@ class WP_HTML_Tag_Processor { * Keep track of the position right before the current tag. This will * be necessary for reparsing the current tag after updating the HTML. */ - $before_current_tag = $this->tag_name_starts_at - 1; + $before_current_tag = $this->token_starts_at; /* * 1. Apply the enqueued edits and update all the pointers to reflect those changes. @@ -2318,15 +3222,7 @@ class WP_HTML_Tag_Processor { * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_tag; - $this->parse_next_tag(); - // Reparse the attributes. - while ( $this->parse_next_attribute() ) { - continue; - } - - $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); - $this->tag_ends_at = $tag_ends_at; - $this->bytes_already_parsed = $tag_ends_at; + $this->base_class_next_token(); return $this->html; } @@ -2447,4 +3343,206 @@ class WP_HTML_Tag_Processor { return true; } + + /** + * Parser Ready State. + * + * Indicates that the parser is ready to run and waiting for a state transition. + * It may not have started yet, or it may have just finished parsing a token and + * is ready to find the next one. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_READY = 'STATE_READY'; + + /** + * Parser Complete State. + * + * Indicates that the parser has reached the end of the document and there is + * nothing left to scan. It finished parsing the last token completely. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_COMPLETE = 'STATE_COMPLETE'; + + /** + * Parser Incomplete Input State. + * + * Indicates that the parser has reached the end of the document before finishing + * a token. It started parsing a token but there is a possibility that the input + * HTML document was truncated in the middle of a token. + * + * The parser is reset at the start of the incomplete token and has paused. There + * is nothing more than can be scanned unless provided a more complete document. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT'; + + /** + * Parser Matched Tag State. + * + * Indicates that the parser has found an HTML tag and it's possible to get + * the tag name and read or modify its attributes (if it's not a closing tag). + * + * @since 6.5.0 + * + * @access private + */ + const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; + + /** + * Parser Text Node State. + * + * Indicates that the parser has found a text node and it's possible + * to read and modify that text. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; + + /** + * Parser CDATA Node State. + * + * Indicates that the parser has found a CDATA node and it's possible + * to read and modify its modifiable text. Note that in HTML there are + * no CDATA nodes outside of foreign content (SVG and MathML). Outside + * of foreign content, they are treated as HTML comments. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; + + /** + * Indicates that the parser has found an HTML comment and it's + * possible to read and modify its modifiable text. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_COMMENT = 'STATE_COMMENT'; + + /** + * Indicates that the parser has found a DOCTYPE node and it's + * possible to read and modify its modifiable text. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_DOCTYPE = 'STATE_DOCTYPE'; + + /** + * Indicates that the parser has found an empty tag closer `</>`. + * + * Note that in HTML there are no empty tag closers, and they + * are ignored. Nonetheless, the Tag Processor still + * recognizes them as they appear in the HTML stream. + * + * These were historically discussed as a "presumptuous tag + * closer," which would close the nearest open tag, but were + * dismissed in favor of explicitly-closing tags. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG'; + + /** + * Indicates that the parser has found a "funky comment" + * and it's possible to read and modify its modifiable text. + * + * Example: + * + * </%url> + * </{"wp-bit":"query/post-author"}> + * </2> + * + * Funky comments are tag closers with invalid tag names. Note + * that in HTML these are turn into bogus comments. Nonetheless, + * the Tag Processor recognizes them in a stream of HTML and + * exposes them for inspection and modification. + * + * @since 6.5.0 + * + * @access private + */ + const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY'; + + /** + * Indicates that a comment was created when encountering abruptly-closed HTML comment. + * + * Example: + * + * <!--> + * <!---> + * + * @since 6.5.0 + */ + const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT'; + + /** + * Indicates that a comment would be parsed as a CDATA node, + * were HTML to allow CDATA nodes outside of foreign content. + * + * Example: + * + * <![CDATA[This is a CDATA node.]]> + * + * This is an HTML comment, but it looks like a CDATA node. + * + * @since 6.5.0 + */ + const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE'; + + /** + * Indicates that a comment was created when encountering + * normative HTML comment syntax. + * + * Example: + * + * <!-- this is a comment --> + * + * @since 6.5.0 + */ + const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT'; + + /** + * Indicates that a comment would be parsed as a Processing + * Instruction node, were they to exist within HTML. + * + * Example: + * + * <?wp __( 'Like' ) ?> + * + * This is an HTML comment, but it looks like a CDATA node. + * + * @since 6.5.0 + */ + const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE'; + + /** + * Indicates that a comment was created when encountering invalid + * HTML input, a so-called "bogus comment." + * + * Example: + * + * <?nothing special> + * <!{nothing special}> + * + * @since 6.5.0 + */ + const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML'; } diff --git a/wp-includes/html-api/class-wp-html-text-replacement.php b/wp-includes/html-api/class-wp-html-text-replacement.php index 26b7bb2..4b8a6a6 100644 --- a/wp-includes/html-api/class-wp-html-text-replacement.php +++ b/wp-includes/html-api/class-wp-html-text-replacement.php @@ -15,6 +15,7 @@ * * @access private * @since 6.2.0 + * @since 6.5.0 Replace `end` with `length` to more closely match `substr()`. * * @see WP_HTML_Tag_Processor */ @@ -23,22 +24,25 @@ class WP_HTML_Text_Replacement { * Byte offset into document where replacement span begins. * * @since 6.2.0 + * * @var int */ public $start; /** - * Byte offset into document where replacement span ends. + * Byte length of span being replaced. + * + * @since 6.5.0 * - * @since 6.2.0 * @var int */ - public $end; + public $length; /** * Span of text to insert in document to replace existing content from start to end. * * @since 6.2.0 + * * @var string */ public $text; @@ -48,13 +52,13 @@ class WP_HTML_Text_Replacement { * * @since 6.2.0 * - * @param int $start Byte offset into document where replacement span begins. - * @param int $end Byte offset into document where replacement span ends. - * @param string $text Span of text to insert in document to replace existing content from start to end. + * @param int $start Byte offset into document where replacement span begins. + * @param int $length Byte length of span in document being replaced. + * @param string $text Span of text to insert in document to replace existing content from start to end. */ - public function __construct( $start, $end, $text ) { - $this->start = $start; - $this->end = $end; - $this->text = $text; + public function __construct( $start, $length, $text ) { + $this->start = $start; + $this->length = $length; + $this->text = $text; } } |