summaryrefslogtreecommitdiffstats
path: root/wp-includes/html-api
diff options
context:
space:
mode:
Diffstat (limited to 'wp-includes/html-api')
-rw-r--r--wp-includes/html-api/class-wp-html-active-formatting-elements.php2
-rw-r--r--wp-includes/html-api/class-wp-html-attribute-token.php38
-rw-r--r--wp-includes/html-api/class-wp-html-open-elements.php46
-rw-r--r--wp-includes/html-api/class-wp-html-processor.php712
-rw-r--r--wp-includes/html-api/class-wp-html-span.php19
-rw-r--r--wp-includes/html-api/class-wp-html-tag-processor.php1496
-rw-r--r--wp-includes/html-api/class-wp-html-text-replacement.php24
7 files changed, 1992 insertions, 345 deletions
diff --git a/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/wp-includes/html-api/class-wp-html-active-formatting-elements.php
index 9598991..9f7fee9 100644
--- a/wp-includes/html-api/class-wp-html-active-formatting-elements.php
+++ b/wp-includes/html-api/class-wp-html-active-formatting-elements.php
@@ -105,7 +105,7 @@ class WP_HTML_Active_Formatting_Elements {
* > paired such that the two attributes in each pair have identical names, namespaces, and values
* > (the order of the attributes does not matter).
*
- * @TODO: Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack.
+ * @todo Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack.
*/
// > Add element to the list of active formatting elements.
$this->stack[] = $token;
diff --git a/wp-includes/html-api/class-wp-html-attribute-token.php b/wp-includes/html-api/class-wp-html-attribute-token.php
index f938609..74d4132 100644
--- a/wp-includes/html-api/class-wp-html-attribute-token.php
+++ b/wp-includes/html-api/class-wp-html-attribute-token.php
@@ -15,6 +15,7 @@
*
* @access private
* @since 6.2.0
+ * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`.
*
* @see WP_HTML_Tag_Processor
*/
@@ -23,6 +24,7 @@ class WP_HTML_Attribute_Token {
* Attribute name.
*
* @since 6.2.0
+ *
* @var string
*/
public $name;
@@ -31,6 +33,7 @@ class WP_HTML_Attribute_Token {
* Attribute value.
*
* @since 6.2.0
+ *
* @var int
*/
public $value_starts_at;
@@ -39,6 +42,7 @@ class WP_HTML_Attribute_Token {
* How many bytes the value occupies in the input HTML.
*
* @since 6.2.0
+ *
* @var int
*/
public $value_length;
@@ -47,22 +51,43 @@ class WP_HTML_Attribute_Token {
* The string offset where the attribute name starts.
*
* @since 6.2.0
+ *
* @var int
*/
public $start;
/**
- * The string offset after the attribute value or its name.
+ * Byte length of text spanning the attribute inside a tag.
+ *
+ * This span starts at the first character of the attribute name
+ * and it ends after one of three cases:
+ *
+ * - at the end of the attribute name for boolean attributes.
+ * - at the end of the value for unquoted attributes.
+ * - at the final single or double quote for quoted attributes.
+ *
+ * Example:
+ *
+ * <div class="post">
+ * ------------ length is 12, including quotes
+ *
+ * <input type="checked" checked id="selector">
+ * ------- length is 6
+ *
+ * <a rel=noopener>
+ * ------------ length is 11
+ *
+ * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`.
*
- * @since 6.2.0
* @var int
*/
- public $end;
+ public $length;
/**
* Whether the attribute is a boolean attribute with value `true`.
*
* @since 6.2.0
+ *
* @var bool
*/
public $is_true;
@@ -71,20 +96,21 @@ class WP_HTML_Attribute_Token {
* Constructor.
*
* @since 6.2.0
+ * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`.
*
* @param string $name Attribute name.
* @param int $value_start Attribute value.
* @param int $value_length Number of bytes attribute value spans.
* @param int $start The string offset where the attribute name starts.
- * @param int $end The string offset after the attribute value or its name.
+ * @param int $length Byte length of the entire attribute name or name and value pair expression.
* @param bool $is_true Whether the attribute is a boolean attribute with true value.
*/
- public function __construct( $name, $value_start, $value_length, $start, $end, $is_true ) {
+ public function __construct( $name, $value_start, $value_length, $start, $length, $is_true ) {
$this->name = $name;
$this->value_starts_at = $value_start;
$this->value_length = $value_length;
$this->start = $start;
- $this->end = $end;
+ $this->length = $length;
$this->is_true = $is_true;
}
}
diff --git a/wp-includes/html-api/class-wp-html-open-elements.php b/wp-includes/html-api/class-wp-html-open-elements.php
index fe56255..1234abc 100644
--- a/wp-includes/html-api/class-wp-html-open-elements.php
+++ b/wp-includes/html-api/class-wp-html-open-elements.php
@@ -116,13 +116,20 @@ class WP_HTML_Open_Elements {
return true;
}
+ if (
+ '(internal: H1 through H6 - do not use)' === $tag_name &&
+ in_array( $node->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true )
+ ) {
+ return true;
+ }
+
switch ( $node->node_name ) {
case 'HTML':
return false;
}
if ( in_array( $node->node_name, $termination_list, true ) ) {
- return true;
+ return false;
}
}
@@ -159,18 +166,22 @@ class WP_HTML_Open_Elements {
* Returns whether a particular element is in list item scope.
*
* @since 6.4.0
+ * @since 6.5.0 Implemented: no longer throws on every invocation.
*
* @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope
*
- * @throws WP_HTML_Unsupported_Exception Always until this function is implemented.
- *
* @param string $tag_name Name of tag to check.
* @return bool Whether given element is in scope.
*/
public function has_element_in_list_item_scope( $tag_name ) {
- throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on list item scope.' );
-
- return false; // The linter requires this unreachable code until the function is implemented and can return.
+ return $this->has_element_in_specific_scope(
+ $tag_name,
+ array(
+ // There are more elements that belong here which aren't currently supported.
+ 'OL',
+ 'UL',
+ )
+ );
}
/**
@@ -270,6 +281,13 @@ class WP_HTML_Open_Elements {
foreach ( $this->walk_up() as $item ) {
$this->pop();
+ if (
+ '(internal: H1 through H6 - do not use)' === $tag_name &&
+ in_array( $item->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true )
+ ) {
+ return true;
+ }
+
if ( $tag_name === $item->node_name ) {
return true;
}
@@ -361,10 +379,22 @@ class WP_HTML_Open_Elements {
* see WP_HTML_Open_Elements::walk_down().
*
* @since 6.4.0
+ * @since 6.5.0 Accepts $above_this_node to start traversal above a given node, if it exists.
+ *
+ * @param ?WP_HTML_Token $above_this_node Start traversing above this node, if provided and if the node exists.
*/
- public function walk_up() {
+ public function walk_up( $above_this_node = null ) {
+ $has_found_node = null === $above_this_node;
+
for ( $i = count( $this->stack ) - 1; $i >= 0; $i-- ) {
- yield $this->stack[ $i ];
+ $node = $this->stack[ $i ];
+
+ if ( ! $has_found_node ) {
+ $has_found_node = $node === $above_this_node;
+ continue;
+ }
+
+ yield $node;
}
}
diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php
index f27f83b..c76cc19 100644
--- a/wp-includes/html-api/class-wp-html-processor.php
+++ b/wp-includes/html-api/class-wp-html-processor.php
@@ -99,12 +99,20 @@
*
* The following list specifies the HTML tags that _are_ supported:
*
+ * - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY.
+ * - Custom elements: All custom elements are supported. :)
+ * - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH.
+ * - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR.
+ * - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
* - Links: A.
- * - The formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
- * - Containers: DIV, FIGCAPTION, FIGURE, SPAN.
- * - Form elements: BUTTON.
- * - Paragraph: P.
- * - Void elements: IMG.
+ * - Lists: DD, DL, DT, LI, OL, UL.
+ * - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO.
+ * - Paragraph: BR, P.
+ * - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
+ * - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION.
+ * - Templating elements: SLOT.
+ * - Text decoration: RUBY.
+ * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER.
*
* ### Supported markup
*
@@ -142,17 +150,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
const MAX_BOOKMARKS = 100;
/**
- * Static query for instructing the Tag Processor to visit every token.
- *
- * @access private
- *
- * @since 6.4.0
- *
- * @var array
- */
- const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' );
-
- /**
* Holds the working state of the parser, including the stack of
* open elements and the stack of active formatting elements.
*
@@ -244,15 +241,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return null;
}
- $p = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE );
- $p->state->context_node = array( 'BODY', array() );
- $p->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+ $processor = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE );
+ $processor->state->context_node = array( 'BODY', array() );
+ $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
- // @TODO: Create "fake" bookmarks for non-existent but implied nodes.
- $p->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
- $p->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
+ // @todo Create "fake" bookmarks for non-existent but implied nodes.
+ $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
+ $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
- $p->state->stack_of_open_elements->push(
+ $processor->state->stack_of_open_elements->push(
new WP_HTML_Token(
'root-node',
'HTML',
@@ -260,15 +257,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
)
);
- $p->state->stack_of_open_elements->push(
+ $processor->state->stack_of_open_elements->push(
new WP_HTML_Token(
'context-node',
- $p->state->context_node[0],
+ $processor->state->context_node[0],
false
)
);
- return $p;
+ return $processor;
}
/**
@@ -342,7 +339,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* Finds the next tag matching the $query.
*
- * @TODO: Support matching the class name and tag name.
+ * @todo Support matching the class name and tag name.
*
* @since 6.4.0
*
@@ -364,6 +361,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
public function next_tag( $query = null ) {
if ( null === $query ) {
while ( $this->step() ) {
+ if ( '#tag' !== $this->get_token_type() ) {
+ continue;
+ }
+
if ( ! $this->is_tag_closer() ) {
return true;
}
@@ -387,6 +388,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
while ( $this->step() ) {
+ if ( '#tag' !== $this->get_token_type() ) {
+ continue;
+ }
+
if ( ! $this->is_tag_closer() ) {
return true;
}
@@ -408,6 +413,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
$match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;
while ( $match_offset > 0 && $this->step() ) {
+ if ( '#tag' !== $this->get_token_type() ) {
+ continue;
+ }
+
if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) {
return true;
}
@@ -417,6 +426,24 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
}
/**
+ * Ensures internal accounting is maintained for HTML semantic rules while
+ * the underlying Tag Processor class is seeking to a bookmark.
+ *
+ * This doesn't currently have a way to represent non-tags and doesn't process
+ * semantic rules for text nodes. For access to the raw tokens consider using
+ * WP_HTML_Tag_Processor instead.
+ *
+ * @since 6.5.0 Added for internal support; do not use.
+ *
+ * @access private
+ *
+ * @return bool
+ */
+ public function next_token() {
+ return $this->step();
+ }
+
+ /**
* Indicates if the currently-matched tag matches the given breadcrumbs.
*
* A "*" represents a single tag wildcard, where any tag matches, but not no tags.
@@ -442,10 +469,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether the currently-matched tag is found at the given nested structure.
*/
public function matches_breadcrumbs( $breadcrumbs ) {
- if ( ! $this->get_tag() ) {
- return false;
- }
-
// Everything matches when there are zero constraints.
if ( 0 === count( $breadcrumbs ) ) {
return true;
@@ -492,7 +515,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return false;
}
- if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
+ if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) {
/*
* Void elements still hop onto the stack of open elements even though
* there's no corresponding closing tag. This is important for managing
@@ -502,28 +525,42 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* When moving on to the next node, therefore, if the bottom-most element
* on the stack is a void element, it must be closed.
*
- * @TODO: Once self-closing foreign elements and BGSOUND are supported,
+ * @todo Once self-closing foreign elements and BGSOUND are supported,
* they must also be implicitly closed here too. BGSOUND is
* special since it's only self-closing if the self-closing flag
* is provided in the opening tag, otherwise it expects a tag closer.
*/
$top_node = $this->state->stack_of_open_elements->current_node();
- if ( $top_node && self::is_void( $top_node->node_name ) ) {
+ if (
+ $top_node && (
+ // Void elements.
+ self::is_void( $top_node->node_name ) ||
+ // Comments, text nodes, and other atomic tokens.
+ '#' === $top_node->node_name[0] ||
+ // Doctype declarations.
+ 'html' === $top_node->node_name
+ )
+ ) {
$this->state->stack_of_open_elements->pop();
}
+ }
- parent::next_tag( self::VISIT_EVERYTHING );
+ if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
+ parent::next_token();
}
// Finish stepping when there are no more tokens in the document.
- if ( null === $this->get_tag() ) {
+ if (
+ WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
+ WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
+ ) {
return false;
}
$this->state->current_token = new WP_HTML_Token(
- $this->bookmark_tag(),
- $this->get_tag(),
- $this->is_tag_closer(),
+ $this->bookmark_token(),
+ $this->get_token_name(),
+ $this->has_self_closing_flag(),
$this->release_internal_bookmark_on_destruct
);
@@ -551,9 +588,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* Breadcrumbs start at the outermost parent and descend toward the matched element.
* They always include the entire path from the root HTML node to the matched element.
*
- * @TODO: It could be more efficient to expose a generator-based version of this function
- * to avoid creating the array copy on tag iteration. If this is done, it would likely
- * be more useful to walk up the stack when yielding instead of starting at the top.
+ * @todo It could be more efficient to expose a generator-based version of this function
+ * to avoid creating the array copy on tag iteration. If this is done, it would likely
+ * be more useful to walk up the stack when yielding instead of starting at the top.
*
* Example
*
@@ -566,10 +603,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
*/
public function get_breadcrumbs() {
- if ( ! $this->get_tag() ) {
- return null;
- }
-
$breadcrumbs = array();
foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) {
$breadcrumbs[] = $stack_item->node_name;
@@ -594,17 +627,67 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found.
*/
private function step_in_body() {
- $tag_name = $this->get_tag();
- $op_sigil = $this->is_tag_closer() ? '-' : '+';
- $op = "{$op_sigil}{$tag_name}";
+ $token_name = $this->get_token_name();
+ $token_type = $this->get_token_type();
+ $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
+ $op = "{$op_sigil}{$token_name}";
switch ( $op ) {
+ case '#comment':
+ case '#funky-comment':
+ case '#presumptuous-tag':
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+
+ case '#text':
+ $this->reconstruct_active_formatting_elements();
+
+ $current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
+
+ /*
+ * > A character token that is U+0000 NULL
+ *
+ * Any successive sequence of NULL bytes is ignored and won't
+ * trigger active format reconstruction. Therefore, if the text
+ * only comprises NULL bytes then the token should be ignored
+ * here, but if there are any other characters in the stream
+ * the active formats should be reconstructed.
+ */
+ if (
+ 1 <= $current_token->length &&
+ "\x00" === $this->html[ $current_token->start ] &&
+ strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
+ ) {
+ // Parse error: ignore the token.
+ return $this->step();
+ }
+
+ /*
+ * Whitespace-only text does not affect the frameset-ok flag.
+ * It is probably inter-element whitespace, but it may also
+ * contain character references which decode only to whitespace.
+ */
+ $text = $this->get_modifiable_text();
+ if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
+ $this->state->frameset_ok = false;
+ }
+
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+
+ case 'html':
+ /*
+ * > A DOCTYPE token
+ * > Parse error. Ignore the token.
+ */
+ return $this->step();
+
/*
* > A start tag whose tag name is "button"
*/
case '+BUTTON':
if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) {
- // @TODO: Indicate a parse error once it's possible. This error does not impact the logic here.
+ // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
$this->generate_implied_end_tags();
$this->state->stack_of_open_elements->pop_until( 'BUTTON' );
}
@@ -621,11 +704,31 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* > "fieldset", "figcaption", "figure", "footer", "header", "hgroup",
* > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul"
*/
+ case '+ADDRESS':
+ case '+ARTICLE':
+ case '+ASIDE':
case '+BLOCKQUOTE':
+ case '+CENTER':
+ case '+DETAILS':
+ case '+DIALOG':
+ case '+DIR':
case '+DIV':
+ case '+DL':
+ case '+FIELDSET':
case '+FIGCAPTION':
case '+FIGURE':
+ case '+FOOTER':
+ case '+HEADER':
+ case '+HGROUP':
+ case '+MAIN':
+ case '+MENU':
+ case '+NAV':
+ case '+OL':
case '+P':
+ case '+SEARCH':
+ case '+SECTION':
+ case '+SUMMARY':
+ case '+UL':
if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
$this->close_a_p_element();
}
@@ -639,22 +742,213 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main",
* > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul"
*/
+ case '-ADDRESS':
+ case '-ARTICLE':
+ case '-ASIDE':
case '-BLOCKQUOTE':
case '-BUTTON':
+ case '-CENTER':
+ case '-DETAILS':
+ case '-DIALOG':
+ case '-DIR':
case '-DIV':
+ case '-DL':
+ case '-FIELDSET':
case '-FIGCAPTION':
case '-FIGURE':
- if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) {
- // @TODO: Report parse error.
+ case '-FOOTER':
+ case '-HEADER':
+ case '-HGROUP':
+ case '-LISTING':
+ case '-MAIN':
+ case '-MENU':
+ case '-NAV':
+ case '-OL':
+ case '-PRE':
+ case '-SEARCH':
+ case '-SECTION':
+ case '-SUMMARY':
+ case '-UL':
+ if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
+ // @todo Report parse error.
// Ignore the token.
return $this->step();
}
$this->generate_implied_end_tags();
- if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
- // @TODO: Record parse error: this error doesn't impact parsing.
+ if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
+ // @todo Record parse error: this error doesn't impact parsing.
+ }
+ $this->state->stack_of_open_elements->pop_until( $token_name );
+ return true;
+
+ /*
+ * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
+ */
+ case '+H1':
+ case '+H2':
+ case '+H3':
+ case '+H4':
+ case '+H5':
+ case '+H6':
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+
+ if (
+ in_array(
+ $this->state->stack_of_open_elements->current_node()->node_name,
+ array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ),
+ true
+ )
+ ) {
+ // @todo Indicate a parse error once it's possible.
+ $this->state->stack_of_open_elements->pop();
+ }
+
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+
+ /*
+ * > A start tag whose tag name is one of: "pre", "listing"
+ */
+ case '+PRE':
+ case '+LISTING':
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+ $this->insert_html_element( $this->state->current_token );
+ $this->state->frameset_ok = false;
+ return true;
+
+ /*
+ * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
+ */
+ case '-H1':
+ case '-H2':
+ case '-H3':
+ case '-H4':
+ case '-H5':
+ case '-H6':
+ if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) {
+ /*
+ * This is a parse error; ignore the token.
+ *
+ * @todo Indicate a parse error once it's possible.
+ */
+ return $this->step();
+ }
+
+ $this->generate_implied_end_tags();
+
+ if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
+ // @todo Record parse error: this error doesn't impact parsing.
}
- $this->state->stack_of_open_elements->pop_until( $tag_name );
+
+ $this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' );
+ return true;
+
+ /*
+ * > A start tag whose tag name is "li"
+ * > A start tag whose tag name is one of: "dd", "dt"
+ */
+ case '+DD':
+ case '+DT':
+ case '+LI':
+ $this->state->frameset_ok = false;
+ $node = $this->state->stack_of_open_elements->current_node();
+ $is_li = 'LI' === $token_name;
+
+ in_body_list_loop:
+ /*
+ * The logic for LI and DT/DD is the same except for one point: LI elements _only_
+ * close other LI elements, but a DT or DD element closes _any_ open DT or DD element.
+ */
+ if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) {
+ $node_name = $is_li ? 'LI' : $node->node_name;
+ $this->generate_implied_end_tags( $node_name );
+ if ( $node_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
+ // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
+ }
+
+ $this->state->stack_of_open_elements->pop_until( $node_name );
+ goto in_body_list_done;
+ }
+
+ if (
+ 'ADDRESS' !== $node->node_name &&
+ 'DIV' !== $node->node_name &&
+ 'P' !== $node->node_name &&
+ $this->is_special( $node->node_name )
+ ) {
+ /*
+ * > If node is in the special category, but is not an address, div,
+ * > or p element, then jump to the step labeled done below.
+ */
+ goto in_body_list_done;
+ } else {
+ /*
+ * > Otherwise, set node to the previous entry in the stack of open elements
+ * > and return to the step labeled loop.
+ */
+ foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) {
+ $node = $item;
+ break;
+ }
+ goto in_body_list_loop;
+ }
+
+ in_body_list_done:
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+
+ /*
+ * > An end tag whose tag name is "li"
+ * > An end tag whose tag name is one of: "dd", "dt"
+ */
+ case '-DD':
+ case '-DT':
+ case '-LI':
+ if (
+ /*
+ * An end tag whose tag name is "li":
+ * If the stack of open elements does not have an li element in list item scope,
+ * then this is a parse error; ignore the token.
+ */
+ (
+ 'LI' === $token_name &&
+ ! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
+ ) ||
+ /*
+ * An end tag whose tag name is one of: "dd", "dt":
+ * If the stack of open elements does not have an element in scope that is an
+ * HTML element with the same tag name as that of the token, then this is a
+ * parse error; ignore the token.
+ */
+ (
+ 'LI' !== $token_name &&
+ ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name )
+ )
+ ) {
+ /*
+ * This is a parse error, ignore the token.
+ *
+ * @todo Indicate a parse error once it's possible.
+ */
+ return $this->step();
+ }
+
+ $this->generate_implied_end_tags( $token_name );
+
+ if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
+ // @todo Indicate a parse error once it's possible. This error does not impact the logic here.
+ }
+
+ $this->state->stack_of_open_elements->pop_until( $token_name );
return true;
/*
@@ -730,47 +1024,174 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return true;
/*
+ * > An end tag whose tag name is "br"
+ * > Parse error. Drop the attributes from the token, and act as described in the next
+ * > entry; i.e. act as if this was a "br" start tag token with no attributes, rather
+ * > than the end tag token that it actually is.
+ */
+ case '-BR':
+ $this->last_error = self::ERROR_UNSUPPORTED;
+ throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' );
+
+ /*
* > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr"
*/
+ case '+AREA':
+ case '+BR':
+ case '+EMBED':
case '+IMG':
+ case '+KEYGEN':
+ case '+WBR':
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
+ $this->state->frameset_ok = false;
return true;
/*
- * > Any other start tag
+ * > A start tag whose tag name is "input"
*/
- case '+SPAN':
+ case '+INPUT':
$this->reconstruct_active_formatting_elements();
$this->insert_html_element( $this->state->current_token );
+ $type_attribute = $this->get_attribute( 'type' );
+ /*
+ * > If the token does not have an attribute with the name "type", or if it does,
+ * > but that attribute's value is not an ASCII case-insensitive match for the
+ * > string "hidden", then: set the frameset-ok flag to "not ok".
+ */
+ if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) {
+ $this->state->frameset_ok = false;
+ }
return true;
/*
- * Any other end tag
+ * > A start tag whose tag name is "hr"
*/
- case '-SPAN':
- foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
- // > If node is an HTML element with the same tag name as the token, then:
- if ( $item->node_name === $tag_name ) {
- $this->generate_implied_end_tags( $tag_name );
+ case '+HR':
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+ $this->insert_html_element( $this->state->current_token );
+ $this->state->frameset_ok = false;
+ return true;
- // > If node is not the current node, then this is a parse error.
+ /*
+ * > A start tag whose tag name is one of: "param", "source", "track"
+ */
+ case '+PARAM':
+ case '+SOURCE':
+ case '+TRACK':
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+ }
- $this->state->stack_of_open_elements->pop_until( $tag_name );
- return true;
- }
+ /*
+ * These tags require special handling in the 'in body' insertion mode
+ * but that handling hasn't yet been implemented.
+ *
+ * As the rules for each tag are implemented, the corresponding tag
+ * name should be removed from this list. An accompanying test should
+ * help ensure this list is maintained.
+ *
+ * @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags
+ *
+ * Since this switch structure throws a WP_HTML_Unsupported_Exception, it's
+ * possible to handle "any other start tag" and "any other end tag" below,
+ * as that guarantees execution doesn't proceed for the unimplemented tags.
+ *
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
+ */
+ switch ( $token_name ) {
+ case 'APPLET':
+ case 'BASE':
+ case 'BASEFONT':
+ case 'BGSOUND':
+ case 'BODY':
+ case 'CAPTION':
+ case 'COL':
+ case 'COLGROUP':
+ case 'FORM':
+ case 'FRAME':
+ case 'FRAMESET':
+ case 'HEAD':
+ case 'HTML':
+ case 'IFRAME':
+ case 'LINK':
+ case 'MARQUEE':
+ case 'MATH':
+ case 'META':
+ case 'NOBR':
+ case 'NOEMBED':
+ case 'NOFRAMES':
+ case 'NOSCRIPT':
+ case 'OBJECT':
+ case 'OPTGROUP':
+ case 'OPTION':
+ case 'PLAINTEXT':
+ case 'RB':
+ case 'RP':
+ case 'RT':
+ case 'RTC':
+ case 'SARCASM':
+ case 'SCRIPT':
+ case 'SELECT':
+ case 'STYLE':
+ case 'SVG':
+ case 'TABLE':
+ case 'TBODY':
+ case 'TD':
+ case 'TEMPLATE':
+ case 'TEXTAREA':
+ case 'TFOOT':
+ case 'TH':
+ case 'THEAD':
+ case 'TITLE':
+ case 'TR':
+ case 'XMP':
+ $this->last_error = self::ERROR_UNSUPPORTED;
+ throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
+ }
- // > Otherwise, if node is in the special category, then this is a parse error; ignore the token, and return.
- if ( self::is_special( $item->node_name ) ) {
- return $this->step();
- }
+ if ( ! $this->is_tag_closer() ) {
+ /*
+ * > Any other start tag
+ */
+ $this->reconstruct_active_formatting_elements();
+ $this->insert_html_element( $this->state->current_token );
+ return true;
+ } else {
+ /*
+ * > Any other end tag
+ */
+
+ /*
+ * Find the corresponding tag opener in the stack of open elements, if
+ * it exists before reaching a special element, which provides a kind
+ * of boundary in the stack. For example, a `</custom-tag>` should not
+ * close anything beyond its containing `P` or `DIV` element.
+ */
+ foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
+ if ( $token_name === $node->node_name ) {
+ break;
}
- // Execution should not reach here; if it does then something went wrong.
- return false;
- default:
- $this->last_error = self::ERROR_UNSUPPORTED;
- throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
+ if ( self::is_special( $node->node_name ) ) {
+ // This is a parse error, ignore the token.
+ return $this->step();
+ }
+ }
+
+ $this->generate_implied_end_tags( $token_name );
+ if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
+ // @todo Record parse error: this error doesn't impact parsing.
+ }
+
+ foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
+ $this->state->stack_of_open_elements->pop();
+ if ( $node === $item ) {
+ return true;
+ }
+ }
}
}
@@ -779,19 +1200,16 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*/
/**
- * Creates a new bookmark for the currently-matched tag and returns the generated name.
+ * Creates a new bookmark for the currently-matched token and returns the generated name.
*
* @since 6.4.0
+ * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
*
* @throws Exception When unable to allocate requested bookmark.
*
* @return string|false Name of created bookmark, or false if unable to create.
*/
- private function bookmark_tag() {
- if ( ! $this->get_tag() ) {
- return false;
- }
-
+ private function bookmark_token() {
if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
$this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
throw new Exception( 'could not allocate bookmark' );
@@ -863,6 +1281,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* Moves the internal cursor in the HTML Processor to a given bookmark's location.
*
+ * Be careful! Seeking backwards to a previous location resets the parser to the
+ * start of the document and reparses the entire contents up until it finds the
+ * sought-after bookmarked location.
+ *
* In order to prevent accidental infinite loops, there's a
* maximum limit on the number of times seek() can be called.
*
@@ -874,6 +1296,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether the internal cursor was successfully moved to the bookmark's location.
*/
public function seek( $bookmark_name ) {
+ // Flush any pending updates to the document before beginning.
+ $this->get_updated_html();
+
$actual_bookmark_name = "_{$bookmark_name}";
$processor_started_at = $this->state->current_token
? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start
@@ -881,44 +1306,73 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
$bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start;
$direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward';
- switch ( $direction ) {
- case 'forward':
- // When moving forwards, re-parse the document until reaching the same location as the original bookmark.
- while ( $this->step() ) {
- if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
- return true;
- }
+ /*
+ * If seeking backwards, it's possible that the sought-after bookmark exists within an element
+ * which has been closed before the current cursor; in other words, it has already been removed
+ * from the stack of open elements. This means that it's insufficient to simply pop off elements
+ * from the stack of open elements which appear after the bookmarked location and then jump to
+ * that location, as the elements which were open before won't be re-opened.
+ *
+ * In order to maintain consistency, the HTML Processor rewinds to the start of the document
+ * and reparses everything until it finds the sought-after bookmark.
+ *
+ * There are potentially better ways to do this: cache the parser state for each bookmark and
+ * restore it when seeking; store an immutable and idempotent register of where elements open
+ * and close.
+ *
+ * If caching the parser state it will be essential to properly maintain the cached stack of
+ * open elements and active formatting elements when modifying the document. This could be a
+ * tedious and time-consuming process as well, and so for now will not be performed.
+ *
+ * It may be possible to track bookmarks for where elements open and close, and in doing so
+ * be able to quickly recalculate breadcrumbs for any element in the document. It may even
+ * be possible to remove the stack of open elements and compute it on the fly this way.
+ * If doing this, the parser would need to track the opening and closing locations for all
+ * tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves
+ * this list could be automatically maintained while modifying the document. Finding the
+ * breadcrumbs would then amount to traversing that list from the start until the token
+ * being inspected. Once an element closes, if there are no bookmarks pointing to locations
+ * within that element, then all of these locations may be forgotten to save on memory use
+ * and computation time.
+ */
+ if ( 'backward' === $direction ) {
+ /*
+ * Instead of clearing the parser state and starting fresh, calling the stack methods
+ * maintains the proper flags in the parser.
+ */
+ foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
+ if ( 'context-node' === $item->bookmark_name ) {
+ break;
}
- return false;
-
- case 'backward':
- /*
- * When moving backwards, clear out all existing stack entries which appear after the destination
- * bookmark. These could be stored for later retrieval, but doing so would require additional
- * memory overhead and also demand that references and bookmarks are updated as the document
- * changes. In time this could be a valuable optimization, but it's okay to give up that
- * optimization in exchange for more CPU time to recompute the stack, to re-parse the
- * document that may have already been parsed once.
- */
- foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
- if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) {
- break;
- }
+ $this->state->stack_of_open_elements->remove_node( $item );
+ }
- $this->state->stack_of_open_elements->remove_node( $item );
+ foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
+ if ( 'context-node' === $item->bookmark_name ) {
+ break;
}
- foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
- if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) {
- break;
- }
+ $this->state->active_formatting_elements->remove_node( $item );
+ }
- $this->state->active_formatting_elements->remove_node( $item );
- }
+ parent::seek( 'context-node' );
+ $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+ $this->state->frameset_ok = true;
+ }
- return parent::seek( $actual_bookmark_name );
+ // When moving forwards, reparse the document until reaching the same location as the original bookmark.
+ if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
+ return true;
}
+
+ while ( $this->step() ) {
+ if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
+ return true;
+ }
+ }
+
+ return false;
}
/**
@@ -1005,6 +1459,18 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return parent::set_bookmark( "_{$bookmark_name}" );
}
+ /**
+ * Checks whether a bookmark with the given name exists.
+ *
+ * @since 6.5.0
+ *
+ * @param string $bookmark_name Name to identify a bookmark that potentially exists.
+ * @return bool Whether that bookmark exists.
+ */
+ public function has_bookmark( $bookmark_name ) {
+ return parent::has_bookmark( "_{$bookmark_name}" );
+ }
+
/*
* HTML Parsing Algorithms
*/
@@ -1034,6 +1500,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*/
private function generate_implied_end_tags( $except_for_this_element = null ) {
$elements_with_implied_end_tags = array(
+ 'DD',
+ 'DT',
+ 'LI',
'P',
);
@@ -1059,6 +1528,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*/
private function generate_implied_end_tags_thoroughly() {
$elements_with_implied_end_tags = array(
+ 'DD',
+ 'DT',
+ 'LI',
'P',
);
@@ -1170,7 +1642,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
// > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return.
if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) {
- $this->state->active_formatting_elements->remove_node( $formatting_element->bookmark_name );
+ $this->state->active_formatting_elements->remove_node( $formatting_element );
return;
}
@@ -1373,14 +1845,19 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return (
'AREA' === $tag_name ||
'BASE' === $tag_name ||
+ 'BASEFONT' === $tag_name || // Obsolete but still treated as void.
+ 'BGSOUND' === $tag_name || // Obsolete but still treated as void.
'BR' === $tag_name ||
'COL' === $tag_name ||
'EMBED' === $tag_name ||
+ 'FRAME' === $tag_name ||
'HR' === $tag_name ||
'IMG' === $tag_name ||
'INPUT' === $tag_name ||
+ 'KEYGEN' === $tag_name || // Obsolete but still treated as void.
'LINK' === $tag_name ||
'META' === $tag_name ||
+ 'PARAM' === $tag_name || // Obsolete but still treated as void.
'SOURCE' === $tag_name ||
'TRACK' === $tag_name ||
'WBR' === $tag_name
@@ -1410,6 +1887,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
const REPROCESS_CURRENT_NODE = 'reprocess-current-node';
/**
+ * Indicates that the current HTML token should be processed without advancing the parser.
+ *
+ * @since 6.5.0
+ *
+ * @var string
+ */
+ const PROCESS_CURRENT_NODE = 'process-current-node';
+
+ /**
* Indicates that the parser encountered unsupported markup and has bailed.
*
* @since 6.4.0
diff --git a/wp-includes/html-api/class-wp-html-span.php b/wp-includes/html-api/class-wp-html-span.php
index 46227eb..b1ab865 100644
--- a/wp-includes/html-api/class-wp-html-span.php
+++ b/wp-includes/html-api/class-wp-html-span.php
@@ -18,6 +18,7 @@
*
* @access private
* @since 6.2.0
+ * @since 6.5.0 Replaced `end` with `length` to more closely align with `substr()`.
*
* @see WP_HTML_Tag_Processor
*/
@@ -26,28 +27,30 @@ class WP_HTML_Span {
* Byte offset into document where span begins.
*
* @since 6.2.0
+ *
* @var int
*/
public $start;
/**
- * Byte offset into document where span ends.
+ * Byte length of this span.
+ *
+ * @since 6.5.0
*
- * @since 6.2.0
* @var int
*/
- public $end;
+ public $length;
/**
* Constructor.
*
* @since 6.2.0
*
- * @param int $start Byte offset into document where replacement span begins.
- * @param int $end Byte offset into document where replacement span ends.
+ * @param int $start Byte offset into document where replacement span begins.
+ * @param int $length Byte length of span.
*/
- public function __construct( $start, $end ) {
- $this->start = $start;
- $this->end = $end;
+ public function __construct( $start, $length ) {
+ $this->start = $start;
+ $this->length = $length;
}
}
diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php
index 0572c46..c540ea9 100644
--- a/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -15,9 +15,6 @@
* - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c".
* This would increase the size of the changes for some operations but leave more
* natural-looking output HTML.
- * - Decode HTML character references within class names when matching. E.g. match having
- * class `1<"2` needs to recognize `class="1&lt;&quot;2"`. Currently the Tag Processor
- * will fail to find the right tag if the class name is encoded as such.
* - Properly decode HTML character references in `get_attribute()`. PHP's
* `html_entity_decode()` is wrong in a couple ways: it doesn't account for the
* no-ambiguous-ampersand rule, and it improperly handles the way semicolons may
@@ -107,6 +104,56 @@
* given, it will return `true` (the only way to set `false` for an
* attribute is to remove it).
*
+ * #### When matching fails
+ *
+ * When `next_tag()` returns `false` it could mean different things:
+ *
+ * - The requested tag wasn't found in the input document.
+ * - The input document ended in the middle of an HTML syntax element.
+ *
+ * When a document ends in the middle of a syntax element it will pause
+ * the processor. This is to make it possible in the future to extend the
+ * input document and proceed - an important requirement for chunked
+ * streaming parsing of a document.
+ *
+ * Example:
+ *
+ * $processor = new WP_HTML_Tag_Processor( 'This <div is="a" partial="token' );
+ * false === $processor->next_tag();
+ *
+ * If a special element (see next section) is encountered but no closing tag
+ * is found it will count as an incomplete tag. The parser will pause as if
+ * the opening tag were incomplete.
+ *
+ * Example:
+ *
+ * $processor = new WP_HTML_Tag_Processor( '<style>// there could be more styling to come' );
+ * false === $processor->next_tag();
+ *
+ * $processor = new WP_HTML_Tag_Processor( '<style>// this is everything</style><div>' );
+ * true === $processor->next_tag( 'DIV' );
+ *
+ * #### Special elements
+ *
+ * Some HTML elements are handled in a special way; their start and end tags
+ * act like a void tag. These are special because their contents can't contain
+ * HTML markup. Everything inside these elements is handled in a special way
+ * and content that _appears_ like HTML tags inside of them isn't. There can
+ * be no nesting in these elements.
+ *
+ * In the following list, "raw text" means that all of the content in the HTML
+ * until the matching closing tag is treated verbatim without any replacements
+ * and without any parsing.
+ *
+ * - IFRAME allows no content but requires a closing tag.
+ * - NOEMBED (deprecated) content is raw text.
+ * - NOFRAMES (deprecated) content is raw text.
+ * - SCRIPT content is plaintext apart from legacy rules allowing `</script>` inside an HTML comment.
+ * - STYLE content is raw text.
+ * - TITLE content is plain text but character references are decoded.
+ * - TEXTAREA content is plain text but character references are decoded.
+ * - XMP (deprecated) content is raw text.
+ *
* ### Modifying HTML attributes for a found tag
*
* Once you've found the start of an opening tag you can modify
@@ -200,6 +247,95 @@
* }
* }
*
+ * ## Tokens and finer-grained processing.
+ *
+ * It's possible to scan through every lexical token in the
+ * HTML document using the `next_token()` function. This
+ * alternative form takes no argument and provides no built-in
+ * query syntax.
+ *
+ * Example:
+ *
+ * $title = '(untitled)';
+ * $text = '';
+ * while ( $processor->next_token() ) {
+ * switch ( $processor->get_token_name() ) {
+ * case '#text':
+ * $text .= $processor->get_modifiable_text();
+ * break;
+ *
+ * case 'BR':
+ * $text .= "\n";
+ * break;
+ *
+ * case 'TITLE':
+ * $title = $processor->get_modifiable_text();
+ * break;
+ * }
+ * }
+ * return trim( "# {$title}\n\n{$text}" );
+ *
+ * ### Tokens and _modifiable text_.
+ *
+ * #### Special "atomic" HTML elements.
+ *
+ * Not all HTML elements are able to contain other elements inside of them.
+ * For instance, the contents inside a TITLE element are plaintext (except
+ * that character references like &amp; will be decoded). This means that
+ * if the string `<img>` appears inside a TITLE element, then it's not an
+ * image tag, but rather it's text describing an image tag. Likewise, the
+ * contents of a SCRIPT or STYLE element are handled entirely separately in
+ * a browser than the contents of other elements because they represent a
+ * different language than HTML.
+ *
+ * For these elements the Tag Processor treats the entire sequence as one,
+ * from the opening tag, including its contents, through its closing tag.
+ * This means that the it's not possible to match the closing tag for a
+ * SCRIPT element unless it's unexpected; the Tag Processor already matched
+ * it when it found the opening tag.
+ *
+ * The inner contents of these elements are that element's _modifiable text_.
+ *
+ * The special elements are:
+ * - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy
+ * style of including Javascript inside of HTML comments to avoid accidentally
+ * closing the SCRIPT from inside a Javascript string. E.g. `console.log( '</script>' )`.
+ * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any
+ * character references are decoded. E.g. `1 &lt; 2 < 3` becomes `1 < 2 < 3`.
+ * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as
+ * raw plaintext and left as-is. E.g. `1 &lt; 2 < 3` remains `1 &lt; 2 < 3`.
+ *
+ * #### Other tokens with modifiable text.
+ *
+ * There are also non-elements which are void/self-closing in nature and contain
+ * modifiable text that is part of that individual syntax token itself.
+ *
+ * - `#text` nodes, whose entire token _is_ the modifiable text.
+ * - HTML comments and tokens that become comments due to some syntax error. The
+ * text for these tokens is the portion of the comment inside of the syntax.
+ * E.g. for `<!-- comment -->` the text is `" comment "` (note the spaces are included).
+ * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for
+ * `<![CDATA[some content]]>` the text is `"some content"` (with restrictions [1]).
+ * - "Funky comments," which are a special case of invalid closing tags whose name is
+ * invalid. The text for these nodes is the text that a browser would transform into
+ * an HTML comment when parsing. E.g. for `</%post_author>` the text is `%post_author`.
+ * - `DOCTYPE` declarations like `<DOCTYPE html>` which have no closing tag.
+ * - XML Processing instruction nodes like `<?wp __( "Like" ); ?>` (with restrictions [2]).
+ * - The empty end tag `</>` which is ignored in the browser and DOM.
+ *
+ * [1]: There are no CDATA sections in HTML. When encountering `<![CDATA[`, everything
+ * until the next `>` becomes a bogus HTML comment, meaning there can be no CDATA
+ * section in an HTML document containing `>`. The Tag Processor will first find
+ * all valid and bogus HTML comments, and then if the comment _would_ have been a
+ * CDATA section _were they to exist_, it will indicate this as the type of comment.
+ *
+ * [2]: XML allows a broader range of characters in a processing instruction's target name
+ * and disallows "xml" as a name, since it's special. The Tag Processor only recognizes
+ * target names with an ASCII-representable subset of characters. It also exhibits the
+ * same constraint as with CDATA sections, in that `>` cannot exist within the token
+ * since Processing Instructions do no exist within HTML and their syntax transforms
+ * into a bogus comment in the DOM.
+ *
* ## Design and limitations
*
* The Tag Processor is designed to linearly scan HTML documents and tokenize
@@ -241,9 +377,40 @@
* double-quoted strings, meaning that attributes on input with single-quoted or
* unquoted values will appear in the output with double-quotes.
*
+ * ### Scripting Flag
+ *
+ * The Tag Processor parses HTML with the "scripting flag" disabled. This means
+ * that it doesn't run any scripts while parsing the page. In a browser with
+ * JavaScript enabled, for example, the script can change the parse of the
+ * document as it loads. On the server, however, evaluating JavaScript is not
+ * only impractical, but also unwanted.
+ *
+ * Practically this means that the Tag Processor will descend into NOSCRIPT
+ * elements and process its child tags. Were the scripting flag enabled, such
+ * as in a typical browser, the contents of NOSCRIPT are skipped entirely.
+ *
+ * This allows the HTML API to process the content that will be presented in
+ * a browser when scripting is disabled, but it offers a different view of a
+ * page than most browser sessions will experience. E.g. the tags inside the
+ * NOSCRIPT disappear.
+ *
+ * ### Text Encoding
+ *
+ * The Tag Processor assumes that the input HTML document is encoded with a
+ * text encoding compatible with 7-bit ASCII's '<', '>', '&', ';', '/', '=',
+ * "'", '"', 'a' - 'z', 'A' - 'Z', and the whitespace characters ' ', tab,
+ * carriage-return, newline, and form-feed.
+ *
+ * In practice, this includes almost every single-byte encoding as well as
+ * UTF-8. Notably, however, it does not include UTF-16. If providing input
+ * that's incompatible, then convert the encoding beforehand.
+ *
* @since 6.2.0
* @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive.
* @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE.
+ * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token.
+ * Introduces "special" elements which act like void elements, e.g. TITLE, STYLE.
+ * Allows scanning through all tokens and processing modifiable text, where applicable.
*/
class WP_HTML_Tag_Processor {
/**
@@ -317,6 +484,51 @@ class WP_HTML_Tag_Processor {
private $stop_on_tag_closers;
/**
+ * Specifies mode of operation of the parser at any given time.
+ *
+ * | State | Meaning |
+ * | ----------------|----------------------------------------------------------------------|
+ * | *Ready* | The parser is ready to run. |
+ * | *Complete* | There is nothing left to parse. |
+ * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. |
+ * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. |
+ * | *Text node* | Found a #text node; this is plaintext and modifiable. |
+ * | *CDATA node* | Found a CDATA section; this is modifiable. |
+ * | *Comment* | Found a comment or bogus comment; this is modifiable. |
+ * | *Presumptuous* | Found an empty tag closer: `</>`. |
+ * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. |
+ *
+ * @since 6.5.0
+ *
+ * @see WP_HTML_Tag_Processor::STATE_READY
+ * @see WP_HTML_Tag_Processor::STATE_COMPLETE
+ * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT
+ * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
+ * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
+ * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
+ * @see WP_HTML_Tag_Processor::STATE_COMMENT
+ * @see WP_HTML_Tag_Processor::STATE_DOCTYPE
+ * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
+ * @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT
+ *
+ * @var string
+ */
+ protected $parser_state = self::STATE_READY;
+
+ /**
+ * What kind of syntax token became an HTML comment.
+ *
+ * Since there are many ways in which HTML syntax can create an HTML comment,
+ * this indicates which of those caused it. This allows the Tag Processor to
+ * represent more from the original input document than would appear in the DOM.
+ *
+ * @since 6.5.0
+ *
+ * @var string|null
+ */
+ protected $comment_type = null;
+
+ /**
* How many bytes from the original HTML document have been read and parsed.
*
* This value points to the latest byte offset in the input document which
@@ -329,6 +541,40 @@ class WP_HTML_Tag_Processor {
private $bytes_already_parsed = 0;
/**
+ * Byte offset in input document where current token starts.
+ *
+ * Example:
+ *
+ * <div id="test">...
+ * 01234
+ * - token starts at 0
+ *
+ * @since 6.5.0
+ *
+ * @var int|null
+ */
+ private $token_starts_at;
+
+ /**
+ * Byte length of current token.
+ *
+ * Example:
+ *
+ * <div id="test">...
+ * 012345678901234
+ * - token length is 14 - 0 = 14
+ *
+ * a <!-- comment --> is a token.
+ * 0123456789 123456789 123456789
+ * - token length is 17 - 2 = 15
+ *
+ * @since 6.5.0
+ *
+ * @var int|null
+ */
+ private $token_length;
+
+ /**
* Byte offset in input document where current tag name starts.
*
* Example:
@@ -338,6 +584,7 @@ class WP_HTML_Tag_Processor {
* - tag name starts at 1
*
* @since 6.2.0
+ *
* @var int|null
*/
private $tag_name_starts_at;
@@ -352,24 +599,28 @@ class WP_HTML_Tag_Processor {
* --- tag name length is 3
*
* @since 6.2.0
+ *
* @var int|null
*/
private $tag_name_length;
/**
- * Byte offset in input document where current tag token ends.
+ * Byte offset into input document where current modifiable text starts.
*
- * Example:
+ * @since 6.5.0
*
- * <div id="test">...
- * 0 1 |
- * 01234567890123456
- * --- tag name ends at 14
+ * @var int
+ */
+ private $text_starts_at;
+
+ /**
+ * Byte length of modifiable text.
*
- * @since 6.2.0
- * @var int|null
+ * @since 6.5.0
+ *
+ * @var string
*/
- private $tag_ends_at;
+ private $text_length;
/**
* Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>.
@@ -388,14 +639,14 @@ class WP_HTML_Tag_Processor {
* // <div id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8">
* // ^ parsing will continue from this point.
* $this->attributes = array(
- * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 )
+ * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false )
* );
*
* // When picking up parsing again, or when asking to find the
* // `class` attribute we will continue and add to this array.
* $this->attributes = array(
- * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ),
- * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 )
+ * 'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ),
+ * 'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false )
* );
*
* // Note that only the `class` attribute value is stored in the index.
@@ -484,9 +735,9 @@ class WP_HTML_Tag_Processor {
*
* // Replace an attribute stored with a new value, indices
* // sourced from the lazily-parsed HTML recognizer.
- * $start = $attributes['src']->start;
- * $end = $attributes['src']->end;
- * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value );
+ * $start = $attributes['src']->start;
+ * $length = $attributes['src']->length;
+ * $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value );
*
* // Correspondingly, something like this will appear in this array.
* $lexical_updates = array(
@@ -523,6 +774,7 @@ class WP_HTML_Tag_Processor {
* Finds the next tag matching the $query.
*
* @since 6.2.0
+ * @since 6.5.0 No longer processes incomplete tokens at end of document; pauses the processor at start of token.
*
* @param array|string|null $query {
* Optional. Which tag name to find, having which class, etc. Default is to find any tag.
@@ -541,90 +793,253 @@ class WP_HTML_Tag_Processor {
$already_found = 0;
do {
- if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ if ( false === $this->next_token() ) {
return false;
}
- // Find the next tag if it exists.
- if ( false === $this->parse_next_tag() ) {
- $this->bytes_already_parsed = strlen( $this->html );
-
- return false;
- }
-
- // Parse all of its attributes.
- while ( $this->parse_next_attribute() ) {
+ if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
continue;
}
- // Ensure that the tag closes before the end of the document.
- if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
- return false;
+ if ( $this->matches() ) {
+ ++$already_found;
}
+ } while ( $already_found < $this->sought_match_offset );
- $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
- if ( false === $tag_ends_at ) {
- return false;
- }
- $this->tag_ends_at = $tag_ends_at;
- $this->bytes_already_parsed = $tag_ends_at;
+ return true;
+ }
- // Finally, check if the parsed tag and its attributes match the search query.
- if ( $this->matches() ) {
- ++$already_found;
+ /**
+ * Finds the next token in the HTML document.
+ *
+ * An HTML document can be viewed as a stream of tokens,
+ * where tokens are things like HTML tags, HTML comments,
+ * text nodes, etc. This method finds the next token in
+ * the HTML document and returns whether it found one.
+ *
+ * If it starts parsing a token and reaches the end of the
+ * document then it will seek to the start of the last
+ * token and pause, returning `false` to indicate that it
+ * failed to find a complete token.
+ *
+ * Possible token types, based on the HTML specification:
+ *
+ * - an HTML tag, whether opening, closing, or void.
+ * - a text node - the plaintext inside tags.
+ * - an HTML comment.
+ * - a DOCTYPE declaration.
+ * - a processing instruction, e.g. `<?xml version="1.0" ?>`.
+ *
+ * The Tag Processor currently only supports the tag token.
+ *
+ * @since 6.5.0
+ *
+ * @return bool Whether a token was parsed.
+ */
+ public function next_token() {
+ return $this->base_class_next_token();
+ }
+
+ /**
+ * Internal method which finds the next token in the HTML document.
+ *
+ * This method is a protected internal function which implements the logic for
+ * finding the next token in a document. It exists so that the parser can update
+ * its state without affecting the location of the cursor in the document and
+ * without triggering subclass methods for things like `next_token()`, e.g. when
+ * applying patches before searching for the next token.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ *
+ * @return bool Whether a token was parsed.
+ */
+ private function base_class_next_token() {
+ $was_at = $this->bytes_already_parsed;
+ $this->after_tag();
+
+ // Don't proceed if there's nothing more to scan.
+ if (
+ self::STATE_COMPLETE === $this->parser_state ||
+ self::STATE_INCOMPLETE_INPUT === $this->parser_state
+ ) {
+ return false;
+ }
+
+ /*
+ * The next step in the parsing loop determines the parsing state;
+ * clear it so that state doesn't linger from the previous step.
+ */
+ $this->parser_state = self::STATE_READY;
+
+ if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ $this->parser_state = self::STATE_COMPLETE;
+ return false;
+ }
+
+ // Find the next tag if it exists.
+ if ( false === $this->parse_next_tag() ) {
+ if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
+ $this->bytes_already_parsed = $was_at;
}
+ return false;
+ }
+
+ /*
+ * For legacy reasons the rest of this function handles tags and their
+ * attributes. If the processor has reached the end of the document
+ * or if it matched any other token then it should return here to avoid
+ * attempting to process tag-specific syntax.
+ */
+ if (
+ self::STATE_INCOMPLETE_INPUT !== $this->parser_state &&
+ self::STATE_COMPLETE !== $this->parser_state &&
+ self::STATE_MATCHED_TAG !== $this->parser_state
+ ) {
+ return true;
+ }
+
+ // Parse all of its attributes.
+ while ( $this->parse_next_attribute() ) {
+ continue;
+ }
+
+ // Ensure that the tag closes before the end of the document.
+ if (
+ self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
+ $this->bytes_already_parsed >= strlen( $this->html )
+ ) {
+ // Does this appropriately clear state (parsed attributes)?
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+ $this->bytes_already_parsed = $was_at;
+
+ return false;
+ }
+
+ $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
+ if ( false === $tag_ends_at ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+ $this->bytes_already_parsed = $was_at;
+
+ return false;
+ }
+ $this->parser_state = self::STATE_MATCHED_TAG;
+ $this->token_length = $tag_ends_at - $this->token_starts_at;
+ $this->bytes_already_parsed = $tag_ends_at + 1;
+
+ /*
+ * For non-DATA sections which might contain text that looks like HTML tags but
+ * isn't, scan with the appropriate alternative mode. Looking at the first letter
+ * of the tag name as a pre-check avoids a string allocation when it's not needed.
+ */
+ $t = $this->html[ $this->tag_name_starts_at ];
+ if (
+ $this->is_closing_tag ||
+ ! (
+ 'i' === $t || 'I' === $t ||
+ 'n' === $t || 'N' === $t ||
+ 's' === $t || 'S' === $t ||
+ 't' === $t || 'T' === $t ||
+ 'x' === $t || 'X' === $t
+ )
+ ) {
+ return true;
+ }
+
+ $tag_name = $this->get_tag();
+
+ /*
+ * Preserve the opening tag pointers, as these will be overwritten
+ * when finding the closing tag. They will be reset after finding
+ * the closing to tag to point to the opening of the special atomic
+ * tag sequence.
+ */
+ $tag_name_starts_at = $this->tag_name_starts_at;
+ $tag_name_length = $this->tag_name_length;
+ $tag_ends_at = $this->token_starts_at + $this->token_length;
+ $attributes = $this->attributes;
+ $duplicate_attributes = $this->duplicate_attributes;
+
+ // Find the closing tag if necessary.
+ $found_closer = false;
+ switch ( $tag_name ) {
+ case 'SCRIPT':
+ $found_closer = $this->skip_script_data();
+ break;
+
+ case 'TEXTAREA':
+ case 'TITLE':
+ $found_closer = $this->skip_rcdata( $tag_name );
+ break;
+
/*
- * For non-DATA sections which might contain text that looks like HTML tags but
- * isn't, scan with the appropriate alternative mode. Looking at the first letter
- * of the tag name as a pre-check avoids a string allocation when it's not needed.
+ * In the browser this list would include the NOSCRIPT element,
+ * but the Tag Processor is an environment with the scripting
+ * flag disabled, meaning that it needs to descend into the
+ * NOSCRIPT element to be able to properly process what will be
+ * sent to a browser.
+ *
+ * Note that this rule makes HTML5 syntax incompatible with XML,
+ * because the parsing of this token depends on client application.
+ * The NOSCRIPT element cannot be represented in the XHTML syntax.
*/
- $t = $this->html[ $this->tag_name_starts_at ];
- if (
- ! $this->is_closing_tag &&
- (
- 'i' === $t || 'I' === $t ||
- 'n' === $t || 'N' === $t ||
- 's' === $t || 'S' === $t ||
- 't' === $t || 'T' === $t
- ) ) {
- $tag_name = $this->get_tag();
-
- if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
- $this->bytes_already_parsed = strlen( $this->html );
- return false;
- } elseif (
- ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) &&
- ! $this->skip_rcdata( $tag_name )
- ) {
- $this->bytes_already_parsed = strlen( $this->html );
- return false;
- } elseif (
- (
- 'IFRAME' === $tag_name ||
- 'NOEMBED' === $tag_name ||
- 'NOFRAMES' === $tag_name ||
- 'NOSCRIPT' === $tag_name ||
- 'STYLE' === $tag_name
- ) &&
- ! $this->skip_rawtext( $tag_name )
- ) {
- /*
- * "XMP" should be here too but its rules are more complicated and require the
- * complexity of the HTML Processor (it needs to close out any open P element,
- * meaning it can't be skipped here or else the HTML Processor will lose its
- * place). For now, it can be ignored as it's a rare HTML tag in practice and
- * any normative HTML should be using PRE instead.
- */
- $this->bytes_already_parsed = strlen( $this->html );
- return false;
- }
- }
- } while ( $already_found < $this->sought_match_offset );
+ case 'IFRAME':
+ case 'NOEMBED':
+ case 'NOFRAMES':
+ case 'STYLE':
+ case 'XMP':
+ $found_closer = $this->skip_rawtext( $tag_name );
+ break;
+
+ // No other tags should be treated in their entirety here.
+ default:
+ return true;
+ }
+
+ if ( ! $found_closer ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+ $this->bytes_already_parsed = $was_at;
+ return false;
+ }
+
+ /*
+ * The values here look like they reference the opening tag but they reference
+ * the closing tag instead. This is why the opening tag values were stored
+ * above in a variable. It reads confusingly here, but that's because the
+ * functions that skip the contents have moved all the internal cursors past
+ * the inner content of the tag.
+ */
+ $this->token_starts_at = $was_at;
+ $this->token_length = $this->bytes_already_parsed - $this->token_starts_at;
+ $this->text_starts_at = $tag_ends_at + 1;
+ $this->text_length = $this->tag_name_starts_at - $this->text_starts_at;
+ $this->tag_name_starts_at = $tag_name_starts_at;
+ $this->tag_name_length = $tag_name_length;
+ $this->attributes = $attributes;
+ $this->duplicate_attributes = $duplicate_attributes;
return true;
}
+ /**
+ * Whether the processor paused because the input HTML document ended
+ * in the middle of a syntax element, such as in the middle of a tag.
+ *
+ * Example:
+ *
+ * $processor = new WP_HTML_Tag_Processor( '<input type="text" value="Th' );
+ * false === $processor->get_next_tag();
+ * true === $processor->paused_at_incomplete_token();
+ *
+ * @since 6.5.0
+ *
+ * @return bool Whether the parse paused at the start of an incomplete token.
+ */
+ public function paused_at_incomplete_token() {
+ return self::STATE_INCOMPLETE_INPUT === $this->parser_state;
+ }
/**
* Generator for a foreach loop to step through each class name for the matched tag.
@@ -643,6 +1058,10 @@ class WP_HTML_Tag_Processor {
* @since 6.4.0
*/
public function class_list() {
+ if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+ return;
+ }
+
/** @var string $class contains the string value of the class attribute, with character references decoded. */
$class = $this->get_attribute( 'class' );
@@ -698,7 +1117,7 @@ class WP_HTML_Tag_Processor {
* @return bool|null Whether the matched tag contains the given class name, or null if not matched.
*/
public function has_class( $wanted_class ) {
- if ( ! $this->tag_name_starts_at ) {
+ if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return null;
}
@@ -795,7 +1214,11 @@ class WP_HTML_Tag_Processor {
* @return bool Whether the bookmark was successfully created.
*/
public function set_bookmark( $name ) {
- if ( null === $this->tag_name_starts_at ) {
+ // It only makes sense to set a bookmark if the parser has paused on a concrete token.
+ if (
+ self::STATE_COMPLETE === $this->parser_state ||
+ self::STATE_INCOMPLETE_INPUT === $this->parser_state
+ ) {
return false;
}
@@ -808,10 +1231,7 @@ class WP_HTML_Tag_Processor {
return false;
}
- $this->bookmarks[ $name ] = new WP_HTML_Span(
- $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ),
- $this->tag_ends_at
- );
+ $this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length );
return true;
}
@@ -873,16 +1293,15 @@ class WP_HTML_Tag_Processor {
$at = $this->bytes_already_parsed;
while ( false !== $at && $at < $doc_length ) {
- $at = strpos( $this->html, '</', $at );
+ $at = strpos( $this->html, '</', $at );
+ $this->tag_name_starts_at = $at;
- // If there is no possible tag closer then fail.
+ // Fail if there is no possible tag closer.
if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
- $this->bytes_already_parsed = $doc_length;
return false;
}
- $closer_potentially_starts_at = $at;
- $at += 2;
+ $at += 2;
/*
* Find a case-insensitive match to the tag name.
@@ -905,6 +1324,10 @@ class WP_HTML_Tag_Processor {
$at += $tag_length;
$this->bytes_already_parsed = $at;
+ if ( $at >= strlen( $html ) ) {
+ return false;
+ }
+
/*
* Ensure that the tag name terminates to avoid matching on
* substrings of a longer tag name. For example, the sequence
@@ -919,13 +1342,23 @@ class WP_HTML_Tag_Processor {
while ( $this->parse_next_attribute() ) {
continue;
}
+
$at = $this->bytes_already_parsed;
if ( $at >= strlen( $this->html ) ) {
return false;
}
- if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) {
- $this->bytes_already_parsed = $closer_potentially_starts_at;
+ if ( '>' === $html[ $at ] ) {
+ $this->bytes_already_parsed = $at + 1;
+ return true;
+ }
+
+ if ( $at + 1 >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) {
+ $this->bytes_already_parsed = $at + 2;
return true;
}
}
@@ -1047,6 +1480,7 @@ class WP_HTML_Tag_Processor {
if ( $is_closing ) {
$this->bytes_already_parsed = $closer_potentially_starts_at;
+ $this->tag_name_starts_at = $closer_potentially_starts_at;
if ( $this->bytes_already_parsed >= $doc_length ) {
return false;
}
@@ -1055,8 +1489,14 @@ class WP_HTML_Tag_Processor {
continue;
}
+ if ( $this->bytes_already_parsed >= $doc_length ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
+ return false;
+ }
+
if ( '>' === $html[ $this->bytes_already_parsed ] ) {
- $this->bytes_already_parsed = $closer_potentially_starts_at;
+ ++$this->bytes_already_parsed;
return true;
}
}
@@ -1085,15 +1525,66 @@ class WP_HTML_Tag_Processor {
$html = $this->html;
$doc_length = strlen( $html );
- $at = $this->bytes_already_parsed;
+ $was_at = $this->bytes_already_parsed;
+ $at = $was_at;
while ( false !== $at && $at < $doc_length ) {
$at = strpos( $html, '<', $at );
+
+ /*
+ * This does not imply an incomplete parse; it indicates that there
+ * can be nothing left in the document other than a #text node.
+ */
if ( false === $at ) {
- return false;
+ $this->parser_state = self::STATE_TEXT_NODE;
+ $this->token_starts_at = $was_at;
+ $this->token_length = strlen( $html ) - $was_at;
+ $this->text_starts_at = $was_at;
+ $this->text_length = $this->token_length;
+ $this->bytes_already_parsed = strlen( $html );
+ return true;
+ }
+
+ if ( $at > $was_at ) {
+ /*
+ * A "<" normally starts a new HTML tag or syntax token, but in cases where the
+ * following character can't produce a valid token, the "<" is instead treated
+ * as plaintext and the parser should skip over it. This avoids a problem when
+ * following earlier practices of typing emoji with text, e.g. "<3". This
+ * should be a heart, not a tag. It's supposed to be rendered, not hidden.
+ *
+ * At this point the parser checks if this is one of those cases and if it is
+ * will continue searching for the next "<" in search of a token boundary.
+ *
+ * @see https://html.spec.whatwg.org/#tag-open-state
+ */
+ if ( strlen( $html ) > $at + 1 ) {
+ $next_character = $html[ $at + 1 ];
+ $at_another_node = (
+ '!' === $next_character ||
+ '/' === $next_character ||
+ '?' === $next_character ||
+ ( 'A' <= $next_character && $next_character <= 'Z' ) ||
+ ( 'a' <= $next_character && $next_character <= 'z' )
+ );
+ if ( ! $at_another_node ) {
+ ++$at;
+ continue;
+ }
+ }
+
+ $this->parser_state = self::STATE_TEXT_NODE;
+ $this->token_starts_at = $was_at;
+ $this->token_length = $at - $was_at;
+ $this->text_starts_at = $was_at;
+ $this->text_length = $this->token_length;
+ $this->bytes_already_parsed = $at;
+ return true;
}
- if ( '/' === $this->html[ $at + 1 ] ) {
+ $this->token_starts_at = $at;
+
+ if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) {
$this->is_closing_tag = true;
++$at;
} else {
@@ -1117,8 +1608,9 @@ class WP_HTML_Tag_Processor {
$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
if ( $tag_name_prefix_length > 0 ) {
++$at;
- $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
+ $this->parser_state = self::STATE_MATCHED_TAG;
$this->tag_name_starts_at = $at;
+ $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
$this->bytes_already_parsed = $at + $this->tag_name_length;
return true;
}
@@ -1127,35 +1619,58 @@ class WP_HTML_Tag_Processor {
* Abort if no tag is found before the end of
* the document. There is nothing left to parse.
*/
- if ( $at + 1 >= strlen( $html ) ) {
+ if ( $at + 1 >= $doc_length ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
/*
- * <! transitions to markup declaration open state
+ * `<!` transitions to markup declaration open state
* https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
*/
if ( '!' === $html[ $at + 1 ] ) {
/*
- * <!-- transitions to a bogus comment state – skip to the nearest -->
+ * `<!--` transitions to a comment state – apply further comment rules.
* https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if (
- strlen( $html ) > $at + 3 &&
+ $doc_length > $at + 3 &&
'-' === $html[ $at + 2 ] &&
'-' === $html[ $at + 3 ]
) {
$closer_at = $at + 4;
// If it's not possible to close the comment then there is nothing more to scan.
- if ( strlen( $html ) <= $closer_at ) {
+ if ( $doc_length <= $closer_at ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
$span_of_dashes = strspn( $html, '-', $closer_at );
if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
- $at = $closer_at + $span_of_dashes + 1;
- continue;
+ /*
+ * @todo When implementing `set_modifiable_text()` ensure that updates to this token
+ * don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment
+ * and bogus comment syntax, these leave no clear insertion point for text and
+ * they need to be modified specially in order to contain text. E.g. to store
+ * `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which
+ * involves inserting an additional `-` into the token after the modifiable text.
+ */
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT;
+ $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at;
+
+ // Only provide modifiable text if the token is long enough to contain it.
+ if ( $span_of_dashes >= 2 ) {
+ $this->comment_type = self::COMMENT_AS_HTML_COMMENT;
+ $this->text_starts_at = $this->token_starts_at + 4;
+ $this->text_length = $span_of_dashes - 2;
+ }
+
+ $this->bytes_already_parsed = $closer_at + $span_of_dashes + 1;
+ return true;
}
/*
@@ -1165,55 +1680,47 @@ class WP_HTML_Tag_Processor {
* See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
*/
--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
- while ( ++$closer_at < strlen( $html ) ) {
+ while ( ++$closer_at < $doc_length ) {
$closer_at = strpos( $html, '--', $closer_at );
if ( false === $closer_at ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
- if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) {
- $at = $closer_at + 3;
- continue 2;
+ if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) {
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_HTML_COMMENT;
+ $this->token_length = $closer_at + 3 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 4;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 3;
+ return true;
}
- if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) {
- $at = $closer_at + 4;
- continue 2;
+ if (
+ $closer_at + 3 < $doc_length &&
+ '!' === $html[ $closer_at + 2 ] &&
+ '>' === $html[ $closer_at + 3 ]
+ ) {
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_HTML_COMMENT;
+ $this->token_length = $closer_at + 4 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 4;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 4;
+ return true;
}
}
}
/*
- * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]>
- * The CDATA is case-sensitive.
- * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
- */
- if (
- strlen( $html ) > $at + 8 &&
- '[' === $html[ $at + 2 ] &&
- 'C' === $html[ $at + 3 ] &&
- 'D' === $html[ $at + 4 ] &&
- 'A' === $html[ $at + 5 ] &&
- 'T' === $html[ $at + 6 ] &&
- 'A' === $html[ $at + 7 ] &&
- '[' === $html[ $at + 8 ]
- ) {
- $closer_at = strpos( $html, ']]>', $at + 9 );
- if ( false === $closer_at ) {
- return false;
- }
-
- $at = $closer_at + 3;
- continue;
- }
-
- /*
- * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest >
+ * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest >
* These are ASCII-case-insensitive.
* https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if (
- strlen( $html ) > $at + 8 &&
+ $doc_length > $at + 8 &&
( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
@@ -1224,59 +1731,179 @@ class WP_HTML_Tag_Processor {
) {
$closer_at = strpos( $html, '>', $at + 9 );
if ( false === $closer_at ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
- $at = $closer_at + 1;
- continue;
+ $this->parser_state = self::STATE_DOCTYPE;
+ $this->token_length = $closer_at + 1 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 9;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 1;
+ return true;
}
/*
* Anything else here is an incorrectly-opened comment and transitions
- * to the bogus comment state - skip to the nearest >.
+ * to the bogus comment state - skip to the nearest >. If no closer is
+ * found then the HTML was truncated inside the markup declaration.
*/
- $at = strpos( $html, '>', $at + 1 );
- continue;
+ $closer_at = strpos( $html, '>', $at + 1 );
+ if ( false === $closer_at ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
+ return false;
+ }
+
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_INVALID_HTML;
+ $this->token_length = $closer_at + 1 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 2;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 1;
+
+ /*
+ * Identify nodes that would be CDATA if HTML had CDATA sections.
+ *
+ * This section must occur after identifying the bogus comment end
+ * because in an HTML parser it will span to the nearest `>`, even
+ * if there's no `]]>` as would be required in an XML document. It
+ * is therefore not possible to parse a CDATA section containing
+ * a `>` in the HTML syntax.
+ *
+ * Inside foreign elements there is a discrepancy between browsers
+ * and the specification on this.
+ *
+ * @todo Track whether the Tag Processor is inside a foreign element
+ * and require the proper closing `]]>` in those cases.
+ */
+ if (
+ $this->token_length >= 10 &&
+ '[' === $html[ $this->token_starts_at + 2 ] &&
+ 'C' === $html[ $this->token_starts_at + 3 ] &&
+ 'D' === $html[ $this->token_starts_at + 4 ] &&
+ 'A' === $html[ $this->token_starts_at + 5 ] &&
+ 'T' === $html[ $this->token_starts_at + 6 ] &&
+ 'A' === $html[ $this->token_starts_at + 7 ] &&
+ '[' === $html[ $this->token_starts_at + 8 ] &&
+ ']' === $html[ $closer_at - 1 ] &&
+ ']' === $html[ $closer_at - 2 ]
+ ) {
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_CDATA_LOOKALIKE;
+ $this->text_starts_at += 7;
+ $this->text_length -= 9;
+ }
+
+ return true;
}
/*
* </> is a missing end tag name, which is ignored.
*
+ * This was also known as the "presumptuous empty tag"
+ * in early discussions as it was proposed to close
+ * the nearest previous opening tag.
+ *
* See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
*/
if ( '>' === $html[ $at + 1 ] ) {
- ++$at;
- continue;
+ $this->parser_state = self::STATE_PRESUMPTUOUS_TAG;
+ $this->token_length = $at + 2 - $this->token_starts_at;
+ $this->bytes_already_parsed = $at + 2;
+ return true;
}
/*
- * <? transitions to a bogus comment state – skip to the nearest >
+ * `<?` transitions to a bogus comment state – skip to the nearest >
* See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if ( '?' === $html[ $at + 1 ] ) {
$closer_at = strpos( $html, '>', $at + 2 );
if ( false === $closer_at ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
- $at = $closer_at + 1;
- continue;
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_INVALID_HTML;
+ $this->token_length = $closer_at + 1 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 2;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 1;
+
+ /*
+ * Identify a Processing Instruction node were HTML to have them.
+ *
+ * This section must occur after identifying the bogus comment end
+ * because in an HTML parser it will span to the nearest `>`, even
+ * if there's no `?>` as would be required in an XML document. It
+ * is therefore not possible to parse a Processing Instruction node
+ * containing a `>` in the HTML syntax.
+ *
+ * XML allows for more target names, but this code only identifies
+ * those with ASCII-representable target names. This means that it
+ * may identify some Processing Instruction nodes as bogus comments,
+ * but it will not misinterpret the HTML structure. By limiting the
+ * identification to these target names the Tag Processor can avoid
+ * the need to start parsing UTF-8 sequences.
+ *
+ * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
+ * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
+ * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+ * [#x10000-#xEFFFF]
+ * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+ *
+ * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
+ */
+ if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
+ $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
+ $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
+
+ if ( 0 < $pi_target_length ) {
+ $pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
+
+ $this->comment_type = self::COMMENT_AS_PI_NODE_LOOKALIKE;
+ $this->tag_name_starts_at = $this->token_starts_at + 2;
+ $this->tag_name_length = $pi_target_length;
+ $this->text_starts_at += $pi_target_length;
+ $this->text_length -= $pi_target_length + 1;
+ }
+ }
+
+ return true;
}
/*
* If a non-alpha starts the tag name in a tag closer it's a comment.
* Find the first `>`, which closes the comment.
*
+ * This parser classifies these particular comments as special "funky comments"
+ * which are made available for further processing.
+ *
* See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
*/
if ( $this->is_closing_tag ) {
+ // No chance of finding a closer.
+ if ( $at + 3 > $doc_length ) {
+ return false;
+ }
+
$closer_at = strpos( $html, '>', $at + 3 );
if ( false === $closer_at ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
- $at = $closer_at + 1;
- continue;
+ $this->parser_state = self::STATE_FUNKY_COMMENT;
+ $this->token_length = $closer_at + 1 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 2;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 1;
+ return true;
}
++$at;
@@ -1296,6 +1923,8 @@ class WP_HTML_Tag_Processor {
// Skip whitespace and slashes.
$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
@@ -1318,11 +1947,15 @@ class WP_HTML_Tag_Processor {
$attribute_name = substr( $this->html, $attribute_start, $name_length );
$this->bytes_already_parsed += $name_length;
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
@@ -1331,6 +1964,8 @@ class WP_HTML_Tag_Processor {
++$this->bytes_already_parsed;
$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
@@ -1357,6 +1992,8 @@ class WP_HTML_Tag_Processor {
}
if ( $attribute_end >= strlen( $this->html ) ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
return false;
}
@@ -1381,7 +2018,7 @@ class WP_HTML_Tag_Processor {
$value_start,
$value_length,
$attribute_start,
- $attribute_end,
+ $attribute_end - $attribute_start,
! $has_value
);
@@ -1396,7 +2033,7 @@ class WP_HTML_Tag_Processor {
* an array when encountering duplicates avoids needless allocations in the
* normative case of parsing tags with no duplicate attributes.
*/
- $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end );
+ $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start );
if ( null === $this->duplicate_attributes ) {
$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
@@ -1423,12 +2060,54 @@ class WP_HTML_Tag_Processor {
* @since 6.2.0
*/
private function after_tag() {
- $this->get_updated_html();
+ /*
+ * There could be lexical updates enqueued for an attribute that
+ * also exists on the next tag. In order to avoid conflating the
+ * attributes across the two tags, lexical updates with names
+ * need to be flushed to raw lexical updates.
+ */
+ $this->class_name_updates_to_attributes_updates();
+
+ /*
+ * Purge updates if there are too many. The actual count isn't
+ * scientific, but a few values from 100 to a few thousand were
+ * tests to find a practially-useful limit.
+ *
+ * If the update queue grows too big, then the Tag Processor
+ * will spend more time iterating through them and lose the
+ * efficiency gains of deferring applying them.
+ */
+ if ( 1000 < count( $this->lexical_updates ) ) {
+ $this->get_updated_html();
+ }
+
+ foreach ( $this->lexical_updates as $name => $update ) {
+ /*
+ * Any updates appearing after the cursor should be applied
+ * before proceeding, otherwise they may be overlooked.
+ */
+ if ( $update->start >= $this->bytes_already_parsed ) {
+ $this->get_updated_html();
+ break;
+ }
+
+ if ( is_int( $name ) ) {
+ continue;
+ }
+
+ $this->lexical_updates[] = $update;
+ unset( $this->lexical_updates[ $name ] );
+ }
+
+ $this->token_starts_at = null;
+ $this->token_length = null;
$this->tag_name_starts_at = null;
$this->tag_name_length = null;
- $this->tag_ends_at = null;
+ $this->text_starts_at = 0;
+ $this->text_length = 0;
$this->is_closing_tag = null;
$this->attributes = array();
+ $this->comment_type = null;
$this->duplicate_attributes = null;
}
@@ -1606,10 +2285,10 @@ class WP_HTML_Tag_Processor {
$bytes_already_copied = 0;
$output_buffer = '';
foreach ( $this->lexical_updates as $diff ) {
- $shift = strlen( $diff->text ) - ( $diff->end - $diff->start );
+ $shift = strlen( $diff->text ) - $diff->length;
// Adjust the cursor position by however much an update affects it.
- if ( $diff->start <= $this->bytes_already_parsed ) {
+ if ( $diff->start < $this->bytes_already_parsed ) {
$this->bytes_already_parsed += $shift;
}
@@ -1620,7 +2299,7 @@ class WP_HTML_Tag_Processor {
$output_buffer .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied );
$output_buffer .= $diff->text;
- $bytes_already_copied = $diff->end;
+ $bytes_already_copied = $diff->start + $diff->length;
}
$this->html = $output_buffer . substr( $this->html, $bytes_already_copied );
@@ -1630,6 +2309,8 @@ class WP_HTML_Tag_Processor {
* replacements adjust offsets in the input document.
*/
foreach ( $this->bookmarks as $bookmark_name => $bookmark ) {
+ $bookmark_end = $bookmark->start + $bookmark->length;
+
/*
* Each lexical update which appears before the bookmark's endpoints
* might shift the offsets for those endpoints. Loop through each change
@@ -1640,28 +2321,30 @@ class WP_HTML_Tag_Processor {
$tail_delta = 0;
foreach ( $this->lexical_updates as $diff ) {
- if ( $bookmark->start < $diff->start && $bookmark->end < $diff->start ) {
+ $diff_end = $diff->start + $diff->length;
+
+ if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) {
break;
}
- if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) {
+ if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) {
$this->release_bookmark( $bookmark_name );
continue 2;
}
- $delta = strlen( $diff->text ) - ( $diff->end - $diff->start );
+ $delta = strlen( $diff->text ) - $diff->length;
if ( $bookmark->start >= $diff->start ) {
$head_delta += $delta;
}
- if ( $bookmark->end >= $diff->end ) {
+ if ( $bookmark_end >= $diff_end ) {
$tail_delta += $delta;
}
}
- $bookmark->start += $head_delta;
- $bookmark->end += $tail_delta;
+ $bookmark->start += $head_delta;
+ $bookmark->length += $tail_delta - $head_delta;
}
$this->lexical_updates = array();
@@ -1716,7 +2399,8 @@ class WP_HTML_Tag_Processor {
// Point this tag processor before the sought tag opener and consume it.
$this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start;
- return $this->next_tag( array( 'tag_closers' => 'visit' ) );
+ $this->parser_state = self::STATE_READY;
+ return $this->next_token();
}
/**
@@ -1743,7 +2427,7 @@ class WP_HTML_Tag_Processor {
* This code should be unreachable, because it implies the two replacements
* start at the same location and contain the same text.
*/
- return $a->end - $b->end;
+ return $a->length - $b->length;
}
/**
@@ -1761,6 +2445,10 @@ class WP_HTML_Tag_Processor {
* @return string|boolean|null Value of enqueued update if present, otherwise false.
*/
private function get_enqueued_attribute_value( $comparable_name ) {
+ if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+ return false;
+ }
+
if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) {
return false;
}
@@ -1828,7 +2516,7 @@ class WP_HTML_Tag_Processor {
* @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
*/
public function get_attribute( $name ) {
- if ( null === $this->tag_name_starts_at ) {
+ if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return null;
}
@@ -1908,7 +2596,10 @@ class WP_HTML_Tag_Processor {
* @return array|null List of attribute names, or `null` when no tag opener is matched.
*/
public function get_attribute_names_with_prefix( $prefix ) {
- if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) {
+ if (
+ self::STATE_MATCHED_TAG !== $this->parser_state ||
+ $this->is_closing_tag
+ ) {
return null;
}
@@ -1946,7 +2637,18 @@ class WP_HTML_Tag_Processor {
$tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
- return strtoupper( $tag_name );
+ if ( self::STATE_MATCHED_TAG === $this->parser_state ) {
+ return strtoupper( $tag_name );
+ }
+
+ if (
+ self::STATE_COMMENT === $this->parser_state &&
+ self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type()
+ ) {
+ return $tag_name;
+ }
+
+ return null;
}
/**
@@ -1967,11 +2669,19 @@ class WP_HTML_Tag_Processor {
* @return bool Whether the currently matched tag contains the self-closing flag.
*/
public function has_self_closing_flag() {
- if ( ! $this->tag_name_starts_at ) {
+ if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return false;
}
- return '/' === $this->html[ $this->tag_ends_at - 1 ];
+ /*
+ * The self-closing flag is the solidus at the _end_ of the tag, not the beginning.
+ *
+ * Example:
+ *
+ * <figure />
+ * ^ this appears one character before the end of the closing ">".
+ */
+ return '/' === $this->html[ $this->token_starts_at + $this->token_length - 1 ];
}
/**
@@ -1991,7 +2701,191 @@ class WP_HTML_Tag_Processor {
* @return bool Whether the current tag is a tag closer.
*/
public function is_tag_closer() {
- return $this->is_closing_tag;
+ return (
+ self::STATE_MATCHED_TAG === $this->parser_state &&
+ $this->is_closing_tag
+ );
+ }
+
+ /**
+ * Indicates the kind of matched token, if any.
+ *
+ * This differs from `get_token_name()` in that it always
+ * returns a static string indicating the type, whereas
+ * `get_token_name()` may return values derived from the
+ * token itself, such as a tag name or processing
+ * instruction tag.
+ *
+ * Possible values:
+ * - `#tag` when matched on a tag.
+ * - `#text` when matched on a text node.
+ * - `#cdata-section` when matched on a CDATA node.
+ * - `#comment` when matched on a comment.
+ * - `#doctype` when matched on a DOCTYPE declaration.
+ * - `#presumptuous-tag` when matched on an empty tag closer.
+ * - `#funky-comment` when matched on a funky comment.
+ *
+ * @since 6.5.0
+ *
+ * @return string|null What kind of token is matched, or null.
+ */
+ public function get_token_type() {
+ switch ( $this->parser_state ) {
+ case self::STATE_MATCHED_TAG:
+ return '#tag';
+
+ case self::STATE_DOCTYPE:
+ return '#doctype';
+
+ default:
+ return $this->get_token_name();
+ }
+ }
+
+ /**
+ * Returns the node name represented by the token.
+ *
+ * This matches the DOM API value `nodeName`. Some values
+ * are static, such as `#text` for a text node, while others
+ * are dynamically generated from the token itself.
+ *
+ * Dynamic names:
+ * - Uppercase tag name for tag matches.
+ * - `html` for DOCTYPE declarations.
+ *
+ * Note that if the Tag Processor is not matched on a token
+ * then this function will return `null`, either because it
+ * hasn't yet found a token or because it reached the end
+ * of the document without matching a token.
+ *
+ * @since 6.5.0
+ *
+ * @return string|null Name of the matched token.
+ */
+ public function get_token_name() {
+ switch ( $this->parser_state ) {
+ case self::STATE_MATCHED_TAG:
+ return $this->get_tag();
+
+ case self::STATE_TEXT_NODE:
+ return '#text';
+
+ case self::STATE_CDATA_NODE:
+ return '#cdata-section';
+
+ case self::STATE_COMMENT:
+ return '#comment';
+
+ case self::STATE_DOCTYPE:
+ return 'html';
+
+ case self::STATE_PRESUMPTUOUS_TAG:
+ return '#presumptuous-tag';
+
+ case self::STATE_FUNKY_COMMENT:
+ return '#funky-comment';
+ }
+ }
+
+ /**
+ * Indicates what kind of comment produced the comment node.
+ *
+ * Because there are different kinds of HTML syntax which produce
+ * comments, the Tag Processor tracks and exposes this as a type
+ * for the comment. Nominally only regular HTML comments exist as
+ * they are commonly known, but a number of unrelated syntax errors
+ * also produce comments.
+ *
+ * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
+ * @see self::COMMENT_AS_CDATA_LOOKALIKE
+ * @see self::COMMENT_AS_INVALID_HTML
+ * @see self::COMMENT_AS_HTML_COMMENT
+ * @see self::COMMENT_AS_PI_NODE_LOOKALIKE
+ *
+ * @since 6.5.0
+ *
+ * @return string|null
+ */
+ public function get_comment_type() {
+ if ( self::STATE_COMMENT !== $this->parser_state ) {
+ return null;
+ }
+
+ return $this->comment_type;
+ }
+
+ /**
+ * Returns the modifiable text for a matched token, or an empty string.
+ *
+ * Modifiable text is text content that may be read and changed without
+ * changing the HTML structure of the document around it. This includes
+ * the contents of `#text` nodes in the HTML as well as the inner
+ * contents of HTML comments, Processing Instructions, and others, even
+ * though these nodes aren't part of a parsed DOM tree. They also contain
+ * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
+ * other section in an HTML document which cannot contain HTML markup (DATA).
+ *
+ * If a token has no modifiable text then an empty string is returned to
+ * avoid needless crashing or type errors. An empty string does not mean
+ * that a token has modifiable text, and a token with modifiable text may
+ * have an empty string (e.g. a comment with no contents).
+ *
+ * @since 6.5.0
+ *
+ * @return string
+ */
+ public function get_modifiable_text() {
+ if ( null === $this->text_starts_at ) {
+ return '';
+ }
+
+ $text = substr( $this->html, $this->text_starts_at, $this->text_length );
+
+ // Comment data is not decoded.
+ if (
+ self::STATE_CDATA_NODE === $this->parser_state ||
+ self::STATE_COMMENT === $this->parser_state ||
+ self::STATE_DOCTYPE === $this->parser_state ||
+ self::STATE_FUNKY_COMMENT === $this->parser_state
+ ) {
+ return $text;
+ }
+
+ $tag_name = $this->get_tag();
+ if (
+ // Script data is not decoded.
+ 'SCRIPT' === $tag_name ||
+
+ // RAWTEXT data is not decoded.
+ 'IFRAME' === $tag_name ||
+ 'NOEMBED' === $tag_name ||
+ 'NOFRAMES' === $tag_name ||
+ 'STYLE' === $tag_name ||
+ 'XMP' === $tag_name
+ ) {
+ return $text;
+ }
+
+ $decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
+
+ /*
+ * TEXTAREA skips a leading newline, but this newline may appear not only as the
+ * literal character `\n`, but also as a character reference, such as in the
+ * following markup: `<textarea>&#x0a;Content</textarea>`.
+ *
+ * For these cases it's important to first decode the text content before checking
+ * for a leading newline and removing it.
+ */
+ if (
+ self::STATE_MATCHED_TAG === $this->parser_state &&
+ 'TEXTAREA' === $tag_name &&
+ strlen( $decoded ) > 0 &&
+ "\n" === $decoded[0]
+ ) {
+ return substr( $decoded, 1 );
+ }
+
+ return $decoded;
}
/**
@@ -2011,7 +2905,10 @@ class WP_HTML_Tag_Processor {
* @return bool Whether an attribute value was set.
*/
public function set_attribute( $name, $value ) {
- if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) {
+ if (
+ self::STATE_MATCHED_TAG !== $this->parser_state ||
+ $this->is_closing_tag
+ ) {
return false;
}
@@ -2031,8 +2928,8 @@ class WP_HTML_Tag_Processor {
*
* @see https://html.spec.whatwg.org/#attributes-2
*
- * @TODO as the only regex pattern maybe we should take it out? are
- * Unicode patterns available broadly in Core?
+ * @todo As the only regex pattern maybe we should take it out?
+ * Are Unicode patterns available broadly in Core?
*/
if ( preg_match(
'~[' .
@@ -2101,7 +2998,7 @@ class WP_HTML_Tag_Processor {
$existing_attribute = $this->attributes[ $comparable_name ];
$this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement(
$existing_attribute->start,
- $existing_attribute->end,
+ $existing_attribute->length,
$updated_attribute
);
} else {
@@ -2119,7 +3016,7 @@ class WP_HTML_Tag_Processor {
*/
$this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement(
$this->tag_name_starts_at + $this->tag_name_length,
- $this->tag_name_starts_at + $this->tag_name_length,
+ 0,
' ' . $updated_attribute
);
}
@@ -2144,7 +3041,10 @@ class WP_HTML_Tag_Processor {
* @return bool Whether an attribute was removed.
*/
public function remove_attribute( $name ) {
- if ( $this->is_closing_tag ) {
+ if (
+ self::STATE_MATCHED_TAG !== $this->parser_state ||
+ $this->is_closing_tag
+ ) {
return false;
}
@@ -2194,7 +3094,7 @@ class WP_HTML_Tag_Processor {
*/
$this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement(
$this->attributes[ $name ]->start,
- $this->attributes[ $name ]->end,
+ $this->attributes[ $name ]->length,
''
);
@@ -2203,7 +3103,7 @@ class WP_HTML_Tag_Processor {
foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
$this->lexical_updates[] = new WP_HTML_Text_Replacement(
$attribute_token->start,
- $attribute_token->end,
+ $attribute_token->length,
''
);
}
@@ -2221,13 +3121,14 @@ class WP_HTML_Tag_Processor {
* @return bool Whether the class was set to be added.
*/
public function add_class( $class_name ) {
- if ( $this->is_closing_tag ) {
+ if (
+ self::STATE_MATCHED_TAG !== $this->parser_state ||
+ $this->is_closing_tag
+ ) {
return false;
}
- if ( null !== $this->tag_name_starts_at ) {
- $this->classname_updates[ $class_name ] = self::ADD_CLASS;
- }
+ $this->classname_updates[ $class_name ] = self::ADD_CLASS;
return true;
}
@@ -2241,7 +3142,10 @@ class WP_HTML_Tag_Processor {
* @return bool Whether the class was set to be removed.
*/
public function remove_class( $class_name ) {
- if ( $this->is_closing_tag ) {
+ if (
+ self::STATE_MATCHED_TAG !== $this->parser_state ||
+ $this->is_closing_tag
+ ) {
return false;
}
@@ -2289,7 +3193,7 @@ class WP_HTML_Tag_Processor {
* Keep track of the position right before the current tag. This will
* be necessary for reparsing the current tag after updating the HTML.
*/
- $before_current_tag = $this->tag_name_starts_at - 1;
+ $before_current_tag = $this->token_starts_at;
/*
* 1. Apply the enqueued edits and update all the pointers to reflect those changes.
@@ -2318,15 +3222,7 @@ class WP_HTML_Tag_Processor {
* └←─┘ back up by strlen("em") + 1 ==> 3
*/
$this->bytes_already_parsed = $before_current_tag;
- $this->parse_next_tag();
- // Reparse the attributes.
- while ( $this->parse_next_attribute() ) {
- continue;
- }
-
- $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
- $this->tag_ends_at = $tag_ends_at;
- $this->bytes_already_parsed = $tag_ends_at;
+ $this->base_class_next_token();
return $this->html;
}
@@ -2447,4 +3343,206 @@ class WP_HTML_Tag_Processor {
return true;
}
+
+ /**
+ * Parser Ready State.
+ *
+ * Indicates that the parser is ready to run and waiting for a state transition.
+ * It may not have started yet, or it may have just finished parsing a token and
+ * is ready to find the next one.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_READY = 'STATE_READY';
+
+ /**
+ * Parser Complete State.
+ *
+ * Indicates that the parser has reached the end of the document and there is
+ * nothing left to scan. It finished parsing the last token completely.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_COMPLETE = 'STATE_COMPLETE';
+
+ /**
+ * Parser Incomplete Input State.
+ *
+ * Indicates that the parser has reached the end of the document before finishing
+ * a token. It started parsing a token but there is a possibility that the input
+ * HTML document was truncated in the middle of a token.
+ *
+ * The parser is reset at the start of the incomplete token and has paused. There
+ * is nothing more than can be scanned unless provided a more complete document.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT';
+
+ /**
+ * Parser Matched Tag State.
+ *
+ * Indicates that the parser has found an HTML tag and it's possible to get
+ * the tag name and read or modify its attributes (if it's not a closing tag).
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG';
+
+ /**
+ * Parser Text Node State.
+ *
+ * Indicates that the parser has found a text node and it's possible
+ * to read and modify that text.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_TEXT_NODE = 'STATE_TEXT_NODE';
+
+ /**
+ * Parser CDATA Node State.
+ *
+ * Indicates that the parser has found a CDATA node and it's possible
+ * to read and modify its modifiable text. Note that in HTML there are
+ * no CDATA nodes outside of foreign content (SVG and MathML). Outside
+ * of foreign content, they are treated as HTML comments.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
+
+ /**
+ * Indicates that the parser has found an HTML comment and it's
+ * possible to read and modify its modifiable text.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_COMMENT = 'STATE_COMMENT';
+
+ /**
+ * Indicates that the parser has found a DOCTYPE node and it's
+ * possible to read and modify its modifiable text.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_DOCTYPE = 'STATE_DOCTYPE';
+
+ /**
+ * Indicates that the parser has found an empty tag closer `</>`.
+ *
+ * Note that in HTML there are no empty tag closers, and they
+ * are ignored. Nonetheless, the Tag Processor still
+ * recognizes them as they appear in the HTML stream.
+ *
+ * These were historically discussed as a "presumptuous tag
+ * closer," which would close the nearest open tag, but were
+ * dismissed in favor of explicitly-closing tags.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG';
+
+ /**
+ * Indicates that the parser has found a "funky comment"
+ * and it's possible to read and modify its modifiable text.
+ *
+ * Example:
+ *
+ * </%url>
+ * </{"wp-bit":"query/post-author"}>
+ * </2>
+ *
+ * Funky comments are tag closers with invalid tag names. Note
+ * that in HTML these are turn into bogus comments. Nonetheless,
+ * the Tag Processor recognizes them in a stream of HTML and
+ * exposes them for inspection and modification.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY';
+
+ /**
+ * Indicates that a comment was created when encountering abruptly-closed HTML comment.
+ *
+ * Example:
+ *
+ * <!-->
+ * <!--->
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT';
+
+ /**
+ * Indicates that a comment would be parsed as a CDATA node,
+ * were HTML to allow CDATA nodes outside of foreign content.
+ *
+ * Example:
+ *
+ * <![CDATA[This is a CDATA node.]]>
+ *
+ * This is an HTML comment, but it looks like a CDATA node.
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE';
+
+ /**
+ * Indicates that a comment was created when encountering
+ * normative HTML comment syntax.
+ *
+ * Example:
+ *
+ * <!-- this is a comment -->
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT';
+
+ /**
+ * Indicates that a comment would be parsed as a Processing
+ * Instruction node, were they to exist within HTML.
+ *
+ * Example:
+ *
+ * <?wp __( 'Like' ) ?>
+ *
+ * This is an HTML comment, but it looks like a CDATA node.
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE';
+
+ /**
+ * Indicates that a comment was created when encountering invalid
+ * HTML input, a so-called "bogus comment."
+ *
+ * Example:
+ *
+ * <?nothing special>
+ * <!{nothing special}>
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
}
diff --git a/wp-includes/html-api/class-wp-html-text-replacement.php b/wp-includes/html-api/class-wp-html-text-replacement.php
index 26b7bb2..4b8a6a6 100644
--- a/wp-includes/html-api/class-wp-html-text-replacement.php
+++ b/wp-includes/html-api/class-wp-html-text-replacement.php
@@ -15,6 +15,7 @@
*
* @access private
* @since 6.2.0
+ * @since 6.5.0 Replace `end` with `length` to more closely match `substr()`.
*
* @see WP_HTML_Tag_Processor
*/
@@ -23,22 +24,25 @@ class WP_HTML_Text_Replacement {
* Byte offset into document where replacement span begins.
*
* @since 6.2.0
+ *
* @var int
*/
public $start;
/**
- * Byte offset into document where replacement span ends.
+ * Byte length of span being replaced.
+ *
+ * @since 6.5.0
*
- * @since 6.2.0
* @var int
*/
- public $end;
+ public $length;
/**
* Span of text to insert in document to replace existing content from start to end.
*
* @since 6.2.0
+ *
* @var string
*/
public $text;
@@ -48,13 +52,13 @@ class WP_HTML_Text_Replacement {
*
* @since 6.2.0
*
- * @param int $start Byte offset into document where replacement span begins.
- * @param int $end Byte offset into document where replacement span ends.
- * @param string $text Span of text to insert in document to replace existing content from start to end.
+ * @param int $start Byte offset into document where replacement span begins.
+ * @param int $length Byte length of span in document being replaced.
+ * @param string $text Span of text to insert in document to replace existing content from start to end.
*/
- public function __construct( $start, $end, $text ) {
- $this->start = $start;
- $this->end = $end;
- $this->text = $text;
+ public function __construct( $start, $length, $text ) {
+ $this->start = $start;
+ $this->length = $length;
+ $this->text = $text;
}
}