7 files changed, 1992 insertions, 345 deletions
diff --git a/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/wp-includes/html-api/class-wp-html-active-formatting-elements.php
index 9598991..9f7fee9 100644
--- a/wp-includes/html-api/class-wp-html-active-formatting-elements.php
+++ b/wp-includes/html-api/class-wp-html-active-formatting-elements.php
@@ -105,7 +105,7 @@ class WP_HTML_Active_Formatting_Elements {
 		 * > paired such that the two attributes in each pair have identical names, namespaces, and values
 		 * > (the order of the attributes does not matter).
 		 *
-		 * @TODO: Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack.
+		 * @todo Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack.
 		 */
 		// > Add element to the list of active formatting elements.
 		$this->stack[] = $token;
diff --git a/wp-includes/html-api/class-wp-html-attribute-token.php b/wp-includes/html-api/class-wp-html-attribute-token.php
index f938609..74d4132 100644
--- a/wp-includes/html-api/class-wp-html-attribute-token.php
+++ b/wp-includes/html-api/class-wp-html-attribute-token.php
@@ -15,6 +15,7 @@
  *
  * @access private
  * @since 6.2.0
+ * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`.
  *
  * @see WP_HTML_Tag_Processor
  */
@@ -23,6 +24,7 @@ class WP_HTML_Attribute_Token {
 	 * Attribute name.
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var string
 	 */
 	public $name;
@@ -31,6 +33,7 @@ class WP_HTML_Attribute_Token {
 	 * Attribute value.
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var int
 	 */
 	public $value_starts_at;
@@ -39,6 +42,7 @@ class WP_HTML_Attribute_Token {
 	 * How many bytes the value occupies in the input HTML.
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var int
 	 */
 	public $value_length;
@@ -47,22 +51,43 @@ class WP_HTML_Attribute_Token {
 	 * The string offset where the attribute name starts.
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var int
 	 */
 	public $start;
 
 	/**
-	 * The string offset after the attribute value or its name.
+	 * Byte length of text spanning the attribute inside a tag.
+	 *
+	 * This span starts at the first character of the attribute name
+	 * and it ends after one of three cases:
+	 *
+	 *  - at the end of the attribute name for boolean attributes.
+	 *  - at the end of the value for unquoted attributes.
+	 *  - at the final single or double quote for quoted attributes.
+	 *
+	 * Example:
+	 *
+	 *     <div class="post">
+	 *          ------------ length is 12, including quotes
+	 *
+	 *     <input type="checked" checked id="selector">
+	 *                           ------- length is 6
+	 *
+	 *     <a rel=noopener>
+	 *        ------------ length is 11
+	 *
+	 * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`.
 	 *
-	 * @since 6.2.0
 	 * @var int
 	 */
-	public $end;
+	public $length;
 
 	/**
 	 * Whether the attribute is a boolean attribute with value `true`.
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var bool
 	 */
 	public $is_true;
@@ -71,20 +96,21 @@ class WP_HTML_Attribute_Token {
 	 * Constructor.
 	 *
 	 * @since 6.2.0
+	 * @since 6.5.0 Replaced `end` with `length` to more closely match `substr()`.
 	 *
 	 * @param string $name         Attribute name.
 	 * @param int    $value_start  Attribute value.
 	 * @param int    $value_length Number of bytes attribute value spans.
 	 * @param int    $start        The string offset where the attribute name starts.
-	 * @param int    $end          The string offset after the attribute value or its name.
+	 * @param int    $length       Byte length of the entire attribute name or name and value pair expression.
 	 * @param bool   $is_true      Whether the attribute is a boolean attribute with true value.
 	 */
-	public function __construct( $name, $value_start, $value_length, $start, $end, $is_true ) {
+	public function __construct( $name, $value_start, $value_length, $start, $length, $is_true ) {
 		$this->name            = $name;
 		$this->value_starts_at = $value_start;
 		$this->value_length    = $value_length;
 		$this->start           = $start;
-		$this->end             = $end;
+		$this->length          = $length;
 		$this->is_true         = $is_true;
 	}
 }
diff --git a/wp-includes/html-api/class-wp-html-open-elements.php b/wp-includes/html-api/class-wp-html-open-elements.php
index fe56255..1234abc 100644
--- a/wp-includes/html-api/class-wp-html-open-elements.php
+++ b/wp-includes/html-api/class-wp-html-open-elements.php
@@ -116,13 +116,20 @@ class WP_HTML_Open_Elements {
 				return true;
 			}
 
+			if (
+				'(internal: H1 through H6 - do not use)' === $tag_name &&
+				in_array( $node->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true )
+			) {
+				return true;
+			}
+
 			switch ( $node->node_name ) {
 				case 'HTML':
 					return false;
 			}
 
 			if ( in_array( $node->node_name, $termination_list, true ) ) {
-				return true;
+				return false;
 			}
 		}
 
@@ -159,18 +166,22 @@ class WP_HTML_Open_Elements {
 	 * Returns whether a particular element is in list item scope.
 	 *
 	 * @since 6.4.0
+	 * @since 6.5.0 Implemented: no longer throws on every invocation.
 	 *
 	 * @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope
 	 *
-	 * @throws WP_HTML_Unsupported_Exception Always until this function is implemented.
-	 *
 	 * @param string $tag_name Name of tag to check.
 	 * @return bool Whether given element is in scope.
 	 */
 	public function has_element_in_list_item_scope( $tag_name ) {
-		throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on list item scope.' );
-
-		return false; // The linter requires this unreachable code until the function is implemented and can return.
+		return $this->has_element_in_specific_scope(
+			$tag_name,
+			array(
+				// There are more elements that belong here which aren't currently supported.
+				'OL',
+				'UL',
+			)
+		);
 	}
 
 	/**
@@ -270,6 +281,13 @@ class WP_HTML_Open_Elements {
 		foreach ( $this->walk_up() as $item ) {
 			$this->pop();
 
+			if (
+				'(internal: H1 through H6 - do not use)' === $tag_name &&
+				in_array( $item->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true )
+			) {
+				return true;
+			}
+
 			if ( $tag_name === $item->node_name ) {
 				return true;
 			}
@@ -361,10 +379,22 @@ class WP_HTML_Open_Elements {
 	 * see WP_HTML_Open_Elements::walk_down().
 	 *
 	 * @since 6.4.0
+	 * @since 6.5.0 Accepts $above_this_node to start traversal above a given node, if it exists.
+	 *
+	 * @param ?WP_HTML_Token $above_this_node Start traversing above this node, if provided and if the node exists.
 	 */
-	public function walk_up() {
+	public function walk_up( $above_this_node = null ) {
+		$has_found_node = null === $above_this_node;
+
 		for ( $i = count( $this->stack ) - 1; $i >= 0; $i-- ) {
-			yield $this->stack[ $i ];
+			$node = $this->stack[ $i ];
+
+			if ( ! $has_found_node ) {
+				$has_found_node = $node === $above_this_node;
+				continue;
+			}
+
+			yield $node;
 		}
 	}
 
diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php
index f27f83b..c76cc19 100644
--- a/wp-includes/html-api/class-wp-html-processor.php
+++ b/wp-includes/html-api/class-wp-html-processor.php
@@ -99,12 +99,20 @@
  *
  * The following list specifies the HTML tags that _are_ supported:
  *
+ *  - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY.
+ *  - Custom elements: All custom elements are supported. :)
+ *  - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH.
+ *  - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR.
+ *  - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
  *  - Links: A.
- *  - The formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
- *  - Containers: DIV, FIGCAPTION, FIGURE, SPAN.
- *  - Form elements: BUTTON.
- *  - Paragraph: P.
- *  - Void elements: IMG.
+ *  - Lists: DD, DL, DT, LI, OL, UL.
+ *  - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO.
+ *  - Paragraph: BR, P.
+ *  - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
+ *  - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION.
+ *  - Templating elements: SLOT.
+ *  - Text decoration: RUBY.
+ *  - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER.
  *
  * ### Supported markup
  *
@@ -142,17 +150,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	const MAX_BOOKMARKS = 100;
 
 	/**
-	 * Static query for instructing the Tag Processor to visit every token.
-	 *
-	 * @access private
-	 *
-	 * @since 6.4.0
-	 *
-	 * @var array
-	 */
-	const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' );
-
-	/**
 	 * Holds the working state of the parser, including the stack of
 	 * open elements and the stack of active formatting elements.
 	 *
@@ -244,15 +241,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			return null;
 		}
 
-		$p                        = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE );
-		$p->state->context_node   = array( 'BODY', array() );
-		$p->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+		$processor                        = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE );
+		$processor->state->context_node   = array( 'BODY', array() );
+		$processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
 
-		// @TODO: Create "fake" bookmarks for non-existent but implied nodes.
-		$p->bookmarks['root-node']    = new WP_HTML_Span( 0, 0 );
-		$p->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
+		// @todo Create "fake" bookmarks for non-existent but implied nodes.
+		$processor->bookmarks['root-node']    = new WP_HTML_Span( 0, 0 );
+		$processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
 
-		$p->state->stack_of_open_elements->push(
+		$processor->state->stack_of_open_elements->push(
 			new WP_HTML_Token(
 				'root-node',
 				'HTML',
@@ -260,15 +257,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			)
 		);
 
-		$p->state->stack_of_open_elements->push(
+		$processor->state->stack_of_open_elements->push(
 			new WP_HTML_Token(
 				'context-node',
-				$p->state->context_node[0],
+				$processor->state->context_node[0],
 				false
 			)
 		);
 
-		return $p;
+		return $processor;
 	}
 
 	/**
@@ -342,7 +339,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	/**
 	 * Finds the next tag matching the $query.
 	 *
-	 * @TODO: Support matching the class name and tag name.
+	 * @todo Support matching the class name and tag name.
 	 *
 	 * @since 6.4.0
 	 *
@@ -364,6 +361,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	public function next_tag( $query = null ) {
 		if ( null === $query ) {
 			while ( $this->step() ) {
+				if ( '#tag' !== $this->get_token_type() ) {
+					continue;
+				}
+
 				if ( ! $this->is_tag_closer() ) {
 					return true;
 				}
@@ -387,6 +388,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 
 		if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) {
 			while ( $this->step() ) {
+				if ( '#tag' !== $this->get_token_type() ) {
+					continue;
+				}
+
 				if ( ! $this->is_tag_closer() ) {
 					return true;
 				}
@@ -408,6 +413,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		$match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1;
 
 		while ( $match_offset > 0 && $this->step() ) {
+			if ( '#tag' !== $this->get_token_type() ) {
+				continue;
+			}
+
 			if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) {
 				return true;
 			}
@@ -417,6 +426,24 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	}
 
 	/**
+	 * Ensures internal accounting is maintained for HTML semantic rules while
+	 * the underlying Tag Processor class is seeking to a bookmark.
+	 *
+	 * This doesn't currently have a way to represent non-tags and doesn't process
+	 * semantic rules for text nodes. For access to the raw tokens consider using
+	 * WP_HTML_Tag_Processor instead.
+	 *
+	 * @since 6.5.0 Added for internal support; do not use.
+	 *
+	 * @access private
+	 *
+	 * @return bool
+	 */
+	public function next_token() {
+		return $this->step();
+	}
+
+	/**
 	 * Indicates if the currently-matched tag matches the given breadcrumbs.
 	 *
 	 * A "*" represents a single tag wildcard, where any tag matches, but not no tags.
@@ -442,10 +469,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @return bool Whether the currently-matched tag is found at the given nested structure.
 	 */
 	public function matches_breadcrumbs( $breadcrumbs ) {
-		if ( ! $this->get_tag() ) {
-			return false;
-		}
-
 		// Everything matches when there are zero constraints.
 		if ( 0 === count( $breadcrumbs ) ) {
 			return true;
@@ -492,7 +515,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			return false;
 		}
 
-		if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
+		if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) {
 			/*
 			 * Void elements still hop onto the stack of open elements even though
 			 * there's no corresponding closing tag. This is important for managing
@@ -502,28 +525,42 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * When moving on to the next node, therefore, if the bottom-most element
 			 * on the stack is a void element, it must be closed.
 			 *
-			 * @TODO: Once self-closing foreign elements and BGSOUND are supported,
+			 * @todo Once self-closing foreign elements and BGSOUND are supported,
 			 *        they must also be implicitly closed here too. BGSOUND is
 			 *        special since it's only self-closing if the self-closing flag
 			 *        is provided in the opening tag, otherwise it expects a tag closer.
 			 */
 			$top_node = $this->state->stack_of_open_elements->current_node();
-			if ( $top_node && self::is_void( $top_node->node_name ) ) {
+			if (
+				$top_node && (
+					// Void elements.
+					self::is_void( $top_node->node_name ) ||
+					// Comments, text nodes, and other atomic tokens.
+					'#' === $top_node->node_name[0] ||
+					// Doctype declarations.
+					'html' === $top_node->node_name
+				)
+			) {
 				$this->state->stack_of_open_elements->pop();
 			}
+		}
 
-			parent::next_tag( self::VISIT_EVERYTHING );
+		if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
+			parent::next_token();
 		}
 
 		// Finish stepping when there are no more tokens in the document.
-		if ( null === $this->get_tag() ) {
+		if (
+			WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
+			WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
+		) {
 			return false;
 		}
 
 		$this->state->current_token = new WP_HTML_Token(
-			$this->bookmark_tag(),
-			$this->get_tag(),
-			$this->is_tag_closer(),
+			$this->bookmark_token(),
+			$this->get_token_name(),
+			$this->has_self_closing_flag(),
 			$this->release_internal_bookmark_on_destruct
 		);
 
@@ -551,9 +588,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * Breadcrumbs start at the outermost parent and descend toward the matched element.
 	 * They always include the entire path from the root HTML node to the matched element.
 	 *
-	 * @TODO: It could be more efficient to expose a generator-based version of this function
-	 *        to avoid creating the array copy on tag iteration. If this is done, it would likely
-	 *        be more useful to walk up the stack when yielding instead of starting at the top.
+	 * @todo It could be more efficient to expose a generator-based version of this function
+	 *       to avoid creating the array copy on tag iteration. If this is done, it would likely
+	 *       be more useful to walk up the stack when yielding instead of starting at the top.
 	 *
 	 * Example
 	 *
@@ -566,10 +603,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL.
 	 */
 	public function get_breadcrumbs() {
-		if ( ! $this->get_tag() ) {
-			return null;
-		}
-
 		$breadcrumbs = array();
 		foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) {
 			$breadcrumbs[] = $stack_item->node_name;
@@ -594,17 +627,67 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_in_body() {
-		$tag_name = $this->get_tag();
-		$op_sigil = $this->is_tag_closer() ? '-' : '+';
-		$op       = "{$op_sigil}{$tag_name}";
+		$token_name = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$token_name}";
 
 		switch ( $op ) {
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			case '#text':
+				$this->reconstruct_active_formatting_elements();
+
+				$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
+
+				/*
+				 * > A character token that is U+0000 NULL
+				 *
+				 * Any successive sequence of NULL bytes is ignored and won't
+				 * trigger active format reconstruction. Therefore, if the text
+				 * only comprises NULL bytes then the token should be ignored
+				 * here, but if there are any other characters in the stream
+				 * the active formats should be reconstructed.
+				 */
+				if (
+					1 <= $current_token->length &&
+					"\x00" === $this->html[ $current_token->start ] &&
+					strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
+				) {
+					// Parse error: ignore the token.
+					return $this->step();
+				}
+
+				/*
+				 * Whitespace-only text does not affect the frameset-ok flag.
+				 * It is probably inter-element whitespace, but it may also
+				 * contain character references which decode only to whitespace.
+				 */
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
+					$this->state->frameset_ok = false;
+				}
+
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			case 'html':
+				/*
+				 * > A DOCTYPE token
+				 * > Parse error. Ignore the token.
+				 */
+				return $this->step();
+
 			/*
 			 * > A start tag whose tag name is "button"
 			 */
 			case '+BUTTON':
 				if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) {
-					// @TODO: Indicate a parse error once it's possible. This error does not impact the logic here.
+					// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
 					$this->generate_implied_end_tags();
 					$this->state->stack_of_open_elements->pop_until( 'BUTTON' );
 				}
@@ -621,11 +704,31 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * > "fieldset", "figcaption", "figure", "footer", "header", "hgroup",
 			 * > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul"
 			 */
+			case '+ADDRESS':
+			case '+ARTICLE':
+			case '+ASIDE':
 			case '+BLOCKQUOTE':
+			case '+CENTER':
+			case '+DETAILS':
+			case '+DIALOG':
+			case '+DIR':
 			case '+DIV':
+			case '+DL':
+			case '+FIELDSET':
 			case '+FIGCAPTION':
 			case '+FIGURE':
+			case '+FOOTER':
+			case '+HEADER':
+			case '+HGROUP':
+			case '+MAIN':
+			case '+MENU':
+			case '+NAV':
+			case '+OL':
 			case '+P':
+			case '+SEARCH':
+			case '+SECTION':
+			case '+SUMMARY':
+			case '+UL':
 				if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
 					$this->close_a_p_element();
 				}
@@ -639,22 +742,213 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main",
 			 * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul"
 			 */
+			case '-ADDRESS':
+			case '-ARTICLE':
+			case '-ASIDE':
 			case '-BLOCKQUOTE':
 			case '-BUTTON':
+			case '-CENTER':
+			case '-DETAILS':
+			case '-DIALOG':
+			case '-DIR':
 			case '-DIV':
+			case '-DL':
+			case '-FIELDSET':
 			case '-FIGCAPTION':
 			case '-FIGURE':
-				if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) {
-					// @TODO: Report parse error.
+			case '-FOOTER':
+			case '-HEADER':
+			case '-HGROUP':
+			case '-LISTING':
+			case '-MAIN':
+			case '-MENU':
+			case '-NAV':
+			case '-OL':
+			case '-PRE':
+			case '-SEARCH':
+			case '-SECTION':
+			case '-SUMMARY':
+			case '-UL':
+				if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $token_name ) ) {
+					// @todo Report parse error.
 					// Ignore the token.
 					return $this->step();
 				}
 
 				$this->generate_implied_end_tags();
-				if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) {
-					// @TODO: Record parse error: this error doesn't impact parsing.
+				if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
+					// @todo Record parse error: this error doesn't impact parsing.
+				}
+				$this->state->stack_of_open_elements->pop_until( $token_name );
+				return true;
+
+			/*
+			 * > A start tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
+			 */
+			case '+H1':
+			case '+H2':
+			case '+H3':
+			case '+H4':
+			case '+H5':
+			case '+H6':
+				if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+					$this->close_a_p_element();
+				}
+
+				if (
+					in_array(
+						$this->state->stack_of_open_elements->current_node()->node_name,
+						array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ),
+						true
+					)
+				) {
+					// @todo Indicate a parse error once it's possible.
+					$this->state->stack_of_open_elements->pop();
+				}
+
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			/*
+			 * > A start tag whose tag name is one of: "pre", "listing"
+			 */
+			case '+PRE':
+			case '+LISTING':
+				if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+					$this->close_a_p_element();
+				}
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->frameset_ok = false;
+				return true;
+
+			/*
+			 * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
+			 */
+			case '-H1':
+			case '-H2':
+			case '-H3':
+			case '-H4':
+			case '-H5':
+			case '-H6':
+				if ( ! $this->state->stack_of_open_elements->has_element_in_scope( '(internal: H1 through H6 - do not use)' ) ) {
+					/*
+					 * This is a parse error; ignore the token.
+					 *
+					 * @todo Indicate a parse error once it's possible.
+					 */
+					return $this->step();
+				}
+
+				$this->generate_implied_end_tags();
+
+				if ( $this->state->stack_of_open_elements->current_node()->node_name !== $token_name ) {
+					// @todo Record parse error: this error doesn't impact parsing.
 				}
-				$this->state->stack_of_open_elements->pop_until( $tag_name );
+
+				$this->state->stack_of_open_elements->pop_until( '(internal: H1 through H6 - do not use)' );
+				return true;
+
+			/*
+			 * > A start tag whose tag name is "li"
+			 * > A start tag whose tag name is one of: "dd", "dt"
+			 */
+			case '+DD':
+			case '+DT':
+			case '+LI':
+				$this->state->frameset_ok = false;
+				$node                     = $this->state->stack_of_open_elements->current_node();
+				$is_li                    = 'LI' === $token_name;
+
+				in_body_list_loop:
+				/*
+				 * The logic for LI and DT/DD is the same except for one point: LI elements _only_
+				 * close other LI elements, but a DT or DD element closes _any_ open DT or DD element.
+				 */
+				if ( $is_li ? 'LI' === $node->node_name : ( 'DD' === $node->node_name || 'DT' === $node->node_name ) ) {
+					$node_name = $is_li ? 'LI' : $node->node_name;
+					$this->generate_implied_end_tags( $node_name );
+					if ( $node_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
+						// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
+					}
+
+					$this->state->stack_of_open_elements->pop_until( $node_name );
+					goto in_body_list_done;
+				}
+
+				if (
+					'ADDRESS' !== $node->node_name &&
+					'DIV' !== $node->node_name &&
+					'P' !== $node->node_name &&
+					$this->is_special( $node->node_name )
+				) {
+					/*
+					 * > If node is in the special category, but is not an address, div,
+					 * > or p element, then jump to the step labeled done below.
+					 */
+					goto in_body_list_done;
+				} else {
+					/*
+					 * > Otherwise, set node to the previous entry in the stack of open elements
+					 * > and return to the step labeled loop.
+					 */
+					foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) {
+						$node = $item;
+						break;
+					}
+					goto in_body_list_loop;
+				}
+
+				in_body_list_done:
+				if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+					$this->close_a_p_element();
+				}
+
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			/*
+			 * > An end tag whose tag name is "li"
+			 * > An end tag whose tag name is one of: "dd", "dt"
+			 */
+			case '-DD':
+			case '-DT':
+			case '-LI':
+				if (
+					/*
+					 * An end tag whose tag name is "li":
+					 * If the stack of open elements does not have an li element in list item scope,
+					 * then this is a parse error; ignore the token.
+					 */
+					(
+						'LI' === $token_name &&
+						! $this->state->stack_of_open_elements->has_element_in_list_item_scope( 'LI' )
+					) ||
+					/*
+					 * An end tag whose tag name is one of: "dd", "dt":
+					 * If the stack of open elements does not have an element in scope that is an
+					 * HTML element with the same tag name as that of the token, then this is a
+					 * parse error; ignore the token.
+					 */
+					(
+						'LI' !== $token_name &&
+						! $this->state->stack_of_open_elements->has_element_in_scope( $token_name )
+					)
+				) {
+					/*
+					 * This is a parse error, ignore the token.
+					 *
+					 * @todo Indicate a parse error once it's possible.
+					 */
+					return $this->step();
+				}
+
+				$this->generate_implied_end_tags( $token_name );
+
+				if ( $token_name !== $this->state->stack_of_open_elements->current_node()->node_name ) {
+					// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
+				}
+
+				$this->state->stack_of_open_elements->pop_until( $token_name );
 				return true;
 
 			/*
@@ -730,47 +1024,174 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				return true;
 
 			/*
+			 * > An end tag whose tag name is "br"
+			 * >   Parse error. Drop the attributes from the token, and act as described in the next
+			 * >   entry; i.e. act as if this was a "br" start tag token with no attributes, rather
+			 * >   than the end tag token that it actually is.
+			 */
+			case '-BR':
+				$this->last_error = self::ERROR_UNSUPPORTED;
+				throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' );
+
+			/*
 			 * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr"
 			 */
+			case '+AREA':
+			case '+BR':
+			case '+EMBED':
 			case '+IMG':
+			case '+KEYGEN':
+			case '+WBR':
 				$this->reconstruct_active_formatting_elements();
 				$this->insert_html_element( $this->state->current_token );
+				$this->state->frameset_ok = false;
 				return true;
 
 			/*
-			 * > Any other start tag
+			 * > A start tag whose tag name is "input"
 			 */
-			case '+SPAN':
+			case '+INPUT':
 				$this->reconstruct_active_formatting_elements();
 				$this->insert_html_element( $this->state->current_token );
+				$type_attribute = $this->get_attribute( 'type' );
+				/*
+				 * > If the token does not have an attribute with the name "type", or if it does,
+				 * > but that attribute's value is not an ASCII case-insensitive match for the
+				 * > string "hidden", then: set the frameset-ok flag to "not ok".
+				 */
+				if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) {
+					$this->state->frameset_ok = false;
+				}
 				return true;
 
 			/*
-			 * Any other end tag
+			 * > A start tag whose tag name is "hr"
 			 */
-			case '-SPAN':
-				foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
-					// > If node is an HTML element with the same tag name as the token, then:
-					if ( $item->node_name === $tag_name ) {
-						$this->generate_implied_end_tags( $tag_name );
+			case '+HR':
+				if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+					$this->close_a_p_element();
+				}
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->frameset_ok = false;
+				return true;
 
-						// > If node is not the current node, then this is a parse error.
+			/*
+			 * > A start tag whose tag name is one of: "param", "source", "track"
+			 */
+			case '+PARAM':
+			case '+SOURCE':
+			case '+TRACK':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+		}
 
-						$this->state->stack_of_open_elements->pop_until( $tag_name );
-						return true;
-					}
+		/*
+		 * These tags require special handling in the 'in body' insertion mode
+		 * but that handling hasn't yet been implemented.
+		 *
+		 * As the rules for each tag are implemented, the corresponding tag
+		 * name should be removed from this list. An accompanying test should
+		 * help ensure this list is maintained.
+		 *
+		 * @see Tests_HtmlApi_WpHtmlProcessor::test_step_in_body_fails_on_unsupported_tags
+		 *
+		 * Since this switch structure throws a WP_HTML_Unsupported_Exception, it's
+		 * possible to handle "any other start tag" and "any other end tag" below,
+		 * as that guarantees execution doesn't proceed for the unimplemented tags.
+		 *
+		 * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
+		 */
+		switch ( $token_name ) {
+			case 'APPLET':
+			case 'BASE':
+			case 'BASEFONT':
+			case 'BGSOUND':
+			case 'BODY':
+			case 'CAPTION':
+			case 'COL':
+			case 'COLGROUP':
+			case 'FORM':
+			case 'FRAME':
+			case 'FRAMESET':
+			case 'HEAD':
+			case 'HTML':
+			case 'IFRAME':
+			case 'LINK':
+			case 'MARQUEE':
+			case 'MATH':
+			case 'META':
+			case 'NOBR':
+			case 'NOEMBED':
+			case 'NOFRAMES':
+			case 'NOSCRIPT':
+			case 'OBJECT':
+			case 'OPTGROUP':
+			case 'OPTION':
+			case 'PLAINTEXT':
+			case 'RB':
+			case 'RP':
+			case 'RT':
+			case 'RTC':
+			case 'SARCASM':
+			case 'SCRIPT':
+			case 'SELECT':
+			case 'STYLE':
+			case 'SVG':
+			case 'TABLE':
+			case 'TBODY':
+			case 'TD':
+			case 'TEMPLATE':
+			case 'TEXTAREA':
+			case 'TFOOT':
+			case 'TH':
+			case 'THEAD':
+			case 'TITLE':
+			case 'TR':
+			case 'XMP':
+				$this->last_error = self::ERROR_UNSUPPORTED;
+				throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
+		}
 
-					// > Otherwise, if node is in the special category, then this is a parse error; ignore the token, and return.
-					if ( self::is_special( $item->node_name ) ) {
-						return $this->step();
-					}
+		if ( ! $this->is_tag_closer() ) {
+			/*
+			 * > Any other start tag
+			 */
+			$this->reconstruct_active_formatting_elements();
+			$this->insert_html_element( $this->state->current_token );
+			return true;
+		} else {
+			/*
+			 * > Any other end tag
+			 */
+
+			/*
+			 * Find the corresponding tag opener in the stack of open elements, if
+			 * it exists before reaching a special element, which provides a kind
+			 * of boundary in the stack. For example, a `</custom-tag>` should not
+			 * close anything beyond its containing `P` or `DIV` element.
+			 */
+			foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
+				if ( $token_name === $node->node_name ) {
+					break;
 				}
-				// Execution should not reach here; if it does then something went wrong.
-				return false;
 
-			default:
-				$this->last_error = self::ERROR_UNSUPPORTED;
-				throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
+				if ( self::is_special( $node->node_name ) ) {
+					// This is a parse error, ignore the token.
+					return $this->step();
+				}
+			}
+
+			$this->generate_implied_end_tags( $token_name );
+			if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
+				// @todo Record parse error: this error doesn't impact parsing.
+			}
+
+			foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
+				$this->state->stack_of_open_elements->pop();
+				if ( $node === $item ) {
+					return true;
+				}
+			}
 		}
 	}
 
@@ -779,19 +1200,16 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 */
 
 	/**
-	 * Creates a new bookmark for the currently-matched tag and returns the generated name.
+	 * Creates a new bookmark for the currently-matched token and returns the generated name.
 	 *
 	 * @since 6.4.0
+	 * @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
 	 *
 	 * @throws Exception When unable to allocate requested bookmark.
 	 *
 	 * @return string|false Name of created bookmark, or false if unable to create.
 	 */
-	private function bookmark_tag() {
-		if ( ! $this->get_tag() ) {
-			return false;
-		}
-
+	private function bookmark_token() {
 		if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
 			$this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
 			throw new Exception( 'could not allocate bookmark' );
@@ -863,6 +1281,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	/**
 	 * Moves the internal cursor in the HTML Processor to a given bookmark's location.
 	 *
+	 * Be careful! Seeking backwards to a previous location resets the parser to the
+	 * start of the document and reparses the entire contents up until it finds the
+	 * sought-after bookmarked location.
+	 *
 	 * In order to prevent accidental infinite loops, there's a
 	 * maximum limit on the number of times seek() can be called.
 	 *
@@ -874,6 +1296,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @return bool Whether the internal cursor was successfully moved to the bookmark's location.
 	 */
 	public function seek( $bookmark_name ) {
+		// Flush any pending updates to the document before beginning.
+		$this->get_updated_html();
+
 		$actual_bookmark_name = "_{$bookmark_name}";
 		$processor_started_at = $this->state->current_token
 			? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start
@@ -881,44 +1306,73 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		$bookmark_starts_at   = $this->bookmarks[ $actual_bookmark_name ]->start;
 		$direction            = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward';
 
-		switch ( $direction ) {
-			case 'forward':
-				// When moving forwards, re-parse the document until reaching the same location as the original bookmark.
-				while ( $this->step() ) {
-					if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
-						return true;
-					}
+		/*
+		 * If seeking backwards, it's possible that the sought-after bookmark exists within an element
+		 * which has been closed before the current cursor; in other words, it has already been removed
+		 * from the stack of open elements. This means that it's insufficient to simply pop off elements
+		 * from the stack of open elements which appear after the bookmarked location and then jump to
+		 * that location, as the elements which were open before won't be re-opened.
+		 *
+		 * In order to maintain consistency, the HTML Processor rewinds to the start of the document
+		 * and reparses everything until it finds the sought-after bookmark.
+		 *
+		 * There are potentially better ways to do this: cache the parser state for each bookmark and
+		 * restore it when seeking; store an immutable and idempotent register of where elements open
+		 * and close.
+		 *
+		 * If caching the parser state it will be essential to properly maintain the cached stack of
+		 * open elements and active formatting elements when modifying the document. This could be a
+		 * tedious and time-consuming process as well, and so for now will not be performed.
+		 *
+		 * It may be possible to track bookmarks for where elements open and close, and in doing so
+		 * be able to quickly recalculate breadcrumbs for any element in the document. It may even
+		 * be possible to remove the stack of open elements and compute it on the fly this way.
+		 * If doing this, the parser would need to track the opening and closing locations for all
+		 * tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves
+		 * this list could be automatically maintained while modifying the document. Finding the
+		 * breadcrumbs would then amount to traversing that list from the start until the token
+		 * being inspected. Once an element closes, if there are no bookmarks pointing to locations
+		 * within that element, then all of these locations may be forgotten to save on memory use
+		 * and computation time.
+		 */
+		if ( 'backward' === $direction ) {
+			/*
+			 * Instead of clearing the parser state and starting fresh, calling the stack methods
+			 * maintains the proper flags in the parser.
+			 */
+			foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
+				if ( 'context-node' === $item->bookmark_name ) {
+					break;
 				}
 
-				return false;
-
-			case 'backward':
-				/*
-				 * When moving backwards, clear out all existing stack entries which appear after the destination
-				 * bookmark. These could be stored for later retrieval, but doing so would require additional
-				 * memory overhead and also demand that references and bookmarks are updated as the document
-				 * changes. In time this could be a valuable optimization, but it's okay to give up that
-				 * optimization in exchange for more CPU time to recompute the stack, to re-parse the
-				 * document that may have already been parsed once.
-				 */
-				foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
-					if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) {
-						break;
-					}
+				$this->state->stack_of_open_elements->remove_node( $item );
+			}
 
-					$this->state->stack_of_open_elements->remove_node( $item );
+			foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
+				if ( 'context-node' === $item->bookmark_name ) {
+					break;
 				}
 
-				foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
-					if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) {
-						break;
-					}
+				$this->state->active_formatting_elements->remove_node( $item );
+			}
 
-					$this->state->active_formatting_elements->remove_node( $item );
-				}
+			parent::seek( 'context-node' );
+			$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+			$this->state->frameset_ok    = true;
+		}
 
-				return parent::seek( $actual_bookmark_name );
+		// When moving forwards, reparse the document until reaching the same location as the original bookmark.
+		if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
+			return true;
 		}
+
+		while ( $this->step() ) {
+			if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
+				return true;
+			}
+		}
+
+		return false;
 	}
 
 	/**
@@ -1005,6 +1459,18 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		return parent::set_bookmark( "_{$bookmark_name}" );
 	}
 
+	/**
+	 * Checks whether a bookmark with the given name exists.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @param string $bookmark_name Name to identify a bookmark that potentially exists.
+	 * @return bool Whether that bookmark exists.
+	 */
+	public function has_bookmark( $bookmark_name ) {
+		return parent::has_bookmark( "_{$bookmark_name}" );
+	}
+
 	/*
 	 * HTML Parsing Algorithms
 	 */
@@ -1034,6 +1500,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 */
 	private function generate_implied_end_tags( $except_for_this_element = null ) {
 		$elements_with_implied_end_tags = array(
+			'DD',
+			'DT',
+			'LI',
 			'P',
 		);
 
@@ -1059,6 +1528,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 */
 	private function generate_implied_end_tags_thoroughly() {
 		$elements_with_implied_end_tags = array(
+			'DD',
+			'DT',
+			'LI',
 			'P',
 		);
 
@@ -1170,7 +1642,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 
 			// > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return.
 			if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) {
-				$this->state->active_formatting_elements->remove_node( $formatting_element->bookmark_name );
+				$this->state->active_formatting_elements->remove_node( $formatting_element );
 				return;
 			}
 
@@ -1373,14 +1845,19 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		return (
 			'AREA' === $tag_name ||
 			'BASE' === $tag_name ||
+			'BASEFONT' === $tag_name || // Obsolete but still treated as void.
+			'BGSOUND' === $tag_name || // Obsolete but still treated as void.
 			'BR' === $tag_name ||
 			'COL' === $tag_name ||
 			'EMBED' === $tag_name ||
+			'FRAME' === $tag_name ||
 			'HR' === $tag_name ||
 			'IMG' === $tag_name ||
 			'INPUT' === $tag_name ||
+			'KEYGEN' === $tag_name || // Obsolete but still treated as void.
 			'LINK' === $tag_name ||
 			'META' === $tag_name ||
+			'PARAM' === $tag_name || // Obsolete but still treated as void.
 			'SOURCE' === $tag_name ||
 			'TRACK' === $tag_name ||
 			'WBR' === $tag_name
@@ -1410,6 +1887,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	const REPROCESS_CURRENT_NODE = 'reprocess-current-node';
 
 	/**
+	 * Indicates that the current HTML token should be processed without advancing the parser.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @var string
+	 */
+	const PROCESS_CURRENT_NODE = 'process-current-node';
+
+	/**
 	 * Indicates that the parser encountered unsupported markup and has bailed.
 	 *
 	 * @since 6.4.0
diff --git a/wp-includes/html-api/class-wp-html-span.php b/wp-includes/html-api/class-wp-html-span.php
index 46227eb..b1ab865 100644
--- a/wp-includes/html-api/class-wp-html-span.php
+++ b/wp-includes/html-api/class-wp-html-span.php
@@ -18,6 +18,7 @@
  *
  * @access private
  * @since 6.2.0
+ * @since 6.5.0 Replaced `end` with `length` to more closely align with `substr()`.
  *
  * @see WP_HTML_Tag_Processor
  */
@@ -26,28 +27,30 @@ class WP_HTML_Span {
 	 * Byte offset into document where span begins.
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var int
 	 */
 	public $start;
 
 	/**
-	 * Byte offset into document where span ends.
+	 * Byte length of this span.
+	 *
+	 * @since 6.5.0
 	 *
-	 * @since 6.2.0
 	 * @var int
 	 */
-	public $end;
+	public $length;
 
 	/**
 	 * Constructor.
 	 *
 	 * @since 6.2.0
 	 *
-	 * @param int $start Byte offset into document where replacement span begins.
-	 * @param int $end   Byte offset into document where replacement span ends.
+	 * @param int $start  Byte offset into document where replacement span begins.
+	 * @param int $length Byte length of span.
 	 */
-	public function __construct( $start, $end ) {
-		$this->start = $start;
-		$this->end   = $end;
+	public function __construct( $start, $length ) {
+		$this->start  = $start;
+		$this->length = $length;
 	}
 }
diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php
index 0572c46..c540ea9 100644
--- a/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -15,9 +15,6 @@
  *  - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c".
  *    This would increase the size of the changes for some operations but leave more
  *    natural-looking output HTML.
- *  - Decode HTML character references within class names when matching. E.g. match having
- *    class `1<"2` needs to recognize `class="1&lt;&quot;2"`. Currently the Tag Processor
- *    will fail to find the right tag if the class name is encoded as such.
  *  - Properly decode HTML character references in `get_attribute()`. PHP's
  *    `html_entity_decode()` is wrong in a couple ways: it doesn't account for the
  *    no-ambiguous-ampersand rule, and it improperly handles the way semicolons may
@@ -107,6 +104,56 @@
  * given, it will return `true` (the only way to set `false` for an
  * attribute is to remove it).
  *
+ * #### When matching fails
+ *
+ * When `next_tag()` returns `false` it could mean different things:
+ *
+ *  - The requested tag wasn't found in the input document.
+ *  - The input document ended in the middle of an HTML syntax element.
+ *
+ * When a document ends in the middle of a syntax element it will pause
+ * the processor. This is to make it possible in the future to extend the
+ * input document and proceed - an important requirement for chunked
+ * streaming parsing of a document.
+ *
+ * Example:
+ *
+ *     $processor = new WP_HTML_Tag_Processor( 'This <div is="a" partial="token' );
+ *     false === $processor->next_tag();
+ *
+ * If a special element (see next section) is encountered but no closing tag
+ * is found it will count as an incomplete tag. The parser will pause as if
+ * the opening tag were incomplete.
+ *
+ * Example:
+ *
+ *     $processor = new WP_HTML_Tag_Processor( '<style>// there could be more styling to come' );
+ *     false === $processor->next_tag();
+ *
+ *     $processor = new WP_HTML_Tag_Processor( '<style>// this is everything</style><div>' );
+ *     true === $processor->next_tag( 'DIV' );
+ *
+ * #### Special elements
+ *
+ * Some HTML elements are handled in a special way; their start and end tags
+ * act like a void tag. These are special because their contents can't contain
+ * HTML markup. Everything inside these elements is handled in a special way
+ * and content that _appears_ like HTML tags inside of them isn't. There can
+ * be no nesting in these elements.
+ *
+ * In the following list, "raw text" means that all of the content in the HTML
+ * until the matching closing tag is treated verbatim without any replacements
+ * and without any parsing.
+ *
+ *  - IFRAME allows no content but requires a closing tag.
+ *  - NOEMBED (deprecated) content is raw text.
+ *  - NOFRAMES (deprecated) content is raw text.
+ *  - SCRIPT content is plaintext apart from legacy rules allowing `</script>` inside an HTML comment.
+ *  - STYLE content is raw text.
+ *  - TITLE content is plain text but character references are decoded.
+ *  - TEXTAREA content is plain text but character references are decoded.
+ *  - XMP (deprecated) content is raw text.
+ *
  * ### Modifying HTML attributes for a found tag
  *
  * Once you've found the start of an opening tag you can modify
@@ -200,6 +247,95 @@
  *         }
  *     }
  *
+ * ## Tokens and finer-grained processing.
+ *
+ * It's possible to scan through every lexical token in the
+ * HTML document using the `next_token()` function. This
+ * alternative form takes no argument and provides no built-in
+ * query syntax.
+ *
+ * Example:
+ *
+ *      $title = '(untitled)';
+ *      $text  = '';
+ *      while ( $processor->next_token() ) {
+ *          switch ( $processor->get_token_name() ) {
+ *              case '#text':
+ *                  $text .= $processor->get_modifiable_text();
+ *                  break;
+ *
+ *              case 'BR':
+ *                  $text .= "\n";
+ *                  break;
+ *
+ *              case 'TITLE':
+ *                  $title = $processor->get_modifiable_text();
+ *                  break;
+ *          }
+ *      }
+ *      return trim( "# {$title}\n\n{$text}" );
+ *
+ * ### Tokens and _modifiable text_.
+ *
+ * #### Special "atomic" HTML elements.
+ *
+ * Not all HTML elements are able to contain other elements inside of them.
+ * For instance, the contents inside a TITLE element are plaintext (except
+ * that character references like &amp; will be decoded). This means that
+ * if the string `<img>` appears inside a TITLE element, then it's not an
+ * image tag, but rather it's text describing an image tag. Likewise, the
+ * contents of a SCRIPT or STYLE element are handled entirely separately in
+ * a browser than the contents of other elements because they represent a
+ * different language than HTML.
+ *
+ * For these elements the Tag Processor treats the entire sequence as one,
+ * from the opening tag, including its contents, through its closing tag.
+ * This means that the it's not possible to match the closing tag for a
+ * SCRIPT element unless it's unexpected; the Tag Processor already matched
+ * it when it found the opening tag.
+ *
+ * The inner contents of these elements are that element's _modifiable text_.
+ *
+ * The special elements are:
+ *  - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy
+ *    style of including Javascript inside of HTML comments to avoid accidentally
+ *    closing the SCRIPT from inside a Javascript string. E.g. `console.log( '</script>' )`.
+ *  - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any
+ *    character references are decoded. E.g. `1 &lt; 2 < 3` becomes `1 < 2 < 3`.
+ *  - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as
+ *    raw plaintext and left as-is. E.g. `1 &lt; 2 < 3` remains `1 &lt; 2 < 3`.
+ *
+ * #### Other tokens with modifiable text.
+ *
+ * There are also non-elements which are void/self-closing in nature and contain
+ * modifiable text that is part of that individual syntax token itself.
+ *
+ *  - `#text` nodes, whose entire token _is_ the modifiable text.
+ *  - HTML comments and tokens that become comments due to some syntax error. The
+ *    text for these tokens is the portion of the comment inside of the syntax.
+ *    E.g. for `<!-- comment -->` the text is `" comment "` (note the spaces are included).
+ *  - `CDATA` sections, whose text is the content inside of the section itself. E.g. for
+ *    `<![CDATA[some content]]>` the text is `"some content"` (with restrictions [1]).
+ *  - "Funky comments," which are a special case of invalid closing tags whose name is
+ *    invalid. The text for these nodes is the text that a browser would transform into
+ *    an HTML comment when parsing. E.g. for `</%post_author>` the text is `%post_author`.
+ *  - `DOCTYPE` declarations like `<DOCTYPE html>` which have no closing tag.
+ *  - XML Processing instruction nodes like `<?wp __( "Like" ); ?>` (with restrictions [2]).
+ *  - The empty end tag `</>` which is ignored in the browser and DOM.
+ *
+ * [1]: There are no CDATA sections in HTML. When encountering `<![CDATA[`, everything
+ *      until the next `>` becomes a bogus HTML comment, meaning there can be no CDATA
+ *      section in an HTML document containing `>`. The Tag Processor will first find
+ *      all valid and bogus HTML comments, and then if the comment _would_ have been a
+ *      CDATA section _were they to exist_, it will indicate this as the type of comment.
+ *
+ * [2]: XML allows a broader range of characters in a processing instruction's target name
+ *      and disallows "xml" as a name, since it's special. The Tag Processor only recognizes
+ *      target names with an ASCII-representable subset of characters. It also exhibits the
+ *      same constraint as with CDATA sections, in that `>` cannot exist within the token
+ *      since Processing Instructions do no exist within HTML and their syntax transforms
+ *      into a bogus comment in the DOM.
+ *
  * ## Design and limitations
  *
  * The Tag Processor is designed to linearly scan HTML documents and tokenize
@@ -241,9 +377,40 @@
  * double-quoted strings, meaning that attributes on input with single-quoted or
  * unquoted values will appear in the output with double-quotes.
  *
+ * ### Scripting Flag
+ *
+ * The Tag Processor parses HTML with the "scripting flag" disabled. This means
+ * that it doesn't run any scripts while parsing the page. In a browser with
+ * JavaScript enabled, for example, the script can change the parse of the
+ * document as it loads. On the server, however, evaluating JavaScript is not
+ * only impractical, but also unwanted.
+ *
+ * Practically this means that the Tag Processor will descend into NOSCRIPT
+ * elements and process its child tags. Were the scripting flag enabled, such
+ * as in a typical browser, the contents of NOSCRIPT are skipped entirely.
+ *
+ * This allows the HTML API to process the content that will be presented in
+ * a browser when scripting is disabled, but it offers a different view of a
+ * page than most browser sessions will experience. E.g. the tags inside the
+ * NOSCRIPT disappear.
+ *
+ * ### Text Encoding
+ *
+ * The Tag Processor assumes that the input HTML document is encoded with a
+ * text encoding compatible with 7-bit ASCII's '<', '>', '&', ';', '/', '=',
+ * "'", '"', 'a' - 'z', 'A' - 'Z', and the whitespace characters ' ', tab,
+ * carriage-return, newline, and form-feed.
+ *
+ * In practice, this includes almost every single-byte encoding as well as
+ * UTF-8. Notably, however, it does not include UTF-16. If providing input
+ * that's incompatible, then convert the encoding beforehand.
+ *
  * @since 6.2.0
  * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive.
  * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE.
+ * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token.
+ *              Introduces "special" elements which act like void elements, e.g. TITLE, STYLE.
+ *              Allows scanning through all tokens and processing modifiable text, where applicable.
  */
 class WP_HTML_Tag_Processor {
 	/**
@@ -317,6 +484,51 @@ class WP_HTML_Tag_Processor {
 	private $stop_on_tag_closers;
 
 	/**
+	 * Specifies mode of operation of the parser at any given time.
+	 *
+	 * | State           | Meaning                                                              |
+	 * | ----------------|----------------------------------------------------------------------|
+	 * | *Ready*         | The parser is ready to run.                                          |
+	 * | *Complete*      | There is nothing left to parse.                                      |
+	 * | *Incomplete*    | The HTML ended in the middle of a token; nothing more can be parsed. |
+	 * | *Matched tag*   | Found an HTML tag; it's possible to modify its attributes.           |
+	 * | *Text node*     | Found a #text node; this is plaintext and modifiable.                |
+	 * | *CDATA node*    | Found a CDATA section; this is modifiable.                           |
+	 * | *Comment*       | Found a comment or bogus comment; this is modifiable.                |
+	 * | *Presumptuous*  | Found an empty tag closer: `</>`.                                    |
+	 * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable.     |
+	 *
+	 * @since 6.5.0
+	 *
+	 * @see WP_HTML_Tag_Processor::STATE_READY
+	 * @see WP_HTML_Tag_Processor::STATE_COMPLETE
+	 * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT
+	 * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
+	 * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
+	 * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
+	 * @see WP_HTML_Tag_Processor::STATE_COMMENT
+	 * @see WP_HTML_Tag_Processor::STATE_DOCTYPE
+	 * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
+	 * @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT
+	 *
+	 * @var string
+	 */
+	protected $parser_state = self::STATE_READY;
+
+	/**
+	 * What kind of syntax token became an HTML comment.
+	 *
+	 * Since there are many ways in which HTML syntax can create an HTML comment,
+	 * this indicates which of those caused it. This allows the Tag Processor to
+	 * represent more from the original input document than would appear in the DOM.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @var string|null
+	 */
+	protected $comment_type = null;
+
+	/**
 	 * How many bytes from the original HTML document have been read and parsed.
 	 *
 	 * This value points to the latest byte offset in the input document which
@@ -329,6 +541,40 @@ class WP_HTML_Tag_Processor {
 	private $bytes_already_parsed = 0;
 
 	/**
+	 * Byte offset in input document where current token starts.
+	 *
+	 * Example:
+	 *
+	 *     <div id="test">...
+	 *     01234
+	 *     - token starts at 0
+	 *
+	 * @since 6.5.0
+	 *
+	 * @var int|null
+	 */
+	private $token_starts_at;
+
+	/**
+	 * Byte length of current token.
+	 *
+	 * Example:
+	 *
+	 *     <div id="test">...
+	 *     012345678901234
+	 *     - token length is 14 - 0 = 14
+	 *
+	 *     a <!-- comment --> is a token.
+	 *     0123456789 123456789 123456789
+	 *     - token length is 17 - 2 = 15
+	 *
+	 * @since 6.5.0
+	 *
+	 * @var int|null
+	 */
+	private $token_length;
+
+	/**
 	 * Byte offset in input document where current tag name starts.
 	 *
 	 * Example:
@@ -338,6 +584,7 @@ class WP_HTML_Tag_Processor {
 	 *      - tag name starts at 1
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var int|null
 	 */
 	private $tag_name_starts_at;
@@ -352,24 +599,28 @@ class WP_HTML_Tag_Processor {
 	 *      --- tag name length is 3
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var int|null
 	 */
 	private $tag_name_length;
 
 	/**
-	 * Byte offset in input document where current tag token ends.
+	 * Byte offset into input document where current modifiable text starts.
 	 *
-	 * Example:
+	 * @since 6.5.0
 	 *
-	 *     <div id="test">...
-	 *     0         1   |
-	 *     01234567890123456
-	 *      --- tag name ends at 14
+	 * @var int
+	 */
+	private $text_starts_at;
+
+	/**
+	 * Byte length of modifiable text.
 	 *
-	 * @since 6.2.0
-	 * @var int|null
+	 * @since 6.5.0
+	 *
+	 * @var string
 	 */
-	private $tag_ends_at;
+	private $text_length;
 
 	/**
 	 * Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>.
@@ -388,14 +639,14 @@ class WP_HTML_Tag_Processor {
 	 *     // <div id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8">
 	 *     //                 ^ parsing will continue from this point.
 	 *     $this->attributes = array(
-	 *         'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 )
+	 *         'id' => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false )
 	 *     );
 	 *
 	 *     // When picking up parsing again, or when asking to find the
 	 *     // `class` attribute we will continue and add to this array.
 	 *     $this->attributes = array(
-	 *         'id'    => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ),
-	 *         'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 )
+	 *         'id'    => new WP_HTML_Attribute_Token( 'id', 9, 6, 5, 11, false ),
+	 *         'class' => new WP_HTML_Attribute_Token( 'class', 23, 7, 17, 13, false )
 	 *     );
 	 *
 	 *     // Note that only the `class` attribute value is stored in the index.
@@ -484,9 +735,9 @@ class WP_HTML_Tag_Processor {
 	 *
 	 *     // Replace an attribute stored with a new value, indices
 	 *     // sourced from the lazily-parsed HTML recognizer.
-	 *     $start = $attributes['src']->start;
-	 *     $end   = $attributes['src']->end;
-	 *     $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value );
+	 *     $start  = $attributes['src']->start;
+	 *     $length = $attributes['src']->length;
+	 *     $modifications[] = new WP_HTML_Text_Replacement( $start, $length, $new_value );
 	 *
 	 *     // Correspondingly, something like this will appear in this array.
 	 *     $lexical_updates = array(
@@ -523,6 +774,7 @@ class WP_HTML_Tag_Processor {
 	 * Finds the next tag matching the $query.
 	 *
 	 * @since 6.2.0
+	 * @since 6.5.0 No longer processes incomplete tokens at end of document; pauses the processor at start of token.
 	 *
 	 * @param array|string|null $query {
 	 *     Optional. Which tag name to find, having which class, etc. Default is to find any tag.
@@ -541,90 +793,253 @@ class WP_HTML_Tag_Processor {
 		$already_found = 0;
 
 		do {
-			if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+			if ( false === $this->next_token() ) {
 				return false;
 			}
 
-			// Find the next tag if it exists.
-			if ( false === $this->parse_next_tag() ) {
-				$this->bytes_already_parsed = strlen( $this->html );
-
-				return false;
-			}
-
-			// Parse all of its attributes.
-			while ( $this->parse_next_attribute() ) {
+			if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
 				continue;
 			}
 
-			// Ensure that the tag closes before the end of the document.
-			if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
-				return false;
+			if ( $this->matches() ) {
+				++$already_found;
 			}
+		} while ( $already_found < $this->sought_match_offset );
 
-			$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
-			if ( false === $tag_ends_at ) {
-				return false;
-			}
-			$this->tag_ends_at          = $tag_ends_at;
-			$this->bytes_already_parsed = $tag_ends_at;
+		return true;
+	}
 
-			// Finally, check if the parsed tag and its attributes match the search query.
-			if ( $this->matches() ) {
-				++$already_found;
+	/**
+	 * Finds the next token in the HTML document.
+	 *
+	 * An HTML document can be viewed as a stream of tokens,
+	 * where tokens are things like HTML tags, HTML comments,
+	 * text nodes, etc. This method finds the next token in
+	 * the HTML document and returns whether it found one.
+	 *
+	 * If it starts parsing a token and reaches the end of the
+	 * document then it will seek to the start of the last
+	 * token and pause, returning `false` to indicate that it
+	 * failed to find a complete token.
+	 *
+	 * Possible token types, based on the HTML specification:
+	 *
+	 *  - an HTML tag, whether opening, closing, or void.
+	 *  - a text node - the plaintext inside tags.
+	 *  - an HTML comment.
+	 *  - a DOCTYPE declaration.
+	 *  - a processing instruction, e.g. `<?xml version="1.0" ?>`.
+	 *
+	 * The Tag Processor currently only supports the tag token.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return bool Whether a token was parsed.
+	 */
+	public function next_token() {
+		return $this->base_class_next_token();
+	}
+
+	/**
+	 * Internal method which finds the next token in the HTML document.
+	 *
+	 * This method is a protected internal function which implements the logic for
+	 * finding the next token in a document. It exists so that the parser can update
+	 * its state without affecting the location of the cursor in the document and
+	 * without triggering subclass methods for things like `next_token()`, e.g. when
+	 * applying patches before searching for the next token.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 *
+	 * @return bool Whether a token was parsed.
+	 */
+	private function base_class_next_token() {
+		$was_at = $this->bytes_already_parsed;
+		$this->after_tag();
+
+		// Don't proceed if there's nothing more to scan.
+		if (
+			self::STATE_COMPLETE === $this->parser_state ||
+			self::STATE_INCOMPLETE_INPUT === $this->parser_state
+		) {
+			return false;
+		}
+
+		/*
+		 * The next step in the parsing loop determines the parsing state;
+		 * clear it so that state doesn't linger from the previous step.
+		 */
+		$this->parser_state = self::STATE_READY;
+
+		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+			$this->parser_state = self::STATE_COMPLETE;
+			return false;
+		}
+
+		// Find the next tag if it exists.
+		if ( false === $this->parse_next_tag() ) {
+			if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
+				$this->bytes_already_parsed = $was_at;
 			}
 
+			return false;
+		}
+
+		/*
+		 * For legacy reasons the rest of this function handles tags and their
+		 * attributes. If the processor has reached the end of the document
+		 * or if it matched any other token then it should return here to avoid
+		 * attempting to process tag-specific syntax.
+		 */
+		if (
+			self::STATE_INCOMPLETE_INPUT !== $this->parser_state &&
+			self::STATE_COMPLETE !== $this->parser_state &&
+			self::STATE_MATCHED_TAG !== $this->parser_state
+		) {
+			return true;
+		}
+
+		// Parse all of its attributes.
+		while ( $this->parse_next_attribute() ) {
+			continue;
+		}
+
+		// Ensure that the tag closes before the end of the document.
+		if (
+			self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
+			$this->bytes_already_parsed >= strlen( $this->html )
+		) {
+			// Does this appropriately clear state (parsed attributes)?
+			$this->parser_state         = self::STATE_INCOMPLETE_INPUT;
+			$this->bytes_already_parsed = $was_at;
+
+			return false;
+		}
+
+		$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
+		if ( false === $tag_ends_at ) {
+			$this->parser_state         = self::STATE_INCOMPLETE_INPUT;
+			$this->bytes_already_parsed = $was_at;
+
+			return false;
+		}
+		$this->parser_state         = self::STATE_MATCHED_TAG;
+		$this->token_length         = $tag_ends_at - $this->token_starts_at;
+		$this->bytes_already_parsed = $tag_ends_at + 1;
+
+		/*
+		 * For non-DATA sections which might contain text that looks like HTML tags but
+		 * isn't, scan with the appropriate alternative mode. Looking at the first letter
+		 * of the tag name as a pre-check avoids a string allocation when it's not needed.
+		 */
+		$t = $this->html[ $this->tag_name_starts_at ];
+		if (
+			$this->is_closing_tag ||
+			! (
+				'i' === $t || 'I' === $t ||
+				'n' === $t || 'N' === $t ||
+				's' === $t || 'S' === $t ||
+				't' === $t || 'T' === $t ||
+				'x' === $t || 'X' === $t
+			)
+		) {
+			return true;
+		}
+
+		$tag_name = $this->get_tag();
+
+		/*
+		 * Preserve the opening tag pointers, as these will be overwritten
+		 * when finding the closing tag. They will be reset after finding
+		 * the closing to tag to point to the opening of the special atomic
+		 * tag sequence.
+		 */
+		$tag_name_starts_at   = $this->tag_name_starts_at;
+		$tag_name_length      = $this->tag_name_length;
+		$tag_ends_at          = $this->token_starts_at + $this->token_length;
+		$attributes           = $this->attributes;
+		$duplicate_attributes = $this->duplicate_attributes;
+
+		// Find the closing tag if necessary.
+		$found_closer = false;
+		switch ( $tag_name ) {
+			case 'SCRIPT':
+				$found_closer = $this->skip_script_data();
+				break;
+
+			case 'TEXTAREA':
+			case 'TITLE':
+				$found_closer = $this->skip_rcdata( $tag_name );
+				break;
+
 			/*
-			 * For non-DATA sections which might contain text that looks like HTML tags but
-			 * isn't, scan with the appropriate alternative mode. Looking at the first letter
-			 * of the tag name as a pre-check avoids a string allocation when it's not needed.
+			 * In the browser this list would include the NOSCRIPT element,
+			 * but the Tag Processor is an environment with the scripting
+			 * flag disabled, meaning that it needs to descend into the
+			 * NOSCRIPT element to be able to properly process what will be
+			 * sent to a browser.
+			 *
+			 * Note that this rule makes HTML5 syntax incompatible with XML,
+			 * because the parsing of this token depends on client application.
+			 * The NOSCRIPT element cannot be represented in the XHTML syntax.
 			 */
-			$t = $this->html[ $this->tag_name_starts_at ];
-			if (
-				! $this->is_closing_tag &&
-				(
-					'i' === $t || 'I' === $t ||
-					'n' === $t || 'N' === $t ||
-					's' === $t || 'S' === $t ||
-					't' === $t || 'T' === $t
-				) ) {
-				$tag_name = $this->get_tag();
-
-				if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
-					$this->bytes_already_parsed = strlen( $this->html );
-					return false;
-				} elseif (
-					( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) &&
-					! $this->skip_rcdata( $tag_name )
-				) {
-					$this->bytes_already_parsed = strlen( $this->html );
-					return false;
-				} elseif (
-					(
-						'IFRAME' === $tag_name ||
-						'NOEMBED' === $tag_name ||
-						'NOFRAMES' === $tag_name ||
-						'NOSCRIPT' === $tag_name ||
-						'STYLE' === $tag_name
-					) &&
-					! $this->skip_rawtext( $tag_name )
-				) {
-					/*
-					 * "XMP" should be here too but its rules are more complicated and require the
-					 * complexity of the HTML Processor (it needs to close out any open P element,
-					 * meaning it can't be skipped here or else the HTML Processor will lose its
-					 * place). For now, it can be ignored as it's a rare HTML tag in practice and
-					 * any normative HTML should be using PRE instead.
-					 */
-					$this->bytes_already_parsed = strlen( $this->html );
-					return false;
-				}
-			}
-		} while ( $already_found < $this->sought_match_offset );
+			case 'IFRAME':
+			case 'NOEMBED':
+			case 'NOFRAMES':
+			case 'STYLE':
+			case 'XMP':
+				$found_closer = $this->skip_rawtext( $tag_name );
+				break;
+
+			// No other tags should be treated in their entirety here.
+			default:
+				return true;
+		}
+
+		if ( ! $found_closer ) {
+			$this->parser_state         = self::STATE_INCOMPLETE_INPUT;
+			$this->bytes_already_parsed = $was_at;
+			return false;
+		}
+
+		/*
+		 * The values here look like they reference the opening tag but they reference
+		 * the closing tag instead. This is why the opening tag values were stored
+		 * above in a variable. It reads confusingly here, but that's because the
+		 * functions that skip the contents have moved all the internal cursors past
+		 * the inner content of the tag.
+		 */
+		$this->token_starts_at      = $was_at;
+		$this->token_length         = $this->bytes_already_parsed - $this->token_starts_at;
+		$this->text_starts_at       = $tag_ends_at + 1;
+		$this->text_length          = $this->tag_name_starts_at - $this->text_starts_at;
+		$this->tag_name_starts_at   = $tag_name_starts_at;
+		$this->tag_name_length      = $tag_name_length;
+		$this->attributes           = $attributes;
+		$this->duplicate_attributes = $duplicate_attributes;
 
 		return true;
 	}
 
+	/**
+	 * Whether the processor paused because the input HTML document ended
+	 * in the middle of a syntax element, such as in the middle of a tag.
+	 *
+	 * Example:
+	 *
+	 *     $processor = new WP_HTML_Tag_Processor( '<input type="text" value="Th' );
+	 *     false      === $processor->get_next_tag();
+	 *     true       === $processor->paused_at_incomplete_token();
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return bool Whether the parse paused at the start of an incomplete token.
+	 */
+	public function paused_at_incomplete_token() {
+		return self::STATE_INCOMPLETE_INPUT === $this->parser_state;
+	}
 
 	/**
 	 * Generator for a foreach loop to step through each class name for the matched tag.
@@ -643,6 +1058,10 @@ class WP_HTML_Tag_Processor {
 	 * @since 6.4.0
 	 */
 	public function class_list() {
+		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+			return;
+		}
+
 		/** @var string $class contains the string value of the class attribute, with character references decoded. */
 		$class = $this->get_attribute( 'class' );
 
@@ -698,7 +1117,7 @@ class WP_HTML_Tag_Processor {
 	 * @return bool|null Whether the matched tag contains the given class name, or null if not matched.
 	 */
 	public function has_class( $wanted_class ) {
-		if ( ! $this->tag_name_starts_at ) {
+		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
 			return null;
 		}
 
@@ -795,7 +1214,11 @@ class WP_HTML_Tag_Processor {
 	 * @return bool Whether the bookmark was successfully created.
 	 */
 	public function set_bookmark( $name ) {
-		if ( null === $this->tag_name_starts_at ) {
+		// It only makes sense to set a bookmark if the parser has paused on a concrete token.
+		if (
+			self::STATE_COMPLETE === $this->parser_state ||
+			self::STATE_INCOMPLETE_INPUT === $this->parser_state
+		) {
 			return false;
 		}
 
@@ -808,10 +1231,7 @@ class WP_HTML_Tag_Processor {
 			return false;
 		}
 
-		$this->bookmarks[ $name ] = new WP_HTML_Span(
-			$this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ),
-			$this->tag_ends_at
-		);
+		$this->bookmarks[ $name ] = new WP_HTML_Span( $this->token_starts_at, $this->token_length );
 
 		return true;
 	}
@@ -873,16 +1293,15 @@ class WP_HTML_Tag_Processor {
 		$at = $this->bytes_already_parsed;
 
 		while ( false !== $at && $at < $doc_length ) {
-			$at = strpos( $this->html, '</', $at );
+			$at                       = strpos( $this->html, '</', $at );
+			$this->tag_name_starts_at = $at;
 
-			// If there is no possible tag closer then fail.
+			// Fail if there is no possible tag closer.
 			if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
-				$this->bytes_already_parsed = $doc_length;
 				return false;
 			}
 
-			$closer_potentially_starts_at = $at;
-			$at                          += 2;
+			$at += 2;
 
 			/*
 			 * Find a case-insensitive match to the tag name.
@@ -905,6 +1324,10 @@ class WP_HTML_Tag_Processor {
 			$at                        += $tag_length;
 			$this->bytes_already_parsed = $at;
 
+			if ( $at >= strlen( $html ) ) {
+				return false;
+			}
+
 			/*
 			 * Ensure that the tag name terminates to avoid matching on
 			 * substrings of a longer tag name. For example, the sequence
@@ -919,13 +1342,23 @@ class WP_HTML_Tag_Processor {
 			while ( $this->parse_next_attribute() ) {
 				continue;
 			}
+
 			$at = $this->bytes_already_parsed;
 			if ( $at >= strlen( $this->html ) ) {
 				return false;
 			}
 
-			if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) {
-				$this->bytes_already_parsed = $closer_potentially_starts_at;
+			if ( '>' === $html[ $at ] ) {
+				$this->bytes_already_parsed = $at + 1;
+				return true;
+			}
+
+			if ( $at + 1 >= strlen( $this->html ) ) {
+				return false;
+			}
+
+			if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) {
+				$this->bytes_already_parsed = $at + 2;
 				return true;
 			}
 		}
@@ -1047,6 +1480,7 @@ class WP_HTML_Tag_Processor {
 
 			if ( $is_closing ) {
 				$this->bytes_already_parsed = $closer_potentially_starts_at;
+				$this->tag_name_starts_at   = $closer_potentially_starts_at;
 				if ( $this->bytes_already_parsed >= $doc_length ) {
 					return false;
 				}
@@ -1055,8 +1489,14 @@ class WP_HTML_Tag_Processor {
 					continue;
 				}
 
+				if ( $this->bytes_already_parsed >= $doc_length ) {
+					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
+					return false;
+				}
+
 				if ( '>' === $html[ $this->bytes_already_parsed ] ) {
-					$this->bytes_already_parsed = $closer_potentially_starts_at;
+					++$this->bytes_already_parsed;
 					return true;
 				}
 			}
@@ -1085,15 +1525,66 @@ class WP_HTML_Tag_Processor {
 
 		$html       = $this->html;
 		$doc_length = strlen( $html );
-		$at         = $this->bytes_already_parsed;
+		$was_at     = $this->bytes_already_parsed;
+		$at         = $was_at;
 
 		while ( false !== $at && $at < $doc_length ) {
 			$at = strpos( $html, '<', $at );
+
+			/*
+			 * This does not imply an incomplete parse; it indicates that there
+			 * can be nothing left in the document other than a #text node.
+			 */
 			if ( false === $at ) {
-				return false;
+				$this->parser_state         = self::STATE_TEXT_NODE;
+				$this->token_starts_at      = $was_at;
+				$this->token_length         = strlen( $html ) - $was_at;
+				$this->text_starts_at       = $was_at;
+				$this->text_length          = $this->token_length;
+				$this->bytes_already_parsed = strlen( $html );
+				return true;
+			}
+
+			if ( $at > $was_at ) {
+				/*
+				 * A "<" normally starts a new HTML tag or syntax token, but in cases where the
+				 * following character can't produce a valid token, the "<" is instead treated
+				 * as plaintext and the parser should skip over it. This avoids a problem when
+				 * following earlier practices of typing emoji with text, e.g. "<3". This
+				 * should be a heart, not a tag. It's supposed to be rendered, not hidden.
+				 *
+				 * At this point the parser checks if this is one of those cases and if it is
+				 * will continue searching for the next "<" in search of a token boundary.
+				 *
+				 * @see https://html.spec.whatwg.org/#tag-open-state
+				 */
+				if ( strlen( $html ) > $at + 1 ) {
+					$next_character  = $html[ $at + 1 ];
+					$at_another_node = (
+						'!' === $next_character ||
+						'/' === $next_character ||
+						'?' === $next_character ||
+						( 'A' <= $next_character && $next_character <= 'Z' ) ||
+						( 'a' <= $next_character && $next_character <= 'z' )
+					);
+					if ( ! $at_another_node ) {
+						++$at;
+						continue;
+					}
+				}
+
+				$this->parser_state         = self::STATE_TEXT_NODE;
+				$this->token_starts_at      = $was_at;
+				$this->token_length         = $at - $was_at;
+				$this->text_starts_at       = $was_at;
+				$this->text_length          = $this->token_length;
+				$this->bytes_already_parsed = $at;
+				return true;
 			}
 
-			if ( '/' === $this->html[ $at + 1 ] ) {
+			$this->token_starts_at = $at;
+
+			if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) {
 				$this->is_closing_tag = true;
 				++$at;
 			} else {
@@ -1117,8 +1608,9 @@ class WP_HTML_Tag_Processor {
 			$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
 			if ( $tag_name_prefix_length > 0 ) {
 				++$at;
-				$this->tag_name_length      = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
+				$this->parser_state         = self::STATE_MATCHED_TAG;
 				$this->tag_name_starts_at   = $at;
+				$this->tag_name_length      = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
 				$this->bytes_already_parsed = $at + $this->tag_name_length;
 				return true;
 			}
@@ -1127,35 +1619,58 @@ class WP_HTML_Tag_Processor {
 			 * Abort if no tag is found before the end of
 			 * the document. There is nothing left to parse.
 			 */
-			if ( $at + 1 >= strlen( $html ) ) {
+			if ( $at + 1 >= $doc_length ) {
+				$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 				return false;
 			}
 
 			/*
-			 * <! transitions to markup declaration open state
+			 * `<!` transitions to markup declaration open state
 			 * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
 			 */
 			if ( '!' === $html[ $at + 1 ] ) {
 				/*
-				 * <!-- transitions to a bogus comment state – skip to the nearest -->
+				 * `<!--` transitions to a comment state – apply further comment rules.
 				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 				 */
 				if (
-					strlen( $html ) > $at + 3 &&
+					$doc_length > $at + 3 &&
 					'-' === $html[ $at + 2 ] &&
 					'-' === $html[ $at + 3 ]
 				) {
 					$closer_at = $at + 4;
 					// If it's not possible to close the comment then there is nothing more to scan.
-					if ( strlen( $html ) <= $closer_at ) {
+					if ( $doc_length <= $closer_at ) {
+						$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 						return false;
 					}
 
 					// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
 					$span_of_dashes = strspn( $html, '-', $closer_at );
 					if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
-						$at = $closer_at + $span_of_dashes + 1;
-						continue;
+						/*
+						 * @todo When implementing `set_modifiable_text()` ensure that updates to this token
+						 *       don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment
+						 *       and bogus comment syntax, these leave no clear insertion point for text and
+						 *       they need to be modified specially in order to contain text. E.g. to store
+						 *       `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which
+						 *       involves inserting an additional `-` into the token after the modifiable text.
+						 */
+						$this->parser_state = self::STATE_COMMENT;
+						$this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT;
+						$this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at;
+
+						// Only provide modifiable text if the token is long enough to contain it.
+						if ( $span_of_dashes >= 2 ) {
+							$this->comment_type   = self::COMMENT_AS_HTML_COMMENT;
+							$this->text_starts_at = $this->token_starts_at + 4;
+							$this->text_length    = $span_of_dashes - 2;
+						}
+
+						$this->bytes_already_parsed = $closer_at + $span_of_dashes + 1;
+						return true;
 					}
 
 					/*
@@ -1165,55 +1680,47 @@ class WP_HTML_Tag_Processor {
 					 * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
 					 */
 					--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
-					while ( ++$closer_at < strlen( $html ) ) {
+					while ( ++$closer_at < $doc_length ) {
 						$closer_at = strpos( $html, '--', $closer_at );
 						if ( false === $closer_at ) {
+							$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 							return false;
 						}
 
-						if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) {
-							$at = $closer_at + 3;
-							continue 2;
+						if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) {
+							$this->parser_state         = self::STATE_COMMENT;
+							$this->comment_type         = self::COMMENT_AS_HTML_COMMENT;
+							$this->token_length         = $closer_at + 3 - $this->token_starts_at;
+							$this->text_starts_at       = $this->token_starts_at + 4;
+							$this->text_length          = $closer_at - $this->text_starts_at;
+							$this->bytes_already_parsed = $closer_at + 3;
+							return true;
 						}
 
-						if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) {
-							$at = $closer_at + 4;
-							continue 2;
+						if (
+							$closer_at + 3 < $doc_length &&
+							'!' === $html[ $closer_at + 2 ] &&
+							'>' === $html[ $closer_at + 3 ]
+						) {
+							$this->parser_state         = self::STATE_COMMENT;
+							$this->comment_type         = self::COMMENT_AS_HTML_COMMENT;
+							$this->token_length         = $closer_at + 4 - $this->token_starts_at;
+							$this->text_starts_at       = $this->token_starts_at + 4;
+							$this->text_length          = $closer_at - $this->text_starts_at;
+							$this->bytes_already_parsed = $closer_at + 4;
+							return true;
 						}
 					}
 				}
 
 				/*
-				 * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]>
-				 * The CDATA is case-sensitive.
-				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
-				 */
-				if (
-					strlen( $html ) > $at + 8 &&
-					'[' === $html[ $at + 2 ] &&
-					'C' === $html[ $at + 3 ] &&
-					'D' === $html[ $at + 4 ] &&
-					'A' === $html[ $at + 5 ] &&
-					'T' === $html[ $at + 6 ] &&
-					'A' === $html[ $at + 7 ] &&
-					'[' === $html[ $at + 8 ]
-				) {
-					$closer_at = strpos( $html, ']]>', $at + 9 );
-					if ( false === $closer_at ) {
-						return false;
-					}
-
-					$at = $closer_at + 3;
-					continue;
-				}
-
-				/*
-				 * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest >
+				 * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest >
 				 * These are ASCII-case-insensitive.
 				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 				 */
 				if (
-					strlen( $html ) > $at + 8 &&
+					$doc_length > $at + 8 &&
 					( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
 					( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
 					( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
@@ -1224,59 +1731,179 @@ class WP_HTML_Tag_Processor {
 				) {
 					$closer_at = strpos( $html, '>', $at + 9 );
 					if ( false === $closer_at ) {
+						$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 						return false;
 					}
 
-					$at = $closer_at + 1;
-					continue;
+					$this->parser_state         = self::STATE_DOCTYPE;
+					$this->token_length         = $closer_at + 1 - $this->token_starts_at;
+					$this->text_starts_at       = $this->token_starts_at + 9;
+					$this->text_length          = $closer_at - $this->text_starts_at;
+					$this->bytes_already_parsed = $closer_at + 1;
+					return true;
 				}
 
 				/*
 				 * Anything else here is an incorrectly-opened comment and transitions
-				 * to the bogus comment state - skip to the nearest >.
+				 * to the bogus comment state - skip to the nearest >. If no closer is
+				 * found then the HTML was truncated inside the markup declaration.
 				 */
-				$at = strpos( $html, '>', $at + 1 );
-				continue;
+				$closer_at = strpos( $html, '>', $at + 1 );
+				if ( false === $closer_at ) {
+					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
+					return false;
+				}
+
+				$this->parser_state         = self::STATE_COMMENT;
+				$this->comment_type         = self::COMMENT_AS_INVALID_HTML;
+				$this->token_length         = $closer_at + 1 - $this->token_starts_at;
+				$this->text_starts_at       = $this->token_starts_at + 2;
+				$this->text_length          = $closer_at - $this->text_starts_at;
+				$this->bytes_already_parsed = $closer_at + 1;
+
+				/*
+				 * Identify nodes that would be CDATA if HTML had CDATA sections.
+				 *
+				 * This section must occur after identifying the bogus comment end
+				 * because in an HTML parser it will span to the nearest `>`, even
+				 * if there's no `]]>` as would be required in an XML document. It
+				 * is therefore not possible to parse a CDATA section containing
+				 * a `>` in the HTML syntax.
+				 *
+				 * Inside foreign elements there is a discrepancy between browsers
+				 * and the specification on this.
+				 *
+				 * @todo Track whether the Tag Processor is inside a foreign element
+				 *       and require the proper closing `]]>` in those cases.
+				 */
+				if (
+					$this->token_length >= 10 &&
+					'[' === $html[ $this->token_starts_at + 2 ] &&
+					'C' === $html[ $this->token_starts_at + 3 ] &&
+					'D' === $html[ $this->token_starts_at + 4 ] &&
+					'A' === $html[ $this->token_starts_at + 5 ] &&
+					'T' === $html[ $this->token_starts_at + 6 ] &&
+					'A' === $html[ $this->token_starts_at + 7 ] &&
+					'[' === $html[ $this->token_starts_at + 8 ] &&
+					']' === $html[ $closer_at - 1 ] &&
+					']' === $html[ $closer_at - 2 ]
+				) {
+					$this->parser_state    = self::STATE_COMMENT;
+					$this->comment_type    = self::COMMENT_AS_CDATA_LOOKALIKE;
+					$this->text_starts_at += 7;
+					$this->text_length    -= 9;
+				}
+
+				return true;
 			}
 
 			/*
 			 * </> is a missing end tag name, which is ignored.
 			 *
+			 * This was also known as the "presumptuous empty tag"
+			 * in early discussions as it was proposed to close
+			 * the nearest previous opening tag.
+			 *
 			 * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
 			 */
 			if ( '>' === $html[ $at + 1 ] ) {
-				++$at;
-				continue;
+				$this->parser_state         = self::STATE_PRESUMPTUOUS_TAG;
+				$this->token_length         = $at + 2 - $this->token_starts_at;
+				$this->bytes_already_parsed = $at + 2;
+				return true;
 			}
 
 			/*
-			 * <? transitions to a bogus comment state – skip to the nearest >
+			 * `<?` transitions to a bogus comment state – skip to the nearest >
 			 * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 			 */
 			if ( '?' === $html[ $at + 1 ] ) {
 				$closer_at = strpos( $html, '>', $at + 2 );
 				if ( false === $closer_at ) {
+					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 					return false;
 				}
 
-				$at = $closer_at + 1;
-				continue;
+				$this->parser_state         = self::STATE_COMMENT;
+				$this->comment_type         = self::COMMENT_AS_INVALID_HTML;
+				$this->token_length         = $closer_at + 1 - $this->token_starts_at;
+				$this->text_starts_at       = $this->token_starts_at + 2;
+				$this->text_length          = $closer_at - $this->text_starts_at;
+				$this->bytes_already_parsed = $closer_at + 1;
+
+				/*
+				 * Identify a Processing Instruction node were HTML to have them.
+				 *
+				 * This section must occur after identifying the bogus comment end
+				 * because in an HTML parser it will span to the nearest `>`, even
+				 * if there's no `?>` as would be required in an XML document. It
+				 * is therefore not possible to parse a Processing Instruction node
+				 * containing a `>` in the HTML syntax.
+				 *
+				 * XML allows for more target names, but this code only identifies
+				 * those with ASCII-representable target names. This means that it
+				 * may identify some Processing Instruction nodes as bogus comments,
+				 * but it will not misinterpret the HTML structure. By limiting the
+				 * identification to these target names the Tag Processor can avoid
+				 * the need to start parsing UTF-8 sequences.
+				 *
+				 * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
+				 *                     [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
+				 *                     [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+				 *                     [#x10000-#xEFFFF]
+				 * > NameChar      ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+				 *
+				 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
+				 */
+				if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
+					$comment_text     = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
+					$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
+
+					if ( 0 < $pi_target_length ) {
+						$pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
+
+						$this->comment_type       = self::COMMENT_AS_PI_NODE_LOOKALIKE;
+						$this->tag_name_starts_at = $this->token_starts_at + 2;
+						$this->tag_name_length    = $pi_target_length;
+						$this->text_starts_at    += $pi_target_length;
+						$this->text_length       -= $pi_target_length + 1;
+					}
+				}
+
+				return true;
 			}
 
 			/*
 			 * If a non-alpha starts the tag name in a tag closer it's a comment.
 			 * Find the first `>`, which closes the comment.
 			 *
+			 * This parser classifies these particular comments as special "funky comments"
+			 * which are made available for further processing.
+			 *
 			 * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
 			 */
 			if ( $this->is_closing_tag ) {
+				// No chance of finding a closer.
+				if ( $at + 3 > $doc_length ) {
+					return false;
+				}
+
 				$closer_at = strpos( $html, '>', $at + 3 );
 				if ( false === $closer_at ) {
+					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 					return false;
 				}
 
-				$at = $closer_at + 1;
-				continue;
+				$this->parser_state         = self::STATE_FUNKY_COMMENT;
+				$this->token_length         = $closer_at + 1 - $this->token_starts_at;
+				$this->text_starts_at       = $this->token_starts_at + 2;
+				$this->text_length          = $closer_at - $this->text_starts_at;
+				$this->bytes_already_parsed = $closer_at + 1;
+				return true;
 			}
 
 			++$at;
@@ -1296,6 +1923,8 @@ class WP_HTML_Tag_Processor {
 		// Skip whitespace and slashes.
 		$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
 		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 			return false;
 		}
 
@@ -1318,11 +1947,15 @@ class WP_HTML_Tag_Processor {
 		$attribute_name              = substr( $this->html, $attribute_start, $name_length );
 		$this->bytes_already_parsed += $name_length;
 		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 			return false;
 		}
 
 		$this->skip_whitespace();
 		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 			return false;
 		}
 
@@ -1331,6 +1964,8 @@ class WP_HTML_Tag_Processor {
 			++$this->bytes_already_parsed;
 			$this->skip_whitespace();
 			if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+				$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 				return false;
 			}
 
@@ -1357,6 +1992,8 @@ class WP_HTML_Tag_Processor {
 		}
 
 		if ( $attribute_end >= strlen( $this->html ) ) {
+			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
+
 			return false;
 		}
 
@@ -1381,7 +2018,7 @@ class WP_HTML_Tag_Processor {
 				$value_start,
 				$value_length,
 				$attribute_start,
-				$attribute_end,
+				$attribute_end - $attribute_start,
 				! $has_value
 			);
 
@@ -1396,7 +2033,7 @@ class WP_HTML_Tag_Processor {
 		 * an array when encountering duplicates avoids needless allocations in the
 		 * normative case of parsing tags with no duplicate attributes.
 		 */
-		$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end );
+		$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start );
 		if ( null === $this->duplicate_attributes ) {
 			$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
 		} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
@@ -1423,12 +2060,54 @@ class WP_HTML_Tag_Processor {
 	 * @since 6.2.0
 	 */
 	private function after_tag() {
-		$this->get_updated_html();
+		/*
+		 * There could be lexical updates enqueued for an attribute that
+		 * also exists on the next tag. In order to avoid conflating the
+		 * attributes across the two tags, lexical updates with names
+		 * need to be flushed to raw lexical updates.
+		 */
+		$this->class_name_updates_to_attributes_updates();
+
+		/*
+		 * Purge updates if there are too many. The actual count isn't
+		 * scientific, but a few values from 100 to a few thousand were
+		 * tests to find a practially-useful limit.
+		 *
+		 * If the update queue grows too big, then the Tag Processor
+		 * will spend more time iterating through them and lose the
+		 * efficiency gains of deferring applying them.
+		 */
+		if ( 1000 < count( $this->lexical_updates ) ) {
+			$this->get_updated_html();
+		}
+
+		foreach ( $this->lexical_updates as $name => $update ) {
+			/*
+			 * Any updates appearing after the cursor should be applied
+			 * before proceeding, otherwise they may be overlooked.
+			 */
+			if ( $update->start >= $this->bytes_already_parsed ) {
+				$this->get_updated_html();
+				break;
+			}
+
+			if ( is_int( $name ) ) {
+				continue;
+			}
+
+			$this->lexical_updates[] = $update;
+			unset( $this->lexical_updates[ $name ] );
+		}
+
+		$this->token_starts_at      = null;
+		$this->token_length         = null;
 		$this->tag_name_starts_at   = null;
 		$this->tag_name_length      = null;
-		$this->tag_ends_at          = null;
+		$this->text_starts_at       = 0;
+		$this->text_length          = 0;
 		$this->is_closing_tag       = null;
 		$this->attributes           = array();
+		$this->comment_type         = null;
 		$this->duplicate_attributes = null;
 	}
 
@@ -1606,10 +2285,10 @@ class WP_HTML_Tag_Processor {
 		$bytes_already_copied = 0;
 		$output_buffer        = '';
 		foreach ( $this->lexical_updates as $diff ) {
-			$shift = strlen( $diff->text ) - ( $diff->end - $diff->start );
+			$shift = strlen( $diff->text ) - $diff->length;
 
 			// Adjust the cursor position by however much an update affects it.
-			if ( $diff->start <= $this->bytes_already_parsed ) {
+			if ( $diff->start < $this->bytes_already_parsed ) {
 				$this->bytes_already_parsed += $shift;
 			}
 
@@ -1620,7 +2299,7 @@ class WP_HTML_Tag_Processor {
 
 			$output_buffer       .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied );
 			$output_buffer       .= $diff->text;
-			$bytes_already_copied = $diff->end;
+			$bytes_already_copied = $diff->start + $diff->length;
 		}
 
 		$this->html = $output_buffer . substr( $this->html, $bytes_already_copied );
@@ -1630,6 +2309,8 @@ class WP_HTML_Tag_Processor {
 		 * replacements adjust offsets in the input document.
 		 */
 		foreach ( $this->bookmarks as $bookmark_name => $bookmark ) {
+			$bookmark_end = $bookmark->start + $bookmark->length;
+
 			/*
 			 * Each lexical update which appears before the bookmark's endpoints
 			 * might shift the offsets for those endpoints. Loop through each change
@@ -1640,28 +2321,30 @@ class WP_HTML_Tag_Processor {
 			$tail_delta = 0;
 
 			foreach ( $this->lexical_updates as $diff ) {
-				if ( $bookmark->start < $diff->start && $bookmark->end < $diff->start ) {
+				$diff_end = $diff->start + $diff->length;
+
+				if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) {
 					break;
 				}
 
-				if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) {
+				if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) {
 					$this->release_bookmark( $bookmark_name );
 					continue 2;
 				}
 
-				$delta = strlen( $diff->text ) - ( $diff->end - $diff->start );
+				$delta = strlen( $diff->text ) - $diff->length;
 
 				if ( $bookmark->start >= $diff->start ) {
 					$head_delta += $delta;
 				}
 
-				if ( $bookmark->end >= $diff->end ) {
+				if ( $bookmark_end >= $diff_end ) {
 					$tail_delta += $delta;
 				}
 			}
 
-			$bookmark->start += $head_delta;
-			$bookmark->end   += $tail_delta;
+			$bookmark->start  += $head_delta;
+			$bookmark->length += $tail_delta - $head_delta;
 		}
 
 		$this->lexical_updates = array();
@@ -1716,7 +2399,8 @@ class WP_HTML_Tag_Processor {
 
 		// Point this tag processor before the sought tag opener and consume it.
 		$this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start;
-		return $this->next_tag( array( 'tag_closers' => 'visit' ) );
+		$this->parser_state         = self::STATE_READY;
+		return $this->next_token();
 	}
 
 	/**
@@ -1743,7 +2427,7 @@ class WP_HTML_Tag_Processor {
 		 * This code should be unreachable, because it implies the two replacements
 		 * start at the same location and contain the same text.
 		 */
-		return $a->end - $b->end;
+		return $a->length - $b->length;
 	}
 
 	/**
@@ -1761,6 +2445,10 @@ class WP_HTML_Tag_Processor {
 	 * @return string|boolean|null Value of enqueued update if present, otherwise false.
 	 */
 	private function get_enqueued_attribute_value( $comparable_name ) {
+		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+			return false;
+		}
+
 		if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) {
 			return false;
 		}
@@ -1828,7 +2516,7 @@ class WP_HTML_Tag_Processor {
 	 * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
 	 */
 	public function get_attribute( $name ) {
-		if ( null === $this->tag_name_starts_at ) {
+		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
 			return null;
 		}
 
@@ -1908,7 +2596,10 @@ class WP_HTML_Tag_Processor {
 	 * @return array|null List of attribute names, or `null` when no tag opener is matched.
 	 */
 	public function get_attribute_names_with_prefix( $prefix ) {
-		if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) {
+		if (
+			self::STATE_MATCHED_TAG !== $this->parser_state ||
+			$this->is_closing_tag
+		) {
 			return null;
 		}
 
@@ -1946,7 +2637,18 @@ class WP_HTML_Tag_Processor {
 
 		$tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
 
-		return strtoupper( $tag_name );
+		if ( self::STATE_MATCHED_TAG === $this->parser_state ) {
+			return strtoupper( $tag_name );
+		}
+
+		if (
+			self::STATE_COMMENT === $this->parser_state &&
+			self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type()
+		) {
+			return $tag_name;
+		}
+
+		return null;
 	}
 
 	/**
@@ -1967,11 +2669,19 @@ class WP_HTML_Tag_Processor {
 	 * @return bool Whether the currently matched tag contains the self-closing flag.
 	 */
 	public function has_self_closing_flag() {
-		if ( ! $this->tag_name_starts_at ) {
+		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
 			return false;
 		}
 
-		return '/' === $this->html[ $this->tag_ends_at - 1 ];
+		/*
+		 * The self-closing flag is the solidus at the _end_ of the tag, not the beginning.
+		 *
+		 * Example:
+		 *
+		 *     <figure />
+		 *             ^ this appears one character before the end of the closing ">".
+		 */
+		return '/' === $this->html[ $this->token_starts_at + $this->token_length - 1 ];
 	}
 
 	/**
@@ -1991,7 +2701,191 @@ class WP_HTML_Tag_Processor {
 	 * @return bool Whether the current tag is a tag closer.
 	 */
 	public function is_tag_closer() {
-		return $this->is_closing_tag;
+		return (
+			self::STATE_MATCHED_TAG === $this->parser_state &&
+			$this->is_closing_tag
+		);
+	}
+
+	/**
+	 * Indicates the kind of matched token, if any.
+	 *
+	 * This differs from `get_token_name()` in that it always
+	 * returns a static string indicating the type, whereas
+	 * `get_token_name()` may return values derived from the
+	 * token itself, such as a tag name or processing
+	 * instruction tag.
+	 *
+	 * Possible values:
+	 *  - `#tag` when matched on a tag.
+	 *  - `#text` when matched on a text node.
+	 *  - `#cdata-section` when matched on a CDATA node.
+	 *  - `#comment` when matched on a comment.
+	 *  - `#doctype` when matched on a DOCTYPE declaration.
+	 *  - `#presumptuous-tag` when matched on an empty tag closer.
+	 *  - `#funky-comment` when matched on a funky comment.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return string|null What kind of token is matched, or null.
+	 */
+	public function get_token_type() {
+		switch ( $this->parser_state ) {
+			case self::STATE_MATCHED_TAG:
+				return '#tag';
+
+			case self::STATE_DOCTYPE:
+				return '#doctype';
+
+			default:
+				return $this->get_token_name();
+		}
+	}
+
+	/**
+	 * Returns the node name represented by the token.
+	 *
+	 * This matches the DOM API value `nodeName`. Some values
+	 * are static, such as `#text` for a text node, while others
+	 * are dynamically generated from the token itself.
+	 *
+	 * Dynamic names:
+	 *  - Uppercase tag name for tag matches.
+	 *  - `html` for DOCTYPE declarations.
+	 *
+	 * Note that if the Tag Processor is not matched on a token
+	 * then this function will return `null`, either because it
+	 * hasn't yet found a token or because it reached the end
+	 * of the document without matching a token.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return string|null Name of the matched token.
+	 */
+	public function get_token_name() {
+		switch ( $this->parser_state ) {
+			case self::STATE_MATCHED_TAG:
+				return $this->get_tag();
+
+			case self::STATE_TEXT_NODE:
+				return '#text';
+
+			case self::STATE_CDATA_NODE:
+				return '#cdata-section';
+
+			case self::STATE_COMMENT:
+				return '#comment';
+
+			case self::STATE_DOCTYPE:
+				return 'html';
+
+			case self::STATE_PRESUMPTUOUS_TAG:
+				return '#presumptuous-tag';
+
+			case self::STATE_FUNKY_COMMENT:
+				return '#funky-comment';
+		}
+	}
+
+	/**
+	 * Indicates what kind of comment produced the comment node.
+	 *
+	 * Because there are different kinds of HTML syntax which produce
+	 * comments, the Tag Processor tracks and exposes this as a type
+	 * for the comment. Nominally only regular HTML comments exist as
+	 * they are commonly known, but a number of unrelated syntax errors
+	 * also produce comments.
+	 *
+	 * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
+	 * @see self::COMMENT_AS_CDATA_LOOKALIKE
+	 * @see self::COMMENT_AS_INVALID_HTML
+	 * @see self::COMMENT_AS_HTML_COMMENT
+	 * @see self::COMMENT_AS_PI_NODE_LOOKALIKE
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return string|null
+	 */
+	public function get_comment_type() {
+		if ( self::STATE_COMMENT !== $this->parser_state ) {
+			return null;
+		}
+
+		return $this->comment_type;
+	}
+
+	/**
+	 * Returns the modifiable text for a matched token, or an empty string.
+	 *
+	 * Modifiable text is text content that may be read and changed without
+	 * changing the HTML structure of the document around it. This includes
+	 * the contents of `#text` nodes in the HTML as well as the inner
+	 * contents of HTML comments, Processing Instructions, and others, even
+	 * though these nodes aren't part of a parsed DOM tree. They also contain
+	 * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
+	 * other section in an HTML document which cannot contain HTML markup (DATA).
+	 *
+	 * If a token has no modifiable text then an empty string is returned to
+	 * avoid needless crashing or type errors. An empty string does not mean
+	 * that a token has modifiable text, and a token with modifiable text may
+	 * have an empty string (e.g. a comment with no contents).
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return string
+	 */
+	public function get_modifiable_text() {
+		if ( null === $this->text_starts_at ) {
+			return '';
+		}
+
+		$text = substr( $this->html, $this->text_starts_at, $this->text_length );
+
+		// Comment data is not decoded.
+		if (
+			self::STATE_CDATA_NODE === $this->parser_state ||
+			self::STATE_COMMENT === $this->parser_state ||
+			self::STATE_DOCTYPE === $this->parser_state ||
+			self::STATE_FUNKY_COMMENT === $this->parser_state
+		) {
+			return $text;
+		}
+
+		$tag_name = $this->get_tag();
+		if (
+			// Script data is not decoded.
+			'SCRIPT' === $tag_name ||
+
+			// RAWTEXT data is not decoded.
+			'IFRAME' === $tag_name ||
+			'NOEMBED' === $tag_name ||
+			'NOFRAMES' === $tag_name ||
+			'STYLE' === $tag_name ||
+			'XMP' === $tag_name
+		) {
+			return $text;
+		}
+
+		$decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
+
+		/*
+		 * TEXTAREA skips a leading newline, but this newline may appear not only as the
+		 * literal character `\n`, but also as a character reference, such as in the
+		 * following markup: `<textarea>&#x0a;Content</textarea>`.
+		 *
+		 * For these cases it's important to first decode the text content before checking
+		 * for a leading newline and removing it.
+		 */
+		if (
+			self::STATE_MATCHED_TAG === $this->parser_state &&
+			'TEXTAREA' === $tag_name &&
+			strlen( $decoded ) > 0 &&
+			"\n" === $decoded[0]
+		) {
+			return substr( $decoded, 1 );
+		}
+
+		return $decoded;
 	}
 
 	/**
@@ -2011,7 +2905,10 @@ class WP_HTML_Tag_Processor {
 	 * @return bool Whether an attribute value was set.
 	 */
 	public function set_attribute( $name, $value ) {
-		if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) {
+		if (
+			self::STATE_MATCHED_TAG !== $this->parser_state ||
+			$this->is_closing_tag
+		) {
 			return false;
 		}
 
@@ -2031,8 +2928,8 @@ class WP_HTML_Tag_Processor {
 		 *
 		 * @see https://html.spec.whatwg.org/#attributes-2
 		 *
-		 * @TODO as the only regex pattern maybe we should take it out? are
-		 *       Unicode patterns available broadly in Core?
+		 * @todo As the only regex pattern maybe we should take it out?
+		 *       Are Unicode patterns available broadly in Core?
 		 */
 		if ( preg_match(
 			'~[' .
@@ -2101,7 +2998,7 @@ class WP_HTML_Tag_Processor {
 			$existing_attribute                        = $this->attributes[ $comparable_name ];
 			$this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement(
 				$existing_attribute->start,
-				$existing_attribute->end,
+				$existing_attribute->length,
 				$updated_attribute
 			);
 		} else {
@@ -2119,7 +3016,7 @@ class WP_HTML_Tag_Processor {
 			 */
 			$this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement(
 				$this->tag_name_starts_at + $this->tag_name_length,
-				$this->tag_name_starts_at + $this->tag_name_length,
+				0,
 				' ' . $updated_attribute
 			);
 		}
@@ -2144,7 +3041,10 @@ class WP_HTML_Tag_Processor {
 	 * @return bool Whether an attribute was removed.
 	 */
 	public function remove_attribute( $name ) {
-		if ( $this->is_closing_tag ) {
+		if (
+			self::STATE_MATCHED_TAG !== $this->parser_state ||
+			$this->is_closing_tag
+		) {
 			return false;
 		}
 
@@ -2194,7 +3094,7 @@ class WP_HTML_Tag_Processor {
 		 */
 		$this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement(
 			$this->attributes[ $name ]->start,
-			$this->attributes[ $name ]->end,
+			$this->attributes[ $name ]->length,
 			''
 		);
 
@@ -2203,7 +3103,7 @@ class WP_HTML_Tag_Processor {
 			foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
 				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
 					$attribute_token->start,
-					$attribute_token->end,
+					$attribute_token->length,
 					''
 				);
 			}
@@ -2221,13 +3121,14 @@ class WP_HTML_Tag_Processor {
 	 * @return bool Whether the class was set to be added.
 	 */
 	public function add_class( $class_name ) {
-		if ( $this->is_closing_tag ) {
+		if (
+			self::STATE_MATCHED_TAG !== $this->parser_state ||
+			$this->is_closing_tag
+		) {
 			return false;
 		}
 
-		if ( null !== $this->tag_name_starts_at ) {
-			$this->classname_updates[ $class_name ] = self::ADD_CLASS;
-		}
+		$this->classname_updates[ $class_name ] = self::ADD_CLASS;
 
 		return true;
 	}
@@ -2241,7 +3142,10 @@ class WP_HTML_Tag_Processor {
 	 * @return bool Whether the class was set to be removed.
 	 */
 	public function remove_class( $class_name ) {
-		if ( $this->is_closing_tag ) {
+		if (
+			self::STATE_MATCHED_TAG !== $this->parser_state ||
+			$this->is_closing_tag
+		) {
 			return false;
 		}
 
@@ -2289,7 +3193,7 @@ class WP_HTML_Tag_Processor {
 		 * Keep track of the position right before the current tag. This will
 		 * be necessary for reparsing the current tag after updating the HTML.
 		 */
-		$before_current_tag = $this->tag_name_starts_at - 1;
+		$before_current_tag = $this->token_starts_at;
 
 		/*
 		 * 1. Apply the enqueued edits and update all the pointers to reflect those changes.
@@ -2318,15 +3222,7 @@ class WP_HTML_Tag_Processor {
 		 *                 └←─┘ back up by strlen("em") + 1 ==> 3
 		 */
 		$this->bytes_already_parsed = $before_current_tag;
-		$this->parse_next_tag();
-		// Reparse the attributes.
-		while ( $this->parse_next_attribute() ) {
-			continue;
-		}
-
-		$tag_ends_at                = strpos( $this->html, '>', $this->bytes_already_parsed );
-		$this->tag_ends_at          = $tag_ends_at;
-		$this->bytes_already_parsed = $tag_ends_at;
+		$this->base_class_next_token();
 
 		return $this->html;
 	}
@@ -2447,4 +3343,206 @@ class WP_HTML_Tag_Processor {
 
 		return true;
 	}
+
+	/**
+	 * Parser Ready State.
+	 *
+	 * Indicates that the parser is ready to run and waiting for a state transition.
+	 * It may not have started yet, or it may have just finished parsing a token and
+	 * is ready to find the next one.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_READY = 'STATE_READY';
+
+	/**
+	 * Parser Complete State.
+	 *
+	 * Indicates that the parser has reached the end of the document and there is
+	 * nothing left to scan. It finished parsing the last token completely.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_COMPLETE = 'STATE_COMPLETE';
+
+	/**
+	 * Parser Incomplete Input State.
+	 *
+	 * Indicates that the parser has reached the end of the document before finishing
+	 * a token. It started parsing a token but there is a possibility that the input
+	 * HTML document was truncated in the middle of a token.
+	 *
+	 * The parser is reset at the start of the incomplete token and has paused. There
+	 * is nothing more than can be scanned unless provided a more complete document.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT';
+
+	/**
+	 * Parser Matched Tag State.
+	 *
+	 * Indicates that the parser has found an HTML tag and it's possible to get
+	 * the tag name and read or modify its attributes (if it's not a closing tag).
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG';
+
+	/**
+	 * Parser Text Node State.
+	 *
+	 * Indicates that the parser has found a text node and it's possible
+	 * to read and modify that text.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_TEXT_NODE = 'STATE_TEXT_NODE';
+
+	/**
+	 * Parser CDATA Node State.
+	 *
+	 * Indicates that the parser has found a CDATA node and it's possible
+	 * to read and modify its modifiable text. Note that in HTML there are
+	 * no CDATA nodes outside of foreign content (SVG and MathML). Outside
+	 * of foreign content, they are treated as HTML comments.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
+
+	/**
+	 * Indicates that the parser has found an HTML comment and it's
+	 * possible to read and modify its modifiable text.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_COMMENT = 'STATE_COMMENT';
+
+	/**
+	 * Indicates that the parser has found a DOCTYPE node and it's
+	 * possible to read and modify its modifiable text.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_DOCTYPE = 'STATE_DOCTYPE';
+
+	/**
+	 * Indicates that the parser has found an empty tag closer `</>`.
+	 *
+	 * Note that in HTML there are no empty tag closers, and they
+	 * are ignored. Nonetheless, the Tag Processor still
+	 * recognizes them as they appear in the HTML stream.
+	 *
+	 * These were historically discussed as a "presumptuous tag
+	 * closer," which would close the nearest open tag, but were
+	 * dismissed in favor of explicitly-closing tags.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG';
+
+	/**
+	 * Indicates that the parser has found a "funky comment"
+	 * and it's possible to read and modify its modifiable text.
+	 *
+	 * Example:
+	 *
+	 *     </%url>
+	 *     </{"wp-bit":"query/post-author"}>
+	 *     </2>
+	 *
+	 * Funky comments are tag closers with invalid tag names. Note
+	 * that in HTML these are turn into bogus comments. Nonetheless,
+	 * the Tag Processor recognizes them in a stream of HTML and
+	 * exposes them for inspection and modification.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY';
+
+	/**
+	 * Indicates that a comment was created when encountering abruptly-closed HTML comment.
+	 *
+	 * Example:
+	 *
+	 *     <!-->
+	 *     <!--->
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT';
+
+	/**
+	 * Indicates that a comment would be parsed as a CDATA node,
+	 * were HTML to allow CDATA nodes outside of foreign content.
+	 *
+	 * Example:
+	 *
+	 *     <![CDATA[This is a CDATA node.]]>
+	 *
+	 * This is an HTML comment, but it looks like a CDATA node.
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE';
+
+	/**
+	 * Indicates that a comment was created when encountering
+	 * normative HTML comment syntax.
+	 *
+	 * Example:
+	 *
+	 *     <!-- this is a comment -->
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT';
+
+	/**
+	 * Indicates that a comment would be parsed as a Processing
+	 * Instruction node, were they to exist within HTML.
+	 *
+	 * Example:
+	 *
+	 *     <?wp __( 'Like' ) ?>
+	 *
+	 * This is an HTML comment, but it looks like a CDATA node.
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE';
+
+	/**
+	 * Indicates that a comment was created when encountering invalid
+	 * HTML input, a so-called "bogus comment."
+	 *
+	 * Example:
+	 *
+	 *     <?nothing special>
+	 *     <!{nothing special}>
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
 }
diff --git a/wp-includes/html-api/class-wp-html-text-replacement.php b/wp-includes/html-api/class-wp-html-text-replacement.php
index 26b7bb2..4b8a6a6 100644
--- a/wp-includes/html-api/class-wp-html-text-replacement.php
+++ b/wp-includes/html-api/class-wp-html-text-replacement.php
@@ -15,6 +15,7 @@
  *
  * @access private
  * @since 6.2.0
+ * @since 6.5.0 Replace `end` with `length` to more closely match `substr()`.
  *
  * @see WP_HTML_Tag_Processor
  */
@@ -23,22 +24,25 @@ class WP_HTML_Text_Replacement {
 	 * Byte offset into document where replacement span begins.
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var int
 	 */
 	public $start;
 
 	/**
-	 * Byte offset into document where replacement span ends.
+	 * Byte length of span being replaced.
+	 *
+	 * @since 6.5.0
 	 *
-	 * @since 6.2.0
 	 * @var int
 	 */
-	public $end;
+	public $length;
 
 	/**
 	 * Span of text to insert in document to replace existing content from start to end.
 	 *
 	 * @since 6.2.0
+	 *
 	 * @var string
 	 */
 	public $text;
@@ -48,13 +52,13 @@ class WP_HTML_Text_Replacement {
 	 *
 	 * @since 6.2.0
 	 *
-	 * @param int    $start Byte offset into document where replacement span begins.
-	 * @param int    $end   Byte offset into document where replacement span ends.
-	 * @param string $text  Span of text to insert in document to replace existing content from start to end.
+	 * @param int    $start  Byte offset into document where replacement span begins.
+	 * @param int    $length Byte length of span in document being replaced.
+	 * @param string $text   Span of text to insert in document to replace existing content from start to end.
 	 */
-	public function __construct( $start, $end, $text ) {
-		$this->start = $start;
-		$this->end   = $end;
-		$this->text  = $text;
+	public function __construct( $start, $length, $text ) {
+		$this->start  = $start;
+		$this->length = $length;
+		$this->text   = $text;
 	}
 }