diff options
Diffstat (limited to 'wp-includes/html-api')
-rw-r--r-- | wp-includes/html-api/class-wp-html-active-formatting-elements.php | 187 | ||||
-rw-r--r-- | wp-includes/html-api/class-wp-html-attribute-token.php | 90 | ||||
-rw-r--r-- | wp-includes/html-api/class-wp-html-open-elements.php | 432 | ||||
-rw-r--r-- | wp-includes/html-api/class-wp-html-processor-state.php | 143 | ||||
-rw-r--r-- | wp-includes/html-api/class-wp-html-processor.php | 1443 | ||||
-rw-r--r-- | wp-includes/html-api/class-wp-html-span.php | 53 | ||||
-rw-r--r-- | wp-includes/html-api/class-wp-html-tag-processor.php | 2450 | ||||
-rw-r--r-- | wp-includes/html-api/class-wp-html-text-replacement.php | 60 | ||||
-rw-r--r-- | wp-includes/html-api/class-wp-html-token.php | 106 | ||||
-rw-r--r-- | wp-includes/html-api/class-wp-html-unsupported-exception.php | 31 |
10 files changed, 4995 insertions, 0 deletions
diff --git a/wp-includes/html-api/class-wp-html-active-formatting-elements.php b/wp-includes/html-api/class-wp-html-active-formatting-elements.php new file mode 100644 index 0000000..9598991 --- /dev/null +++ b/wp-includes/html-api/class-wp-html-active-formatting-elements.php @@ -0,0 +1,187 @@ +<?php +/** + * HTML API: WP_HTML_Active_Formatting_Elements class + * + * @package WordPress + * @subpackage HTML-API + * @since 6.4.0 + */ + +/** + * Core class used by the HTML processor during HTML parsing + * for managing the stack of active formatting elements. + * + * This class is designed for internal use by the HTML processor. + * + * > Initially, the list of active formatting elements is empty. + * > It is used to handle mis-nested formatting element tags. + * > + * > The list contains elements in the formatting category, and markers. + * > The markers are inserted when entering applet, object, marquee, + * > template, td, th, and caption elements, and are used to prevent + * > formatting from "leaking" into applet, object, marquee, template, + * > td, th, and caption elements. + * > + * > In addition, each element in the list of active formatting elements + * > is associated with the token for which it was created, so that + * > further elements can be created for that token if necessary. + * + * @since 6.4.0 + * + * @access private + * + * @see https://html.spec.whatwg.org/#list-of-active-formatting-elements + * @see WP_HTML_Processor + */ +class WP_HTML_Active_Formatting_Elements { + /** + * Holds the stack of active formatting element references. + * + * @since 6.4.0 + * + * @var WP_HTML_Token[] + */ + private $stack = array(); + + /** + * Reports if a specific node is in the stack of active formatting elements. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $token Look for this node in the stack. + * @return bool Whether the referenced node is in the stack of active formatting elements. + */ + public function contains_node( $token ) { + foreach ( $this->walk_up() as $item ) { + if ( $token->bookmark_name === $item->bookmark_name ) { + return true; + } + } + + return false; + } + + /** + * Returns how many nodes are currently in the stack of active formatting elements. + * + * @since 6.4.0 + * + * @return int How many node are in the stack of active formatting elements. + */ + public function count() { + return count( $this->stack ); + } + + /** + * Returns the node at the end of the stack of active formatting elements, + * if one exists. If the stack is empty, returns null. + * + * @since 6.4.0 + * + * @return WP_HTML_Token|null Last node in the stack of active formatting elements, if one exists, otherwise null. + */ + public function current_node() { + $current_node = end( $this->stack ); + + return $current_node ? $current_node : null; + } + + /** + * Pushes a node onto the stack of active formatting elements. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#push-onto-the-list-of-active-formatting-elements + * + * @param WP_HTML_Token $token Push this node onto the stack. + */ + public function push( $token ) { + /* + * > If there are already three elements in the list of active formatting elements after the last marker, + * > if any, or anywhere in the list if there are no markers, that have the same tag name, namespace, and + * > attributes as element, then remove the earliest such element from the list of active formatting + * > elements. For these purposes, the attributes must be compared as they were when the elements were + * > created by the parser; two elements have the same attributes if all their parsed attributes can be + * > paired such that the two attributes in each pair have identical names, namespaces, and values + * > (the order of the attributes does not matter). + * + * @TODO: Implement the "Noah's Ark clause" to only add up to three of any given kind of formatting elements to the stack. + */ + // > Add element to the list of active formatting elements. + $this->stack[] = $token; + } + + /** + * Removes a node from the stack of active formatting elements. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $token Remove this node from the stack, if it's there already. + * @return bool Whether the node was found and removed from the stack of active formatting elements. + */ + public function remove_node( $token ) { + foreach ( $this->walk_up() as $position_from_end => $item ) { + if ( $token->bookmark_name !== $item->bookmark_name ) { + continue; + } + + $position_from_start = $this->count() - $position_from_end - 1; + array_splice( $this->stack, $position_from_start, 1 ); + return true; + } + + return false; + } + + /** + * Steps through the stack of active formatting elements, starting with the + * top element (added first) and walking downwards to the one added last. + * + * This generator function is designed to be used inside a "foreach" loop. + * + * Example: + * + * $html = '<em><strong><a>We are here'; + * foreach ( $stack->walk_down() as $node ) { + * echo "{$node->node_name} -> "; + * } + * > EM -> STRONG -> A -> + * + * To start with the most-recently added element and walk towards the top, + * see WP_HTML_Active_Formatting_Elements::walk_up(). + * + * @since 6.4.0 + */ + public function walk_down() { + $count = count( $this->stack ); + + for ( $i = 0; $i < $count; $i++ ) { + yield $this->stack[ $i ]; + } + } + + /** + * Steps through the stack of active formatting elements, starting with the + * bottom element (added last) and walking upwards to the one added first. + * + * This generator function is designed to be used inside a "foreach" loop. + * + * Example: + * + * $html = '<em><strong><a>We are here'; + * foreach ( $stack->walk_up() as $node ) { + * echo "{$node->node_name} -> "; + * } + * > A -> STRONG -> EM -> + * + * To start with the first added element and walk towards the bottom, + * see WP_HTML_Active_Formatting_Elements::walk_down(). + * + * @since 6.4.0 + */ + public function walk_up() { + for ( $i = count( $this->stack ) - 1; $i >= 0; $i-- ) { + yield $this->stack[ $i ]; + } + } +} diff --git a/wp-includes/html-api/class-wp-html-attribute-token.php b/wp-includes/html-api/class-wp-html-attribute-token.php new file mode 100644 index 0000000..f938609 --- /dev/null +++ b/wp-includes/html-api/class-wp-html-attribute-token.php @@ -0,0 +1,90 @@ +<?php +/** + * HTML API: WP_HTML_Attribute_Token class + * + * @package WordPress + * @subpackage HTML-API + * @since 6.2.0 + */ + +/** + * Core class used by the HTML tag processor as a data structure for the attribute token, + * allowing to drastically improve performance. + * + * This class is for internal usage of the WP_HTML_Tag_Processor class. + * + * @access private + * @since 6.2.0 + * + * @see WP_HTML_Tag_Processor + */ +class WP_HTML_Attribute_Token { + /** + * Attribute name. + * + * @since 6.2.0 + * @var string + */ + public $name; + + /** + * Attribute value. + * + * @since 6.2.0 + * @var int + */ + public $value_starts_at; + + /** + * How many bytes the value occupies in the input HTML. + * + * @since 6.2.0 + * @var int + */ + public $value_length; + + /** + * The string offset where the attribute name starts. + * + * @since 6.2.0 + * @var int + */ + public $start; + + /** + * The string offset after the attribute value or its name. + * + * @since 6.2.0 + * @var int + */ + public $end; + + /** + * Whether the attribute is a boolean attribute with value `true`. + * + * @since 6.2.0 + * @var bool + */ + public $is_true; + + /** + * Constructor. + * + * @since 6.2.0 + * + * @param string $name Attribute name. + * @param int $value_start Attribute value. + * @param int $value_length Number of bytes attribute value spans. + * @param int $start The string offset where the attribute name starts. + * @param int $end The string offset after the attribute value or its name. + * @param bool $is_true Whether the attribute is a boolean attribute with true value. + */ + public function __construct( $name, $value_start, $value_length, $start, $end, $is_true ) { + $this->name = $name; + $this->value_starts_at = $value_start; + $this->value_length = $value_length; + $this->start = $start; + $this->end = $end; + $this->is_true = $is_true; + } +} diff --git a/wp-includes/html-api/class-wp-html-open-elements.php b/wp-includes/html-api/class-wp-html-open-elements.php new file mode 100644 index 0000000..fe56255 --- /dev/null +++ b/wp-includes/html-api/class-wp-html-open-elements.php @@ -0,0 +1,432 @@ +<?php +/** + * HTML API: WP_HTML_Open_Elements class + * + * @package WordPress + * @subpackage HTML-API + * @since 6.4.0 + */ + +/** + * Core class used by the HTML processor during HTML parsing + * for managing the stack of open elements. + * + * This class is designed for internal use by the HTML processor. + * + * > Initially, the stack of open elements is empty. The stack grows + * > downwards; the topmost node on the stack is the first one added + * > to the stack, and the bottommost node of the stack is the most + * > recently added node in the stack (notwithstanding when the stack + * > is manipulated in a random access fashion as part of the handling + * > for misnested tags). + * + * @since 6.4.0 + * + * @access private + * + * @see https://html.spec.whatwg.org/#stack-of-open-elements + * @see WP_HTML_Processor + */ +class WP_HTML_Open_Elements { + /** + * Holds the stack of open element references. + * + * @since 6.4.0 + * + * @var WP_HTML_Token[] + */ + public $stack = array(); + + /** + * Whether a P element is in button scope currently. + * + * This class optimizes scope lookup by pre-calculating + * this value when elements are added and removed to the + * stack of open elements which might change its value. + * This avoids frequent iteration over the stack. + * + * @since 6.4.0 + * + * @var bool + */ + private $has_p_in_button_scope = false; + + /** + * Reports if a specific node is in the stack of open elements. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $token Look for this node in the stack. + * @return bool Whether the referenced node is in the stack of open elements. + */ + public function contains_node( $token ) { + foreach ( $this->walk_up() as $item ) { + if ( $token->bookmark_name === $item->bookmark_name ) { + return true; + } + } + + return false; + } + + /** + * Returns how many nodes are currently in the stack of open elements. + * + * @since 6.4.0 + * + * @return int How many node are in the stack of open elements. + */ + public function count() { + return count( $this->stack ); + } + + /** + * Returns the node at the end of the stack of open elements, + * if one exists. If the stack is empty, returns null. + * + * @since 6.4.0 + * + * @return WP_HTML_Token|null Last node in the stack of open elements, if one exists, otherwise null. + */ + public function current_node() { + $current_node = end( $this->stack ); + + return $current_node ? $current_node : null; + } + + /** + * Returns whether an element is in a specific scope. + * + * ## HTML Support + * + * This function skips checking for the termination list because there + * are no supported elements which appear in the termination list. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-the-specific-scope + * + * @param string $tag_name Name of tag check. + * @param string[] $termination_list List of elements that terminate the search. + * @return bool Whether the element was found in a specific scope. + */ + public function has_element_in_specific_scope( $tag_name, $termination_list ) { + foreach ( $this->walk_up() as $node ) { + if ( $node->node_name === $tag_name ) { + return true; + } + + switch ( $node->node_name ) { + case 'HTML': + return false; + } + + if ( in_array( $node->node_name, $termination_list, true ) ) { + return true; + } + } + + return false; + } + + /** + * Returns whether a particular element is in scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-scope + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_scope( $tag_name ) { + return $this->has_element_in_specific_scope( + $tag_name, + array( + + /* + * Because it's not currently possible to encounter + * one of the termination elements, they don't need + * to be listed here. If they were, they would be + * unreachable and only waste CPU cycles while + * scanning through HTML. + */ + ) + ); + } + + /** + * Returns whether a particular element is in list item scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-list-item-scope + * + * @throws WP_HTML_Unsupported_Exception Always until this function is implemented. + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_list_item_scope( $tag_name ) { + throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on list item scope.' ); + + return false; // The linter requires this unreachable code until the function is implemented and can return. + } + + /** + * Returns whether a particular element is in button scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-button-scope + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_button_scope( $tag_name ) { + return $this->has_element_in_specific_scope( $tag_name, array( 'BUTTON' ) ); + } + + /** + * Returns whether a particular element is in table scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-table-scope + * + * @throws WP_HTML_Unsupported_Exception Always until this function is implemented. + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_table_scope( $tag_name ) { + throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on table scope.' ); + + return false; // The linter requires this unreachable code until the function is implemented and can return. + } + + /** + * Returns whether a particular element is in select scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-select-scope + * + * @throws WP_HTML_Unsupported_Exception Always until this function is implemented. + * + * @param string $tag_name Name of tag to check. + * @return bool Whether given element is in scope. + */ + public function has_element_in_select_scope( $tag_name ) { + throw new WP_HTML_Unsupported_Exception( 'Cannot process elements depending on select scope.' ); + + return false; // The linter requires this unreachable code until the function is implemented and can return. + } + + /** + * Returns whether a P is in BUTTON scope. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#has-an-element-in-button-scope + * + * @return bool Whether a P is in BUTTON scope. + */ + public function has_p_in_button_scope() { + return $this->has_p_in_button_scope; + } + + /** + * Pops a node off of the stack of open elements. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#stack-of-open-elements + * + * @return bool Whether a node was popped off of the stack. + */ + public function pop() { + $item = array_pop( $this->stack ); + + if ( null === $item ) { + return false; + } + + $this->after_element_pop( $item ); + return true; + } + + /** + * Pops nodes off of the stack of open elements until one with the given tag name has been popped. + * + * @since 6.4.0 + * + * @see WP_HTML_Open_Elements::pop + * + * @param string $tag_name Name of tag that needs to be popped off of the stack of open elements. + * @return bool Whether a tag of the given name was found and popped off of the stack of open elements. + */ + public function pop_until( $tag_name ) { + foreach ( $this->walk_up() as $item ) { + $this->pop(); + + if ( $tag_name === $item->node_name ) { + return true; + } + } + + return false; + } + + /** + * Pushes a node onto the stack of open elements. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#stack-of-open-elements + * + * @param WP_HTML_Token $stack_item Item to add onto stack. + */ + public function push( $stack_item ) { + $this->stack[] = $stack_item; + $this->after_element_push( $stack_item ); + } + + /** + * Removes a specific node from the stack of open elements. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $token The node to remove from the stack of open elements. + * @return bool Whether the node was found and removed from the stack of open elements. + */ + public function remove_node( $token ) { + foreach ( $this->walk_up() as $position_from_end => $item ) { + if ( $token->bookmark_name !== $item->bookmark_name ) { + continue; + } + + $position_from_start = $this->count() - $position_from_end - 1; + array_splice( $this->stack, $position_from_start, 1 ); + $this->after_element_pop( $item ); + return true; + } + + return false; + } + + + /** + * Steps through the stack of open elements, starting with the top element + * (added first) and walking downwards to the one added last. + * + * This generator function is designed to be used inside a "foreach" loop. + * + * Example: + * + * $html = '<em><strong><a>We are here'; + * foreach ( $stack->walk_down() as $node ) { + * echo "{$node->node_name} -> "; + * } + * > EM -> STRONG -> A -> + * + * To start with the most-recently added element and walk towards the top, + * see WP_HTML_Open_Elements::walk_up(). + * + * @since 6.4.0 + */ + public function walk_down() { + $count = count( $this->stack ); + + for ( $i = 0; $i < $count; $i++ ) { + yield $this->stack[ $i ]; + } + } + + /** + * Steps through the stack of open elements, starting with the bottom element + * (added last) and walking upwards to the one added first. + * + * This generator function is designed to be used inside a "foreach" loop. + * + * Example: + * + * $html = '<em><strong><a>We are here'; + * foreach ( $stack->walk_up() as $node ) { + * echo "{$node->node_name} -> "; + * } + * > A -> STRONG -> EM -> + * + * To start with the first added element and walk towards the bottom, + * see WP_HTML_Open_Elements::walk_down(). + * + * @since 6.4.0 + */ + public function walk_up() { + for ( $i = count( $this->stack ) - 1; $i >= 0; $i-- ) { + yield $this->stack[ $i ]; + } + } + + /* + * Internal helpers. + */ + + /** + * Updates internal flags after adding an element. + * + * Certain conditions (such as "has_p_in_button_scope") are maintained here as + * flags that are only modified when adding and removing elements. This allows + * the HTML Processor to quickly check for these conditions instead of iterating + * over the open stack elements upon each new tag it encounters. These flags, + * however, need to be maintained as items are added and removed from the stack. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $item Element that was added to the stack of open elements. + */ + public function after_element_push( $item ) { + /* + * When adding support for new elements, expand this switch to trap + * cases where the precalculated value needs to change. + */ + switch ( $item->node_name ) { + case 'BUTTON': + $this->has_p_in_button_scope = false; + break; + + case 'P': + $this->has_p_in_button_scope = true; + break; + } + } + + /** + * Updates internal flags after removing an element. + * + * Certain conditions (such as "has_p_in_button_scope") are maintained here as + * flags that are only modified when adding and removing elements. This allows + * the HTML Processor to quickly check for these conditions instead of iterating + * over the open stack elements upon each new tag it encounters. These flags, + * however, need to be maintained as items are added and removed from the stack. + * + * @since 6.4.0 + * + * @param WP_HTML_Token $item Element that was removed from the stack of open elements. + */ + public function after_element_pop( $item ) { + /* + * When adding support for new elements, expand this switch to trap + * cases where the precalculated value needs to change. + */ + switch ( $item->node_name ) { + case 'BUTTON': + $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' ); + break; + + case 'P': + $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' ); + break; + } + } +} diff --git a/wp-includes/html-api/class-wp-html-processor-state.php b/wp-includes/html-api/class-wp-html-processor-state.php new file mode 100644 index 0000000..9cf10c3 --- /dev/null +++ b/wp-includes/html-api/class-wp-html-processor-state.php @@ -0,0 +1,143 @@ +<?php +/** + * HTML API: WP_HTML_Processor_State class + * + * @package WordPress + * @subpackage HTML-API + * @since 6.4.0 + */ + +/** + * Core class used by the HTML processor during HTML parsing + * for managing the internal parsing state. + * + * This class is designed for internal use by the HTML processor. + * + * @since 6.4.0 + * + * @access private + * + * @see WP_HTML_Processor + */ +class WP_HTML_Processor_State { + /* + * Insertion mode constants. + * + * These constants exist and are named to make it easier to + * discover and recognize the supported insertion modes in + * the parser. + * + * Out of all the possible insertion modes, only those + * supported by the parser are listed here. As support + * is added to the parser for more modes, add them here + * following the same naming and value pattern. + * + * @see https://html.spec.whatwg.org/#the-insertion-mode + */ + + /** + * Initial insertion mode for full HTML parser. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#the-initial-insertion-mode + * @see WP_HTML_Processor_State::$insertion_mode + * + * @var string + */ + const INSERTION_MODE_INITIAL = 'insertion-mode-initial'; + + /** + * In body insertion mode for full HTML parser. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#parsing-main-inbody + * @see WP_HTML_Processor_State::$insertion_mode + * + * @var string + */ + const INSERTION_MODE_IN_BODY = 'insertion-mode-in-body'; + + /** + * Tracks open elements while scanning HTML. + * + * This property is initialized in the constructor and never null. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#stack-of-open-elements + * + * @var WP_HTML_Open_Elements + */ + public $stack_of_open_elements = null; + + /** + * Tracks open formatting elements, used to handle mis-nested formatting element tags. + * + * This property is initialized in the constructor and never null. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#list-of-active-formatting-elements + * + * @var WP_HTML_Active_Formatting_Elements + */ + public $active_formatting_elements = null; + + /** + * Refers to the currently-matched tag, if any. + * + * @since 6.4.0 + * + * @var WP_HTML_Token|null + */ + public $current_token = null; + + /** + * Tree construction insertion mode. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#insertion-mode + * + * @var string + */ + public $insertion_mode = self::INSERTION_MODE_INITIAL; + + /** + * Context node initializing fragment parser, if created as a fragment parser. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#concept-frag-parse-context + * + * @var [string, array]|null + */ + public $context_node = null; + + /** + * The frameset-ok flag indicates if a `FRAMESET` element is allowed in the current state. + * + * > The frameset-ok flag is set to "ok" when the parser is created. It is set to "not ok" after certain tokens are seen. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#frameset-ok-flag + * + * @var bool + */ + public $frameset_ok = true; + + /** + * Constructor - creates a new and empty state value. + * + * @since 6.4.0 + * + * @see WP_HTML_Processor + */ + public function __construct() { + $this->stack_of_open_elements = new WP_HTML_Open_Elements(); + $this->active_formatting_elements = new WP_HTML_Active_Formatting_Elements(); + } +} diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php new file mode 100644 index 0000000..f27f83b --- /dev/null +++ b/wp-includes/html-api/class-wp-html-processor.php @@ -0,0 +1,1443 @@ +<?php +/** + * HTML API: WP_HTML_Processor class + * + * @package WordPress + * @subpackage HTML-API + * @since 6.4.0 + */ + +/** + * Core class used to safely parse and modify an HTML document. + * + * The HTML Processor class properly parses and modifies HTML5 documents. + * + * It supports a subset of the HTML5 specification, and when it encounters + * unsupported markup, it aborts early to avoid unintentionally breaking + * the document. The HTML Processor should never break an HTML document. + * + * While the `WP_HTML_Tag_Processor` is a valuable tool for modifying + * attributes on individual HTML tags, the HTML Processor is more capable + * and useful for the following operations: + * + * - Querying based on nested HTML structure. + * + * Eventually the HTML Processor will also support: + * - Wrapping a tag in surrounding HTML. + * - Unwrapping a tag by removing its parent. + * - Inserting and removing nodes. + * - Reading and changing inner content. + * - Navigating up or around HTML structure. + * + * ## Usage + * + * Use of this class requires three steps: + * + * 1. Call a static creator method with your input HTML document. + * 2. Find the location in the document you are looking for. + * 3. Request changes to the document at that location. + * + * Example: + * + * $processor = WP_HTML_Processor::create_fragment( $html ); + * if ( $processor->next_tag( array( 'breadcrumbs' => array( 'DIV', 'FIGURE', 'IMG' ) ) ) ) { + * $processor->add_class( 'responsive-image' ); + * } + * + * #### Breadcrumbs + * + * Breadcrumbs represent the stack of open elements from the root + * of the document or fragment down to the currently-matched node, + * if one is currently selected. Call WP_HTML_Processor::get_breadcrumbs() + * to inspect the breadcrumbs for a matched tag. + * + * Breadcrumbs can specify nested HTML structure and are equivalent + * to a CSS selector comprising tag names separated by the child + * combinator, such as "DIV > FIGURE > IMG". + * + * Since all elements find themselves inside a full HTML document + * when parsed, the return value from `get_breadcrumbs()` will always + * contain any implicit outermost elements. For example, when parsing + * with `create_fragment()` in the `BODY` context (the default), any + * tag in the given HTML document will contain `array( 'HTML', 'BODY', … )` + * in its breadcrumbs. + * + * Despite containing the implied outermost elements in their breadcrumbs, + * tags may be found with the shortest-matching breadcrumb query. That is, + * `array( 'IMG' )` matches all IMG elements and `array( 'P', 'IMG' )` + * matches all IMG elements directly inside a P element. To ensure that no + * partial matches erroneously match it's possible to specify in a query + * the full breadcrumb match all the way down from the root HTML element. + * + * Example: + * + * $html = '<figure><img><figcaption>A <em>lovely</em> day outside</figcaption></figure>'; + * // ----- Matches here. + * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'IMG' ) ) ); + * + * $html = '<figure><img><figcaption>A <em>lovely</em> day outside</figcaption></figure>'; + * // ---- Matches here. + * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'FIGCAPTION', 'EM' ) ) ); + * + * $html = '<div><img></div><img>'; + * // ----- Matches here, because IMG must be a direct child of the implicit BODY. + * $processor->next_tag( array( 'breadcrumbs' => array( 'BODY', 'IMG' ) ) ); + * + * ## HTML Support + * + * This class implements a small part of the HTML5 specification. + * It's designed to operate within its support and abort early whenever + * encountering circumstances it can't properly handle. This is + * the principle way in which this class remains as simple as possible + * without cutting corners and breaking compliance. + * + * ### Supported elements + * + * If any unsupported element appears in the HTML input the HTML Processor + * will abort early and stop all processing. This draconian measure ensures + * that the HTML Processor won't break any HTML it doesn't fully understand. + * + * The following list specifies the HTML tags that _are_ supported: + * + * - Links: A. + * - The formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U. + * - Containers: DIV, FIGCAPTION, FIGURE, SPAN. + * - Form elements: BUTTON. + * - Paragraph: P. + * - Void elements: IMG. + * + * ### Supported markup + * + * Some kinds of non-normative HTML involve reconstruction of formatting elements and + * re-parenting of mis-nested elements. For example, a DIV tag found inside a TABLE + * may in fact belong _before_ the table in the DOM. If the HTML Processor encounters + * such a case it will stop processing. + * + * The following list specifies HTML markup that _is_ supported: + * + * - Markup involving only those tags listed above. + * - Fully-balanced and non-overlapping tags. + * - HTML with unexpected tag closers. + * - Some unbalanced or overlapping tags. + * - P tags after unclosed P tags. + * - BUTTON tags after unclosed BUTTON tags. + * - A tags after unclosed A tags that don't involve any active formatting elements. + * + * @since 6.4.0 + * + * @see WP_HTML_Tag_Processor + * @see https://html.spec.whatwg.org/ + */ +class WP_HTML_Processor extends WP_HTML_Tag_Processor { + /** + * The maximum number of bookmarks allowed to exist at any given time. + * + * HTML processing requires more bookmarks than basic tag processing, + * so this class constant from the Tag Processor is overwritten. + * + * @since 6.4.0 + * + * @var int + */ + const MAX_BOOKMARKS = 100; + + /** + * Static query for instructing the Tag Processor to visit every token. + * + * @access private + * + * @since 6.4.0 + * + * @var array + */ + const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' ); + + /** + * Holds the working state of the parser, including the stack of + * open elements and the stack of active formatting elements. + * + * Initialized in the constructor. + * + * @since 6.4.0 + * + * @var WP_HTML_Processor_State + */ + private $state = null; + + /** + * Used to create unique bookmark names. + * + * This class sets a bookmark for every tag in the HTML document that it encounters. + * The bookmark name is auto-generated and increments, starting with `1`. These are + * internal bookmarks and are automatically released when the referring WP_HTML_Token + * goes out of scope and is garbage-collected. + * + * @since 6.4.0 + * + * @see WP_HTML_Processor::$release_internal_bookmark_on_destruct + * + * @var int + */ + private $bookmark_counter = 0; + + /** + * Stores an explanation for why something failed, if it did. + * + * @see self::get_last_error + * + * @since 6.4.0 + * + * @var string|null + */ + private $last_error = null; + + /** + * Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance. + * + * This function is created inside the class constructor so that it can be passed to + * the stack of open elements and the stack of active formatting elements without + * exposing it as a public method on the class. + * + * @since 6.4.0 + * + * @var closure + */ + private $release_internal_bookmark_on_destruct = null; + + /* + * Public Interface Functions + */ + + /** + * Creates an HTML processor in the fragment parsing mode. + * + * Use this for cases where you are processing chunks of HTML that + * will be found within a bigger HTML document, such as rendered + * block output that exists within a post, `the_content` inside a + * rendered site layout. + * + * Fragment parsing occurs within a context, which is an HTML element + * that the document will eventually be placed in. It becomes important + * when special elements have different rules than others, such as inside + * a TEXTAREA or a TITLE tag where things that look like tags are text, + * or inside a SCRIPT tag where things that look like HTML syntax are JS. + * + * The context value should be a representation of the tag into which the + * HTML is found. For most cases this will be the body element. The HTML + * form is provided because a context element may have attributes that + * impact the parse, such as with a SCRIPT tag and its `type` attribute. + * + * ## Current HTML Support + * + * - The only supported context is `<body>`, which is the default value. + * - The only supported document encoding is `UTF-8`, which is the default value. + * + * @since 6.4.0 + * + * @param string $html Input HTML fragment to process. + * @param string $context Context element for the fragment, must be default of `<body>`. + * @param string $encoding Text encoding of the document; must be default of 'UTF-8'. + * @return WP_HTML_Processor|null The created processor if successful, otherwise null. + */ + public static function create_fragment( $html, $context = '<body>', $encoding = 'UTF-8' ) { + if ( '<body>' !== $context || 'UTF-8' !== $encoding ) { + return null; + } + + $p = new self( $html, self::CONSTRUCTOR_UNLOCK_CODE ); + $p->state->context_node = array( 'BODY', array() ); + $p->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + + // @TODO: Create "fake" bookmarks for non-existent but implied nodes. + $p->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); + $p->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); + + $p->state->stack_of_open_elements->push( + new WP_HTML_Token( + 'root-node', + 'HTML', + false + ) + ); + + $p->state->stack_of_open_elements->push( + new WP_HTML_Token( + 'context-node', + $p->state->context_node[0], + false + ) + ); + + return $p; + } + + /** + * Constructor. + * + * Do not use this method. Use the static creator methods instead. + * + * @access private + * + * @since 6.4.0 + * + * @see WP_HTML_Processor::create_fragment() + * + * @param string $html HTML to process. + * @param string|null $use_the_static_create_methods_instead This constructor should not be called manually. + */ + public function __construct( $html, $use_the_static_create_methods_instead = null ) { + parent::__construct( $html ); + + if ( self::CONSTRUCTOR_UNLOCK_CODE !== $use_the_static_create_methods_instead ) { + _doing_it_wrong( + __METHOD__, + sprintf( + /* translators: %s: WP_HTML_Processor::create_fragment(). */ + __( 'Call %s to create an HTML Processor instead of calling the constructor directly.' ), + '<code>WP_HTML_Processor::create_fragment()</code>' + ), + '6.4.0' + ); + } + + $this->state = new WP_HTML_Processor_State(); + + /* + * Create this wrapper so that it's possible to pass + * a private method into WP_HTML_Token classes without + * exposing it to any public API. + */ + $this->release_internal_bookmark_on_destruct = function ( $name ) { + parent::release_bookmark( $name ); + }; + } + + /** + * Returns the last error, if any. + * + * Various situations lead to parsing failure but this class will + * return `false` in all those cases. To determine why something + * failed it's possible to request the last error. This can be + * helpful to know to distinguish whether a given tag couldn't + * be found or if content in the document caused the processor + * to give up and abort processing. + * + * Example + * + * $processor = WP_HTML_Processor::create_fragment( '<template><strong><button><em><p><em>' ); + * false === $processor->next_tag(); + * WP_HTML_Processor::ERROR_UNSUPPORTED === $processor->get_last_error(); + * + * @since 6.4.0 + * + * @see self::ERROR_UNSUPPORTED + * @see self::ERROR_EXCEEDED_MAX_BOOKMARKS + * + * @return string|null The last error, if one exists, otherwise null. + */ + public function get_last_error() { + return $this->last_error; + } + + /** + * Finds the next tag matching the $query. + * + * @TODO: Support matching the class name and tag name. + * + * @since 6.4.0 + * + * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document. + * + * @param array|string|null $query { + * Optional. Which tag name to find, having which class, etc. Default is to find any tag. + * + * @type string|null $tag_name Which tag to find, or `null` for "any tag." + * @type int|null $match_offset Find the Nth tag matching all search criteria. + * 1 for "first" tag, 3 for "third," etc. + * Defaults to first tag. + * @type string|null $class_name Tag must contain this whole class name to match. + * @type string[] $breadcrumbs DOM sub-path at which element is found, e.g. `array( 'FIGURE', 'IMG' )`. + * May also contain the wildcard `*` which matches a single element, e.g. `array( 'SECTION', '*' )`. + * } + * @return bool Whether a tag was matched. + */ + public function next_tag( $query = null ) { + if ( null === $query ) { + while ( $this->step() ) { + if ( ! $this->is_tag_closer() ) { + return true; + } + } + + return false; + } + + if ( is_string( $query ) ) { + $query = array( 'breadcrumbs' => array( $query ) ); + } + + if ( ! is_array( $query ) ) { + _doing_it_wrong( + __METHOD__, + __( 'Please pass a query array to this function.' ), + '6.4.0' + ); + return false; + } + + if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) { + while ( $this->step() ) { + if ( ! $this->is_tag_closer() ) { + return true; + } + } + + return false; + } + + if ( isset( $query['tag_closers'] ) && 'visit' === $query['tag_closers'] ) { + _doing_it_wrong( + __METHOD__, + __( 'Cannot visit tag closers in HTML Processor.' ), + '6.4.0' + ); + return false; + } + + $breadcrumbs = $query['breadcrumbs']; + $match_offset = isset( $query['match_offset'] ) ? (int) $query['match_offset'] : 1; + + while ( $match_offset > 0 && $this->step() ) { + if ( $this->matches_breadcrumbs( $breadcrumbs ) && 0 === --$match_offset ) { + return true; + } + } + + return false; + } + + /** + * Indicates if the currently-matched tag matches the given breadcrumbs. + * + * A "*" represents a single tag wildcard, where any tag matches, but not no tags. + * + * At some point this function _may_ support a `**` syntax for matching any number + * of unspecified tags in the breadcrumb stack. This has been intentionally left + * out, however, to keep this function simple and to avoid introducing backtracking, + * which could open up surprising performance breakdowns. + * + * Example: + * + * $processor = WP_HTML_Processor::create_fragment( '<div><span><figure><img></figure></span></div>' ); + * $processor->next_tag( 'img' ); + * true === $processor->matches_breadcrumbs( array( 'figure', 'img' ) ); + * true === $processor->matches_breadcrumbs( array( 'span', 'figure', 'img' ) ); + * false === $processor->matches_breadcrumbs( array( 'span', 'img' ) ); + * true === $processor->matches_breadcrumbs( array( 'span', '*', 'img' ) ); + * + * @since 6.4.0 + * + * @param string[] $breadcrumbs DOM sub-path at which element is found, e.g. `array( 'FIGURE', 'IMG' )`. + * May also contain the wildcard `*` which matches a single element, e.g. `array( 'SECTION', '*' )`. + * @return bool Whether the currently-matched tag is found at the given nested structure. + */ + public function matches_breadcrumbs( $breadcrumbs ) { + if ( ! $this->get_tag() ) { + return false; + } + + // Everything matches when there are zero constraints. + if ( 0 === count( $breadcrumbs ) ) { + return true; + } + + // Start at the last crumb. + $crumb = end( $breadcrumbs ); + + if ( '*' !== $crumb && $this->get_tag() !== strtoupper( $crumb ) ) { + return false; + } + + foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { + $crumb = strtoupper( current( $breadcrumbs ) ); + + if ( '*' !== $crumb && $node->node_name !== $crumb ) { + return false; + } + + if ( false === prev( $breadcrumbs ) ) { + return true; + } + } + + return false; + } + + /** + * Steps through the HTML document and stop at the next tag, if any. + * + * @since 6.4.0 + * + * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document. + * + * @see self::PROCESS_NEXT_NODE + * @see self::REPROCESS_CURRENT_NODE + * + * @param string $node_to_process Whether to parse the next node or reprocess the current node. + * @return bool Whether a tag was matched. + */ + public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { + // Refuse to proceed if there was a previous error. + if ( null !== $this->last_error ) { + return false; + } + + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { + /* + * Void elements still hop onto the stack of open elements even though + * there's no corresponding closing tag. This is important for managing + * stack-based operations such as "navigate to parent node" or checking + * on an element's breadcrumbs. + * + * When moving on to the next node, therefore, if the bottom-most element + * on the stack is a void element, it must be closed. + * + * @TODO: Once self-closing foreign elements and BGSOUND are supported, + * they must also be implicitly closed here too. BGSOUND is + * special since it's only self-closing if the self-closing flag + * is provided in the opening tag, otherwise it expects a tag closer. + */ + $top_node = $this->state->stack_of_open_elements->current_node(); + if ( $top_node && self::is_void( $top_node->node_name ) ) { + $this->state->stack_of_open_elements->pop(); + } + + parent::next_tag( self::VISIT_EVERYTHING ); + } + + // Finish stepping when there are no more tokens in the document. + if ( null === $this->get_tag() ) { + return false; + } + + $this->state->current_token = new WP_HTML_Token( + $this->bookmark_tag(), + $this->get_tag(), + $this->is_tag_closer(), + $this->release_internal_bookmark_on_destruct + ); + + try { + switch ( $this->state->insertion_mode ) { + case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY: + return $this->step_in_body(); + + default: + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." ); + } + } catch ( WP_HTML_Unsupported_Exception $e ) { + /* + * Exceptions are used in this class to escape deep call stacks that + * otherwise might involve messier calling and return conventions. + */ + return false; + } + } + + /** + * Computes the HTML breadcrumbs for the currently-matched node, if matched. + * + * Breadcrumbs start at the outermost parent and descend toward the matched element. + * They always include the entire path from the root HTML node to the matched element. + * + * @TODO: It could be more efficient to expose a generator-based version of this function + * to avoid creating the array copy on tag iteration. If this is done, it would likely + * be more useful to walk up the stack when yielding instead of starting at the top. + * + * Example + * + * $processor = WP_HTML_Processor::create_fragment( '<p><strong><em><img></em></strong></p>' ); + * $processor->next_tag( 'IMG' ); + * $processor->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' ); + * + * @since 6.4.0 + * + * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. + */ + public function get_breadcrumbs() { + if ( ! $this->get_tag() ) { + return null; + } + + $breadcrumbs = array(); + foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { + $breadcrumbs[] = $stack_item->node_name; + } + + return $breadcrumbs; + } + + /** + * Parses next element in the 'in body' insertion mode. + * + * This internal function performs the 'in body' insertion mode + * logic for the generalized WP_HTML_Processor::step() function. + * + * @since 6.4.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#parsing-main-inbody + * @see WP_HTML_Processor::step + * + * @return bool Whether an element was found. + */ + private function step_in_body() { + $tag_name = $this->get_tag(); + $op_sigil = $this->is_tag_closer() ? '-' : '+'; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A start tag whose tag name is "button" + */ + case '+BUTTON': + if ( $this->state->stack_of_open_elements->has_element_in_scope( 'BUTTON' ) ) { + // @TODO: Indicate a parse error once it's possible. This error does not impact the logic here. + $this->generate_implied_end_tags(); + $this->state->stack_of_open_elements->pop_until( 'BUTTON' ); + } + + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + $this->state->frameset_ok = false; + + return true; + + /* + * > A start tag whose tag name is one of: "address", "article", "aside", + * > "blockquote", "center", "details", "dialog", "dir", "div", "dl", + * > "fieldset", "figcaption", "figure", "footer", "header", "hgroup", + * > "main", "menu", "nav", "ol", "p", "search", "section", "summary", "ul" + */ + case '+BLOCKQUOTE': + case '+DIV': + case '+FIGCAPTION': + case '+FIGURE': + case '+P': + if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->close_a_p_element(); + } + + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is one of: "address", "article", "aside", "blockquote", + * > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", + * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", + * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul" + */ + case '-BLOCKQUOTE': + case '-BUTTON': + case '-DIV': + case '-FIGCAPTION': + case '-FIGURE': + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $tag_name ) ) { + // @TODO: Report parse error. + // Ignore the token. + return $this->step(); + } + + $this->generate_implied_end_tags(); + if ( $this->state->stack_of_open_elements->current_node()->node_name !== $tag_name ) { + // @TODO: Record parse error: this error doesn't impact parsing. + } + $this->state->stack_of_open_elements->pop_until( $tag_name ); + return true; + + /* + * > An end tag whose tag name is "p" + */ + case '-P': + if ( ! $this->state->stack_of_open_elements->has_p_in_button_scope() ) { + $this->insert_html_element( $this->state->current_token ); + } + + $this->close_a_p_element(); + return true; + + // > A start tag whose tag name is "a" + case '+A': + foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { + switch ( $item->node_name ) { + case 'marker': + break; + + case 'A': + $this->run_adoption_agency_algorithm(); + $this->state->active_formatting_elements->remove_node( $item ); + $this->state->stack_of_open_elements->remove_node( $item ); + break; + } + } + + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + $this->state->active_formatting_elements->push( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is one of: "b", "big", "code", "em", "font", "i", + * > "s", "small", "strike", "strong", "tt", "u" + */ + case '+B': + case '+BIG': + case '+CODE': + case '+EM': + case '+FONT': + case '+I': + case '+S': + case '+SMALL': + case '+STRIKE': + case '+STRONG': + case '+TT': + case '+U': + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + $this->state->active_formatting_elements->push( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is one of: "a", "b", "big", "code", "em", "font", "i", + * > "nobr", "s", "small", "strike", "strong", "tt", "u" + */ + case '-A': + case '-B': + case '-BIG': + case '-CODE': + case '-EM': + case '-FONT': + case '-I': + case '-S': + case '-SMALL': + case '-STRIKE': + case '-STRONG': + case '-TT': + case '-U': + $this->run_adoption_agency_algorithm(); + return true; + + /* + * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr" + */ + case '+IMG': + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > Any other start tag + */ + case '+SPAN': + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * Any other end tag + */ + case '-SPAN': + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + // > If node is an HTML element with the same tag name as the token, then: + if ( $item->node_name === $tag_name ) { + $this->generate_implied_end_tags( $tag_name ); + + // > If node is not the current node, then this is a parse error. + + $this->state->stack_of_open_elements->pop_until( $tag_name ); + return true; + } + + // > Otherwise, if node is in the special category, then this is a parse error; ignore the token, and return. + if ( self::is_special( $item->node_name ) ) { + return $this->step(); + } + } + // Execution should not reach here; if it does then something went wrong. + return false; + + default: + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." ); + } + } + + /* + * Internal helpers + */ + + /** + * Creates a new bookmark for the currently-matched tag and returns the generated name. + * + * @since 6.4.0 + * + * @throws Exception When unable to allocate requested bookmark. + * + * @return string|false Name of created bookmark, or false if unable to create. + */ + private function bookmark_tag() { + if ( ! $this->get_tag() ) { + return false; + } + + if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) { + $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS; + throw new Exception( 'could not allocate bookmark' ); + } + + return "{$this->bookmark_counter}"; + } + + /* + * HTML semantic overrides for Tag Processor + */ + + /** + * Returns the uppercase name of the matched tag. + * + * The semantic rules for HTML specify that certain tags be reprocessed + * with a different tag name. Because of this, the tag name presented + * by the HTML Processor may differ from the one reported by the HTML + * Tag Processor, which doesn't apply these semantic rules. + * + * Example: + * + * $processor = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' ); + * $processor->next_tag() === true; + * $processor->get_tag() === 'DIV'; + * + * $processor->next_tag() === false; + * $processor->get_tag() === null; + * + * @since 6.4.0 + * + * @return string|null Name of currently matched tag in input HTML, or `null` if none found. + */ + public function get_tag() { + if ( null !== $this->last_error ) { + return null; + } + + $tag_name = parent::get_tag(); + + switch ( $tag_name ) { + case 'IMAGE': + /* + * > A start tag whose tag name is "image" + * > Change the token's tag name to "img" and reprocess it. (Don't ask.) + */ + return 'IMG'; + + default: + return $tag_name; + } + } + + /** + * Removes a bookmark that is no longer needed. + * + * Releasing a bookmark frees up the small + * performance overhead it requires. + * + * @since 6.4.0 + * + * @param string $bookmark_name Name of the bookmark to remove. + * @return bool Whether the bookmark already existed before removal. + */ + public function release_bookmark( $bookmark_name ) { + return parent::release_bookmark( "_{$bookmark_name}" ); + } + + /** + * Moves the internal cursor in the HTML Processor to a given bookmark's location. + * + * In order to prevent accidental infinite loops, there's a + * maximum limit on the number of times seek() can be called. + * + * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document. + * + * @since 6.4.0 + * + * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. + * @return bool Whether the internal cursor was successfully moved to the bookmark's location. + */ + public function seek( $bookmark_name ) { + $actual_bookmark_name = "_{$bookmark_name}"; + $processor_started_at = $this->state->current_token + ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start + : 0; + $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; + $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; + + switch ( $direction ) { + case 'forward': + // When moving forwards, re-parse the document until reaching the same location as the original bookmark. + while ( $this->step() ) { + if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { + return true; + } + } + + return false; + + case 'backward': + /* + * When moving backwards, clear out all existing stack entries which appear after the destination + * bookmark. These could be stored for later retrieval, but doing so would require additional + * memory overhead and also demand that references and bookmarks are updated as the document + * changes. In time this could be a valuable optimization, but it's okay to give up that + * optimization in exchange for more CPU time to recompute the stack, to re-parse the + * document that may have already been parsed once. + */ + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) { + break; + } + + $this->state->stack_of_open_elements->remove_node( $item ); + } + + foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { + if ( $bookmark_starts_at >= $this->bookmarks[ $item->bookmark_name ]->start ) { + break; + } + + $this->state->active_formatting_elements->remove_node( $item ); + } + + return parent::seek( $actual_bookmark_name ); + } + } + + /** + * Sets a bookmark in the HTML document. + * + * Bookmarks represent specific places or tokens in the HTML + * document, such as a tag opener or closer. When applying + * edits to a document, such as setting an attribute, the + * text offsets of that token may shift; the bookmark is + * kept updated with those shifts and remains stable unless + * the entire span of text in which the token sits is removed. + * + * Release bookmarks when they are no longer needed. + * + * Example: + * + * <main><h2>Surprising fact you may not know!</h2></main> + * ^ ^ + * \-|-- this `H2` opener bookmark tracks the token + * + * <main class="clickbait"><h2>Surprising fact you may no… + * ^ ^ + * \-|-- it shifts with edits + * + * Bookmarks provide the ability to seek to a previously-scanned + * place in the HTML document. This avoids the need to re-scan + * the entire document. + * + * Example: + * + * <ul><li>One</li><li>Two</li><li>Three</li></ul> + * ^^^^ + * want to note this last item + * + * $p = new WP_HTML_Tag_Processor( $html ); + * $in_list = false; + * while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) { + * if ( 'UL' === $p->get_tag() ) { + * if ( $p->is_tag_closer() ) { + * $in_list = false; + * $p->set_bookmark( 'resume' ); + * if ( $p->seek( 'last-li' ) ) { + * $p->add_class( 'last-li' ); + * } + * $p->seek( 'resume' ); + * $p->release_bookmark( 'last-li' ); + * $p->release_bookmark( 'resume' ); + * } else { + * $in_list = true; + * } + * } + * + * if ( 'LI' === $p->get_tag() ) { + * $p->set_bookmark( 'last-li' ); + * } + * } + * + * Bookmarks intentionally hide the internal string offsets + * to which they refer. They are maintained internally as + * updates are applied to the HTML document and therefore + * retain their "position" - the location to which they + * originally pointed. The inability to use bookmarks with + * functions like `substr` is therefore intentional to guard + * against accidentally breaking the HTML. + * + * Because bookmarks allocate memory and require processing + * for every applied update, they are limited and require + * a name. They should not be created with programmatically-made + * names, such as "li_{$index}" with some loop. As a general + * rule they should only be created with string-literal names + * like "start-of-section" or "last-paragraph". + * + * Bookmarks are a powerful tool to enable complicated behavior. + * Consider double-checking that you need this tool if you are + * reaching for it, as inappropriate use could lead to broken + * HTML structure or unwanted processing overhead. + * + * @since 6.4.0 + * + * @param string $bookmark_name Identifies this particular bookmark. + * @return bool Whether the bookmark was successfully created. + */ + public function set_bookmark( $bookmark_name ) { + return parent::set_bookmark( "_{$bookmark_name}" ); + } + + /* + * HTML Parsing Algorithms + */ + + /** + * Closes a P element. + * + * @since 6.4.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#close-a-p-element + */ + private function close_a_p_element() { + $this->generate_implied_end_tags( 'P' ); + $this->state->stack_of_open_elements->pop_until( 'P' ); + } + + /** + * Closes elements that have implied end tags. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#generate-implied-end-tags + * + * @param string|null $except_for_this_element Perform as if this element doesn't exist in the stack of open elements. + */ + private function generate_implied_end_tags( $except_for_this_element = null ) { + $elements_with_implied_end_tags = array( + 'P', + ); + + $current_node = $this->state->stack_of_open_elements->current_node(); + while ( + $current_node && $current_node->node_name !== $except_for_this_element && + in_array( $this->state->stack_of_open_elements->current_node(), $elements_with_implied_end_tags, true ) + ) { + $this->state->stack_of_open_elements->pop(); + } + } + + /** + * Closes elements that have implied end tags, thoroughly. + * + * See the HTML specification for an explanation why this is + * different from generating end tags in the normal sense. + * + * @since 6.4.0 + * + * @see WP_HTML_Processor::generate_implied_end_tags + * @see https://html.spec.whatwg.org/#generate-implied-end-tags + */ + private function generate_implied_end_tags_thoroughly() { + $elements_with_implied_end_tags = array( + 'P', + ); + + while ( in_array( $this->state->stack_of_open_elements->current_node(), $elements_with_implied_end_tags, true ) ) { + $this->state->stack_of_open_elements->pop(); + } + } + + /** + * Reconstructs the active formatting elements. + * + * > This has the effect of reopening all the formatting elements that were opened + * > in the current body, cell, or caption (whichever is youngest) that haven't + * > been explicitly closed. + * + * @since 6.4.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#reconstruct-the-active-formatting-elements + * + * @return bool Whether any formatting elements needed to be reconstructed. + */ + private function reconstruct_active_formatting_elements() { + /* + * > If there are no entries in the list of active formatting elements, then there is nothing + * > to reconstruct; stop this algorithm. + */ + if ( 0 === $this->state->active_formatting_elements->count() ) { + return false; + } + + $last_entry = $this->state->active_formatting_elements->current_node(); + if ( + + /* + * > If the last (most recently added) entry in the list of active formatting elements is a marker; + * > stop this algorithm. + */ + 'marker' === $last_entry->node_name || + + /* + * > If the last (most recently added) entry in the list of active formatting elements is an + * > element that is in the stack of open elements, then there is nothing to reconstruct; + * > stop this algorithm. + */ + $this->state->stack_of_open_elements->contains_node( $last_entry ) + ) { + return false; + } + + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' ); + } + + /** + * Runs the adoption agency algorithm. + * + * @since 6.4.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see https://html.spec.whatwg.org/#adoption-agency-algorithm + */ + private function run_adoption_agency_algorithm() { + $budget = 1000; + $subject = $this->get_tag(); + $current_node = $this->state->stack_of_open_elements->current_node(); + + if ( + // > If the current node is an HTML element whose tag name is subject + $current_node && $subject === $current_node->node_name && + // > the current node is not in the list of active formatting elements + ! $this->state->active_formatting_elements->contains_node( $current_node ) + ) { + $this->state->stack_of_open_elements->pop(); + return; + } + + $outer_loop_counter = 0; + while ( $budget-- > 0 ) { + if ( $outer_loop_counter++ >= 8 ) { + return; + } + + /* + * > Let formatting element be the last element in the list of active formatting elements that: + * > - is between the end of the list and the last marker in the list, + * > if any, or the start of the list otherwise, + * > - and has the tag name subject. + */ + $formatting_element = null; + foreach ( $this->state->active_formatting_elements->walk_up() as $item ) { + if ( 'marker' === $item->node_name ) { + break; + } + + if ( $subject === $item->node_name ) { + $formatting_element = $item; + break; + } + } + + // > If there is no such element, then return and instead act as described in the "any other end tag" entry above. + if ( null === $formatting_element ) { + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when "any other end tag" is required.' ); + } + + // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. + if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) { + $this->state->active_formatting_elements->remove_node( $formatting_element->bookmark_name ); + return; + } + + // > If formatting element is in the stack of open elements, but the element is not in scope, then this is a parse error; return. + if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $formatting_element->node_name ) ) { + return; + } + + /* + * > Let furthest block be the topmost node in the stack of open elements that is lower in the stack + * > than formatting element, and is an element in the special category. There might not be one. + */ + $is_above_formatting_element = true; + $furthest_block = null; + foreach ( $this->state->stack_of_open_elements->walk_down() as $item ) { + if ( $is_above_formatting_element && $formatting_element->bookmark_name !== $item->bookmark_name ) { + continue; + } + + if ( $is_above_formatting_element ) { + $is_above_formatting_element = false; + continue; + } + + if ( self::is_special( $item->node_name ) ) { + $furthest_block = $item; + break; + } + } + + /* + * > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the + * > stack of open elements, from the current node up to and including formatting element, then + * > remove formatting element from the list of active formatting elements, and finally return. + */ + if ( null === $furthest_block ) { + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + + if ( $formatting_element->bookmark_name === $item->bookmark_name ) { + $this->state->active_formatting_elements->remove_node( $formatting_element ); + return; + } + } + } + + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( 'Cannot extract common ancestor in adoption agency algorithm.' ); + } + + $this->last_error = self::ERROR_UNSUPPORTED; + throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when looping required.' ); + } + + /** + * Inserts an HTML element on the stack of open elements. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#insert-a-foreign-element + * + * @param WP_HTML_Token $token Name of bookmark pointing to element in original input HTML. + */ + private function insert_html_element( $token ) { + $this->state->stack_of_open_elements->push( $token ); + } + + /* + * HTML Specification Helpers + */ + + /** + * Returns whether an element of a given name is in the HTML special category. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#special + * + * @param string $tag_name Name of element to check. + * @return bool Whether the element of the given name is in the special category. + */ + public static function is_special( $tag_name ) { + $tag_name = strtoupper( $tag_name ); + + return ( + 'ADDRESS' === $tag_name || + 'APPLET' === $tag_name || + 'AREA' === $tag_name || + 'ARTICLE' === $tag_name || + 'ASIDE' === $tag_name || + 'BASE' === $tag_name || + 'BASEFONT' === $tag_name || + 'BGSOUND' === $tag_name || + 'BLOCKQUOTE' === $tag_name || + 'BODY' === $tag_name || + 'BR' === $tag_name || + 'BUTTON' === $tag_name || + 'CAPTION' === $tag_name || + 'CENTER' === $tag_name || + 'COL' === $tag_name || + 'COLGROUP' === $tag_name || + 'DD' === $tag_name || + 'DETAILS' === $tag_name || + 'DIR' === $tag_name || + 'DIV' === $tag_name || + 'DL' === $tag_name || + 'DT' === $tag_name || + 'EMBED' === $tag_name || + 'FIELDSET' === $tag_name || + 'FIGCAPTION' === $tag_name || + 'FIGURE' === $tag_name || + 'FOOTER' === $tag_name || + 'FORM' === $tag_name || + 'FRAME' === $tag_name || + 'FRAMESET' === $tag_name || + 'H1' === $tag_name || + 'H2' === $tag_name || + 'H3' === $tag_name || + 'H4' === $tag_name || + 'H5' === $tag_name || + 'H6' === $tag_name || + 'HEAD' === $tag_name || + 'HEADER' === $tag_name || + 'HGROUP' === $tag_name || + 'HR' === $tag_name || + 'HTML' === $tag_name || + 'IFRAME' === $tag_name || + 'IMG' === $tag_name || + 'INPUT' === $tag_name || + 'KEYGEN' === $tag_name || + 'LI' === $tag_name || + 'LINK' === $tag_name || + 'LISTING' === $tag_name || + 'MAIN' === $tag_name || + 'MARQUEE' === $tag_name || + 'MENU' === $tag_name || + 'META' === $tag_name || + 'NAV' === $tag_name || + 'NOEMBED' === $tag_name || + 'NOFRAMES' === $tag_name || + 'NOSCRIPT' === $tag_name || + 'OBJECT' === $tag_name || + 'OL' === $tag_name || + 'P' === $tag_name || + 'PARAM' === $tag_name || + 'PLAINTEXT' === $tag_name || + 'PRE' === $tag_name || + 'SCRIPT' === $tag_name || + 'SEARCH' === $tag_name || + 'SECTION' === $tag_name || + 'SELECT' === $tag_name || + 'SOURCE' === $tag_name || + 'STYLE' === $tag_name || + 'SUMMARY' === $tag_name || + 'TABLE' === $tag_name || + 'TBODY' === $tag_name || + 'TD' === $tag_name || + 'TEMPLATE' === $tag_name || + 'TEXTAREA' === $tag_name || + 'TFOOT' === $tag_name || + 'TH' === $tag_name || + 'THEAD' === $tag_name || + 'TITLE' === $tag_name || + 'TR' === $tag_name || + 'TRACK' === $tag_name || + 'UL' === $tag_name || + 'WBR' === $tag_name || + 'XMP' === $tag_name || + + // MathML. + 'MI' === $tag_name || + 'MO' === $tag_name || + 'MN' === $tag_name || + 'MS' === $tag_name || + 'MTEXT' === $tag_name || + 'ANNOTATION-XML' === $tag_name || + + // SVG. + 'FOREIGNOBJECT' === $tag_name || + 'DESC' === $tag_name || + 'TITLE' === $tag_name + ); + } + + /** + * Returns whether a given element is an HTML Void Element + * + * > area, base, br, col, embed, hr, img, input, link, meta, source, track, wbr + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#void-elements + * + * @param string $tag_name Name of HTML tag to check. + * @return bool Whether the given tag is an HTML Void Element. + */ + public static function is_void( $tag_name ) { + $tag_name = strtoupper( $tag_name ); + + return ( + 'AREA' === $tag_name || + 'BASE' === $tag_name || + 'BR' === $tag_name || + 'COL' === $tag_name || + 'EMBED' === $tag_name || + 'HR' === $tag_name || + 'IMG' === $tag_name || + 'INPUT' === $tag_name || + 'LINK' === $tag_name || + 'META' === $tag_name || + 'SOURCE' === $tag_name || + 'TRACK' === $tag_name || + 'WBR' === $tag_name + ); + } + + /* + * Constants that would pollute the top of the class if they were found there. + */ + + /** + * Indicates that the next HTML token should be parsed and processed. + * + * @since 6.4.0 + * + * @var string + */ + const PROCESS_NEXT_NODE = 'process-next-node'; + + /** + * Indicates that the current HTML token should be reprocessed in the newly-selected insertion mode. + * + * @since 6.4.0 + * + * @var string + */ + const REPROCESS_CURRENT_NODE = 'reprocess-current-node'; + + /** + * Indicates that the parser encountered unsupported markup and has bailed. + * + * @since 6.4.0 + * + * @var string + */ + const ERROR_UNSUPPORTED = 'unsupported'; + + /** + * Indicates that the parser encountered more HTML tokens than it + * was able to process and has bailed. + * + * @since 6.4.0 + * + * @var string + */ + const ERROR_EXCEEDED_MAX_BOOKMARKS = 'exceeded-max-bookmarks'; + + /** + * Unlock code that must be passed into the constructor to create this class. + * + * This class extends the WP_HTML_Tag_Processor, which has a public class + * constructor. Therefore, it's not possible to have a private constructor here. + * + * This unlock code is used to ensure that anyone calling the constructor is + * doing so with a full understanding that it's intended to be a private API. + * + * @access private + */ + const CONSTRUCTOR_UNLOCK_CODE = 'Use WP_HTML_Processor::create_fragment() instead of calling the class constructor directly.'; +} diff --git a/wp-includes/html-api/class-wp-html-span.php b/wp-includes/html-api/class-wp-html-span.php new file mode 100644 index 0000000..46227eb --- /dev/null +++ b/wp-includes/html-api/class-wp-html-span.php @@ -0,0 +1,53 @@ +<?php +/** + * HTML API: WP_HTML_Span class + * + * @package WordPress + * @subpackage HTML-API + * @since 6.2.0 + */ + +/** + * Core class used by the HTML tag processor to represent a textual span + * inside an HTML document. + * + * This is a two-tuple in disguise, used to avoid the memory overhead + * involved in using an array for the same purpose. + * + * This class is for internal usage of the WP_HTML_Tag_Processor class. + * + * @access private + * @since 6.2.0 + * + * @see WP_HTML_Tag_Processor + */ +class WP_HTML_Span { + /** + * Byte offset into document where span begins. + * + * @since 6.2.0 + * @var int + */ + public $start; + + /** + * Byte offset into document where span ends. + * + * @since 6.2.0 + * @var int + */ + public $end; + + /** + * Constructor. + * + * @since 6.2.0 + * + * @param int $start Byte offset into document where replacement span begins. + * @param int $end Byte offset into document where replacement span ends. + */ + public function __construct( $start, $end ) { + $this->start = $start; + $this->end = $end; + } +} diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php new file mode 100644 index 0000000..0572c46 --- /dev/null +++ b/wp-includes/html-api/class-wp-html-tag-processor.php @@ -0,0 +1,2450 @@ +<?php +/** + * HTML API: WP_HTML_Tag_Processor class + * + * Scans through an HTML document to find specific tags, then + * transforms those tags by adding, removing, or updating the + * values of the HTML attributes within that tag (opener). + * + * Does not fully parse HTML or _recurse_ into the HTML structure + * Instead this scans linearly through a document and only parses + * the HTML tag openers. + * + * ### Possible future direction for this module + * + * - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c". + * This would increase the size of the changes for some operations but leave more + * natural-looking output HTML. + * - Decode HTML character references within class names when matching. E.g. match having + * class `1<"2` needs to recognize `class="1<"2"`. Currently the Tag Processor + * will fail to find the right tag if the class name is encoded as such. + * - Properly decode HTML character references in `get_attribute()`. PHP's + * `html_entity_decode()` is wrong in a couple ways: it doesn't account for the + * no-ambiguous-ampersand rule, and it improperly handles the way semicolons may + * or may not terminate a character reference. + * + * @package WordPress + * @subpackage HTML-API + * @since 6.2.0 + */ + +/** + * Core class used to modify attributes in an HTML document for tags matching a query. + * + * ## Usage + * + * Use of this class requires three steps: + * + * 1. Create a new class instance with your input HTML document. + * 2. Find the tag(s) you are looking for. + * 3. Request changes to the attributes in those tag(s). + * + * Example: + * + * $tags = new WP_HTML_Tag_Processor( $html ); + * if ( $tags->next_tag( 'option' ) ) { + * $tags->set_attribute( 'selected', true ); + * } + * + * ### Finding tags + * + * The `next_tag()` function moves the internal cursor through + * your input HTML document until it finds a tag meeting any of + * the supplied restrictions in the optional query argument. If + * no argument is provided then it will find the next HTML tag, + * regardless of what kind it is. + * + * If you want to _find whatever the next tag is_: + * + * $tags->next_tag(); + * + * | Goal | Query | + * |-----------------------------------------------------------|---------------------------------------------------------------------------------| + * | Find any tag. | `$tags->next_tag();` | + * | Find next image tag. | `$tags->next_tag( array( 'tag_name' => 'img' ) );` | + * | Find next image tag (without passing the array). | `$tags->next_tag( 'img' );` | + * | Find next tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'class_name' => 'fullwidth' ) );` | + * | Find next image tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'tag_name' => 'img', 'class_name' => 'fullwidth' ) );` | + * + * If a tag was found meeting your criteria then `next_tag()` + * will return `true` and you can proceed to modify it. If it + * returns `false`, however, it failed to find the tag and + * moved the cursor to the end of the file. + * + * Once the cursor reaches the end of the file the processor + * is done and if you want to reach an earlier tag you will + * need to recreate the processor and start over, as it's + * unable to back up or move in reverse. + * + * See the section on bookmarks for an exception to this + * no-backing-up rule. + * + * #### Custom queries + * + * Sometimes it's necessary to further inspect an HTML tag than + * the query syntax here permits. In these cases one may further + * inspect the search results using the read-only functions + * provided by the processor or external state or variables. + * + * Example: + * + * // Paint up to the first five DIV or SPAN tags marked with the "jazzy" style. + * $remaining_count = 5; + * while ( $remaining_count > 0 && $tags->next_tag() ) { + * if ( + * ( 'DIV' === $tags->get_tag() || 'SPAN' === $tags->get_tag() ) && + * 'jazzy' === $tags->get_attribute( 'data-style' ) + * ) { + * $tags->add_class( 'theme-style-everest-jazz' ); + * $remaining_count--; + * } + * } + * + * `get_attribute()` will return `null` if the attribute wasn't present + * on the tag when it was called. It may return `""` (the empty string) + * in cases where the attribute was present but its value was empty. + * For boolean attributes, those whose name is present but no value is + * given, it will return `true` (the only way to set `false` for an + * attribute is to remove it). + * + * ### Modifying HTML attributes for a found tag + * + * Once you've found the start of an opening tag you can modify + * any number of the attributes on that tag. You can set a new + * value for an attribute, remove the entire attribute, or do + * nothing and move on to the next opening tag. + * + * Example: + * + * if ( $tags->next_tag( array( 'class_name' => 'wp-group-block' ) ) ) { + * $tags->set_attribute( 'title', 'This groups the contained content.' ); + * $tags->remove_attribute( 'data-test-id' ); + * } + * + * If `set_attribute()` is called for an existing attribute it will + * overwrite the existing value. Similarly, calling `remove_attribute()` + * for a non-existing attribute has no effect on the document. Both + * of these methods are safe to call without knowing if a given attribute + * exists beforehand. + * + * ### Modifying CSS classes for a found tag + * + * The tag processor treats the `class` attribute as a special case. + * Because it's a common operation to add or remove CSS classes, this + * interface adds helper methods to make that easier. + * + * As with attribute values, adding or removing CSS classes is a safe + * operation that doesn't require checking if the attribute or class + * exists before making changes. If removing the only class then the + * entire `class` attribute will be removed. + * + * Example: + * + * // from `<span>Yippee!</span>` + * // to `<span class="is-active">Yippee!</span>` + * $tags->add_class( 'is-active' ); + * + * // from `<span class="excited">Yippee!</span>` + * // to `<span class="excited is-active">Yippee!</span>` + * $tags->add_class( 'is-active' ); + * + * // from `<span class="is-active heavy-accent">Yippee!</span>` + * // to `<span class="is-active heavy-accent">Yippee!</span>` + * $tags->add_class( 'is-active' ); + * + * // from `<input type="text" class="is-active rugby not-disabled" length="24">` + * // to `<input type="text" class="is-active not-disabled" length="24"> + * $tags->remove_class( 'rugby' ); + * + * // from `<input type="text" class="rugby" length="24">` + * // to `<input type="text" length="24"> + * $tags->remove_class( 'rugby' ); + * + * // from `<input type="text" length="24">` + * // to `<input type="text" length="24"> + * $tags->remove_class( 'rugby' ); + * + * When class changes are enqueued but a direct change to `class` is made via + * `set_attribute` then the changes to `set_attribute` (or `remove_attribute`) + * will take precedence over those made through `add_class` and `remove_class`. + * + * ### Bookmarks + * + * While scanning through the input HTMl document it's possible to set + * a named bookmark when a particular tag is found. Later on, after + * continuing to scan other tags, it's possible to `seek` to one of + * the set bookmarks and then proceed again from that point forward. + * + * Because bookmarks create processing overhead one should avoid + * creating too many of them. As a rule, create only bookmarks + * of known string literal names; avoid creating "mark_{$index}" + * and so on. It's fine from a performance standpoint to create a + * bookmark and update it frequently, such as within a loop. + * + * $total_todos = 0; + * while ( $p->next_tag( array( 'tag_name' => 'UL', 'class_name' => 'todo' ) ) ) { + * $p->set_bookmark( 'list-start' ); + * while ( $p->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + * if ( 'UL' === $p->get_tag() && $p->is_tag_closer() ) { + * $p->set_bookmark( 'list-end' ); + * $p->seek( 'list-start' ); + * $p->set_attribute( 'data-contained-todos', (string) $total_todos ); + * $total_todos = 0; + * $p->seek( 'list-end' ); + * break; + * } + * + * if ( 'LI' === $p->get_tag() && ! $p->is_tag_closer() ) { + * $total_todos++; + * } + * } + * } + * + * ## Design and limitations + * + * The Tag Processor is designed to linearly scan HTML documents and tokenize + * HTML tags and their attributes. It's designed to do this as efficiently as + * possible without compromising parsing integrity. Therefore it will be + * slower than some methods of modifying HTML, such as those incorporating + * over-simplified PCRE patterns, but will not introduce the defects and + * failures that those methods bring in, which lead to broken page renders + * and often to security vulnerabilities. On the other hand, it will be faster + * than full-blown HTML parsers such as DOMDocument and use considerably + * less memory. It requires a negligible memory overhead, enough to consider + * it a zero-overhead system. + * + * The performance characteristics are maintained by avoiding tree construction + * and semantic cleanups which are specified in HTML5. Because of this, for + * example, it's not possible for the Tag Processor to associate any given + * opening tag with its corresponding closing tag, or to return the inner markup + * inside an element. Systems may be built on top of the Tag Processor to do + * this, but the Tag Processor is and should be constrained so it can remain an + * efficient, low-level, and reliable HTML scanner. + * + * The Tag Processor's design incorporates a "garbage-in-garbage-out" philosophy. + * HTML5 specifies that certain invalid content be transformed into different forms + * for display, such as removing null bytes from an input document and replacing + * invalid characters with the Unicode replacement character `U+FFFD` (visually "�"). + * Where errors or transformations exist within the HTML5 specification, the Tag Processor + * leaves those invalid inputs untouched, passing them through to the final browser + * to handle. While this implies that certain operations will be non-spec-compliant, + * such as reading the value of an attribute with invalid content, it also preserves a + * simplicity and efficiency for handling those error cases. + * + * Most operations within the Tag Processor are designed to minimize the difference + * between an input and output document for any given change. For example, the + * `add_class` and `remove_class` methods preserve whitespace and the class ordering + * within the `class` attribute; and when encountering tags with duplicated attributes, + * the Tag Processor will leave those invalid duplicate attributes where they are but + * update the proper attribute which the browser will read for parsing its value. An + * exception to this rule is that all attribute updates store their values as + * double-quoted strings, meaning that attributes on input with single-quoted or + * unquoted values will appear in the output with double-quotes. + * + * @since 6.2.0 + * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive. + * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE. + */ +class WP_HTML_Tag_Processor { + /** + * The maximum number of bookmarks allowed to exist at + * any given time. + * + * @since 6.2.0 + * @var int + * + * @see WP_HTML_Tag_Processor::set_bookmark() + */ + const MAX_BOOKMARKS = 10; + + /** + * Maximum number of times seek() can be called. + * Prevents accidental infinite loops. + * + * @since 6.2.0 + * @var int + * + * @see WP_HTML_Tag_Processor::seek() + */ + const MAX_SEEK_OPS = 1000; + + /** + * The HTML document to parse. + * + * @since 6.2.0 + * @var string + */ + protected $html; + + /** + * The last query passed to next_tag(). + * + * @since 6.2.0 + * @var array|null + */ + private $last_query; + + /** + * The tag name this processor currently scans for. + * + * @since 6.2.0 + * @var string|null + */ + private $sought_tag_name; + + /** + * The CSS class name this processor currently scans for. + * + * @since 6.2.0 + * @var string|null + */ + private $sought_class_name; + + /** + * The match offset this processor currently scans for. + * + * @since 6.2.0 + * @var int|null + */ + private $sought_match_offset; + + /** + * Whether to visit tag closers, e.g. </div>, when walking an input document. + * + * @since 6.2.0 + * @var bool + */ + private $stop_on_tag_closers; + + /** + * How many bytes from the original HTML document have been read and parsed. + * + * This value points to the latest byte offset in the input document which + * has been already parsed. It is the internal cursor for the Tag Processor + * and updates while scanning through the HTML tokens. + * + * @since 6.2.0 + * @var int + */ + private $bytes_already_parsed = 0; + + /** + * Byte offset in input document where current tag name starts. + * + * Example: + * + * <div id="test">... + * 01234 + * - tag name starts at 1 + * + * @since 6.2.0 + * @var int|null + */ + private $tag_name_starts_at; + + /** + * Byte length of current tag name. + * + * Example: + * + * <div id="test">... + * 01234 + * --- tag name length is 3 + * + * @since 6.2.0 + * @var int|null + */ + private $tag_name_length; + + /** + * Byte offset in input document where current tag token ends. + * + * Example: + * + * <div id="test">... + * 0 1 | + * 01234567890123456 + * --- tag name ends at 14 + * + * @since 6.2.0 + * @var int|null + */ + private $tag_ends_at; + + /** + * Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>. + * + * @var bool + */ + private $is_closing_tag; + + /** + * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. + * + * Example: + * + * // Supposing the parser is working through this content + * // and stops after recognizing the `id` attribute. + * // <div id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8"> + * // ^ parsing will continue from this point. + * $this->attributes = array( + * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ) + * ); + * + * // When picking up parsing again, or when asking to find the + * // `class` attribute we will continue and add to this array. + * $this->attributes = array( + * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ), + * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 ) + * ); + * + * // Note that only the `class` attribute value is stored in the index. + * // That's because it is the only value used by this class at the moment. + * + * @since 6.2.0 + * @var WP_HTML_Attribute_Token[] + */ + private $attributes = array(); + + /** + * Tracks spans of duplicate attributes on a given tag, used for removing + * all copies of an attribute when calling `remove_attribute()`. + * + * @since 6.3.2 + * + * @var (WP_HTML_Span[])[]|null + */ + private $duplicate_attributes = null; + + /** + * Which class names to add or remove from a tag. + * + * These are tracked separately from attribute updates because they are + * semantically distinct, whereas this interface exists for the common + * case of adding and removing class names while other attributes are + * generally modified as with DOM `setAttribute` calls. + * + * When modifying an HTML document these will eventually be collapsed + * into a single `set_attribute( 'class', $changes )` call. + * + * Example: + * + * // Add the `wp-block-group` class, remove the `wp-group` class. + * $classname_updates = array( + * // Indexed by a comparable class name. + * 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS, + * 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS + * ); + * + * @since 6.2.0 + * @var bool[] + */ + private $classname_updates = array(); + + /** + * Tracks a semantic location in the original HTML which + * shifts with updates as they are applied to the document. + * + * @since 6.2.0 + * @var WP_HTML_Span[] + */ + protected $bookmarks = array(); + + const ADD_CLASS = true; + const REMOVE_CLASS = false; + const SKIP_CLASS = null; + + /** + * Lexical replacements to apply to input HTML document. + * + * "Lexical" in this class refers to the part of this class which + * operates on pure text _as text_ and not as HTML. There's a line + * between the public interface, with HTML-semantic methods like + * `set_attribute` and `add_class`, and an internal state that tracks + * text offsets in the input document. + * + * When higher-level HTML methods are called, those have to transform their + * operations (such as setting an attribute's value) into text diffing + * operations (such as replacing the sub-string from indices A to B with + * some given new string). These text-diffing operations are the lexical + * updates. + * + * As new higher-level methods are added they need to collapse their + * operations into these lower-level lexical updates since that's the + * Tag Processor's internal language of change. Any code which creates + * these lexical updates must ensure that they do not cross HTML syntax + * boundaries, however, so these should never be exposed outside of this + * class or any classes which intentionally expand its functionality. + * + * These are enqueued while editing the document instead of being immediately + * applied to avoid processing overhead, string allocations, and string + * copies when applying many updates to a single document. + * + * Example: + * + * // Replace an attribute stored with a new value, indices + * // sourced from the lazily-parsed HTML recognizer. + * $start = $attributes['src']->start; + * $end = $attributes['src']->end; + * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value ); + * + * // Correspondingly, something like this will appear in this array. + * $lexical_updates = array( + * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' ) + * ); + * + * @since 6.2.0 + * @var WP_HTML_Text_Replacement[] + */ + protected $lexical_updates = array(); + + /** + * Tracks and limits `seek()` calls to prevent accidental infinite loops. + * + * @since 6.2.0 + * @var int + * + * @see WP_HTML_Tag_Processor::seek() + */ + protected $seek_count = 0; + + /** + * Constructor. + * + * @since 6.2.0 + * + * @param string $html HTML to process. + */ + public function __construct( $html ) { + $this->html = $html; + } + + /** + * Finds the next tag matching the $query. + * + * @since 6.2.0 + * + * @param array|string|null $query { + * Optional. Which tag name to find, having which class, etc. Default is to find any tag. + * + * @type string|null $tag_name Which tag to find, or `null` for "any tag." + * @type int|null $match_offset Find the Nth tag matching all search criteria. + * 1 for "first" tag, 3 for "third," etc. + * Defaults to first tag. + * @type string|null $class_name Tag must contain this whole class name to match. + * @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </div>. + * } + * @return bool Whether a tag was matched. + */ + public function next_tag( $query = null ) { + $this->parse_query( $query ); + $already_found = 0; + + do { + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + // Find the next tag if it exists. + if ( false === $this->parse_next_tag() ) { + $this->bytes_already_parsed = strlen( $this->html ); + + return false; + } + + // Parse all of its attributes. + while ( $this->parse_next_attribute() ) { + continue; + } + + // Ensure that the tag closes before the end of the document. + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); + if ( false === $tag_ends_at ) { + return false; + } + $this->tag_ends_at = $tag_ends_at; + $this->bytes_already_parsed = $tag_ends_at; + + // Finally, check if the parsed tag and its attributes match the search query. + if ( $this->matches() ) { + ++$already_found; + } + + /* + * For non-DATA sections which might contain text that looks like HTML tags but + * isn't, scan with the appropriate alternative mode. Looking at the first letter + * of the tag name as a pre-check avoids a string allocation when it's not needed. + */ + $t = $this->html[ $this->tag_name_starts_at ]; + if ( + ! $this->is_closing_tag && + ( + 'i' === $t || 'I' === $t || + 'n' === $t || 'N' === $t || + 's' === $t || 'S' === $t || + 't' === $t || 'T' === $t + ) ) { + $tag_name = $this->get_tag(); + + if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { + $this->bytes_already_parsed = strlen( $this->html ); + return false; + } elseif ( + ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && + ! $this->skip_rcdata( $tag_name ) + ) { + $this->bytes_already_parsed = strlen( $this->html ); + return false; + } elseif ( + ( + 'IFRAME' === $tag_name || + 'NOEMBED' === $tag_name || + 'NOFRAMES' === $tag_name || + 'NOSCRIPT' === $tag_name || + 'STYLE' === $tag_name + ) && + ! $this->skip_rawtext( $tag_name ) + ) { + /* + * "XMP" should be here too but its rules are more complicated and require the + * complexity of the HTML Processor (it needs to close out any open P element, + * meaning it can't be skipped here or else the HTML Processor will lose its + * place). For now, it can be ignored as it's a rare HTML tag in practice and + * any normative HTML should be using PRE instead. + */ + $this->bytes_already_parsed = strlen( $this->html ); + return false; + } + } + } while ( $already_found < $this->sought_match_offset ); + + return true; + } + + + /** + * Generator for a foreach loop to step through each class name for the matched tag. + * + * This generator function is designed to be used inside a "foreach" loop. + * + * Example: + * + * $p = new WP_HTML_Tag_Processor( "<div class='free <egg<\tlang-en'>" ); + * $p->next_tag(); + * foreach ( $p->class_list() as $class_name ) { + * echo "{$class_name} "; + * } + * // Outputs: "free <egg> lang-en " + * + * @since 6.4.0 + */ + public function class_list() { + /** @var string $class contains the string value of the class attribute, with character references decoded. */ + $class = $this->get_attribute( 'class' ); + + if ( ! is_string( $class ) ) { + return; + } + + $seen = array(); + + $at = 0; + while ( $at < strlen( $class ) ) { + // Skip past any initial boundary characters. + $at += strspn( $class, " \t\f\r\n", $at ); + if ( $at >= strlen( $class ) ) { + return; + } + + // Find the byte length until the next boundary. + $length = strcspn( $class, " \t\f\r\n", $at ); + if ( 0 === $length ) { + return; + } + + /* + * CSS class names are case-insensitive in the ASCII range. + * + * @see https://www.w3.org/TR/CSS2/syndata.html#x1 + */ + $name = strtolower( substr( $class, $at, $length ) ); + $at += $length; + + /* + * It's expected that the number of class names for a given tag is relatively small. + * Given this, it is probably faster overall to scan an array for a value rather + * than to use the class name as a key and check if it's a key of $seen. + */ + if ( in_array( $name, $seen, true ) ) { + continue; + } + + $seen[] = $name; + yield $name; + } + } + + + /** + * Returns if a matched tag contains the given ASCII case-insensitive class name. + * + * @since 6.4.0 + * + * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive. + * @return bool|null Whether the matched tag contains the given class name, or null if not matched. + */ + public function has_class( $wanted_class ) { + if ( ! $this->tag_name_starts_at ) { + return null; + } + + $wanted_class = strtolower( $wanted_class ); + + foreach ( $this->class_list() as $class_name ) { + if ( $class_name === $wanted_class ) { + return true; + } + } + + return false; + } + + + /** + * Sets a bookmark in the HTML document. + * + * Bookmarks represent specific places or tokens in the HTML + * document, such as a tag opener or closer. When applying + * edits to a document, such as setting an attribute, the + * text offsets of that token may shift; the bookmark is + * kept updated with those shifts and remains stable unless + * the entire span of text in which the token sits is removed. + * + * Release bookmarks when they are no longer needed. + * + * Example: + * + * <main><h2>Surprising fact you may not know!</h2></main> + * ^ ^ + * \-|-- this `H2` opener bookmark tracks the token + * + * <main class="clickbait"><h2>Surprising fact you may no… + * ^ ^ + * \-|-- it shifts with edits + * + * Bookmarks provide the ability to seek to a previously-scanned + * place in the HTML document. This avoids the need to re-scan + * the entire document. + * + * Example: + * + * <ul><li>One</li><li>Two</li><li>Three</li></ul> + * ^^^^ + * want to note this last item + * + * $p = new WP_HTML_Tag_Processor( $html ); + * $in_list = false; + * while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) { + * if ( 'UL' === $p->get_tag() ) { + * if ( $p->is_tag_closer() ) { + * $in_list = false; + * $p->set_bookmark( 'resume' ); + * if ( $p->seek( 'last-li' ) ) { + * $p->add_class( 'last-li' ); + * } + * $p->seek( 'resume' ); + * $p->release_bookmark( 'last-li' ); + * $p->release_bookmark( 'resume' ); + * } else { + * $in_list = true; + * } + * } + * + * if ( 'LI' === $p->get_tag() ) { + * $p->set_bookmark( 'last-li' ); + * } + * } + * + * Bookmarks intentionally hide the internal string offsets + * to which they refer. They are maintained internally as + * updates are applied to the HTML document and therefore + * retain their "position" - the location to which they + * originally pointed. The inability to use bookmarks with + * functions like `substr` is therefore intentional to guard + * against accidentally breaking the HTML. + * + * Because bookmarks allocate memory and require processing + * for every applied update, they are limited and require + * a name. They should not be created with programmatically-made + * names, such as "li_{$index}" with some loop. As a general + * rule they should only be created with string-literal names + * like "start-of-section" or "last-paragraph". + * + * Bookmarks are a powerful tool to enable complicated behavior. + * Consider double-checking that you need this tool if you are + * reaching for it, as inappropriate use could lead to broken + * HTML structure or unwanted processing overhead. + * + * @since 6.2.0 + * + * @param string $name Identifies this particular bookmark. + * @return bool Whether the bookmark was successfully created. + */ + public function set_bookmark( $name ) { + if ( null === $this->tag_name_starts_at ) { + return false; + } + + if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= static::MAX_BOOKMARKS ) { + _doing_it_wrong( + __METHOD__, + __( 'Too many bookmarks: cannot create any more.' ), + '6.2.0' + ); + return false; + } + + $this->bookmarks[ $name ] = new WP_HTML_Span( + $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ), + $this->tag_ends_at + ); + + return true; + } + + + /** + * Removes a bookmark that is no longer needed. + * + * Releasing a bookmark frees up the small + * performance overhead it requires. + * + * @param string $name Name of the bookmark to remove. + * @return bool Whether the bookmark already existed before removal. + */ + public function release_bookmark( $name ) { + if ( ! array_key_exists( $name, $this->bookmarks ) ) { + return false; + } + + unset( $this->bookmarks[ $name ] ); + + return true; + } + + /** + * Skips contents of generic rawtext elements. + * + * @since 6.3.2 + * + * @see https://html.spec.whatwg.org/#generic-raw-text-element-parsing-algorithm + * + * @param string $tag_name The uppercase tag name which will close the RAWTEXT region. + * @return bool Whether an end to the RAWTEXT region was found before the end of the document. + */ + private function skip_rawtext( $tag_name ) { + /* + * These two functions distinguish themselves on whether character references are + * decoded, and since functionality to read the inner markup isn't supported, it's + * not necessary to implement these two functions separately. + */ + return $this->skip_rcdata( $tag_name ); + } + + /** + * Skips contents of RCDATA elements, namely title and textarea tags. + * + * @since 6.2.0 + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state + * + * @param string $tag_name The uppercase tag name which will close the RCDATA region. + * @return bool Whether an end to the RCDATA region was found before the end of the document. + */ + private function skip_rcdata( $tag_name ) { + $html = $this->html; + $doc_length = strlen( $html ); + $tag_length = strlen( $tag_name ); + + $at = $this->bytes_already_parsed; + + while ( false !== $at && $at < $doc_length ) { + $at = strpos( $this->html, '</', $at ); + + // If there is no possible tag closer then fail. + if ( false === $at || ( $at + $tag_length ) >= $doc_length ) { + $this->bytes_already_parsed = $doc_length; + return false; + } + + $closer_potentially_starts_at = $at; + $at += 2; + + /* + * Find a case-insensitive match to the tag name. + * + * Because tag names are limited to US-ASCII there is no + * need to perform any kind of Unicode normalization when + * comparing; any character which could be impacted by such + * normalization could not be part of a tag name. + */ + for ( $i = 0; $i < $tag_length; $i++ ) { + $tag_char = $tag_name[ $i ]; + $html_char = $html[ $at + $i ]; + + if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) { + $at += $i; + continue 2; + } + } + + $at += $tag_length; + $this->bytes_already_parsed = $at; + + /* + * Ensure that the tag name terminates to avoid matching on + * substrings of a longer tag name. For example, the sequence + * "</textarearug" should not match for "</textarea" even + * though "textarea" is found within the text. + */ + $c = $html[ $at ]; + if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) { + continue; + } + + while ( $this->parse_next_attribute() ) { + continue; + } + $at = $this->bytes_already_parsed; + if ( $at >= strlen( $this->html ) ) { + return false; + } + + if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { + $this->bytes_already_parsed = $closer_potentially_starts_at; + return true; + } + } + + return false; + } + + /** + * Skips contents of script tags. + * + * @since 6.2.0 + * + * @return bool Whether the script tag was closed before the end of the document. + */ + private function skip_script_data() { + $state = 'unescaped'; + $html = $this->html; + $doc_length = strlen( $html ); + $at = $this->bytes_already_parsed; + + while ( false !== $at && $at < $doc_length ) { + $at += strcspn( $html, '-<', $at ); + + /* + * For all script states a "-->" transitions + * back into the normal unescaped script mode, + * even if that's the current state. + */ + if ( + $at + 2 < $doc_length && + '-' === $html[ $at ] && + '-' === $html[ $at + 1 ] && + '>' === $html[ $at + 2 ] + ) { + $at += 3; + $state = 'unescaped'; + continue; + } + + // Everything of interest past here starts with "<". + if ( $at + 1 >= $doc_length || '<' !== $html[ $at++ ] ) { + continue; + } + + /* + * Unlike with "-->", the "<!--" only transitions + * into the escaped mode if not already there. + * + * Inside the escaped modes it will be ignored; and + * should never break out of the double-escaped + * mode and back into the escaped mode. + * + * While this requires a mode change, it does not + * impact the parsing otherwise, so continue + * parsing after updating the state. + */ + if ( + $at + 2 < $doc_length && + '!' === $html[ $at ] && + '-' === $html[ $at + 1 ] && + '-' === $html[ $at + 2 ] + ) { + $at += 3; + $state = 'unescaped' === $state ? 'escaped' : $state; + continue; + } + + if ( '/' === $html[ $at ] ) { + $closer_potentially_starts_at = $at - 1; + $is_closing = true; + ++$at; + } else { + $is_closing = false; + } + + /* + * At this point the only remaining state-changes occur with the + * <script> and </script> tags; unless one of these appears next, + * proceed scanning to the next potential token in the text. + */ + if ( ! ( + $at + 6 < $doc_length && + ( 's' === $html[ $at ] || 'S' === $html[ $at ] ) && + ( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) && + ( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) && + ( 'i' === $html[ $at + 3 ] || 'I' === $html[ $at + 3 ] ) && + ( 'p' === $html[ $at + 4 ] || 'P' === $html[ $at + 4 ] ) && + ( 't' === $html[ $at + 5 ] || 'T' === $html[ $at + 5 ] ) + ) ) { + ++$at; + continue; + } + + /* + * Ensure that the script tag terminates to avoid matching on + * substrings of a non-match. For example, the sequence + * "<script123" should not end a script region even though + * "<script" is found within the text. + */ + if ( $at + 6 >= $doc_length ) { + continue; + } + $at += 6; + $c = $html[ $at ]; + if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) { + ++$at; + continue; + } + + if ( 'escaped' === $state && ! $is_closing ) { + $state = 'double-escaped'; + continue; + } + + if ( 'double-escaped' === $state && $is_closing ) { + $state = 'escaped'; + continue; + } + + if ( $is_closing ) { + $this->bytes_already_parsed = $closer_potentially_starts_at; + if ( $this->bytes_already_parsed >= $doc_length ) { + return false; + } + + while ( $this->parse_next_attribute() ) { + continue; + } + + if ( '>' === $html[ $this->bytes_already_parsed ] ) { + $this->bytes_already_parsed = $closer_potentially_starts_at; + return true; + } + } + + ++$at; + } + + return false; + } + + /** + * Parses the next tag. + * + * This will find and start parsing the next tag, including + * the opening `<`, the potential closer `/`, and the tag + * name. It does not parse the attributes or scan to the + * closing `>`; these are left for other methods. + * + * @since 6.2.0 + * @since 6.2.1 Support abruptly-closed comments, invalid-tag-closer-comments, and empty elements. + * + * @return bool Whether a tag was found before the end of the document. + */ + private function parse_next_tag() { + $this->after_tag(); + + $html = $this->html; + $doc_length = strlen( $html ); + $at = $this->bytes_already_parsed; + + while ( false !== $at && $at < $doc_length ) { + $at = strpos( $html, '<', $at ); + if ( false === $at ) { + return false; + } + + if ( '/' === $this->html[ $at + 1 ] ) { + $this->is_closing_tag = true; + ++$at; + } else { + $this->is_closing_tag = false; + } + + /* + * HTML tag names must start with [a-zA-Z] otherwise they are not tags. + * For example, "<3" is rendered as text, not a tag opener. If at least + * one letter follows the "<" then _it is_ a tag, but if the following + * character is anything else it _is not a tag_. + * + * It's not uncommon to find non-tags starting with `<` in an HTML + * document, so it's good for performance to make this pre-check before + * continuing to attempt to parse a tag name. + * + * Reference: + * * https://html.spec.whatwg.org/multipage/parsing.html#data-state + * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 ); + if ( $tag_name_prefix_length > 0 ) { + ++$at; + $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); + $this->tag_name_starts_at = $at; + $this->bytes_already_parsed = $at + $this->tag_name_length; + return true; + } + + /* + * Abort if no tag is found before the end of + * the document. There is nothing left to parse. + */ + if ( $at + 1 >= strlen( $html ) ) { + return false; + } + + /* + * <! transitions to markup declaration open state + * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state + */ + if ( '!' === $html[ $at + 1 ] ) { + /* + * <!-- transitions to a bogus comment state – skip to the nearest --> + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + if ( + strlen( $html ) > $at + 3 && + '-' === $html[ $at + 2 ] && + '-' === $html[ $at + 3 ] + ) { + $closer_at = $at + 4; + // If it's not possible to close the comment then there is nothing more to scan. + if ( strlen( $html ) <= $closer_at ) { + return false; + } + + // Abruptly-closed empty comments are a sequence of dashes followed by `>`. + $span_of_dashes = strspn( $html, '-', $closer_at ); + if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { + $at = $closer_at + $span_of_dashes + 1; + continue; + } + + /* + * Comments may be closed by either a --> or an invalid --!>. + * The first occurrence closes the comment. + * + * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment + */ + --$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping. + while ( ++$closer_at < strlen( $html ) ) { + $closer_at = strpos( $html, '--', $closer_at ); + if ( false === $closer_at ) { + return false; + } + + if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) { + $at = $closer_at + 3; + continue 2; + } + + if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { + $at = $closer_at + 4; + continue 2; + } + } + } + + /* + * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]> + * The CDATA is case-sensitive. + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + if ( + strlen( $html ) > $at + 8 && + '[' === $html[ $at + 2 ] && + 'C' === $html[ $at + 3 ] && + 'D' === $html[ $at + 4 ] && + 'A' === $html[ $at + 5 ] && + 'T' === $html[ $at + 6 ] && + 'A' === $html[ $at + 7 ] && + '[' === $html[ $at + 8 ] + ) { + $closer_at = strpos( $html, ']]>', $at + 9 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 3; + continue; + } + + /* + * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest > + * These are ASCII-case-insensitive. + * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + if ( + strlen( $html ) > $at + 8 && + ( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) && + ( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) && + ( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) && + ( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) && + ( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) && + ( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) && + ( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] ) + ) { + $closer_at = strpos( $html, '>', $at + 9 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 1; + continue; + } + + /* + * Anything else here is an incorrectly-opened comment and transitions + * to the bogus comment state - skip to the nearest >. + */ + $at = strpos( $html, '>', $at + 1 ); + continue; + } + + /* + * </> is a missing end tag name, which is ignored. + * + * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name + */ + if ( '>' === $html[ $at + 1 ] ) { + ++$at; + continue; + } + + /* + * <? transitions to a bogus comment state – skip to the nearest > + * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + */ + if ( '?' === $html[ $at + 1 ] ) { + $closer_at = strpos( $html, '>', $at + 2 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 1; + continue; + } + + /* + * If a non-alpha starts the tag name in a tag closer it's a comment. + * Find the first `>`, which closes the comment. + * + * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name + */ + if ( $this->is_closing_tag ) { + $closer_at = strpos( $html, '>', $at + 3 ); + if ( false === $closer_at ) { + return false; + } + + $at = $closer_at + 1; + continue; + } + + ++$at; + } + + return false; + } + + /** + * Parses the next attribute. + * + * @since 6.2.0 + * + * @return bool Whether an attribute was found before the end of the document. + */ + private function parse_next_attribute() { + // Skip whitespace and slashes. + $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + /* + * Treat the equal sign as a part of the attribute + * name if it is the first encountered byte. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state + */ + $name_length = '=' === $this->html[ $this->bytes_already_parsed ] + ? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed + 1 ) + : strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed ); + + // No attribute, just tag closer. + if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) { + return false; + } + + $attribute_start = $this->bytes_already_parsed; + $attribute_name = substr( $this->html, $attribute_start, $name_length ); + $this->bytes_already_parsed += $name_length; + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + $this->skip_whitespace(); + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + $has_value = '=' === $this->html[ $this->bytes_already_parsed ]; + if ( $has_value ) { + ++$this->bytes_already_parsed; + $this->skip_whitespace(); + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + return false; + } + + switch ( $this->html[ $this->bytes_already_parsed ] ) { + case "'": + case '"': + $quote = $this->html[ $this->bytes_already_parsed ]; + $value_start = $this->bytes_already_parsed + 1; + $value_length = strcspn( $this->html, $quote, $value_start ); + $attribute_end = $value_start + $value_length + 1; + $this->bytes_already_parsed = $attribute_end; + break; + + default: + $value_start = $this->bytes_already_parsed; + $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start ); + $attribute_end = $value_start + $value_length; + $this->bytes_already_parsed = $attribute_end; + } + } else { + $value_start = $this->bytes_already_parsed; + $value_length = 0; + $attribute_end = $attribute_start + $name_length; + } + + if ( $attribute_end >= strlen( $this->html ) ) { + return false; + } + + if ( $this->is_closing_tag ) { + return true; + } + + /* + * > There must never be two or more attributes on + * > the same start tag whose names are an ASCII + * > case-insensitive match for each other. + * - HTML 5 spec + * + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + */ + $comparable_name = strtolower( $attribute_name ); + + // If an attribute is listed many times, only use the first declaration and ignore the rest. + if ( ! array_key_exists( $comparable_name, $this->attributes ) ) { + $this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token( + $attribute_name, + $value_start, + $value_length, + $attribute_start, + $attribute_end, + ! $has_value + ); + + return true; + } + + /* + * Track the duplicate attributes so if we remove it, all disappear together. + * + * While `$this->duplicated_attributes` could always be stored as an `array()`, + * which would simplify the logic here, storing a `null` and only allocating + * an array when encountering duplicates avoids needless allocations in the + * normative case of parsing tags with no duplicate attributes. + */ + $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end ); + if ( null === $this->duplicate_attributes ) { + $this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) ); + } elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) { + $this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span ); + } else { + $this->duplicate_attributes[ $comparable_name ][] = $duplicate_span; + } + + return true; + } + + /** + * Move the internal cursor past any immediate successive whitespace. + * + * @since 6.2.0 + */ + private function skip_whitespace() { + $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n", $this->bytes_already_parsed ); + } + + /** + * Applies attribute updates and cleans up once a tag is fully parsed. + * + * @since 6.2.0 + */ + private function after_tag() { + $this->get_updated_html(); + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->tag_ends_at = null; + $this->is_closing_tag = null; + $this->attributes = array(); + $this->duplicate_attributes = null; + } + + /** + * Converts class name updates into tag attributes updates + * (they are accumulated in different data formats for performance). + * + * @since 6.2.0 + * + * @see WP_HTML_Tag_Processor::$lexical_updates + * @see WP_HTML_Tag_Processor::$classname_updates + */ + private function class_name_updates_to_attributes_updates() { + if ( count( $this->classname_updates ) === 0 ) { + return; + } + + $existing_class = $this->get_enqueued_attribute_value( 'class' ); + if ( null === $existing_class || true === $existing_class ) { + $existing_class = ''; + } + + if ( false === $existing_class && isset( $this->attributes['class'] ) ) { + $existing_class = substr( + $this->html, + $this->attributes['class']->value_starts_at, + $this->attributes['class']->value_length + ); + } + + if ( false === $existing_class ) { + $existing_class = ''; + } + + /** + * Updated "class" attribute value. + * + * This is incrementally built while scanning through the existing class + * attribute, skipping removed classes on the way, and then appending + * added classes at the end. Only when finished processing will the + * value contain the final new value. + + * @var string $class + */ + $class = ''; + + /** + * Tracks the cursor position in the existing + * class attribute value while parsing. + * + * @var int $at + */ + $at = 0; + + /** + * Indicates if there's any need to modify the existing class attribute. + * + * If a call to `add_class()` and `remove_class()` wouldn't impact + * the `class` attribute value then there's no need to rebuild it. + * For example, when adding a class that's already present or + * removing one that isn't. + * + * This flag enables a performance optimization when none of the enqueued + * class updates would impact the `class` attribute; namely, that the + * processor can continue without modifying the input document, as if + * none of the `add_class()` or `remove_class()` calls had been made. + * + * This flag is set upon the first change that requires a string update. + * + * @var bool $modified + */ + $modified = false; + + // Remove unwanted classes by only copying the new ones. + $existing_class_length = strlen( $existing_class ); + while ( $at < $existing_class_length ) { + // Skip to the first non-whitespace character. + $ws_at = $at; + $ws_length = strspn( $existing_class, " \t\f\r\n", $ws_at ); + $at += $ws_length; + + // Capture the class name – it's everything until the next whitespace. + $name_length = strcspn( $existing_class, " \t\f\r\n", $at ); + if ( 0 === $name_length ) { + // If no more class names are found then that's the end. + break; + } + + $name = substr( $existing_class, $at, $name_length ); + $at += $name_length; + + // If this class is marked for removal, start processing the next one. + $remove_class = ( + isset( $this->classname_updates[ $name ] ) && + self::REMOVE_CLASS === $this->classname_updates[ $name ] + ); + + // If a class has already been seen then skip it; it should not be added twice. + if ( ! $remove_class ) { + $this->classname_updates[ $name ] = self::SKIP_CLASS; + } + + if ( $remove_class ) { + $modified = true; + continue; + } + + /* + * Otherwise, append it to the new "class" attribute value. + * + * There are options for handling whitespace between tags. + * Preserving the existing whitespace produces fewer changes + * to the HTML content and should clarify the before/after + * content when debugging the modified output. + * + * This approach contrasts normalizing the inter-class + * whitespace to a single space, which might appear cleaner + * in the output HTML but produce a noisier change. + */ + $class .= substr( $existing_class, $ws_at, $ws_length ); + $class .= $name; + } + + // Add new classes by appending those which haven't already been seen. + foreach ( $this->classname_updates as $name => $operation ) { + if ( self::ADD_CLASS === $operation ) { + $modified = true; + + $class .= strlen( $class ) > 0 ? ' ' : ''; + $class .= $name; + } + } + + $this->classname_updates = array(); + if ( ! $modified ) { + return; + } + + if ( strlen( $class ) > 0 ) { + $this->set_attribute( 'class', $class ); + } else { + $this->remove_attribute( 'class' ); + } + } + + /** + * Applies attribute updates to HTML document. + * + * @since 6.2.0 + * @since 6.2.1 Accumulates shift for internal cursor and passed pointer. + * @since 6.3.0 Invalidate any bookmarks whose targets are overwritten. + * + * @param int $shift_this_point Accumulate and return shift for this position. + * @return int How many bytes the given pointer moved in response to the updates. + */ + private function apply_attributes_updates( $shift_this_point = 0 ) { + if ( ! count( $this->lexical_updates ) ) { + return 0; + } + + $accumulated_shift_for_given_point = 0; + + /* + * Attribute updates can be enqueued in any order but updates + * to the document must occur in lexical order; that is, each + * replacement must be made before all others which follow it + * at later string indices in the input document. + * + * Sorting avoid making out-of-order replacements which + * can lead to mangled output, partially-duplicated + * attributes, and overwritten attributes. + */ + usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) ); + + $bytes_already_copied = 0; + $output_buffer = ''; + foreach ( $this->lexical_updates as $diff ) { + $shift = strlen( $diff->text ) - ( $diff->end - $diff->start ); + + // Adjust the cursor position by however much an update affects it. + if ( $diff->start <= $this->bytes_already_parsed ) { + $this->bytes_already_parsed += $shift; + } + + // Accumulate shift of the given pointer within this function call. + if ( $diff->start <= $shift_this_point ) { + $accumulated_shift_for_given_point += $shift; + } + + $output_buffer .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied ); + $output_buffer .= $diff->text; + $bytes_already_copied = $diff->end; + } + + $this->html = $output_buffer . substr( $this->html, $bytes_already_copied ); + + /* + * Adjust bookmark locations to account for how the text + * replacements adjust offsets in the input document. + */ + foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { + /* + * Each lexical update which appears before the bookmark's endpoints + * might shift the offsets for those endpoints. Loop through each change + * and accumulate the total shift for each bookmark, then apply that + * shift after tallying the full delta. + */ + $head_delta = 0; + $tail_delta = 0; + + foreach ( $this->lexical_updates as $diff ) { + if ( $bookmark->start < $diff->start && $bookmark->end < $diff->start ) { + break; + } + + if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) { + $this->release_bookmark( $bookmark_name ); + continue 2; + } + + $delta = strlen( $diff->text ) - ( $diff->end - $diff->start ); + + if ( $bookmark->start >= $diff->start ) { + $head_delta += $delta; + } + + if ( $bookmark->end >= $diff->end ) { + $tail_delta += $delta; + } + } + + $bookmark->start += $head_delta; + $bookmark->end += $tail_delta; + } + + $this->lexical_updates = array(); + + return $accumulated_shift_for_given_point; + } + + /** + * Checks whether a bookmark with the given name exists. + * + * @since 6.3.0 + * + * @param string $bookmark_name Name to identify a bookmark that potentially exists. + * @return bool Whether that bookmark exists. + */ + public function has_bookmark( $bookmark_name ) { + return array_key_exists( $bookmark_name, $this->bookmarks ); + } + + /** + * Move the internal cursor in the Tag Processor to a given bookmark's location. + * + * In order to prevent accidental infinite loops, there's a + * maximum limit on the number of times seek() can be called. + * + * @since 6.2.0 + * + * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. + * @return bool Whether the internal cursor was successfully moved to the bookmark's location. + */ + public function seek( $bookmark_name ) { + if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { + _doing_it_wrong( + __METHOD__, + __( 'Unknown bookmark name.' ), + '6.2.0' + ); + return false; + } + + if ( ++$this->seek_count > static::MAX_SEEK_OPS ) { + _doing_it_wrong( + __METHOD__, + __( 'Too many calls to seek() - this can lead to performance issues.' ), + '6.2.0' + ); + return false; + } + + // Flush out any pending updates to the document. + $this->get_updated_html(); + + // Point this tag processor before the sought tag opener and consume it. + $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; + return $this->next_tag( array( 'tag_closers' => 'visit' ) ); + } + + /** + * Compare two WP_HTML_Text_Replacement objects. + * + * @since 6.2.0 + * + * @param WP_HTML_Text_Replacement $a First attribute update. + * @param WP_HTML_Text_Replacement $b Second attribute update. + * @return int Comparison value for string order. + */ + private static function sort_start_ascending( $a, $b ) { + $by_start = $a->start - $b->start; + if ( 0 !== $by_start ) { + return $by_start; + } + + $by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0; + if ( 0 !== $by_text ) { + return $by_text; + } + + /* + * This code should be unreachable, because it implies the two replacements + * start at the same location and contain the same text. + */ + return $a->end - $b->end; + } + + /** + * Return the enqueued value for a given attribute, if one exists. + * + * Enqueued updates can take different data types: + * - If an update is enqueued and is boolean, the return will be `true` + * - If an update is otherwise enqueued, the return will be the string value of that update. + * - If an attribute is enqueued to be removed, the return will be `null` to indicate that. + * - If no updates are enqueued, the return will be `false` to differentiate from "removed." + * + * @since 6.2.0 + * + * @param string $comparable_name The attribute name in its comparable form. + * @return string|boolean|null Value of enqueued update if present, otherwise false. + */ + private function get_enqueued_attribute_value( $comparable_name ) { + if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { + return false; + } + + $enqueued_text = $this->lexical_updates[ $comparable_name ]->text; + + // Removed attributes erase the entire span. + if ( '' === $enqueued_text ) { + return null; + } + + /* + * Boolean attribute updates are just the attribute name without a corresponding value. + * + * This value might differ from the given comparable name in that there could be leading + * or trailing whitespace, and that the casing follows the name given in `set_attribute`. + * + * Example: + * + * $p->set_attribute( 'data-TEST-id', 'update' ); + * 'update' === $p->get_enqueued_attribute_value( 'data-test-id' ); + * + * Detect this difference based on the absence of the `=`, which _must_ exist in any + * attribute containing a value, e.g. `<input type="text" enabled />`. + * ¹ ² + * 1. Attribute with a string value. + * 2. Boolean attribute whose value is `true`. + */ + $equals_at = strpos( $enqueued_text, '=' ); + if ( false === $equals_at ) { + return true; + } + + /* + * Finally, a normal update's value will appear after the `=` and + * be double-quoted, as performed incidentally by `set_attribute`. + * + * e.g. `type="text"` + * ¹² ³ + * 1. Equals is here. + * 2. Double-quoting starts one after the equals sign. + * 3. Double-quoting ends at the last character in the update. + */ + $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 ); + return html_entity_decode( $enqueued_value ); + } + + /** + * Returns the value of a requested attribute from a matched tag opener if that attribute exists. + * + * Example: + * + * $p = new WP_HTML_Tag_Processor( '<div enabled class="test" data-test-id="14">Test</div>' ); + * $p->next_tag( array( 'class_name' => 'test' ) ) === true; + * $p->get_attribute( 'data-test-id' ) === '14'; + * $p->get_attribute( 'enabled' ) === true; + * $p->get_attribute( 'aria-label' ) === null; + * + * $p->next_tag() === false; + * $p->get_attribute( 'class' ) === null; + * + * @since 6.2.0 + * + * @param string $name Name of attribute whose value is requested. + * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. + */ + public function get_attribute( $name ) { + if ( null === $this->tag_name_starts_at ) { + return null; + } + + $comparable = strtolower( $name ); + + /* + * For every attribute other than `class` it's possible to perform a quick check if + * there's an enqueued lexical update whose value takes priority over what's found in + * the input document. + * + * The `class` attribute is special though because of the exposed helpers `add_class` + * and `remove_class`. These form a builder for the `class` attribute, so an additional + * check for enqueued class changes is required in addition to the check for any enqueued + * attribute values. If any exist, those enqueued class changes must first be flushed out + * into an attribute value update. + */ + if ( 'class' === $name ) { + $this->class_name_updates_to_attributes_updates(); + } + + // Return any enqueued attribute value updates if they exist. + $enqueued_value = $this->get_enqueued_attribute_value( $comparable ); + if ( false !== $enqueued_value ) { + return $enqueued_value; + } + + if ( ! isset( $this->attributes[ $comparable ] ) ) { + return null; + } + + $attribute = $this->attributes[ $comparable ]; + + /* + * This flag distinguishes an attribute with no value + * from an attribute with an empty string value. For + * unquoted attributes this could look very similar. + * It refers to whether an `=` follows the name. + * + * e.g. <div boolean-attribute empty-attribute=></div> + * ¹ ² + * 1. Attribute `boolean-attribute` is `true`. + * 2. Attribute `empty-attribute` is `""`. + */ + if ( true === $attribute->is_true ) { + return true; + } + + $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); + + return html_entity_decode( $raw_value ); + } + + /** + * Gets lowercase names of all attributes matching a given prefix in the current tag. + * + * Note that matching is case-insensitive. This is in accordance with the spec: + * + * > There must never be two or more attributes on + * > the same start tag whose names are an ASCII + * > case-insensitive match for each other. + * - HTML 5 spec + * + * Example: + * + * $p = new WP_HTML_Tag_Processor( '<div data-ENABLED class="test" DATA-test-id="14">Test</div>' ); + * $p->next_tag( array( 'class_name' => 'test' ) ) === true; + * $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' ); + * + * $p->next_tag() === false; + * $p->get_attribute_names_with_prefix( 'data-' ) === null; + * + * @since 6.2.0 + * + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + * + * @param string $prefix Prefix of requested attribute names. + * @return array|null List of attribute names, or `null` when no tag opener is matched. + */ + public function get_attribute_names_with_prefix( $prefix ) { + if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { + return null; + } + + $comparable = strtolower( $prefix ); + + $matches = array(); + foreach ( array_keys( $this->attributes ) as $attr_name ) { + if ( str_starts_with( $attr_name, $comparable ) ) { + $matches[] = $attr_name; + } + } + return $matches; + } + + /** + * Returns the uppercase name of the matched tag. + * + * Example: + * + * $p = new WP_HTML_Tag_Processor( '<div class="test">Test</div>' ); + * $p->next_tag() === true; + * $p->get_tag() === 'DIV'; + * + * $p->next_tag() === false; + * $p->get_tag() === null; + * + * @since 6.2.0 + * + * @return string|null Name of currently matched tag in input HTML, or `null` if none found. + */ + public function get_tag() { + if ( null === $this->tag_name_starts_at ) { + return null; + } + + $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + + return strtoupper( $tag_name ); + } + + /** + * Indicates if the currently matched tag contains the self-closing flag. + * + * No HTML elements ought to have the self-closing flag and for those, the self-closing + * flag will be ignored. For void elements this is benign because they "self close" + * automatically. For non-void HTML elements though problems will appear if someone + * intends to use a self-closing element in place of that element with an empty body. + * For HTML foreign elements and custom elements the self-closing flag determines if + * they self-close or not. + * + * This function does not determine if a tag is self-closing, + * but only if the self-closing flag is present in the syntax. + * + * @since 6.3.0 + * + * @return bool Whether the currently matched tag contains the self-closing flag. + */ + public function has_self_closing_flag() { + if ( ! $this->tag_name_starts_at ) { + return false; + } + + return '/' === $this->html[ $this->tag_ends_at - 1 ]; + } + + /** + * Indicates if the current tag token is a tag closer. + * + * Example: + * + * $p = new WP_HTML_Tag_Processor( '<div></div>' ); + * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); + * $p->is_tag_closer() === false; + * + * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); + * $p->is_tag_closer() === true; + * + * @since 6.2.0 + * + * @return bool Whether the current tag is a tag closer. + */ + public function is_tag_closer() { + return $this->is_closing_tag; + } + + /** + * Updates or creates a new attribute on the currently matched tag with the passed value. + * + * For boolean attributes special handling is provided: + * - When `true` is passed as the value, then only the attribute name is added to the tag. + * - When `false` is passed, the attribute gets removed if it existed before. + * + * For string attributes, the value is escaped using the `esc_attr` function. + * + * @since 6.2.0 + * @since 6.2.1 Fix: Only create a single update for multiple calls with case-variant attribute names. + * + * @param string $name The attribute name to target. + * @param string|bool $value The new attribute value. + * @return bool Whether an attribute value was set. + */ + public function set_attribute( $name, $value ) { + if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { + return false; + } + + /* + * WordPress rejects more characters than are strictly forbidden + * in HTML5. This is to prevent additional security risks deeper + * in the WordPress and plugin stack. Specifically the + * less-than (<) greater-than (>) and ampersand (&) aren't allowed. + * + * The use of a PCRE match enables looking for specific Unicode + * code points without writing a UTF-8 decoder. Whereas scanning + * for one-byte characters is trivial (with `strcspn`), scanning + * for the longer byte sequences would be more complicated. Given + * that this shouldn't be in the hot path for execution, it's a + * reasonable compromise in efficiency without introducing a + * noticeable impact on the overall system. + * + * @see https://html.spec.whatwg.org/#attributes-2 + * + * @TODO as the only regex pattern maybe we should take it out? are + * Unicode patterns available broadly in Core? + */ + if ( preg_match( + '~[' . + // Syntax-like characters. + '"\'>&</ =' . + // Control characters. + '\x{00}-\x{1F}' . + // HTML noncharacters. + '\x{FDD0}-\x{FDEF}' . + '\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}' . + '\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}' . + '\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}' . + '\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}' . + '\x{10FFFE}\x{10FFFF}' . + ']~Ssu', + $name + ) ) { + _doing_it_wrong( + __METHOD__, + __( 'Invalid attribute name.' ), + '6.2.0' + ); + + return false; + } + + /* + * > The values "true" and "false" are not allowed on boolean attributes. + * > To represent a false value, the attribute has to be omitted altogether. + * - HTML5 spec, https://html.spec.whatwg.org/#boolean-attributes + */ + if ( false === $value ) { + return $this->remove_attribute( $name ); + } + + if ( true === $value ) { + $updated_attribute = $name; + } else { + $escaped_new_value = esc_attr( $value ); + $updated_attribute = "{$name}=\"{$escaped_new_value}\""; + } + + /* + * > There must never be two or more attributes on + * > the same start tag whose names are an ASCII + * > case-insensitive match for each other. + * - HTML 5 spec + * + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + */ + $comparable_name = strtolower( $name ); + + if ( isset( $this->attributes[ $comparable_name ] ) ) { + /* + * Update an existing attribute. + * + * Example – set attribute id to "new" in <div id="initial_id" />: + * + * <div id="initial_id"/> + * ^-------------^ + * start end + * replacement: `id="new"` + * + * Result: <div id="new"/> + */ + $existing_attribute = $this->attributes[ $comparable_name ]; + $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( + $existing_attribute->start, + $existing_attribute->end, + $updated_attribute + ); + } else { + /* + * Create a new attribute at the tag's name end. + * + * Example – add attribute id="new" to <div />: + * + * <div/> + * ^ + * start and end + * replacement: ` id="new"` + * + * Result: <div id="new"/> + */ + $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( + $this->tag_name_starts_at + $this->tag_name_length, + $this->tag_name_starts_at + $this->tag_name_length, + ' ' . $updated_attribute + ); + } + + /* + * Any calls to update the `class` attribute directly should wipe out any + * enqueued class changes from `add_class` and `remove_class`. + */ + if ( 'class' === $comparable_name && ! empty( $this->classname_updates ) ) { + $this->classname_updates = array(); + } + + return true; + } + + /** + * Remove an attribute from the currently-matched tag. + * + * @since 6.2.0 + * + * @param string $name The attribute name to remove. + * @return bool Whether an attribute was removed. + */ + public function remove_attribute( $name ) { + if ( $this->is_closing_tag ) { + return false; + } + + /* + * > There must never be two or more attributes on + * > the same start tag whose names are an ASCII + * > case-insensitive match for each other. + * - HTML 5 spec + * + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive + */ + $name = strtolower( $name ); + + /* + * Any calls to update the `class` attribute directly should wipe out any + * enqueued class changes from `add_class` and `remove_class`. + */ + if ( 'class' === $name && count( $this->classname_updates ) !== 0 ) { + $this->classname_updates = array(); + } + + /* + * If updating an attribute that didn't exist in the input + * document, then remove the enqueued update and move on. + * + * For example, this might occur when calling `remove_attribute()` + * after calling `set_attribute()` for the same attribute + * and when that attribute wasn't originally present. + */ + if ( ! isset( $this->attributes[ $name ] ) ) { + if ( isset( $this->lexical_updates[ $name ] ) ) { + unset( $this->lexical_updates[ $name ] ); + } + return false; + } + + /* + * Removes an existing tag attribute. + * + * Example – remove the attribute id from <div id="main"/>: + * <div id="initial_id"/> + * ^-------------^ + * start end + * replacement: `` + * + * Result: <div /> + */ + $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $this->attributes[ $name ]->start, + $this->attributes[ $name ]->end, + '' + ); + + // Removes any duplicated attributes if they were also present. + if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) { + foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) { + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $attribute_token->start, + $attribute_token->end, + '' + ); + } + } + + return true; + } + + /** + * Adds a new class name to the currently matched tag. + * + * @since 6.2.0 + * + * @param string $class_name The class name to add. + * @return bool Whether the class was set to be added. + */ + public function add_class( $class_name ) { + if ( $this->is_closing_tag ) { + return false; + } + + if ( null !== $this->tag_name_starts_at ) { + $this->classname_updates[ $class_name ] = self::ADD_CLASS; + } + + return true; + } + + /** + * Removes a class name from the currently matched tag. + * + * @since 6.2.0 + * + * @param string $class_name The class name to remove. + * @return bool Whether the class was set to be removed. + */ + public function remove_class( $class_name ) { + if ( $this->is_closing_tag ) { + return false; + } + + if ( null !== $this->tag_name_starts_at ) { + $this->classname_updates[ $class_name ] = self::REMOVE_CLASS; + } + + return true; + } + + /** + * Returns the string representation of the HTML Tag Processor. + * + * @since 6.2.0 + * + * @see WP_HTML_Tag_Processor::get_updated_html() + * + * @return string The processed HTML. + */ + public function __toString() { + return $this->get_updated_html(); + } + + /** + * Returns the string representation of the HTML Tag Processor. + * + * @since 6.2.0 + * @since 6.2.1 Shifts the internal cursor corresponding to the applied updates. + * @since 6.4.0 No longer calls subclass method `next_tag()` after updating HTML. + * + * @return string The processed HTML. + */ + public function get_updated_html() { + $requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates ); + + /* + * When there is nothing more to update and nothing has already been + * updated, return the original document and avoid a string copy. + */ + if ( $requires_no_updating ) { + return $this->html; + } + + /* + * Keep track of the position right before the current tag. This will + * be necessary for reparsing the current tag after updating the HTML. + */ + $before_current_tag = $this->tag_name_starts_at - 1; + + /* + * 1. Apply the enqueued edits and update all the pointers to reflect those changes. + */ + $this->class_name_updates_to_attributes_updates(); + $before_current_tag += $this->apply_attributes_updates( $before_current_tag ); + + /* + * 2. Rewind to before the current tag and reparse to get updated attributes. + * + * At this point the internal cursor points to the end of the tag name. + * Rewind before the tag name starts so that it's as if the cursor didn't + * move; a call to `next_tag()` will reparse the recently-updated attributes + * and additional calls to modify the attributes will apply at this same + * location, but in order to avoid issues with subclasses that might add + * behaviors to `next_tag()`, the internal methods should be called here + * instead. + * + * It's important to note that in this specific place there will be no change + * because the processor was already at a tag when this was called and it's + * rewinding only to the beginning of this very tag before reprocessing it + * and its attributes. + * + * <p>Previous HTML<em>More HTML</em></p> + * ↑ │ back up by the length of the tag name plus the opening < + * └←─┘ back up by strlen("em") + 1 ==> 3 + */ + $this->bytes_already_parsed = $before_current_tag; + $this->parse_next_tag(); + // Reparse the attributes. + while ( $this->parse_next_attribute() ) { + continue; + } + + $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); + $this->tag_ends_at = $tag_ends_at; + $this->bytes_already_parsed = $tag_ends_at; + + return $this->html; + } + + /** + * Parses tag query input into internal search criteria. + * + * @since 6.2.0 + * + * @param array|string|null $query { + * Optional. Which tag name to find, having which class, etc. Default is to find any tag. + * + * @type string|null $tag_name Which tag to find, or `null` for "any tag." + * @type int|null $match_offset Find the Nth tag matching all search criteria. + * 1 for "first" tag, 3 for "third," etc. + * Defaults to first tag. + * @type string|null $class_name Tag must contain this class name to match. + * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </div>. + * } + */ + private function parse_query( $query ) { + if ( null !== $query && $query === $this->last_query ) { + return; + } + + $this->last_query = $query; + $this->sought_tag_name = null; + $this->sought_class_name = null; + $this->sought_match_offset = 1; + $this->stop_on_tag_closers = false; + + // A single string value means "find the tag of this name". + if ( is_string( $query ) ) { + $this->sought_tag_name = $query; + return; + } + + // An empty query parameter applies no restrictions on the search. + if ( null === $query ) { + return; + } + + // If not using the string interface, an associative array is required. + if ( ! is_array( $query ) ) { + _doing_it_wrong( + __METHOD__, + __( 'The query argument must be an array or a tag name.' ), + '6.2.0' + ); + return; + } + + if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) { + $this->sought_tag_name = $query['tag_name']; + } + + if ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) { + $this->sought_class_name = $query['class_name']; + } + + if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) { + $this->sought_match_offset = $query['match_offset']; + } + + if ( isset( $query['tag_closers'] ) ) { + $this->stop_on_tag_closers = 'visit' === $query['tag_closers']; + } + } + + + /** + * Checks whether a given tag and its attributes match the search criteria. + * + * @since 6.2.0 + * + * @return bool Whether the given tag and its attribute match the search criteria. + */ + private function matches() { + if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { + return false; + } + + // Does the tag name match the requested tag name in a case-insensitive manner? + if ( null !== $this->sought_tag_name ) { + /* + * String (byte) length lookup is fast. If they aren't the + * same length then they can't be the same string values. + */ + if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) { + return false; + } + + /* + * Check each character to determine if they are the same. + * Defer calls to `strtoupper()` to avoid them when possible. + * Calling `strcasecmp()` here tested slowed than comparing each + * character, so unless benchmarks show otherwise, it should + * not be used. + * + * It's expected that most of the time that this runs, a + * lower-case tag name will be supplied and the input will + * contain lower-case tag names, thus normally bypassing + * the case comparison code. + */ + for ( $i = 0; $i < $this->tag_name_length; $i++ ) { + $html_char = $this->html[ $this->tag_name_starts_at + $i ]; + $tag_char = $this->sought_tag_name[ $i ]; + + if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) { + return false; + } + } + } + + if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) { + return false; + } + + return true; + } +} diff --git a/wp-includes/html-api/class-wp-html-text-replacement.php b/wp-includes/html-api/class-wp-html-text-replacement.php new file mode 100644 index 0000000..26b7bb2 --- /dev/null +++ b/wp-includes/html-api/class-wp-html-text-replacement.php @@ -0,0 +1,60 @@ +<?php +/** + * HTML API: WP_HTML_Text_Replacement class + * + * @package WordPress + * @subpackage HTML-API + * @since 6.2.0 + */ + +/** + * Core class used by the HTML tag processor as a data structure for replacing + * existing content from start to end, allowing to drastically improve performance. + * + * This class is for internal usage of the WP_HTML_Tag_Processor class. + * + * @access private + * @since 6.2.0 + * + * @see WP_HTML_Tag_Processor + */ +class WP_HTML_Text_Replacement { + /** + * Byte offset into document where replacement span begins. + * + * @since 6.2.0 + * @var int + */ + public $start; + + /** + * Byte offset into document where replacement span ends. + * + * @since 6.2.0 + * @var int + */ + public $end; + + /** + * Span of text to insert in document to replace existing content from start to end. + * + * @since 6.2.0 + * @var string + */ + public $text; + + /** + * Constructor. + * + * @since 6.2.0 + * + * @param int $start Byte offset into document where replacement span begins. + * @param int $end Byte offset into document where replacement span ends. + * @param string $text Span of text to insert in document to replace existing content from start to end. + */ + public function __construct( $start, $end, $text ) { + $this->start = $start; + $this->end = $end; + $this->text = $text; + } +} diff --git a/wp-includes/html-api/class-wp-html-token.php b/wp-includes/html-api/class-wp-html-token.php new file mode 100644 index 0000000..86dd765 --- /dev/null +++ b/wp-includes/html-api/class-wp-html-token.php @@ -0,0 +1,106 @@ +<?php +/** + * HTML API: WP_HTML_Token class + * + * @package WordPress + * @subpackage HTML-API + * @since 6.4.0 + */ + +/** + * Core class used by the HTML processor during HTML parsing + * for referring to tokens in the input HTML string. + * + * This class is designed for internal use by the HTML processor. + * + * @since 6.4.0 + * + * @access private + * + * @see WP_HTML_Processor + */ +class WP_HTML_Token { + /** + * Name of bookmark corresponding to source of token in input HTML string. + * + * Having a bookmark name does not imply that the token still exists. It + * may be that the source token and underlying bookmark was wiped out by + * some modification to the source HTML. + * + * @since 6.4.0 + * + * @var string + */ + public $bookmark_name = null; + + /** + * Name of node; lowercase names such as "marker" are not HTML elements. + * + * For HTML elements/tags this value should come from WP_HTML_Processor::get_tag(). + * + * @since 6.4.0 + * + * @see WP_HTML_Processor::get_tag() + * + * @var string + */ + public $node_name = null; + + /** + * Whether node contains the self-closing flag. + * + * A node may have a self-closing flag when it shouldn't. This value + * only reports if the flag is present in the original HTML. + * + * @since 6.4.0 + * + * @see https://html.spec.whatwg.org/#self-closing-flag + * + * @var bool + */ + public $has_self_closing_flag = false; + + /** + * Called when token is garbage-collected or otherwise destroyed. + * + * @var callable|null + */ + public $on_destroy = null; + + /** + * Constructor - creates a reference to a token in some external HTML string. + * + * @since 6.4.0 + * + * @param string $bookmark_name Name of bookmark corresponding to location in HTML where token is found. + * @param string $node_name Name of node token represents; if uppercase, an HTML element; if lowercase, a special value like "marker". + * @param bool $has_self_closing_flag Whether the source token contains the self-closing flag, regardless of whether it's valid. + * @param callable $on_destroy Function to call when destroying token, useful for releasing the bookmark. + */ + public function __construct( $bookmark_name, $node_name, $has_self_closing_flag, $on_destroy = null ) { + $this->bookmark_name = $bookmark_name; + $this->node_name = $node_name; + $this->has_self_closing_flag = $has_self_closing_flag; + $this->on_destroy = $on_destroy; + } + + /** + * Destructor. + * + * @since 6.4.0 + */ + public function __destruct() { + if ( is_callable( $this->on_destroy ) ) { + call_user_func( $this->on_destroy, $this->bookmark_name ); + } + } + + /** + * Wakeup magic method. + * + * @since 6.4.2 + */ + public function __wakeup() { + throw new \LogicException( __CLASS__ . ' should never be unserialized' ); + } +} diff --git a/wp-includes/html-api/class-wp-html-unsupported-exception.php b/wp-includes/html-api/class-wp-html-unsupported-exception.php new file mode 100644 index 0000000..6e72286 --- /dev/null +++ b/wp-includes/html-api/class-wp-html-unsupported-exception.php @@ -0,0 +1,31 @@ +<?php +/** + * HTML API: WP_HTML_Unsupported_Exception class + * + * @package WordPress + * @subpackage HTML-API + * @since 6.4.0 + */ + +/** + * Core class used by the HTML processor during HTML parsing + * for indicating that a given operation is unsupported. + * + * This class is designed for internal use by the HTML processor. + * + * The HTML API aims to operate in compliance with the HTML5 + * specification, but does not implement the full specification. + * In cases where it lacks support it should not cause breakage + * or unexpected behavior. In the cases where it recognizes that + * it cannot proceed, this class is used to abort from any + * operation and signify that the given HTML cannot be processed. + * + * @since 6.4.0 + * + * @access private + * + * @see WP_HTML_Processor + */ +class WP_HTML_Unsupported_Exception extends Exception { + +} |