From d1fd2a4d6fa80da8406408fac6e49c7c163bde6e Mon Sep 17 00:00:00 2001 From: naokomura Date: Wed, 22 Nov 2023 11:22:25 +0900 Subject: [PATCH 1/5] fix regex for html tag split in wptexturize --- src/wp-includes/formatting.php | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index bc150c8a537cb..c046ecc5c7dca 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -696,11 +696,20 @@ function _get_wptexturize_split_regex( $shortcode_regex = '' ) { . ')*+' // Loop possessively. . '(?:-->)?'; // End of comment. If not found, match all input. + /** + * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 + */ + $attribute_regex = + '[^>"\']*' // Find before end of element, or before start of attribute value. + . '(?:"[^"]*"[^>"]*)*' // Double-quoted attribute value + . '(?:\'[^\']*\'[^>\']*)*'; // Single-quoted attribute value + $html_regex = // Needs replaced with wp_html_split() per Shortcode API Roadmap. '<' // Find start of element. . '(?(?=!--)' // Is this a comment? . $comment_regex // Find end of comment. . '|' + . $attribute_regex // Exclude matching within attribute values. . '[^>]*>?' // Find end of element. If not found, match all input. . ')'; // phpcs:enable From 0b144722046fd75ec2b20a00d17814e15235773b Mon Sep 17 00:00:00 2001 From: naokomura Date: Wed, 22 Nov 2023 15:03:48 +0900 Subject: [PATCH 2/5] refactor attribute regex, add attribute regex to get_html_split_regex() --- src/wp-includes/formatting.php | 38 ++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index c046ecc5c7dca..f5f43d5641614 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -657,13 +657,22 @@ function get_html_split_regex() { . $cdata . ')'; + $attribute = + '[^>"\']*' // Match any character except >, ", or ' + . '(?:(?:' // Start of non-capturing group. + . '"[^"]*"' // Double-quoted attribute value. + . '|' // or + . '\'[^\']*\'' // Single-quoted attribute value. + . '))*'; // End of attribute value. Proceed to next regex. + $regex = - '/(' // Capture the entire match. - . '<' // Find start of element. - . '(?' // Conditional expression follows. - . $escaped // Find end of escaped element. - . '|' // ...else... - . '[^>]*>?' // Find end of normal element. + '/(' // Capture the entire match. + . '<' // Find start of element. + . '(?' // Conditional expression follows. + . $escaped // Find end of escaped element. + . '|' // ...else... + . $attribute // Exclude matching within attribute values. + . '[^>]*>?' // Find end of normal element. . ')' . ')/'; // phpcs:enable @@ -700,17 +709,20 @@ function _get_wptexturize_split_regex( $shortcode_regex = '' ) { * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 */ $attribute_regex = - '[^>"\']*' // Find before end of element, or before start of attribute value. - . '(?:"[^"]*"[^>"]*)*' // Double-quoted attribute value - . '(?:\'[^\']*\'[^>\']*)*'; // Single-quoted attribute value + '[^>"\']*' // Match any character except >, ", or ' + . '(?:(?:' // Start of non-capturing group. + . '"[^"]*"' // Double-quoted attribute value. + . '|' // or + . '\'[^\']*\'' // Single-quoted attribute value. + . '))*'; // End of attribute value. Proceed to next regex. $html_regex = // Needs replaced with wp_html_split() per Shortcode API Roadmap. - '<' // Find start of element. - . '(?(?=!--)' // Is this a comment? - . $comment_regex // Find end of comment. + '<' // Find start of element. + . '(?(?=!--)' // Is this a comment? + . $comment_regex // Find end of comment. . '|' . $attribute_regex // Exclude matching within attribute values. - . '[^>]*>?' // Find end of element. If not found, match all input. + . '[^>]*>?' // Find end of element. If not found, match all input. . ')'; // phpcs:enable } From 3e9aa12eb14362e8133342b559a82f406c4e652d Mon Sep 17 00:00:00 2001 From: naokomura Date: Wed, 22 Nov 2023 16:13:22 +0900 Subject: [PATCH 3/5] add tests for 'greater than' in html attribute value --- .../phpunit/tests/formatting/wpHtmlSplit.php | 8 +++++ .../phpunit/tests/formatting/wpTexturize.php | 33 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/tests/phpunit/tests/formatting/wpHtmlSplit.php b/tests/phpunit/tests/formatting/wpHtmlSplit.php index 750ad3821cc54..121f96c7fb8be 100644 --- a/tests/phpunit/tests/formatting/wpHtmlSplit.php +++ b/tests/phpunit/tests/formatting/wpHtmlSplit.php @@ -34,6 +34,14 @@ public function data_basic_features() { 'abcd ]]> efgh', array( 'abcd ', ' ]]>', ' efgh' ), ), + array( + 'abcd efgh', + array( 'abcd ', '', ' efgh' ), + ), + array( + 'abcd bar\' /> efgh', + array( 'abcd ', 'bar\' />', ' efgh' ), + ), ); } diff --git a/tests/phpunit/tests/formatting/wpTexturize.php b/tests/phpunit/tests/formatting/wpTexturize.php index 3202db4ba760f..e603ae97415a8 100644 --- a/tests/phpunit/tests/formatting/wpTexturize.php +++ b/tests/phpunit/tests/formatting/wpTexturize.php @@ -2115,4 +2115,37 @@ public function data_whole_posts() { require_once DIR_TESTDATA . '/formatting/whole-posts.php'; return data_whole_posts(); } + + /** + * @ticket 57381 + * @dataProvider data_greater_than_in_attribute_value + */ + public function test_greater_than_in_attribute_value( $input, $output ) { + $this->assertSame( $output, wptexturize( $input ) ); + } + + public function data_greater_than_in_attribute_value() { + return array( + array( + ' + + + ', + ' + + + ', + ), + array( + ' + + bar\' /> + ', + ' + + bar\' /> + ', + ), + ); + } } From 6879be1deba27d5b4404ef33c977894a8b58384d Mon Sep 17 00:00:00 2001 From: naokomura Date: Wed, 22 Nov 2023 18:48:27 +0900 Subject: [PATCH 4/5] fix bug of attribute regex in some cases, add test for that cases --- src/wp-includes/formatting.php | 56 ++++++++++--------- .../phpunit/tests/formatting/wpHtmlSplit.php | 10 ++++ .../phpunit/tests/formatting/wpTexturize.php | 10 ++++ 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index f5f43d5641614..27634491d15fa 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -657,22 +657,24 @@ function get_html_split_regex() { . $cdata . ')'; - $attribute = - '[^>"\']*' // Match any character except >, ", or ' - . '(?:(?:' // Start of non-capturing group. - . '"[^"]*"' // Double-quoted attribute value. - . '|' // or - . '\'[^\']*\'' // Single-quoted attribute value. - . '))*'; // End of attribute value. Proceed to next regex. + $ignore_attr = + '(?:' + . '[^>"\']*' // Match any characters except >, " or '. + . '(?:' + . '"[^"]*"' // Double-quoted attribute value. + . '|' + . '\'[^\']*\'' // Single-quoted attribute value. + . ')?' + . ')*'; // End of attribute value. $regex = - '/(' // Capture the entire match. - . '<' // Find start of element. - . '(?' // Conditional expression follows. - . $escaped // Find end of escaped element. - . '|' // ...else... - . $attribute // Exclude matching within attribute values. - . '[^>]*>?' // Find end of normal element. + '/(' // Capture the entire match. + . '<' // Find start of element. + . '(?' // Conditional expression follows. + . $escaped // Find end of escaped element. + . '|' // ...else... + . $ignore_attr // Exclude matching within attribute values. + . '[^>]*>?' // Find end of normal element. . ')' . ')/'; // phpcs:enable @@ -708,21 +710,23 @@ function _get_wptexturize_split_regex( $shortcode_regex = '' ) { /** * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 */ - $attribute_regex = - '[^>"\']*' // Match any character except >, ", or ' - . '(?:(?:' // Start of non-capturing group. - . '"[^"]*"' // Double-quoted attribute value. - . '|' // or - . '\'[^\']*\'' // Single-quoted attribute value. - . '))*'; // End of attribute value. Proceed to next regex. + $ignore_attr_regex = + '(?:' + . '[^>"\']*' // Match any characters except >, " or '. + . '(?:' + . '"[^"]*"' // Double-quoted attribute value. + . '|' + . '\'[^\']*\'' // Single-quoted attribute value. + . ')?' + . ')*'; // End of attribute value. $html_regex = // Needs replaced with wp_html_split() per Shortcode API Roadmap. - '<' // Find start of element. - . '(?(?=!--)' // Is this a comment? - . $comment_regex // Find end of comment. + '<' // Find start of element. + . '(?(?=!--)' // Is this a comment? + . $comment_regex // Find end of comment. . '|' - . $attribute_regex // Exclude matching within attribute values. - . '[^>]*>?' // Find end of element. If not found, match all input. + . $ignore_attr_regex // Ignore matching of element end within attribute values. + . '[^>]*>?' // Find end of element. If not found, match all input. . ')'; // phpcs:enable } diff --git a/tests/phpunit/tests/formatting/wpHtmlSplit.php b/tests/phpunit/tests/formatting/wpHtmlSplit.php index 121f96c7fb8be..bf0ade5c04123 100644 --- a/tests/phpunit/tests/formatting/wpHtmlSplit.php +++ b/tests/phpunit/tests/formatting/wpHtmlSplit.php @@ -42,6 +42,16 @@ public function data_basic_features() { 'abcd bar\' /> efgh', array( 'abcd ', 'bar\' />', ' efgh' ), ), + array( + '

numbers

', + array( + '', + '

', + 'numbers', + '

', + '', + ), + ), ); } diff --git a/tests/phpunit/tests/formatting/wpTexturize.php b/tests/phpunit/tests/formatting/wpTexturize.php index e603ae97415a8..90ca62902fcb3 100644 --- a/tests/phpunit/tests/formatting/wpTexturize.php +++ b/tests/phpunit/tests/formatting/wpTexturize.php @@ -2117,6 +2117,8 @@ public function data_whole_posts() { } /** + * @ticket 43457 + * @ticket 45387 * @ticket 57381 * @dataProvider data_greater_than_in_attribute_value */ @@ -2146,6 +2148,14 @@ public function data_greater_than_in_attribute_value() { bar\' /> ', ), + array( + 'loading...', + 'loading…', + ), + array( + '

Go to WordPress ->

', + '

Go to WordPress ->

', + ), ); } } From 8940e5ce3cbd689d05f7265bdbbbfb9f24fb3951 Mon Sep 17 00:00:00 2001 From: naokomura Date: Wed, 22 Nov 2023 23:48:55 +0900 Subject: [PATCH 5/5] remove reference transformation process for '&' entities in html attr values --- src/wp-includes/formatting.php | 4 ---- tests/phpunit/tests/formatting/wpTexturize.php | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 27634491d15fa..48300acb3e872 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -249,10 +249,6 @@ function wptexturize( $text, $reset = false ) { continue; } else { // This is an HTML element delimiter. - - // Replace each & with & unless it already looks like an entity. - $curl = preg_replace( '/&(?!#(?:\d+|x[a-f0-9]+);|[a-z1-4]{1,8};)/i', '&', $curl ); - _wptexturize_pushpop_element( $curl, $no_texturize_tags_stack, $no_texturize_tags ); } } elseif ( '' === trim( $curl ) ) { diff --git a/tests/phpunit/tests/formatting/wpTexturize.php b/tests/phpunit/tests/formatting/wpTexturize.php index 90ca62902fcb3..4ff0fa49d08f2 100644 --- a/tests/phpunit/tests/formatting/wpTexturize.php +++ b/tests/phpunit/tests/formatting/wpTexturize.php @@ -1278,11 +1278,11 @@ public function data_tag_avoidance() { ), array( '[ photos by this guy & that guy ]', - '[ photos by this guy & that guy ]', + '[ photos by this guy & that guy ]', ), array( '[photos by this guy & that guy ]', - '[photos by this guy & that guy ]', + '[photos by this guy & that guy ]', ), array( '& ', @@ -2134,7 +2134,7 @@ public function data_greater_than_in_attribute_value() { ', ' - + ', ), @@ -2144,7 +2144,7 @@ public function data_greater_than_in_attribute_value() { bar\' /> ', ' - + bar\' /> ', ),