From 1c66dd0f1efe09fc1ebd9b83c5054e20ad79607b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 17 Dec 2024 13:36:04 +0100 Subject: [PATCH] [Data Liberation] Add HTML to Blocks converter Adds a basic WP_HTML_To_Blocks class that accepts HTML and outputs block markup. It only considers the markup and won't consider any visual changes introduced via CSS or JavaScript. A part of #1894 ## Example ```html $html = <<

Hello world!

HTML; $converter = new WP_HTML_To_Blocks( $html ); $converter->convert(); var_dump( $converter->get_all_metadata() ); /* * array( 'post_title' => array( 'My first post' ) ) */ var_dump( $converter->get_block_markup() ); /* * *

Hello world!

* */ ``` ## Testing instructions This PR mostly adds new code. Just confirm the unit tests pass in CI. --- .../data-liberation/blueprints-library | 2 +- .../playground/data-liberation/bootstrap.php | 5 + .../playground/data-liberation/phpunit.xml | 2 + .../WP_Block_Markup_Converter.php | 8 + .../src/block-markup/WP_HTML_To_Blocks.php | 395 + .../src/entity-readers/WP_Entity_Reader.php | 70 + .../entity-readers/WP_HTML_Entity_Reader.php | 95 + .../src/import/WP_Import_Utils.php | 96 + .../class-wp-html-processor.php | 10 +- .../tests/WPHTMLEntityReaderTests.php | 75 + .../tests/WPHTMLToBlocksTests.php | 141 + .../html-to-blocks/excerpt.input.html | 189 + .../html-to-blocks/excerpt.output.html | 9251 +++++++++++++++++ 13 files changed, 10336 insertions(+), 3 deletions(-) create mode 100644 packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Converter.php create mode 100644 packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php create mode 100644 packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php create mode 100644 packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php create mode 100644 packages/playground/data-liberation/src/import/WP_Import_Utils.php create mode 100644 packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php create mode 100644 packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php create mode 100644 packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.input.html create mode 100644 packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.output.html diff --git a/packages/playground/data-liberation/blueprints-library b/packages/playground/data-liberation/blueprints-library index 966e692625..b52a93ce17 160000 --- a/packages/playground/data-liberation/blueprints-library +++ b/packages/playground/data-liberation/blueprints-library @@ -1 +1 @@ -Subproject commit 966e6926256dc56c8473c6257d0d474be0f20811 +Subproject commit b52a93ce17562a1964fb27df770792fe165b217b diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index 9c38ff0a6e..978acd3942 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -44,14 +44,17 @@ require_once __DIR__ . '/src/wordpress-core-html-api/html5-named-character-references.php'; } +require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Converter.php'; require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Processor.php'; require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Url_Processor.php'; require_once __DIR__ . '/src/block-markup/WP_URL_In_Text_Processor.php'; +require_once __DIR__ . '/src/block-markup/WP_HTML_To_Blocks.php'; require_once __DIR__ . '/src/block-markup/WP_URL.php'; require_once __DIR__ . '/src/xml-api/WP_XML_Decoder.php'; require_once __DIR__ . '/src/xml-api/WP_XML_Processor.php'; require_once __DIR__ . '/src/wxr/WP_WXR_Reader.php'; +require_once __DIR__ . '/src/import/WP_Import_Utils.php'; require_once __DIR__ . '/src/import/WP_Block_Object.php'; require_once __DIR__ . '/src/import/WP_Entity_Importer.php'; require_once __DIR__ . '/src/import/WP_File_Visitor.php'; @@ -64,6 +67,8 @@ require_once __DIR__ . '/src/import/WP_Entity_Iterator_Chain.php'; require_once __DIR__ . '/src/import/WP_Retry_Frontloading_Iterator.php'; require_once __DIR__ . '/src/import/WP_Markdown_Importer.php'; +require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php'; require_once __DIR__ . '/src/utf8_decoder.php'; diff --git a/packages/playground/data-liberation/phpunit.xml b/packages/playground/data-liberation/phpunit.xml index 800b55f189..9646f33205 100644 --- a/packages/playground/data-liberation/phpunit.xml +++ b/packages/playground/data-liberation/phpunit.xml @@ -2,6 +2,8 @@ + tests/WPHTMLEntityReaderTests.php + tests/WPHTMLToBlocksTests.php tests/WPWXRReaderTests.php tests/WPRewriteUrlsTests.php tests/WPURLInTextProcessorTests.php diff --git a/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Converter.php b/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Converter.php new file mode 100644 index 0000000000..e3cd04b6de --- /dev/null +++ b/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Converter.php @@ -0,0 +1,8 @@ + + *

Hello world!

+ * + * Becomes: + * + * + *

Hello world!

+ * + * + * With the following metadata: + * + * array( + * 'post_title' => array( 'My first post' ), + * ) + */ +class WP_HTML_To_Blocks implements WP_Block_Markup_Converter { + const STATE_READY = 'STATE_READY'; + const STATE_COMPLETE = 'STATE_COMPLETE'; + + private $state = self::STATE_READY; + private $block_stack = array(); + private $html; + private $ignore_text = false; + private $in_ephemeral_paragraph = false; + private $block_markup = ''; + private $metadata = array(); + + public function __construct( $html ) { + $this->html = new \WP_HTML_Processor( $html ); + } + + public function convert() { + if ( self::STATE_READY !== $this->state ) { + return false; + } + + while ( $this->html->next_token() ) { + switch ( $this->html->get_token_type() ) { + case '#text': + if ( $this->ignore_text ) { + break; + } + $this->append_rich_text( htmlspecialchars( $this->html->get_modifiable_text() ) ); + break; + case '#tag': + $this->handle_tag(); + break; + } + } + + $this->close_ephemeral_paragraph(); + return true; + } + + public function get_meta_value( $key ) { + if ( ! array_key_exists( $key, $this->metadata ) ) { + return null; + } + return $this->metadata[ $key ][0]; + } + + public function get_all_metadata() { + return $this->metadata; + } + + public function get_block_markup() { + return $this->block_markup; + } + + private function handle_tag() { + $html = $this->html; + $tag = $html->get_tag(); + $tag_lowercase = strtolower( $tag ); + + $is_tag_opener = ! $html->is_tag_closer(); + if ( ! $html->expects_closer() ) { + switch ( $tag ) { + case 'META': + $key = $html->get_attribute( 'name' ); + $value = $html->get_attribute( 'content' ); + if ( ! array_key_exists( $key, $this->metadata ) ) { + $this->metadata[ $key ] = array(); + } + $this->metadata[ $key ][] = $value; + break; + case 'IMG': + $template = new \WP_HTML_Tag_Processor( '' ); + $template->next_tag(); + foreach ( array( 'alt', 'title', 'src' ) as $attr ) { + if ( $html->get_attribute( $attr ) ) { + $template->set_attribute( $attr, $html->get_attribute( $attr ) ); + } + } + /** + * + */ + $this->append_rich_text( $template->get_updated_html() ); + break; + default: + // @TODO: What to do with other void tags, e.g. ? + // Just insert an HTML block or what? + break; + } + } elseif ( $is_tag_opener ) { + switch ( $tag ) { + // Block elements + case 'SCRIPT': + $this->ignore_text = true; + break; + case 'UL': + case 'OL': + $this->push_block( 'list', array( 'ordered' => $tag === 'ol' ) ); + $this->block_markup .= ''; + $this->pop_block(); + break; + + case 'LI': + case 'BLOCKQUOTE': + case 'PRE': + case 'HR': + case 'P': + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + $this->block_markup .= ''; + $this->pop_block(); + break; + + case 'A': + $this->block_markup .= ''; + break; + + // Formats + default: + if ( $this->should_preserve_tag_in_rich_text( $tag ) ) { + $this->block_markup .= ''; + } + break; + } + } + } + + /** + * Checks whether the given tag is an inline formatting element + * that we want to preserve when parsing rich text. For example, + * tags are meaningful from the rich text perspective, but + *
tags are not. + * + * @param string $tag The tag to check. + * @return bool Whether the tag should be preserved in rich text. + */ + private function should_preserve_tag_in_rich_text( $tag ) { + return in_array( + $tag, + array( + 'B', + 'STRONG', + 'I', + 'U', + 'S', + 'SMALL', + 'SUP', + 'SUB', + 'MARK', + 'EM', + 'CITE', + 'DFN', + 'CODE', + 'KBD', + 'SAMP', + 'VAR', + ), + true + ); + } + + private function is_at_inline_code_element() { + $breadcrumbs = $this->html->get_breadcrumbs(); + foreach ( $breadcrumbs as $tag ) { + switch ( $tag ) { + case 'A': + case 'P': + case 'LI': + case 'TABLE': + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + return true; + } + } + return false; + } + + /** + * Appends a snippet of HTML to the block markup. + * Ensures given $html is a part of a block. If no block is + * currently open, it appends a new paragraph block. + * + * @param string $html The HTML snippet to append. + */ + private function append_rich_text( $html ) { + $html = trim( $html ); + if ( empty( $html ) ) { + return; + } + // Make sure two subsequent append_text() calls don't merge the text. + $html .= ' '; + $this->ensure_open_block(); + $this->block_markup .= $html; + } + + /** + * Pushes a new block onto the stack of open blocks and appends the block + * opener to the block markup. + * + * @param string $name The name of the block to push. + * @param array $attributes The attributes of the block to push. + */ + private function push_block( $name, $attributes = array() ) { + $this->close_ephemeral_paragraph(); + $block = new \WP_Block_Object( $name, $attributes ); + array_push( $this->block_stack, $block ); + $this->block_markup .= WP_Import_Utils::block_opener( $block->block_name, $block->attrs ) . "\n"; + } + + /** + * Pops the last block from the stack of open blocks and appends the block + * closer to the block markup. + * + * @return \WP_Block_Object The last block that was popped. + */ + private function pop_block() { + if ( ! empty( $this->block_stack ) ) { + $popped = array_pop( $this->block_stack ); + $this->block_markup .= WP_Import_Utils::block_closer( $popped->block_name ) . "\n"; + return $popped; + } + } + + /** + * Ensures that a block is open. If no block is currently open, it appends + * a new, ephemeral paragraph block that will be automatically closed + * when the next block opens OR when the HTML ends. + */ + private function ensure_open_block() { + if ( empty( $this->block_stack ) && ! $this->in_ephemeral_paragraph ) { + $this->block_markup .= WP_Import_Utils::block_opener( 'paragraph' ) . "\n"; + $this->block_markup .= '

'; + $this->in_ephemeral_paragraph = true; + } + } + + /** + * Closes the ephemeral paragraph if it is currently open. + */ + private function close_ephemeral_paragraph() { + if ( $this->in_ephemeral_paragraph ) { + $this->block_markup .= '

'; + $this->block_markup .= WP_Import_Utils::block_closer( 'paragraph' ); + $this->in_ephemeral_paragraph = false; + } + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php new file mode 100644 index 0000000000..77261f4b35 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php @@ -0,0 +1,70 @@ +get_entity() && ! $this->is_finished() && ! $this->get_last_error() ) { + $this->next(); + } + return $this->get_entity(); + } + + private $last_next_result = null; + public function next(): void { + // @TODO: Don't keep track of this. Just make sure the next_entity() + // call will make the is_finished() true. + $this->last_next_result = $this->next_entity(); + } + + public function key(): string { + return $this->get_reentrancy_cursor(); + } + + public function valid(): bool { + return false !== $this->last_next_result && ! $this->is_finished() && ! $this->get_last_error(); + } + + public function rewind(): void { + // Haven't started yet. + if ( null === $this->last_next_result ) { + return; + } + _doing_it_wrong( + __METHOD__, + 'WP_WXR_Entity_Reader does not support rewinding.', + null + ); + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php new file mode 100644 index 0000000000..b01bd0c875 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php @@ -0,0 +1,95 @@ +html = $html; + $this->post_id = $post_id; + } + + public function next_entity() { + // If we're finished, we're finished. + if ( $this->finished ) { + return false; + } + + // If we've already read some entities, skip to the next one. + if ( null !== $this->entities ) { + if ( count( $this->entities ) <= 1 ) { + $this->finished = true; + return false; + } + array_shift( $this->entities ); + return true; + } + + // We did not read any entities yet. Let's convert the HTML document into entities. + $converter = new WP_HTML_To_Blocks( $this->html ); + if ( false === $converter->convert() ) { + return false; + } + + $all_metadata = $converter->get_all_metadata(); + $post_fields = array(); + $other_metadata = array(); + foreach ( $all_metadata as $key => $values ) { + if ( in_array( $key, WP_Imported_Entity::POST_FIELDS, true ) ) { + $post_fields[ $key ] = $values[0]; + } else { + $other_metadata[ $key ] = $values[0]; + } + } + + // Yield the post entity. + $this->entities[] = new WP_Imported_Entity( + 'post', + array_merge( + $post_fields, + array( + 'post_id' => $this->post_id, + 'content' => $converter->get_block_markup(), + ) + ) + ); + + // Yield all the metadata that don't belong to the post entity. + foreach ( $other_metadata as $key => $value ) { + $this->entities[] = new WP_Imported_Entity( + 'post_meta', + array( + 'post_id' => $this->post_id, + 'meta_key' => $key, + 'meta_value' => $value, + ) + ); + } + return true; + } + + public function get_entity() { + if ( $this->is_finished() ) { + return false; + } + return $this->entities[0]; + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_last_error(): ?string { + return null; + } +} diff --git a/packages/playground/data-liberation/src/import/WP_Import_Utils.php b/packages/playground/data-liberation/src/import/WP_Import_Utils.php new file mode 100644 index 0000000000..91761e4223 --- /dev/null +++ b/packages/playground/data-liberation/src/import/WP_Import_Utils.php @@ -0,0 +1,96 @@ +"; + } + + public static function block_closer( $block_name ) { + return ""; + } + + /** + * Convert an array of WP_Block_Object objects to HTML markup. + * + * @param array $blocks The blocks to convert to markup. + * @return string The HTML markup. + */ + public static function convert_blocks_to_markup( $blocks ) { + $block_markup = ''; + + foreach ( $blocks as $block ) { + // Allow mixing of inner blocks and content strings. + if ( is_string( $block ) ) { + $block_markup .= $block; + continue; + } + // Start of block comment + $block_markup .= self::block_opener( $block->block_name, $block->attrs ); + $block_markup .= $block->attrs['content'] ?? ''; + $block_markup .= self::convert_blocks_to_markup( $block->inner_blocks ); + $block_markup .= self::block_closer( $block->block_name ); + } + + return $block_markup; + } + + public static function slug_to_title( $filename ) { + $name = pathinfo( $filename, PATHINFO_FILENAME ); + $name = preg_replace( '/^\d+/', '', $name ); + $name = str_replace( + array( '-', '_' ), + ' ', + $name + ); + $name = ucwords( $name ); + return $name; + } + + public static function remove_first_h1_block_from_block_markup( $html ) { + $p = WP_Import_HTML_Processor::create_fragment( $html ); + if ( false === $p->next_tag() ) { + return false; + } + if ( $p->get_tag() !== 'H1' ) { + return false; + } + $depth = $p->get_current_depth(); + $title = ''; + do { + if ( false === $p->next_token() ) { + break; + } + if ( $p->get_token_type() === '#text' ) { + $title .= $p->get_modifiable_text() . ' '; + } + } while ( $p->get_current_depth() > $depth ); + + if ( ! $title ) { + return false; + } + + // Move past the closing comment + $p->next_token(); + if ( $p->get_token_type() === '#text' ) { + $p->next_token(); + } + if ( $p->get_token_type() !== '#comment' ) { + return false; + } + + return array( + 'content' => trim( $title ), + 'remaining_html' => substr( + $html, + $p->get_string_index_after_current_token() + ), + ); + } +} diff --git a/packages/playground/data-liberation/src/wordpress-core-html-api/class-wp-html-processor.php b/packages/playground/data-liberation/src/wordpress-core-html-api/class-wp-html-processor.php index 14cb296d43..c2109168b4 100644 --- a/packages/playground/data-liberation/src/wordpress-core-html-api/class-wp-html-processor.php +++ b/packages/playground/data-liberation/src/wordpress-core-html-api/class-wp-html-processor.php @@ -1613,7 +1613,10 @@ private function step_in_head(): bool { */ $charset = $this->get_attribute( 'charset' ); if ( is_string( $charset ) && 'tentative' === $this->state->encoding_confidence ) { - $this->bail( 'Cannot yet process META tags with charset to determine encoding.' ); + // Commenting this out for now. We're assuming UTF-8 in WP_HTML_To_Blocks and + // we don't want to fail just because a document contained a meta tag with a UTF-8 charset. + // @TODO: Bail on non-utf8 charsets. + // $this->bail( 'Cannot yet process META tags with charset to determine encoding.' ); } /* @@ -1632,7 +1635,10 @@ private function step_in_head(): bool { 0 === strcasecmp( $http_equiv, 'Content-Type' ) && 'tentative' === $this->state->encoding_confidence ) { - $this->bail( 'Cannot yet process META tags with http-equiv Content-Type to determine encoding.' ); + // Commenting this out for now. We're assuming UTF-8 in WP_HTML_To_Blocks and + // we don't want to fail just because a document contained a meta tag with a UTF-8 charset. + // @TODO: Bail on non-utf8 charsets. + // $this->bail( 'Cannot yet process META tags with http-equiv Content-Type to determine encoding.' ); } return true; diff --git a/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php b/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php new file mode 100644 index 0000000000..be233599fa --- /dev/null +++ b/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php @@ -0,0 +1,75 @@ + + + + +

It is our pleasure to announce that WordPress 6.8 was released

+

Last week, WordPress 6.8 was released.

+HTML; + $reader = new WP_HTML_Entity_Reader( $html, 1 ); + $entities = []; + while ( $reader->next_entity() ) { + $data = $reader->get_entity()->get_data(); + if(isset($data['content'])) { + $data['content'] = $this->normalize_markup( $data['content'] ); + } + $entities[] = [ + 'type' => $reader->get_entity()->get_type(), + 'data' => $data, + ]; + } + $expected_entities = [ + [ + 'type' => 'post', + 'data' => [ + 'post_title' => 'WordPress 6.8 was released', + 'post_date' => '2024-12-16', + 'post_id' => 1, + 'content' => $this->normalize_markup(<< +

It is our pleasure to announce that WordPress 6.8 was released

+ + + +

Last week, WordPress 6.8 was released.

+ +HTML) + ] + ], + [ + 'type' => 'post_meta', + 'data' => [ + 'post_id' => 1, + 'meta_key' => 'custom_post_meta', + 'meta_value' => 'custom_post_meta_value', + ] + ], + [ + 'type' => 'post_meta', + 'data' => [ + 'post_id' => 1, + 'meta_key' => 'color_palette', + 'meta_value' => 'use_that_pretty_one', + ] + ], + ]; + $this->assertEquals( $expected_entities, $entities ); + } + + private function normalize_markup( $markup ) { + $processor = new WP_HTML_Processor( $markup ); + $serialized = $processor->serialize(); + if(str_ends_with($serialized, "")) { + $serialized = substr($serialized, 0, strlen("")); + } + return $serialized; + } + +} diff --git a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php new file mode 100644 index 0000000000..fc78ecc98a --- /dev/null +++ b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php @@ -0,0 +1,141 @@ + + + + + + + +

WordPress 6.8 was released

+

Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.

+HTML; + $converter = new WP_HTML_To_Blocks( $html ); + $converter->convert( $html ); + $metadata = $converter->get_all_metadata(); + $expected_metadata = [ + 'post_title' => ['WordPress 6.8 was released'], + 'post_date' => ['2024-12-16'], + 'post_modified' => ['2024-12-16'], + 'post_author' => ['1'], + 'post_author_name' => ['The WordPress Team'], + 'post_author_url' => ['https://wordpress.org'], + 'post_author_avatar' => ['https://wordpress.org/wp-content/uploads/2024/04/wordpress-logo-2024.png'], + ]; + $this->assertEquals( $expected_metadata, $metadata ); + } + + /** + * @dataProvider provider_test_conversion + */ + public function test_html_to_blocks_conversion( $html, $expected ) { + $converter = new WP_HTML_To_Blocks( $html ); + $converter->convert( $html ); + $blocks = $converter->get_block_markup(); + + $this->assertEquals( $this->normalize_markup($expected), $this->normalize_markup($blocks) ); + } + + private function normalize_markup( $markup ) { + $processor = new WP_HTML_Processor( $markup ); + $serialized = $processor->serialize(); + $serialized = trim( + str_replace( + [ + // Naively remove parts of the HTML that serialize() + // adds that we don't want. + '', + '', + // Even more naively, remove all the newlines. + "\n" + ], + '', + $serialized + ) + ); + return $serialized; + } + + public function provider_test_conversion() { + return [ + 'A simple paragraph' => [ + 'html' => '

A simple paragraph

', + 'expected' => "

A simple paragraph

" + ], + 'A simple list' => [ + 'html' => '
  • Item 1
  • Item 2
', + 'expected' => <<
    \n
  • Item 1
  • Item 2
+HTML + ], + 'A non-normative list' => [ + 'html' => '