diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index 091dcc4ec0..dcfab2623f 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -61,6 +61,7 @@ require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_EPub_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_WXR_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_Directory_Tree_Entity_Reader.php'; diff --git a/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php index 5e82314ca9..0d36c5629e 100644 --- a/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php +++ b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php @@ -28,14 +28,15 @@ class WP_HTML_To_Blocks implements WP_Block_Markup_Converter { private $state = self::STATE_READY; private $block_stack = array(); - private $html; + private $markup_processor; private $ignore_text = false; private $in_ephemeral_paragraph = false; private $block_markup = ''; private $metadata = array(); + private $last_error = null; - public function __construct( $html ) { - $this->html = new \WP_HTML_Processor( $html ); + public function __construct( $markup_processor ) { + $this->markup_processor = $markup_processor; } public function convert() { @@ -43,13 +44,13 @@ public function convert() { return false; } - while ( $this->html->next_token() ) { - switch ( $this->html->get_token_type() ) { + while ( $this->markup_processor->next_token() ) { + switch ( $this->markup_processor->get_token_type() ) { case '#text': if ( $this->ignore_text ) { break; } - $this->append_rich_text( htmlspecialchars( $this->html->get_modifiable_text() ) ); + $this->append_rich_text( htmlspecialchars( $this->markup_processor->get_modifiable_text() ) ); break; case '#tag': $this->handle_tag(); @@ -57,7 +58,13 @@ public function convert() { } } + if ( $this->markup_processor->get_last_error() ) { + $this->last_error = $this->markup_processor->get_last_error(); + return false; + } + $this->close_ephemeral_paragraph(); + return true; } @@ -77,12 +84,12 @@ public function get_block_markup() { } private function handle_tag() { - $html = $this->html; - $tag = $html->get_tag(); + $html = $this->markup_processor; + $tag = strtoupper( $html->get_tag() ); $tag_lowercase = strtolower( $tag ); - $is_tag_opener = ! $html->is_tag_closer(); - if ( ! $html->expects_closer() ) { + $is_void_tag = ! $html->expects_closer() && ! $html->is_tag_closer(); + if ( $is_void_tag ) { switch ( $tag ) { case 'META': $key = $html->get_attribute( 'name' ); @@ -110,7 +117,7 @@ private function handle_tag() { // Just insert an HTML block or what? break; } - } elseif ( $is_tag_opener ) { + } elseif ( ! $html->is_tag_closer() ) { switch ( $tag ) { // Block elements case 'SCRIPT': @@ -304,7 +311,7 @@ private function should_preserve_tag_in_rich_text( $tag ) { } private function is_at_inline_code_element() { - $breadcrumbs = $this->html->get_breadcrumbs(); + $breadcrumbs = $this->markup_processor->get_breadcrumbs(); foreach ( $breadcrumbs as $tag ) { switch ( $tag ) { case 'A': @@ -392,4 +399,8 @@ private function close_ephemeral_paragraph() { $this->in_ephemeral_paragraph = false; } } + + public function get_last_error() { + return $this->last_error; + } } diff --git a/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php new file mode 100644 index 0000000000..db7b8b9df3 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php @@ -0,0 +1,123 @@ +zip = $zip; + $this->current_post_id = $first_post_id; + } + + public function next_entity() { + if ( $this->last_error ) { + return false; + } + + if ( $this->finished ) { + return false; + } + + if ( null === $this->remaining_html_files ) { + $path = false; + foreach ( array( '/OEBPS', '/EPUB' ) as $path_candidate ) { + if ( $this->zip->is_dir( $path_candidate ) ) { + $path = $path_candidate; + break; + } + } + if ( false === $path ) { + _doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); + $this->finished = true; + return false; + } + + $files = $this->zip->ls( $path ); + if ( false === $files ) { + _doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); + $this->finished = true; + return false; + } + $this->remaining_html_files = array(); + foreach ( $files as $file ) { + if ( str_ends_with( $file, '.xhtml' ) || str_ends_with( $file, '.html' ) ) { + $this->remaining_html_files[] = $path . '/' . $file; + } + } + } + + while ( true ) { + if ( null !== $this->current_html_reader ) { + if ( + ! $this->current_html_reader->is_finished() && + $this->current_html_reader->next_entity() + ) { + return true; + } + if ( $this->current_html_reader->get_last_error() ) { + _doing_it_wrong( + __METHOD__, + 'The EPUB file did not contain any HTML files.', + '1.0.0' + ); + $this->finished = true; + return false; + } + } + + if ( count( $this->remaining_html_files ) === 0 ) { + $this->finished = true; + return false; + } + + $html_file = array_shift( $this->remaining_html_files ); + $html = $this->zip->read_file( $html_file ); + $this->current_html_reader = new WP_HTML_Entity_Reader( + WP_XML_Processor::create_from_string( $html ), + $this->current_post_id + ); + if ( $this->current_html_reader->get_last_error() ) { + $this->last_error = $this->current_html_reader->get_last_error(); + return false; + } + ++$this->current_post_id; + } + + return false; + } + + public function get_entity() { + return $this->current_html_reader->get_entity(); + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_last_error(): ?string { + return $this->last_error; + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php index b01bd0c875..67ceadad7d 100644 --- a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php +++ b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php @@ -1,7 +1,5 @@ html = $html; - $this->post_id = $post_id; + public function __construct( $html_processor, $post_id ) { + $this->html_processor = $html_processor; + $this->post_id = $post_id; } public function next_entity() { @@ -36,8 +35,9 @@ public function next_entity() { } // We did not read any entities yet. Let's convert the HTML document into entities. - $converter = new WP_HTML_To_Blocks( $this->html ); + $converter = new WP_HTML_To_Blocks( $this->html_processor ); if ( false === $converter->convert() ) { + $this->last_error = $converter->get_last_error(); return false; } @@ -90,6 +90,6 @@ public function is_finished(): bool { } public function get_last_error(): ?string { - return null; + return $this->last_error; } } diff --git a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php index 881e689020..b6b2a7669e 100644 --- a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php +++ b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php @@ -1558,7 +1558,6 @@ private function parse_next_tag() { * See https://www.w3.org/TR/xml11.xml/#sec-cdata-sect */ if ( - ! $this->is_closing_tag && $doc_length > $this->token_starts_at + 8 && '[' === $xml[ $this->token_starts_at + 2 ] && 'C' === $xml[ $this->token_starts_at + 3 ] && @@ -1583,6 +1582,59 @@ private function parse_next_tag() { return true; } + /* + * Identify DOCTYPE nodes. + * + * See https://www.w3.org/TR/xml11.html/#dtd + */ + if ( + $doc_length > $this->token_starts_at + 8 && + 'D' === $xml[ $at + 2 ] && + 'O' === $xml[ $at + 3 ] && + 'C' === $xml[ $at + 4 ] && + 'T' === $xml[ $at + 5 ] && + 'Y' === $xml[ $at + 6 ] && + 'P' === $xml[ $at + 7 ] && + 'E' === $xml[ $at + 8 ] + ) { + $at += 9; + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + // @TODO: Expose the "name" value instead of skipping it like that + $at += $this->parse_name( $at ); + + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + return false; + } + + if ( $this->xml[ $at ] !== '>' ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Unsupported DOCTYPE syntax. Only a simple is supported.' ), + 'WP_VERSION' + ); + return false; + } + + $closer_at = $at; + $this->parser_state = self::STATE_DOCTYPE_NODE; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; + } + /* * Anything else here is either unsupported at this point or invalid * syntax. See the class-level @TODO annotations for more information. @@ -2471,6 +2523,22 @@ public function get_tag() { return null; } + /** + * Indicates if the currently matched tag is expected to be closed. + * Returns true for tag openers (
Last week, WordPress 6.8 was released.
HTML; - $reader = new WP_HTML_Entity_Reader( $html, 1 ); + $reader = new WP_HTML_Entity_Reader( new WP_HTML_Processor( $html ), 1 ); $entities = []; while ( $reader->next_entity() ) { $data = $reader->get_entity()->get_data(); diff --git a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php index 66cae9670e..91359b9e47 100644 --- a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php +++ b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php @@ -16,7 +16,7 @@ public function test_metadata_extraction() {Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.
HTML; - $converter = new WP_HTML_To_Blocks( $html ); + $converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) ); $converter->convert( $html ); $metadata = $converter->get_all_metadata(); $expected_metadata = [ @@ -35,7 +35,7 @@ public function test_metadata_extraction() { * @dataProvider provider_test_conversion */ public function test_html_to_blocks_conversion( $html, $expected ) { - $converter = new WP_HTML_To_Blocks( $html ); + $converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) ); $converter->convert( $html ); $blocks = $converter->get_block_markup(); @@ -136,7 +136,7 @@ public function provider_test_conversion() { public function test_html_to_blocks_excerpt() { $input = file_get_contents( __DIR__ . '/fixtures/html-to-blocks/excerpt.input.html' ); - $converter = new WP_HTML_To_Blocks( $input ); + $converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $input ) ); $converter->convert( $input ); $blocks = $converter->get_block_markup(); @@ -146,7 +146,29 @@ public function test_html_to_blocks_excerpt() { } $this->assertEquals( file_get_contents( $output_file ), $blocks ); - + } + + public function test_xhtml_to_blocks_conversion() { + $input = <<And some content
+ + +XML; + $converter = new WP_HTML_To_Blocks( WP_XML_Processor::create_from_string( $input ) ); + $converter->convert( $input ); + $blocks = $converter->get_block_markup(); + $expected = <<And some content
+HTML; + $this->assertEquals( + $this->normalize_markup( $expected ), + $this->normalize_markup( $blocks ) + ); } } diff --git a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php index 2c3646dada..0e1dbf1ec4 100644 --- a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php +++ b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php @@ -1749,4 +1749,46 @@ public function test_pause_and_resume() { $this->assertEquals( 'Hello there', $resumed->get_modifiable_text(), 'Did not find the expected text.' ); } -} \ No newline at end of file + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::next_token + */ + public function test_doctype_parsing() { + $processor = WP_XML_Processor::create_from_string( + '