From 869f6bd04c7060143bb6300a1d5ab2eefdea625c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 17 Dec 2024 13:06:21 +0100 Subject: [PATCH] [Data Liberation] Move Markdown importer to a separate package (#2093) Moves the Markdown importer to a `data-liberation-markdown` package so that it can be shipped as a separate `.phar` file and downloaded only when needed. ## Testing instructions This only moves code around. To test, confirm the CI PHP unit tests keep working. A part of: * https://github.com/WordPress/wordpress-playground/pull/2080 * #1894 --- .../src/WP_Markdown_Importer.php | 91 ++++++ .../src}/WP_Markdown_To_Blocks.php | 53 ++-- .../src/bootstrap.php | 6 + .../playground/data-liberation/bootstrap.php | 14 - .../WP_Markdown_Directory_Tree_Reader.php | 283 ------------------ .../WP_Markdown_HTML_Processor.php | 19 -- 6 files changed, 114 insertions(+), 352 deletions(-) create mode 100644 packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php rename packages/playground/{data-liberation/src/markdown-api => data-liberation-markdown/src}/WP_Markdown_To_Blocks.php (90%) create mode 100644 packages/playground/data-liberation-markdown/src/bootstrap.php delete mode 100644 packages/playground/data-liberation/src/markdown-api/WP_Markdown_Directory_Tree_Reader.php delete mode 100644 packages/playground/data-liberation/src/markdown-api/WP_Markdown_HTML_Processor.php diff --git a/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php b/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php new file mode 100644 index 0000000000..ceda2728c9 --- /dev/null +++ b/packages/playground/data-liberation-markdown/src/WP_Markdown_Importer.php @@ -0,0 +1,91 @@ + $markdown_directory, + 'first_post_id' => 1, + 'allowed_extensions' => array( 'md' ), + 'index_file_patterns' => array( '#^index\.md$#' ), + 'markup_converter_factory' => function( $content ) { + return new WP_Markdown_To_Blocks( $content ); + }, + ) + ); + }, + $options, + $cursor + ); + } + + protected static function parse_options( $options ) { + if ( ! isset( $options['source_site_url'] ) ) { + _doing_it_wrong( __METHOD__, 'The source_site_url option is required.', '__WP_VERSION__' ); + return false; + } + $options['default_source_site_url'] = $options['source_site_url']; + + if ( ! isset( $options['local_markdown_assets_root'] ) ) { + _doing_it_wrong( __METHOD__, 'The markdown_assets_root option is required.', '__WP_VERSION__' ); + return false; + } + if ( ! is_dir( $options['local_markdown_assets_root'] ) ) { + _doing_it_wrong( __METHOD__, 'The markdown_assets_root option must point to a directory.', '__WP_VERSION__' ); + return false; + } + $options['local_markdown_assets_root'] = rtrim( $options['local_markdown_assets_root'], '/' ); + + return parent::parse_options( $options ); + } + + protected function rewrite_attachment_url( string $raw_url, $context_path = null ) { + /** + * For Docusaurus docs, URLs starting with `@site` are referring + * to local files. Let's convert them to file:// URLs. + */ + if ( + isset( $this->options['local_markdown_assets_url_prefix'] ) && + str_starts_with( $raw_url, $this->options['local_markdown_assets_url_prefix'] ) + ) { + // @TODO: Source the file from the current input stream if we can. + // This would allow stream-importing zipped Markdown and WXR directory + // structures. + // Maybe for v1 we could just support importing them from ZIP files + // that are already downloaded and available in a local directory just + // to avoid additional data transfer and the hurdle with implementing + // multiple range requests. + $relative_asset_path = substr( $raw_url, strlen( $this->options['local_markdown_assets_url_prefix'] ) ); + $relative_asset_path = '/' . ltrim( $relative_asset_path, '/' ); + $raw_url = ( + 'file://' . + $this->options['local_markdown_assets_root'] . + $relative_asset_path + ); + } + + return parent::rewrite_attachment_url( $raw_url, $context_path ); + } + + /** + * When processing Markdown, we'll download all the images + * referenced in the image tags. + * + * @TODO: Actually, should we? + * @TODO: How can we process the videos? + * @TODO: What other asset types are there? + */ + protected function url_processor_matched_asset_url( WP_Block_Markup_Url_Processor $p ) { + return ( + $p->get_tag() === 'IMG' && + $p->get_inspected_attribute_name() === 'src' + ); + } +} diff --git a/packages/playground/data-liberation/src/markdown-api/WP_Markdown_To_Blocks.php b/packages/playground/data-liberation-markdown/src/WP_Markdown_To_Blocks.php similarity index 90% rename from packages/playground/data-liberation/src/markdown-api/WP_Markdown_To_Blocks.php rename to packages/playground/data-liberation-markdown/src/WP_Markdown_To_Blocks.php index 424ebd5b8d..2316dcee61 100644 --- a/packages/playground/data-liberation/src/markdown-api/WP_Markdown_To_Blocks.php +++ b/packages/playground/data-liberation-markdown/src/WP_Markdown_To_Blocks.php @@ -21,8 +21,7 @@ use League\CommonMark\Extension\Table\TableRow; use League\CommonMark\Extension\Table\TableSection; - -class WP_Markdown_To_Blocks { +class WP_Markdown_To_Blocks implements WP_Block_Markup_Converter { const STATE_READY = 'STATE_READY'; const STATE_COMPLETE = 'STATE_COMPLETE'; @@ -40,19 +39,26 @@ public function __construct( $markdown ) { $this->markdown = $markdown; } - public function parse() { + public function convert() { if ( self::STATE_READY !== $this->state ) { return false; } $this->convert_markdown_to_blocks(); - $this->block_markup = self::convert_blocks_to_markup( $this->parsed_blocks ); + $this->block_markup = WP_Import_Utils::convert_blocks_to_markup( $this->parsed_blocks ); return true; } - public function get_frontmatter() { + public function get_all_metadata() { return $this->frontmatter; } + public function get_meta_value( $key ) { + if ( ! array_key_exists( $key, $this->frontmatter ) ) { + return null; + } + return $this->frontmatter[ $key ][0]; + } + public function get_block_markup() { return $this->block_markup; } @@ -74,7 +80,11 @@ private function convert_markdown_to_blocks() { $parser = new MarkdownParser( $environment ); $document = $parser->parse( $this->markdown ); - $this->frontmatter = $document->data; + $this->frontmatter = []; + foreach( $document->data as $key => $value ) { + // Use an array as a value to comply with the WP_Block_Markup_Converter interface. + $this->frontmatter[ $key ] = [$value]; + } $walker = $document->walker(); while ( true ) { @@ -163,7 +173,7 @@ private function convert_markdown_to_blocks() { 'content' => '
' . trim( str_replace( "\n", '
', htmlspecialchars( $node->getLiteral() ) ) ) . '
', ) ); - if ( $node->getInfo() ) { + if ( method_exists( $node, 'getInfo' ) && $node->getInfo() ) { $this->current_block->attrs['language'] = preg_replace( '/[ \t\r\n\f].*/', '', $node->getInfo() ); } break; @@ -339,35 +349,6 @@ private function convert_markdown_to_blocks() { $this->parsed_blocks = $this->root_block->inner_blocks; } - private static function convert_blocks_to_markup( $blocks ) { - $block_markup = ''; - - foreach ( $blocks as $block ) { - // Start of block comment - $comment = ''; - $p = new WP_HTML_Tag_Processor( $comment ); - $p->next_token(); - $attrs = $block->attrs; - $content = $block->attrs['content'] ?? ''; - unset( $attrs['content'] ); - $encoded_attrs = json_encode( $attrs ); - if ( $encoded_attrs === '[]' ) { - $encoded_attrs = ''; - } - $p->set_modifiable_text( " wp:{$block->block_name} " . $encoded_attrs . ' ' ); - $open_comment = $p->get_updated_html(); - - $block_markup .= $open_comment . "\n"; - $block_markup .= $content . "\n"; - $block_markup .= self::convert_blocks_to_markup( $block->inner_blocks ); - - // End of block comment - $block_markup .= "\n"; - } - - return $block_markup; - } - private function append_content( $content ) { if ( ! isset( $this->current_block->attrs['content'] ) ) { $this->current_block->attrs['content'] = ''; diff --git a/packages/playground/data-liberation-markdown/src/bootstrap.php b/packages/playground/data-liberation-markdown/src/bootstrap.php new file mode 100644 index 0000000000..f815de02b7 --- /dev/null +++ b/packages/playground/data-liberation-markdown/src/bootstrap.php @@ -0,0 +1,6 @@ +file_visitor = new WP_File_Visitor( realpath( $root_dir ) ); - $this->next_post_id = $first_post_id; - } - - public function next_entity() { - while ( true ) { - if ( null !== $this->pending_directory_index ) { - $dir = $this->file_visitor->get_event()->dir; - $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() - 1 ] ?? null; - - if ( false === $this->pending_directory_index ) { - // No directory index candidate – let's create a fake page - // just to have something in the page tree. - $markdown = ''; - $source_path = $dir->getPathName(); - } else { - $markdown = file_get_contents( $this->pending_directory_index->getRealPath() ); - $source_path = $this->pending_directory_index->getRealPath(); - } - $post_id = $this->next_post_id; - ++$this->next_post_id; - ++$this->entities_read_so_far; - $this->entity = $this->markdown_to_post_entity( - array( - 'markdown' => $markdown, - 'source_path' => $source_path, - 'post_id' => $post_id, - 'parent_id' => $parent_id, - 'title_fallback' => $this->slug_to_title( $dir->getFileName() ), - ) - ); - $this->pending_directory_index = null; - $depth = $this->file_visitor->get_current_depth(); - $this->parent_ids[ $depth ] = $post_id; - return true; - } - - while ( count( $this->pending_files ) ) { - $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null; - $file = array_shift( $this->pending_files ); - $this->entity = $this->markdown_to_post_entity( - array( - 'markdown' => file_get_contents( $file->getRealPath() ), - 'source_path' => $file->getRealPath(), - 'post_id' => $this->next_post_id, - 'parent_id' => $parent_id, - 'title_fallback' => $this->slug_to_title( $file->getFileName() ), - ) - ); - ++$this->next_post_id; - ++$this->entities_read_so_far; - return true; - } - - if ( false === $this->next_file() ) { - break; - } - } - $this->is_finished = true; - return false; - } - - public function get_entity(): WP_Imported_Entity { - return $this->entity; - } - - protected function markdown_to_post_entity( $options ) { - $converter = new WP_Markdown_To_Blocks( $options['markdown'] ); - $converter->parse(); - $block_markup = $converter->get_block_markup(); - $frontmatter = $converter->get_frontmatter(); - - $removed_title = $this->remove_first_h1_block_from_block_markup( $block_markup ); - if ( false !== $removed_title ) { - $block_markup = $removed_title['remaining_html']; - } - - $post_title = ''; - if ( ! $post_title && ! empty( $removed_title['content'] ) ) { - $post_title = $removed_title['content']; - } - if ( ! $post_title && ! empty( $frontmatter['title'] ) ) { - // In WordPress Playground docs, the frontmatter title - // is actually a worse candidate than the first H1 block - // - // There will, inevitably, be 10,000 ways people will want - // to use this importer with different projects. Let's just - // enable plugins to customize the title resolution. - $post_title = $frontmatter['title']; - } - if ( ! $post_title ) { - $post_title = $options['title_fallback']; - } - - $entity_data = array( - 'post_id' => $options['post_id'], - 'post_type' => 'page', - 'guid' => $options['source_path'], - 'post_title' => $post_title, - 'post_content' => $block_markup, - 'post_excerpt' => $frontmatter['description'] ?? '', - 'post_status' => 'publish', - ); - - /** - * Technically `source_path` isn't a part of the WordPress post object, - * but we need it to resolve relative URLs in the imported content. - * - * This path is relative to the root directory traversed by this class. - */ - if ( ! empty( $options['source_path'] ) ) { - $source_path = $options['source_path']; - $root_dir = $this->file_visitor->get_root_dir(); - if ( str_starts_with( $source_path, $root_dir ) ) { - $source_path = substr( $source_path, strlen( $root_dir ) ); - } - $source_path = ltrim( $source_path, '/' ); - $entity_data['source_path'] = $source_path; - } - - if ( ! empty( $frontmatter['slug'] ) ) { - $slug = $frontmatter['slug']; - $last_segment = substr( $slug, strrpos( $slug, '/' ) + 1 ); - $entity_data['post_name'] = $last_segment; - } - - if ( isset( $frontmatter['sidebar_position'] ) ) { - $entity_data['post_order'] = $frontmatter['sidebar_position']; - } - - if ( $options['parent_id'] ) { - $entity_data['post_parent'] = $options['parent_id']; - } - return new WP_Imported_Entity( 'post', $entity_data ); - } - - private function next_file() { - $this->pending_files = array(); - $this->entity = null; - while ( $this->file_visitor->next() ) { - $event = $this->file_visitor->get_event(); - - $is_root = $event->dir->getPathName() === $this->file_visitor->get_root_dir(); - if ( $is_root ) { - continue; - } - if ( $event->is_exiting() ) { - // Clean up stale IDs to save some memory when processing - // large directory trees. - unset( $this->parent_ids[ $event->dir->getRealPath() ] ); - continue; - } - - $this->pending_files = $this->choose_relevant_files( $event->files ); - $directory_index_idx = $this->choose_directory_index( $this->pending_files ); - if ( -1 !== $directory_index_idx ) { - $this->pending_directory_index = $this->pending_files[ $directory_index_idx ]; - unset( $this->pending_files[ $directory_index_idx ] ); - } else { - $this->pending_directory_index = false; - } - return true; - } - return false; - } - - protected function choose_directory_index( $files ) { - foreach ( $files as $idx => $file ) { - if ( $this->looks_like_directory_index( $file ) ) { - return $idx; - } - } - return -1; - } - - protected function looks_like_directory_index( $file ) { - return str_contains( $file->getFilename(), 'index' ); - } - - protected function choose_relevant_files( $files ) { - return array_filter( $files, array( $this, 'is_valid_file' ) ); - } - - protected function is_valid_file( $file ) { - return 'md' === $file->getExtension(); - } - - protected function slug_to_title( $filename ) { - $name = pathinfo( $filename, PATHINFO_FILENAME ); - $name = preg_replace( '/^\d+/', '', $name ); - $name = str_replace( - array( '-', '_' ), - ' ', - $name - ); - $name = ucwords( $name ); - return $name; - } - - private function remove_first_h1_block_from_block_markup( $html ) { - $p = WP_Markdown_HTML_Processor::create_fragment( $html ); - if ( false === $p->next_tag() ) { - return false; - } - if ( $p->get_tag() !== 'H1' ) { - return false; - } - $depth = $p->get_current_depth(); - $title = ''; - do { - if ( false === $p->next_token() ) { - break; - } - if ( $p->get_token_type() === '#text' ) { - $title .= $p->get_modifiable_text() . ' '; - } - } while ( $p->get_current_depth() > $depth ); - - if ( ! $title ) { - return false; - } - - // Move past the closing comment - $p->next_token(); - if ( $p->get_token_type() === '#text' ) { - $p->next_token(); - } - if ( $p->get_token_type() !== '#comment' ) { - return false; - } - - return array( - 'content' => trim( $title ), - 'remaining_html' => substr( - $html, - $p->get_string_index_after_current_token() - ), - ); - } - - public function current(): object { - return $this->get_entity(); - } - - public function next(): void { - $this->next_entity(); - } - - public function key(): int { - return $this->entities_read_so_far - 1; - } - - public function valid(): bool { - return ! $this->is_finished; - } - - public function rewind(): void { - // noop - } -} diff --git a/packages/playground/data-liberation/src/markdown-api/WP_Markdown_HTML_Processor.php b/packages/playground/data-liberation/src/markdown-api/WP_Markdown_HTML_Processor.php deleted file mode 100644 index afb6ec9c2a..0000000000 --- a/packages/playground/data-liberation/src/markdown-api/WP_Markdown_HTML_Processor.php +++ /dev/null @@ -1,19 +0,0 @@ -set_bookmark( $name ); - $bookmark = $this->bookmarks[ '_' . $name ]; - $this->release_bookmark( $name ); - return $bookmark->start + $bookmark->length; - } -}