From 7778714791198ec336db90526b4a93f2d3c6bab2 Mon Sep 17 00:00:00 2001 From: Francesco Bigiarini Date: Tue, 26 Nov 2024 14:49:02 +0100 Subject: [PATCH] First topological sorter draft --- .../playground/data-liberation/bootstrap.php | 2 + .../playground/data-liberation/plugin.php | 14 +- .../src/cli/WP_Import_Command.php | 173 ++++++++++++++++++ .../src/cli/WP_Import_Logger.php | 51 ++++++ .../src/import/WP_Entity_Importer.php | 56 +----- .../data-liberation/src/import/WP_Logger.php | 51 ++++++ .../src/import/WP_Stream_Importer.php | 34 +++- .../src/import/WP_Topological_Sorter.php | 103 +++++++++++ .../data-liberation/src/wxr/WP_WXR_Reader.php | 4 + 9 files changed, 413 insertions(+), 75 deletions(-) create mode 100644 packages/playground/data-liberation/src/cli/WP_Import_Command.php create mode 100644 packages/playground/data-liberation/src/cli/WP_Import_Logger.php create mode 100644 packages/playground/data-liberation/src/import/WP_Logger.php create mode 100644 packages/playground/data-liberation/src/import/WP_Topological_Sorter.php diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index eb6d45c096..4b0a206079 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -55,6 +55,8 @@ require_once __DIR__ . '/src/import/WP_Attachment_Downloader_Event.php'; require_once __DIR__ . '/src/import/WP_Stream_Importer.php'; require_once __DIR__ . '/src/import/WP_Markdown_Importer.php'; +require_once __DIR__ . '/src/import/WP_Logger.php'; +require_once __DIR__ . '/src/import/WP_Topological_Sorter.php'; require_once __DIR__ . '/src/utf8_decoder.php'; diff --git a/packages/playground/data-liberation/plugin.php b/packages/playground/data-liberation/plugin.php index 5f383b4a69..d49d692a85 100644 --- a/packages/playground/data-liberation/plugin.php +++ b/packages/playground/data-liberation/plugin.php @@ -27,20 +27,10 @@ add_action('init', function() { if ( defined( 'WP_CLI' ) && WP_CLI ) { - /** - * Import a WXR file. - * - * - * : The WXR file to import. - */ - $command = function ( $args, $assoc_args ) { - $file = $args[0]; - data_liberation_import( $file ); - }; + require_once __DIR__ . '/src/cli/WP_Import_Command.php'; // Register the WP-CLI import command. - // Example usage: wp data-liberation /path/to/file.xml - WP_CLI::add_command( 'data-liberation', $command ); + WP_CLI::add_command( 'data-liberation', WP_Import_Command::class ); } }); diff --git a/packages/playground/data-liberation/src/cli/WP_Import_Command.php b/packages/playground/data-liberation/src/cli/WP_Import_Command.php new file mode 100644 index 0000000000..fe49ced08e --- /dev/null +++ b/packages/playground/data-liberation/src/cli/WP_Import_Command.php @@ -0,0 +1,173 @@ + + * : The path to the WXR file. Either a file, a directory or a URL. + * + * [--dry-run] + * : Perform a dry run if set. + * + * ## EXAMPLES + * + * wp data-liberation import /path/to/file.xml + * + * @param array $args + * @param array $assoc_args + * @return void + */ + public function import( $args, $assoc_args ) { + $path = $args[0]; + $this->dry_run = WP_CLI\Utils\get_flag_value( $assoc_args, 'dry-run', false ); + $options = array( + 'logger' => new WP_Import_logger(), + ); + + if ( extension_loaded( 'pcntl' ) ) { + // Set the signal handler. + $this->register_handlers(); + } + + if ( filter_var( $path, FILTER_VALIDATE_URL ) ) { + // Import URL. + $this->import_wxr_url( $path, $options ); + } elseif ( is_dir( $path ) ) { + $count = 0; + // Get all the WXR files in the directory. + foreach ( wp_visit_file_tree( $path ) as $event ) { + foreach ( $event->files as $file ) { + if ( $file->isFile() && 'xml' === pathinfo( $file->getPathname(), PATHINFO_EXTENSION ) ) { + ++$count; + + // Import the WXR file. + $this->import_wxr_file( $file->getPathname(), $options ); + } + } + } + + if ( ! $count ) { + WP_CLI::error( WP_CLI::colorize( "No WXR files found in the {$path} directory" ) ); + } + } else { + if ( ! is_file( $path ) ) { + WP_CLI::error( WP_CLI::colorize( "File not found: %R{$path}%n" ) ); + } + + // Import the WXR file. + $this->import_wxr_file( $path, $options ); + } + } + + /** + * Import a WXR file. + * + * @param string $file_path The path to the WXR file. + * @return void + */ + private function import_wxr_file( $file_path, $options = array() ) { + $this->wxr_path = $file_path; + $this->importer = WP_Stream_Importer::create_for_wxr_file( $file_path, $options ); + + $this->import_wxr(); + } + + /** + * Import a WXR file from a URL. + * + * @param string $url The URL to the WXR file. + * @return void + */ + private function import_wxr_url( $url, $options = array() ) { + $this->wxr_path = $url; + $this->importer = WP_Stream_Importer::create_for_wxr_url( $url, $options ); + + $this->import_wxr(); + } + + /** + * Import the WXR file. + */ + private function import_wxr() { + if ( ! $this->importer ) { + WP_CLI::error( 'Could not create importer' ); + } + + WP_CLI::line( "Importing {$this->wxr_path}" ); + + if ( $this->dry_run ) { + WP_CLI::line( 'Dry run enabled.' ); + } else { + while ( $this->importer->next_step() ) { + $current_stage = $this->importer->get_current_stage(); + // WP_CLI::line( "Stage {$current_stage}" ); + } + } + + WP_CLI::success( 'Import finished' ); + } + + /** + * Callback function registered to `pcntl_signal` to handle signals. + * + * @param int $signal The signal number. + * @return void + */ + protected function signal_handler( $signal ) { + switch ( $signal ) { + case SIGINT: + WP_CLI::line( 'Received SIGINT signal' ); + exit( 0 ); + + case SIGTERM: + WP_CLI::line( 'Received SIGTERM signal' ); + exit( 0 ); + } + } + + /** + * Register signal handlers for the command. + * + * @return void + */ + private function register_handlers() { + // Handle the Ctrl + C signal to terminate the program. + pcntl_signal( SIGINT, array( $this, 'signal_handler' ) ); + + // Handle the `kill` command to terminate the program. + pcntl_signal( SIGTERM, array( $this, 'signal_handler' ) ); + } +} diff --git a/packages/playground/data-liberation/src/cli/WP_Import_Logger.php b/packages/playground/data-liberation/src/cli/WP_Import_Logger.php new file mode 100644 index 0000000000..103ab3d9e2 --- /dev/null +++ b/packages/playground/data-liberation/src/cli/WP_Import_Logger.php @@ -0,0 +1,51 @@ +mapping['term_id'] = array(); $this->requires_remapping = $empty_types; $this->exists = $empty_types; - $this->logger = new Logger(); + $this->logger = isset( $options['logger'] ) ? $options['logger'] : new WP_Logger(); $this->options = wp_parse_args( $options, @@ -1191,57 +1191,3 @@ public static function sort_comments_by_id( $a, $b ) { return $a['comment_id'] - $b['comment_id']; } } - -/** - * @TODO how to treat this? Should this class even exist? - * how does WordPress handle different levels? It - * seems useful for usage in wp-cli, Blueprints, - * and other non-web environments. - */ -// phpcs:ignore Generic.Files.OneObjectStructurePerFile.MultipleFound -class Logger { - /** - * Log a debug message. - * - * @param string $message Message to log - */ - public function debug( $message ) { - // echo( '[DEBUG] ' . $message ); - } - - /** - * Log an info message. - * - * @param string $message Message to log - */ - public function info( $message ) { - // echo( '[INFO] ' . $message ); - } - - /** - * Log a warning message. - * - * @param string $message Message to log - */ - public function warning( $message ) { - echo( '[WARNING] ' . $message ); - } - - /** - * Log an error message. - * - * @param string $message Message to log - */ - public function error( $message ) { - echo( '[ERROR] ' . $message ); - } - - /** - * Log a notice message. - * - * @param string $message Message to log - */ - public function notice( $message ) { - // echo( '[NOTICE] ' . $message ); - } -} diff --git a/packages/playground/data-liberation/src/import/WP_Logger.php b/packages/playground/data-liberation/src/import/WP_Logger.php new file mode 100644 index 0000000000..87605336fe --- /dev/null +++ b/packages/playground/data-liberation/src/import/WP_Logger.php @@ -0,0 +1,51 @@ +entity_iterator ) { - $this->entity_iterator = $this->create_entity_iterator(); - $this->downloader = new WP_Attachment_Downloader( $this->options ); + $this->entity_iterator = $this->create_entity_iterator(); + $this->topological_sorter = new WP_Topological_Sorter(); + $this->downloader = new WP_Attachment_Downloader( $this->options ); } $this->frontloading_advance_reentrancy_cursor(); @@ -253,11 +259,15 @@ private function next_frontloading_step() { if ( ! empty( $this->active_downloads ) ) { _doing_it_wrong( __METHOD__, 'Frontloading queue is not empty.', '1.0' ); } - $this->stage = self::STAGE_IMPORT_ENTITIES; - $this->downloader = null; - $this->active_downloads = array(); - $this->entity_iterator = null; - $this->resume_at_entity = null; + + print_r( $this->topological_sorter->mapping ); + + $this->stage = self::STAGE_IMPORT_ENTITIES; + $this->topological_sorter = null; + $this->downloader = null; + $this->active_downloads = array(); + $this->entity_iterator = null; + $this->resume_at_entity = null; return false; } @@ -288,14 +298,22 @@ private function next_frontloading_step() { $cursor = $this->entity_iterator->get_reentrancy_cursor(); $this->active_downloads[ $cursor ] = array(); - $data = $entity->get_data(); + $data = $entity->get_data(); + $upstream = $this->entity_iterator->get_upstream(); + switch ( $entity->get_type() ) { + case 'category': + case 'term': + $this->topological_sorter->map_term( $upstream, $data ); + break; case 'site_option': if ( $data['option_name'] === 'home' ) { $this->source_site_url = $data['option_value']; } break; case 'post': + $this->topological_sorter->map_post( $upstream, $data ); + if ( isset( $data['post_type'] ) && $data['post_type'] === 'attachment' ) { $this->enqueue_attachment_download( $data['attachment_url'], null ); } elseif ( isset( $data['post_content'] ) ) { diff --git a/packages/playground/data-liberation/src/import/WP_Topological_Sorter.php b/packages/playground/data-liberation/src/import/WP_Topological_Sorter.php new file mode 100644 index 0000000000..291421aae5 --- /dev/null +++ b/packages/playground/data-liberation/src/import/WP_Topological_Sorter.php @@ -0,0 +1,103 @@ +terms[ $data['slug'] ] = array( + 'upstream' => $upstream, + 'visited' => false, + ); + } + + public function map_post( $upstream, $data ) { + if ( empty( $data ) ) { + return false; + } + + // No parent, no need to sort. + if ( ! isset( $data['post_type'] ) ) { + return false; + } + + if ( 'post' === $data['post_type'] || 'page' === $data['post_type'] ) { + if ( ! $data['post_id'] ) { + $this->last_post_id = $this->orphan_post_counter; + --$this->orphan_post_counter; + } + + $this->unsorted_posts[ $data['post_id'] ] = array( + 'upstream' => $upstream, + 'parent' => $data['post_parent'], + 'visited' => false, + ); + } + } + + /** + * Sort posts topologically. + * + * Children posts should not be processed before their parent has been processed. + * This method sorts the posts in the order they should be processed. + * + * Sorted posts will be stored as attachments and posts/pages separately. + */ + public function sort_posts_topologically() { + foreach ( $this->unsorted_posts as $id => $post ) { + $this->topological_sort( $id, $post ); + } + + // Empty the unsorted posts + $this->unsorted_posts = array(); + } + + /** + * Recursive topological sorting. + * + * @param int $id The id of the post to sort. + * @param array $post The post to sort. + * + * @todo Check for circular dependencies. + */ + private function topological_sort( $id, $post ) { + if ( isset( $this->posts[ $id ]['visited'] ) ) { + return; + } + + $this->unsorted_posts[ $id ]['visited'] = true; + + if ( isset( $this->posts[ $post['parent'] ] ) ) { + $this->topological_sort( $post['parent'], $this->unsorted_posts[ $post['parent'] ] ); + } + + $this->index[] = $post['upstream']; + } +} diff --git a/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php b/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php index c37d952714..3b2988457e 100644 --- a/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php +++ b/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php @@ -389,6 +389,10 @@ protected function __construct( WP_XML_Processor $xml ) { $this->xml = $xml; } + public function get_upstream() { + return $this->entity_byte_offset; + } + public function get_reentrancy_cursor() { /** * @TODO: Instead of adjusting the XML cursor internals, adjust the get_reentrancy_cursor()