From e95618eea67e64f43c552cd7bb0091d56049a58e Mon Sep 17 00:00:00 2001 From: Francesco Bigiarini Date: Wed, 4 Dec 2024 14:53:10 +0100 Subject: [PATCH] Add new topo sorting query --- .../src/import/WP_Topological_Sorter.php | 286 +++++++++++++----- 1 file changed, 207 insertions(+), 79 deletions(-) diff --git a/packages/playground/data-liberation/src/import/WP_Topological_Sorter.php b/packages/playground/data-liberation/src/import/WP_Topological_Sorter.php index 60ebe10d3c..8f48bff58c 100644 --- a/packages/playground/data-liberation/src/import/WP_Topological_Sorter.php +++ b/packages/playground/data-liberation/src/import/WP_Topological_Sorter.php @@ -9,8 +9,24 @@ */ class WP_Topological_Sorter { - public $posts = array(); - public $categories = array(); + /** + * The base name of the table. + */ + const TABLE_NAME = 'data_liberation_index'; + + /** + * The option name for the database version. + */ + const OPTION_NAME = 'data_liberation_db_version'; + + /** + * The current database version, to be used with dbDelta. + */ + const DB_VERSION = 1; + + // Element types. + const ELEMENT_TYPE_POST = 1; + const ELEMENT_TYPE_CATEGORY = 2; /** * Variable for keeping counts of orphaned posts/attachments, it'll also be assigned as temporarly post ID. @@ -34,27 +50,135 @@ class WP_Topological_Sorter { */ protected $sorted = false; + public static function get_table_name() { + global $wpdb; + + // Default is wp_{TABLE_NAME} + return $wpdb->prefix . self::TABLE_NAME; + } + + /** + * Run by register_activation_hook. + */ + public static function activate() { + global $wpdb; + + // See wp_get_db_schema + $max_index_length = 191; + $table_name = self::get_table_name(); + + // Create the table if it doesn't exist. + // @TODO: remove this custom SQLite declaration after first phase of unit tests is done. + if ( self::is_sqlite() ) { + $sql = $wpdb->prepare( + 'CREATE TABLE IF NOT EXISTS %i ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + element_type INTEGER NOT NULL default %d, + element_id INTEGER NOT NULL, + parent_id INTEGER, + parent TEXT NOT NULL default "", + byte_offset INTEGER NOT NULL, + hierarchy_level INTEGER DEFAULT NULL + ); + + CREATE UNIQUE INDEX IF NOT EXISTS idx_element_id ON %i (element_id); + CREATE INDEX IF NOT EXISTS idx_element_parent ON %i (parent); + CREATE INDEX IF NOT EXISTS idx_byte_offset ON %i (byte_offset);', + $table_name, + self::ELEMENT_TYPE_POST, + $table_name, + $table_name, + $table_name + ); + } else { + // MySQL, MariaDB. + $sql = $wpdb->prepare( + 'CREATE TABLE IF NOT EXISTS %i ( + id bigint(20) unsigned NOT NULL AUTO_INCREMENT, + element_type tinyint(1) NOT NULL default %d, + element_id unsigned bigint(20) NOT NULL, + parent_id unsigned bigint(20) DEFAULT NULL, + parent varchar(200) NOT NULL default "", + byte_offset bigint(20) unsigned NOT NULL, + hierarchy_level INT DEFAULT NULL, + PRIMARY KEY (id), + UNIQUE KEY element_id (element_id(%d)) + KEY element_parent (element_parent(%d)) + KEY byte_offset (byte_offset(%d)) + ) ' . $wpdb->get_charset_collate(), + self::get_table_name(), + self::ELEMENT_TYPE_POST, + $max_index_length, + $max_index_length, + $max_index_length + ); + } + + require_once ABSPATH . 'wp-admin/includes/upgrade.php'; + dbDelta( $sql ); + + update_option( self::OPTION_NAME, self::DB_VERSION ); + } + + public static function is_sqlite() { + return defined( 'DB_ENGINE' ) || 'sqlite' === DB_ENGINE; + } + + /** + * Run in the 'plugins_loaded' action. + */ + public static function load() { + if ( self::DB_VERSION !== (int) get_site_option( self::OPTION_NAME ) ) { + // Used to update the database with dbDelta, if needed in the future. + self::activate(); + } + } + + /** + * Run by register_deactivation_hook. + */ + public static function deactivate() { + global $wpdb; + $table_name = self::get_table_name(); + + // Drop the table. + $wpdb->query( $wpdb->prepare( 'DROP TABLE IF EXISTS %s', $table_name ) ); + + // Delete the option. + delete_option( self::OPTION_NAME ); + } + + /** + * Run by register_uninstall_hook. + */ public function reset() { - $this->posts = array(); - $this->categories = array(); - $this->category_index = array(); $this->orphan_post_counter = 0; $this->last_post_id = 0; $this->sorted = false; } public function map_category( $byte_offset, $data ) { + global $wpdb; + if ( empty( $data ) ) { return false; } - $this->categories[ $data['slug'] ] = array( - array_key_exists( 'parent', $data ) ? $data['parent'] : '', - $byte_offset, + $wpdb->insert( + self::get_table_name(), + array( + 'element_type' => self::ELEMENT_TYPE_CATEGORY, + 'element_id' => $data['term_id'], + 'parent_id' => $data['parent_id'], + 'parent' => array_key_exists( 'parent', $data ) ? $data['parent'] : '', + 'byte_offset' => $byte_offset, + ) ); } public function map_post( $byte_offset, $data ) { + global $wpdb; + if ( empty( $data ) ) { return false; } @@ -70,11 +194,15 @@ public function map_post( $byte_offset, $data ) { --$this->orphan_post_counter; } - // This is an array saved as: [ parent, byte_offset ], to save - // space and not using an associative one. - $this->posts[ $data['post_id'] ] = array( - $data['post_parent'], - $byte_offset, + $wpdb->insert( + self::get_table_name(), + array( + 'element_type' => self::ELEMENT_TYPE_POST, + 'element_id' => $data['post_id'], + 'parent_id' => $data['post_parent'], + 'parent' => '', + 'byte_offset' => $byte_offset, + ) ); } @@ -89,25 +217,20 @@ public function map_post( $byte_offset, $data ) { * @return int|bool The byte offset of the post, or false if the post is not found. */ public function get_post_byte_offset( $id ) { + global $wpdb; + if ( ! $this->sorted ) { return false; } - if ( isset( $this->posts[ $id ] ) ) { - $ret = $this->posts[ $id ]; - - // Remove the element from the array. - unset( $this->posts[ $id ] ); - - if ( 0 === count( $this->categories ) && 0 === count( $this->posts ) ) { - // All posts have been processed. - $this->reset(); - } - - return $ret; - } - - return false; + return $wpdb->get_var( + $wpdb->prepare( + 'SELECT byte_offset FROM %s WHERE element_id = %d AND element_type = %d', + self::get_table_name(), + $id, + self::ELEMENT_TYPE_POST + ) + ); } /** @@ -118,25 +241,20 @@ public function get_post_byte_offset( $id ) { * @return int|bool The byte offset of the category, or false if the category is not found. */ public function get_category_byte_offset( $slug ) { + global $wpdb; + if ( ! $this->sorted ) { return false; } - if ( isset( $this->categories[ $slug ] ) ) { - $ret = $this->categories[ $slug ]; - - // Remove the element from the array. - unset( $this->categories[ $slug ] ); - - if ( 0 === count( $this->categories ) && 0 === count( $this->posts ) ) { - // All categories have been processed. - $this->reset(); - } - - return $ret; - } - - return false; + return $wpdb->get_var( + $wpdb->prepare( + 'SELECT byte_offset FROM %s WHERE element_id = %d AND element_type = %d', + self::get_table_name(), + $id, + self::ELEMENT_TYPE_CATEGORY + ) + ); } public function is_sorted() { @@ -150,30 +268,30 @@ public function is_sorted() { * This method sorts the elements in the order they should be processed. */ public function sort_topologically( $free_space = true ) { - foreach ( $this->categories as $slug => $category ) { - $this->topological_category_sort( $slug, $category ); - } + /*foreach ( $this->categories as $slug => $category ) { + // $this->topological_category_sort( $slug, $category ); + }*/ - $this->sort_elements( $this->posts ); - $this->sort_elements( $this->categories ); + $this->sort_elements( self::ELEMENT_TYPE_POST ); + $this->sort_elements( self::ELEMENT_TYPE_CATEGORY ); // Free some space. if ( $free_space ) { - /** + /* * @TODO: all the elements that have not been moved can be flushed away. - */ + * foreach ( $this->posts as $id => $element ) { // Save only the byte offset. $this->posts[ $id ] = $element[1]; } - /** + /* * @TODO: all the elements that have not been moved can be flushed away. - */ + * foreach ( $this->categories as $slug => $element ) { // Save only the byte offset. $this->categories[ $slug ] = $element[1]; - } + }*/ } $this->sorted = true; @@ -182,34 +300,44 @@ public function sort_topologically( $free_space = true ) { /** * Recursive sort elements. Posts with parents will be moved to the correct position. * + * @param int $type The type of element to sort. * @return true */ - private function sort_elements( &$elements ) { - $sort_callback = function ( $a, $b ) use ( &$elements ) { - $parent_a = $elements[ $a ][0]; - $parent_b = $elements[ $b ][0]; - - if ( ! $parent_a && ! $parent_b ) { - // No parents. - return 0; - } elseif ( $a === $parent_b ) { - // A is the parent of B. - return -1; - } elseif ( $b === $parent_a ) { - // B is the parent of A. - return 1; - } - - return 0; - }; - - /** - * @TODO: PHP uses quicksort: https://github.com/php/php-src/blob/master/Zend/zend_sort.c - * WordPress export posts by ID and so are likely to be already in order. - * Quicksort performs badly on already sorted arrays, O(n^2) is the worst case. - * Let's consider using a different sorting algorithm. - */ - uksort( $elements, $sort_callback ); + private function sort_elements( $type ) { + global $wpdb; + $table_name = self::get_table_name(); + + return $wpdb->query( + $wpdb->prepare( + // Perform a topological sort CTE. + 'WITH RECURSIVE hierarchy_cte AS ( + -- Select all root nodes (where parent_id is NULL) + SELECT id, parent_id, 1 AS hierarchy_level + FROM %i + WHERE parent_id IS NULL AND element_type = %d + + UNION ALL + + -- Recursive member: Join the CTE with the table to find children + SELECT yt.id, yt.parent_id, hc.hierarchy_level + 1 + FROM %i yt + WHERE element_type = %d + INNER JOIN hierarchy_cte hc ON yt.parent_id = hc.id + ) + + -- Update the hierarchy_level based on the computed hierarchy_level + UPDATE %i + SET hierarchy_level = hc.hierarchy_level + FROM hierarchy_cte hc + WHERE %i.id = hc.id;', + $table_name, + $type, + $table_name, + $type, + $table_name, + $table_name + ) + ); } /**