Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Data Liberation] Add EPub to Blocks converter #2097

Open
wants to merge 3 commits into
base: refactor-readers
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/playground/data-liberation/bootstrap.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_EPub_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_WXR_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_Directory_Tree_Entity_Reader.php';

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,36 +28,43 @@ class WP_HTML_To_Blocks implements WP_Block_Markup_Converter {

private $state = self::STATE_READY;
private $block_stack = array();
private $html;
private $markup_processor;
private $ignore_text = false;
private $in_ephemeral_paragraph = false;
private $block_markup = '';
private $metadata = array();
private $last_error = null;

public function __construct( $html ) {
$this->html = new \WP_HTML_Processor( $html );
public function __construct( $markup_processor ) {
$this->markup_processor = $markup_processor;
}

public function convert() {
if ( self::STATE_READY !== $this->state ) {
return false;
}

while ( $this->html->next_token() ) {
switch ( $this->html->get_token_type() ) {
while ( $this->markup_processor->next_token() ) {
switch ( $this->markup_processor->get_token_type() ) {
case '#text':
if ( $this->ignore_text ) {
break;
}
$this->append_rich_text( htmlspecialchars( $this->html->get_modifiable_text() ) );
$this->append_rich_text( htmlspecialchars( $this->markup_processor->get_modifiable_text() ) );
break;
case '#tag':
$this->handle_tag();
break;
}
}

if ( $this->markup_processor->get_last_error() ) {
$this->last_error = $this->markup_processor->get_last_error();
return false;
}

$this->close_ephemeral_paragraph();

return true;
}

Expand All @@ -77,12 +84,12 @@ public function get_block_markup() {
}

private function handle_tag() {
$html = $this->html;
$tag = $html->get_tag();
$html = $this->markup_processor;
$tag = strtoupper( $html->get_tag() );
$tag_lowercase = strtolower( $tag );

$is_tag_opener = ! $html->is_tag_closer();
if ( ! $html->expects_closer() ) {
$is_void_tag = ! $html->expects_closer() && ! $html->is_tag_closer();
if ( $is_void_tag ) {
switch ( $tag ) {
case 'META':
$key = $html->get_attribute( 'name' );
Expand Down Expand Up @@ -110,7 +117,7 @@ private function handle_tag() {
// Just insert an HTML block or what?
break;
}
} elseif ( $is_tag_opener ) {
} elseif ( ! $html->is_tag_closer() ) {
switch ( $tag ) {
// Block elements
case 'SCRIPT':
Expand Down Expand Up @@ -304,7 +311,7 @@ private function should_preserve_tag_in_rich_text( $tag ) {
}

private function is_at_inline_code_element() {
$breadcrumbs = $this->html->get_breadcrumbs();
$breadcrumbs = $this->markup_processor->get_breadcrumbs();
foreach ( $breadcrumbs as $tag ) {
switch ( $tag ) {
case 'A':
Expand Down Expand Up @@ -392,4 +399,8 @@ private function close_ephemeral_paragraph() {
$this->in_ephemeral_paragraph = false;
}
}

public function get_last_error() {
return $this->last_error;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
<?php

use WordPress\Zip\WP_Zip_Filesystem;

/**
* https://www.w3.org/AudioVideo/ebook/
*
* An EPUB Publication is transported as a single file (a "portable document") that contains:
* * a Package Document (OPF file) which specifies all the Publication's constituent content documents and their required resources, defines a reading order and associates Publication-level metadata and navigation information.
* * A metadata element including and/or referencing metadata applicable to the entire Publication and particular resources within it.
* * A manifest element: identifies (via IRI) and describes (via MIME media type) the set of resources that constitute the EPUB Publication.
* * A spine element : defines the default reading order of the Publication. (An ordered list of Publication Resources (EPUB Content Documents).
* * A Bindings element defines a set of custom handlers for media types not supported by EPUB3. If the Reading System cannot support the specific media type, it could use scripting fallback if supported.
* * all Content Documents
* * all other required resources for processing the Publication.
*
* The OCF Container is packaged into a physical single ZIP file containing:
* * Mime Type file: application/epub+zip.
* * META-INF folder (container file which points to the location of the .opf file), signatures, encryption, rights, are xml files
* * OEBPS folder stores the book content .(opf, ncx, html, svg, png, css, etc. files)
*/
class WP_EPub_Entity_Reader extends WP_Entity_Reader {

protected $zip;
protected $finished = false;
protected $current_post_id;
protected $remaining_html_files;
protected $current_html_reader;
protected $last_error;
public function __construct( WP_Zip_Filesystem $zip, $first_post_id = 1 ) {
$this->zip = $zip;
$this->current_post_id = $first_post_id;
}

public function next_entity() {
if ( $this->last_error ) {
return false;
}

if ( $this->finished ) {
return false;
}

if ( null === $this->remaining_html_files ) {
$path = false;
foreach ( array( '/OEBPS', '/EPUB' ) as $path_candidate ) {
if ( $this->zip->is_dir( $path_candidate ) ) {
$path = $path_candidate;
break;
}
}
if ( false === $path ) {
_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' );
$this->finished = true;
return false;
}

$files = $this->zip->ls( $path );
if ( false === $files ) {
_doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' );
$this->finished = true;
return false;
}
$this->remaining_html_files = array();
foreach ( $files as $file ) {
if ( str_ends_with( $file, '.xhtml' ) || str_ends_with( $file, '.html' ) ) {
$this->remaining_html_files[] = $path . '/' . $file;
}
}
}

while ( true ) {
if ( null !== $this->current_html_reader ) {
if (
! $this->current_html_reader->is_finished() &&
$this->current_html_reader->next_entity()
) {
return true;
}
if ( $this->current_html_reader->get_last_error() ) {
_doing_it_wrong(
__METHOD__,
'The EPUB file did not contain any HTML files.',
'1.0.0'
);
$this->finished = true;
return false;
}
}

if ( count( $this->remaining_html_files ) === 0 ) {
$this->finished = true;
return false;
}

$html_file = array_shift( $this->remaining_html_files );
$html = $this->zip->read_file( $html_file );
$this->current_html_reader = new WP_HTML_Entity_Reader(
WP_XML_Processor::create_from_string( $html ),
$this->current_post_id
);
if ( $this->current_html_reader->get_last_error() ) {
$this->last_error = $this->current_html_reader->get_last_error();
return false;
}
++$this->current_post_id;
}

return false;
}

public function get_entity() {
return $this->current_html_reader->get_entity();
}

public function is_finished(): bool {
return $this->finished;
}

public function get_last_error(): ?string {
return $this->last_error;
}
}
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
<?php

use WordPress\Data_Liberation\Block_Markup\WP_HTML_To_Blocks;

/**
* Converts a single HTML file into a stream of WordPress entities.
*
* @TODO: Support post meta.
*/
class WP_HTML_Entity_Reader extends WP_Entity_Reader {

protected $html;
protected $html_processor;
protected $entities;
protected $finished = false;
protected $post_id;
protected $last_error;

public function __construct( $html, $post_id ) {
$this->html = $html;
$this->post_id = $post_id;
public function __construct( $html_processor, $post_id ) {
$this->html_processor = $html_processor;
$this->post_id = $post_id;
}

public function next_entity() {
Expand All @@ -36,8 +35,9 @@ public function next_entity() {
}

// We did not read any entities yet. Let's convert the HTML document into entities.
$converter = new WP_HTML_To_Blocks( $this->html );
$converter = new WP_HTML_To_Blocks( $this->html_processor );
if ( false === $converter->convert() ) {
$this->last_error = $converter->get_last_error();
return false;
}

Expand Down Expand Up @@ -90,6 +90,6 @@ public function is_finished(): bool {
}

public function get_last_error(): ?string {
return null;
return $this->last_error;
}
}
Loading
Loading