Skip to content

Commit

Permalink
Port NewZipStreamReader from adamziel/wxr-normalize
Browse files Browse the repository at this point in the history
This new ZipStreamReader opens its own file handles which means
it can be paused, resumed, and is more reliable. The original
implementation was built as a part of adamziel/wxr-normalize#1

This is all new code so there are no testing instructions. Eventually
this implementation will replace the existing ZipStreamReader.
  • Loading branch information
adamziel committed Oct 30, 2024
1 parent 3b8943b commit d7b184a
Showing 1 changed file with 356 additions and 0 deletions.
356 changes: 356 additions & 0 deletions src/WordPress/Zip/NewZipStreamReader.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,356 @@
<?php

namespace WordPress\Zip;

/**
* Improves on ZipStreamReader – it keeps track of its own parsing state
* without relying on $fp objects, which enables pausing and resuming.
*
* @TODO: Replace ZipStreamReader with this class once the consumers of
* ZipStreamReader have been updated to use the new interface.
*/
class NewZipStreamReader {

const SIGNATURE_FILE = 0x04034b50;
const SIGNATURE_CENTRAL_DIRECTORY = 0x02014b50;
const SIGNATURE_CENTRAL_DIRECTORY_END = 0x06054b50;
const COMPRESSION_DEFLATE = 8;

private $file_path;
private $zip_file_bytes_parsed_so_far = 0;
private $file_entry_body_bytes_parsed_so_far = 0;
private $state = NewZipStreamReader::STATE_SCAN;
private $header = null;
private $file_body_chunk = null;
private $paused_incomplete_input = false;
private $error_message;

private $inflate_handle;
private $fp;

const STATE_SCAN = 'scan';
const STATE_FILE_ENTRY = 'file-entry';
const STATE_CENTRAL_DIRECTORY_ENTRY = 'central-directory-entry';
const STATE_CENTRAL_DIRECTORY_ENTRY_EXTRA = 'central-directory-entry-extra';
const STATE_END_CENTRAL_DIRECTORY_ENTRY = 'end-central-directory-entry';
const STATE_END_CENTRAL_DIRECTORY_ENTRY_EXTRA = 'end-central-directory-entry-extra';
const STATE_COMPLETE = 'complete';
const STATE_ERROR = 'error';

public function pause() {
return [
'file_path' => $this->file_path,
'zip_file_bytes_parsed_so_far' => $this->zip_file_bytes_parsed_so_far,
'file_entry_body_bytes_parsed_so_far' => $this->file_entry_body_bytes_parsed_so_far,
'state' => $this->state,
'header' => $this->header,
'file_body_chunk' => $this->file_body_chunk,
'paused_incomplete_input' => $this->paused_incomplete_input,
];
}

public function resume($paused) {
$this->file_path = $paused['file_path'];
$this->zip_file_bytes_parsed_so_far = 0;
$this->state = $paused['state'];
$this->header = $paused['header'];
$this->file_body_chunk = $paused['file_body_chunk'];
$this->paused_incomplete_input = $paused['paused_incomplete_input'];

$this->fp = fopen($this->file_path, 'rb');
if($paused['file_entry_body_bytes_parsed_so_far'] > 0) {
$this->inflate_handle = inflate_init(ZLIB_ENCODING_RAW);
$file_starts_at = $paused['zip_file_bytes_parsed_so_far'] - $paused['file_entry_body_bytes_parsed_so_far'];
$this->zip_file_bytes_parsed_so_far = $file_starts_at;
fseek($this->fp, $file_starts_at);
while(true) {
$missing_bytes = $paused['file_entry_body_bytes_parsed_so_far'] - $this->file_entry_body_bytes_parsed_so_far;
$missing_bytes = max(0, min(4096, $missing_bytes));
if($missing_bytes === 0) {
break;
}
$this->read_file_entry_body_chunk($missing_bytes);
}
} else {
$this->zip_file_bytes_parsed_so_far = $paused['zip_file_bytes_parsed_so_far'];
fseek($this->fp, $this->zip_file_bytes_parsed_so_far);
}
}

public function __construct($file_path) {
$this->file_path = $file_path;
}

public function is_paused_at_incomplete_input(): bool {
return $this->paused_incomplete_input;
}

public function is_finished(): bool
{
return self::STATE_COMPLETE === $this->state || self::STATE_ERROR === $this->state;
}

public function get_state()
{
return $this->state;
}

public function get_header()
{
return $this->header;
}

public function get_file_path()
{
if(!$this->header) {
return null;
}

return $this->header['path'];
}

public function get_file_body_chunk()
{
return $this->file_body_chunk;
}

public function get_last_error(): ?string
{
return $this->error_message;
}

public function next()
{
do {
if(self::STATE_SCAN === $this->state) {
if(false === $this->scan()) {
return false;
}
}

switch ($this->state) {
case self::STATE_ERROR:
case self::STATE_COMPLETE:
return false;

case self::STATE_FILE_ENTRY:
if (false === $this->read_file_entry()) {
return false;
}
break;

case self::STATE_CENTRAL_DIRECTORY_ENTRY:
if (false === $this->read_central_directory_entry()) {
return false;
}
break;

case self::STATE_END_CENTRAL_DIRECTORY_ENTRY:
if (false === $this->read_end_central_directory_entry()) {
return false;
}
break;

default:
return false;
}
} while (self::STATE_SCAN === $this->state);

return true;
}

private function read_central_directory_entry()
{
if ($this->header && !empty($this->header['path'])) {
$this->header = null;
$this->state = self::STATE_SCAN;
return;
}

if (!$this->header) {
$data = $this->consume_bytes(42);
if ($data === false) {
$this->paused_incomplete_input = true;
return false;
}
$this->header = unpack(
'vversionCreated/vversionNeeded/vgeneralPurpose/vcompressionMethod/vlastModifiedTime/vlastModifiedDate/Vcrc/VcompressedSize/VuncompressedSize/vpathLength/vextraLength/vfileCommentLength/vdiskNumber/vinternalAttributes/VexternalAttributes/VfirstByteAt',
$data
);
}

if($this->header) {
$n = $this->header['pathLength'] + $this->header['extraLength'] + $this->header['fileCommentLength'];
$this->header['path'] = $this->consume_bytes($this->header['pathLength']);
$this->header['extra'] = $this->consume_bytes($this->header['extraLength']);
$this->header['fileComment'] = $this->consume_bytes($this->header['fileCommentLength']);
if(!$this->header['path']) {
$this->set_error('Empty path in central directory entry');
}
}
}

private function read_end_central_directory_entry()
{
if ($this->header && ( !empty($this->header['comment']) || 0 === $this->header['commentLength'] )) {
$this->header = null;
$this->state = self::STATE_SCAN;
return;
}

if(!$this->header) {
$data = $this->consume_bytes(18);
if ($data === false) {
$this->paused_incomplete_input = true;
return false;
}
$this->header = unpack(
'vdiskNumber/vcentralDirectoryStartDisk/vnumberCentralDirectoryRecordsOnThisDisk/vnumberCentralDirectoryRecords/VcentralDirectorySize/VcentralDirectoryOffset/vcommentLength',
$data
);
}

if($this->header && empty($this->header['comment']) && $this->header['commentLength'] > 0) {
$comment = $this->consume_bytes($this->header['commentLength']);
if(false === $comment) {
$this->paused_incomplete_input = true;
return false;
}
$this->header['comment'] = $comment;
}
}

private function scan() {
$signature = $this->consume_bytes(4);
if ($signature === false || 0 === strlen($signature)) {
$this->paused_incomplete_input = true;
return false;
}
$signature = unpack('V', $signature)[1];
switch($signature) {
case self::SIGNATURE_FILE:
$this->state = self::STATE_FILE_ENTRY;
break;
case self::SIGNATURE_CENTRAL_DIRECTORY:
$this->state = self::STATE_CENTRAL_DIRECTORY_ENTRY;
break;
case self::SIGNATURE_CENTRAL_DIRECTORY_END:
$this->state = self::STATE_END_CENTRAL_DIRECTORY_ENTRY;
break;
default:
$this->set_error('Invalid signature ' . $signature);
return false;
}
}

/**
* Reads a file entry from a zip file.
*
* The file entry is structured as follows:
*
* ```
* Offset Bytes Description
* 0 4 Local file header signature = 0x04034b50 (PK♥♦ or "PK\3\4")
* 4 2 Version needed to extract (minimum)
* 6 2 General purpose bit flag
* 8 2 Compression method; e.g. none = 0, DEFLATE = 8 (or "\0x08\0x00")
* 10 2 File last modification time
* 12 2 File last modification date
* 14 4 CRC-32 of uncompressed data
* 18 4 Compressed size (or 0xffffffff for ZIP64)
* 22 4 Uncompressed size (or 0xffffffff for ZIP64)
* 26 2 File name length (n)
* 28 2 Extra field length (m)
* 30 n File name
* 30+n m Extra field
* ```
*
* @param resource $stream
*/
private function read_file_entry()
{
if(false === $this->read_file_entry_header()) {
return false;
}
if(false === $this->read_file_entry_body_chunk()) {
return false;
}
}

private function read_file_entry_header() {
if (null === $this->header) {
$data = $this->consume_bytes(26);
if ($data === false) {
$this->paused_incomplete_input = true;
return false;
}
$this->header = unpack(
'vversionNeeded/vgeneralPurpose/vcompressionMethod/vlastModifiedTime/vlastModifiedDate/Vcrc/VcompressedSize/VuncompressedSize/vpathLength/vextraLength',
$data
);
$this->file_entry_body_bytes_parsed_so_far = 0;
}

if($this->header && empty($this->header['path'])) {
$this->header['path'] = $this->consume_bytes($this->header['pathLength']);
$this->header['extra'] = $this->consume_bytes($this->header['extraLength']);
if($this->header['compressionMethod'] === self::COMPRESSION_DEFLATE) {
$this->inflate_handle = inflate_init(ZLIB_ENCODING_RAW);
}
}
}

private function read_file_entry_body_chunk($max_bytes_to_read=4096) {
$this->file_body_chunk = null;

$file_body_bytes_left = $this->header['compressedSize'] - $this->file_entry_body_bytes_parsed_so_far;
if($file_body_bytes_left === 0) {
$this->header = null;
$this->inflate_handle = null;
$this->file_entry_body_bytes_parsed_so_far = 0;
$this->state = self::STATE_SCAN;
return;
}

$chunk_size = min($max_bytes_to_read, $file_body_bytes_left);
$compressed_bytes = $this->consume_bytes($chunk_size);
$this->file_entry_body_bytes_parsed_so_far += strlen($compressed_bytes);

if ($this->header['compressionMethod'] === self::COMPRESSION_DEFLATE) {
$uncompressed_bytes = inflate_add($this->inflate_handle, $compressed_bytes, ZLIB_PARTIAL_FLUSH);
if ( $uncompressed_bytes === false || inflate_get_status( $this->inflate_handle ) === false ) {
$this->set_error('Failed to inflate');
return false;
}
} else {
$uncompressed_bytes = $compressed_bytes;
}

$this->file_body_chunk = $uncompressed_bytes;
}

private function set_error($message) {
$this->state = self::STATE_ERROR;
$this->error_message = $message;
$this->paused_incomplete_input = false;
}

private function consume_bytes($n) {
if(0 === $n) {
return '';
}
if(null === $this->fp) {
$this->fp = fopen($this->file_path, 'rb');
}

$this->zip_file_bytes_parsed_so_far += $n;
$bytes_read = fread($this->fp, $n);
if(false === $bytes_read || '' === $bytes_read) {
fclose($this->fp);
$this->state = self::STATE_COMPLETE;
return false;
}
return $bytes_read;
}

}

0 comments on commit d7b184a

Please sign in to comment.