Skip to content

Commit

Permalink
Fixes an issue with merging PDFs and the ordering
Browse files Browse the repository at this point in the history
  • Loading branch information
stumpylog committed Oct 16, 2023
1 parent 52c264e commit cf27d2c
Show file tree
Hide file tree
Showing 12 changed files with 91 additions and 39 deletions.
6 changes: 3 additions & 3 deletions .docker/docker-compose.ci-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

version: "3"
services:
gotenberg:
gotenberg-client-test-server:
image: docker.io/gotenberg/gotenberg:7.9.2
hostname: gotenberg
container_name: gotenberg
hostname: gotenberg-client-test-server
container_name: gotenberg-client-test-server
network_mode: host
restart: unless-stopped
9 changes: 8 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ jobs:
docker compose --file ${GITHUB_WORKSPACE}/.docker/docker-compose.ci-test.yml up --detach
echo "Wait for container to be started"
sleep 5
-
name: Install poppler-utils
run: |
sudo apt-get update
sudo apt-get install --yes --no-install-recommends poppler-utils
-
name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
Expand All @@ -72,7 +77,9 @@ jobs:
cache: 'pip'
-
name: Install Hatch
run: pip install --upgrade hatch
run: |
python3 -m pip install --upgrade pip
pip install --upgrade hatch
-
name: Run tests
run: hatch run cov
Expand Down
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Fixed

- An issue with the sorting of merging PDFs. Expanded testing to cover the merged ordering

### Changed

- Multiple merge calls on the same route will maintain the ordering of all files, rather than just per merge call

## [0.2.0] - 2023-10-16

### Added
Expand Down
7 changes: 3 additions & 4 deletions src/gotenberg_client/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,23 +79,22 @@ def get_files(self) -> RequestFiles:
files = {}
for filename in self._file_map:
file_path = self._file_map[filename]
# Gotenberg requires these to have the specific name
filepath_name = filename if filename in {"index.html", "header.html", "footer.html"} else file_path.name

# Helpful but not necessary to provide the mime type when possible
mime_type = guess_mime_type(file_path)
if mime_type is not None:
files.update(
{filepath_name: (filepath_name, self._stack.enter_context(file_path.open("rb")), mime_type)},
{filename: (filename, self._stack.enter_context(file_path.open("rb")), mime_type)},
)
else: # pragma: no cover
files.update({filepath_name: (filepath_name, self._stack.enter_context(file_path.open("rb")))}) # type: ignore
files.update({filename: (filename, self._stack.enter_context(file_path.open("rb")))}) # type: ignore
return files

def _add_file_map(self, filepath: Path, name: Optional[str] = None) -> None:
"""
Small helper to handle bookkeeping of files for later opening. The name is
optional to support those things which are required to have a certain name
generally for ordering or just to be found at all
"""
if name is None:
name = filepath.name
Expand Down
11 changes: 9 additions & 2 deletions src/gotenberg_client/_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pathlib import Path
from typing import List

from httpx import Client

from gotenberg_client._base import BaseApi
from gotenberg_client._base import BaseRoute

Expand All @@ -13,15 +15,20 @@ class MergeRoute(BaseRoute):
Handles the merging of a given set of files
"""

def __init__(self, client: Client, api_route: str) -> None:
super().__init__(client, api_route)
self._next = 1

def merge(self, files: List[Path]) -> "MergeRoute":
"""
Adds the given files into the file mapping. This method will maintain the
ordering of the list. Calling this method multiple times may not merge
in the expected ordering
"""
for idx, filepath in enumerate(files):
for filepath in files:
# Include index to enforce ordering
self._add_file_map(filepath, f"{idx}_{filepath.name}")
self._add_file_map(filepath, f"{self._next}_{filepath.name}")
self._next += 1
return self


Expand Down
4 changes: 2 additions & 2 deletions src/gotenberg_client/_types_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import sys

if sys.version_info >= (3, 11):
if sys.version_info >= (3, 11): # pragma: no cover
from typing import Self
else:
else: # pragma: no cover
from typing_extensions import Self # noqa: F401
2 changes: 1 addition & 1 deletion src/gotenberg_client/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def optional_to_form(value: Optional[Union[bool, int, float, str]], name: str) -
return {name: str(value).lower()}


def guess_mime_type_stdlib(url: Path) -> Optional[str]:
def guess_mime_type_stdlib(url: Path) -> Optional[str]: # pragma: no cover
"""
Uses the standard library to guess a mimetype
"""
Expand Down
Binary file added tests/samples/a_merge_second.pdf
Binary file not shown.
Empty file modified tests/samples/sample1.pdf
100755 → 100644
Empty file.
Binary file added tests/samples/z_first_merge.pdf
Binary file not shown.
69 changes: 49 additions & 20 deletions tests/test_merge.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import List

import pikepdf
import pytest
Expand All @@ -15,18 +15,27 @@
from tests.utils import call_run_with_server_error_handling


@pytest.fixture()
def create_files():
def extract_text(pdf_path: Path) -> str:
"""
Creates 2 files in a temporary directory and cleans them up
after their use
Using pdftotext from poppler, extracts the text of a PDF into a file,
then reads the file contents and returns it
"""
temp_dir = Path(tempfile.mkdtemp())
test_file = SAMPLE_DIR / "sample1.pdf"
other_test_file = temp_dir / "sample2.pdf"
other_test_file.write_bytes(test_file.read_bytes())
yield [test_file, other_test_file]
shutil.rmtree(temp_dir, ignore_errors=True)
with tempfile.NamedTemporaryFile(
mode="w+",
) as tmp:
subprocess.run(
[ # noqa: S603
shutil.which("pdftotext"),
"-q",
"-layout",
"-enc",
"UTF-8",
str(pdf_path),
tmp.name,
],
check=True,
)
return tmp.read()


class TestMergePdfs:
Expand All @@ -37,12 +46,15 @@ class TestMergePdfs:
def test_merge_files_pdf_a(
self,
client: GotenbergClient,
create_files: List[Path],
gt_format: PdfAFormat,
pike_format: str,
):
with client.merge.merge() as route:
resp = call_run_with_server_error_handling(route.merge(create_files).pdf_format(gt_format))
resp = call_run_with_server_error_handling(
route.merge([SAMPLE_DIR / "z_first_merge.pdf", SAMPLE_DIR / "a_merge_second.pdf"]).pdf_format(
gt_format,
),
)

assert resp.status_code == codes.OK
assert "Content-Type" in resp.headers
Expand All @@ -58,14 +70,31 @@ def test_merge_files_pdf_a(
if SAVE_OUTPUTS:
(SAVE_DIR / f"test_libre_office_convert_xlsx_format_{pike_format}.pdf").write_bytes(resp.content)

def test_pdf_a_multiple_file(
def test_merge_multiple_file(
self,
client: GotenbergClient,
create_files: List[Path],
):
with client.merge.merge() as route:
resp = call_run_with_server_error_handling(route.merge(create_files))
if shutil.which("pdftotext") is None:
pytest.skip("No pdftotext executable found")
else:
with client.merge.merge() as route:
# By default, these would not merge correctly
route.merge([SAMPLE_DIR / "z_first_merge.pdf", SAMPLE_DIR / "a_merge_second.pdf"])
resp = call_run_with_server_error_handling(route)

assert resp.status_code == codes.OK
assert "Content-Type" in resp.headers
assert resp.headers["Content-Type"] == "application/pdf"

with tempfile.NamedTemporaryFile(mode="wb") as tmp:
tmp.write(resp.content)

text = extract_text(tmp.name)
lines = text.split("\n")
# Extra is empty line
assert len(lines) == 3
assert "first PDF to be merged." in lines[0]
assert "second PDF to be merged." in lines[1]

assert resp.status_code == codes.OK
assert "Content-Type" in resp.headers
assert resp.headers["Content-Type"] == "application/pdf"
if SAVE_OUTPUTS:
(SAVE_DIR / "test_pdf_a_multiple_file.pdf").write_bytes(resp.content)
12 changes: 6 additions & 6 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,16 @@ def call_run_with_server_error_handling(route: BaseRoute) -> Response:
one attempt to parse.
This will wait the following:
- Attempt 1 - 20s following failure
- Attempt 2 - 40s following failure
- Attempt 3 - 80s following failure
- Attempt 4 - 160s
- Attempt 5 - 320s
- Attempt 1 - 5s following failure
- Attempt 2 - 10s following failure
- Attempt 3 - 20s following failure
- Attempt 4 - 40s following failure
- Attempt 5 - 80s following failure
"""
result = None
succeeded = False
retry_time = 20.0
retry_time = 5.0
retry_count = 0
max_retry_count = 5

Expand Down

0 comments on commit cf27d2c

Please sign in to comment.