Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.10
3.11
14 changes: 9 additions & 5 deletions dpsprep/dpsprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def dpsprep( # noqa: C901, PLR0912, PLR0913, PLR0915
for task in tasks:
try:
task.get(timeout=25)
except multiprocessing.TimeoutError: # noqa: PERF203
except multiprocessing.TimeoutError:
pool_is_working = True

pool.join()
Expand All @@ -179,12 +179,16 @@ def dpsprep( # noqa: C901, PLR0912, PLR0913, PLR0915
if no_text:
combine_pdfs_on_fs_without_text(workdir, outline, len(document.pages))

if ocr_options is None:
ocr_success = False

if ocr_options:
loguru.logger.info('Performing OCR.')
ocr_success = perform_ocr(workdir, ocr_options)
else:
loguru.logger.info('Skipping the text layer.')

if not ocr_success:
shutil.copy(workdir.combined_pdf_without_text_path, workdir.combined_pdf_path)
else:
loguru.logger.info('Performing OCR.')
perform_ocr(workdir, ocr_options)
else:
combine_pdfs_on_fs_with_text(workdir, outline)

Expand Down
56 changes: 13 additions & 43 deletions dpsprep/ocrmypdf.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,34 @@
import argparse
import shutil
from typing import Any

import loguru

from .workdir import WorkingDirectory

# We use OCRmyPDF in a non-canonical way: only optimize the file without performing any OCR.
# The optimization procedure provides good results and preserves the text layer and outline.
# The code here is based on
# https://github.com/ocrmypdf/OCRmyPDF/blob/fb006ef39f7f8842dec1976bebe4bcd5ca2e8df8/src/ocrmypdf/optimize.py#L724
# https://github.com/ocrmypdf/OCRmyPDF/blob/fb006ef39f7f8842dec1976bebe4bcd5ca2e8df8/src/ocrmypdf/optimize.py#L724
# with some simplifications for OCRmyPDF 17

import shutil
from typing import Any

class OptimizeOptions(argparse.Namespace):
"""Emulate ocrmypdf's options."""

input_file: str
jobs: int
optimize: int
jpeg_quality: int
png_quality: int
jbig2_page_group_size: int
jbig2_lossy: bool
jbig2_threshold: float
quiet: bool
progress_bar: bool
import loguru

def __init__(
self, input_file: str, jobs: int, optimize_: int, jpeg_quality: int, png_quality: int,
) -> None:
self.input_file = input_file
self.jobs = jobs
self.optimize = optimize_
self.jpeg_quality = jpeg_quality
self.png_quality = png_quality
self.jbig2_page_group_size = 0 # When 0, this should be adjusted inside OCRmyPDF's "optimize" function
self.jbig2_lossy = False
self.jbig2_threshold = 0.85 # This seems to be the default
# Changing the two verbosity options seems to have no effect in this concrete case
self.quiet = True
self.progress_bar = False
from .workdir import WorkingDirectory


def optimize_pdf(workdir: WorkingDirectory, optlevel: int, quality: int | None, pool_size: int) -> bool:
try:
# ObjectStreamMode is actually from pikepdf, but I did not want to include that as a dependency
from ocrmypdf._options import OcrOptions
from ocrmypdf.optimize import ObjectStreamMode, PdfContext, optimize
from ocrmypdf.pdfinfo import PdfInfo
except ImportError:
loguru.logger.warning('Cannot detect OCRmyPDF. No optimizations will be performed on the output file.')
shutil.copy(workdir.combined_pdf_path, workdir.optimized_pdf_path)
return False

options = OptimizeOptions(
input_file=str(workdir.combined_pdf_path),
options = OcrOptions(
input_file=workdir.combined_pdf_without_text_path,
output_file=workdir.combined_pdf_path,
jobs=pool_size, # These correspond to CPU cores rather than threads, but it seems better to use the available pool size parameter
optimize_=optlevel,
optimize=optlevel,
# When 0, these should be adjusted inside OCRmyPDF's "optimize" function
jpeg_quality=quality or 0,
jpg_quality=quality or 0,
png_quality=quality or 0
)

Expand All @@ -83,12 +54,11 @@ def perform_ocr(workdir: WorkingDirectory, options: dict[str, Any]) -> bool:
from ocrmypdf import api
except ImportError:
loguru.logger.warning('Cannot detect OCRmyPDF. No OCR will be performed on the output file.')
shutil.copy(workdir.combined_pdf_without_text_path, workdir.combined_pdf_path)
return False

try:
api.ocr(
input_file=workdir.combined_pdf_without_text_path,
input_file_or_options=workdir.combined_pdf_without_text_path,
output_file=workdir.combined_pdf_path,
**options,
)
Expand Down
16 changes: 8 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[project]
name = "dpsprep"
version = "2.4.0"
version = "2.4.1"
description = "A DjVu to PDF converter with a focus on small output size and the ability to preserve document outlines and text layers"
requires-python = ">=3.10,<4.0"
requires-python = ">=3.11, <4.0"
authors = [
{ name = "Kevin Arthur Schiff Croker" },
{ name = "Ianis Vasilev", email = "ianis@ivasilev.net" }
Expand All @@ -20,6 +20,11 @@ dependencies = [
[project.urls]
Repository = "https://github.com/kcroker/dpsprep.git"

[project.optional-dependencies]
compress = [
"ocrmypdf (>=17.3.0)"
]

[project.scripts]
dpsprep = "dpsprep:dpsprep"

Expand All @@ -33,13 +38,8 @@ dev = [
"types-pillow (>=10.2.0.20240822)",
]

[optional-dependencies]
compress = [
"ocrmypdf (>=15.4.4)"
]

[build-system]
requires = ["uv_build (>=0.10.5,<0.11.0)"]
requires = ["uv_build (>=0.10.5, <0.11.0)"]
build-backend = "uv_build"

[tool.uv.build-backend]
Expand Down
Loading
Loading