qurator-spk · kba · Oct 10, 2025 · Aug 19, 2025 · Aug 19, 2025 · Sep 19, 2025
diff --git a/.github/workflows/test-eynollah.yml b/.github/workflows/test-eynollah.yml
@@ -24,24 +24,39 @@ jobs:
         sudo rm -rf "$AGENT_TOOLSDIRECTORY"
         df -h
     - uses: actions/checkout@v4
-    - uses: actions/cache@v4
+    - uses: actions/cache/restore@v4
       id: seg_model_cache
       with:
         path: models_layout_v0_5_0
-        key: ${{ runner.os }}-models
-    - uses: actions/cache@v4
+        key: seg-models
+    - uses: actions/cache/restore@v4
       id: ocr_model_cache
       with:
-        path: models_ocr_v0_5_0
-        key: ${{ runner.os }}-models
-    - uses: actions/cache@v4
+        path: models_ocr_v0_5_1
+        key: ocr-models
+    - uses: actions/cache/restore@v4
       id: bin_model_cache
       with:
         path: default-2021-03-09
-        key: ${{ runner.os }}-modelbin
+        key: bin-models
     - name: Download models
       if: steps.seg_model_cache.outputs.cache-hit != 'true' || steps.bin_model_cache.outputs.cache-hit != 'true' || steps.ocr_model_cache.outputs.cache-hit != true
       run: make models
+    - uses: actions/cache/save@v4
+      if: steps.seg_model_cache.outputs.cache-hit != 'true'
+      with:
+        path: models_layout_v0_5_0
+        key: seg-models
+    - uses: actions/cache/save@v4
+      if: steps.ocr_model_cache.outputs.cache-hit != 'true'
+      with:
+        path: models_ocr_v0_5_1
+        key: ocr-models
+    - uses: actions/cache/save@v4
+      if: steps.bin_model_cache.outputs.cache-hit != 'true'
+      with:
+        path: default-2021-03-09
+        key: bin-models
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
@@ -50,7 +65,12 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         make install-dev EXTRAS=OCR,plotting
-        make deps-test
+        make deps-test EXTRAS=OCR,plotting
+        ls -l models_*
+    - name: Lint with ruff
+      uses: astral-sh/ruff-action@v3
+      with:
+        src: "./src"
     - name: Test with pytest
       run: make coverage PYTEST_ARGS="-vv --junitxml=pytest.xml"
     - name: Get coverage results

diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,11 @@
 __pycache__
 sbb_newspapers_org_image/pylint.log
 models_eynollah*
+models_ocr*
+models_layout*
+default-2021-03-09
 output.html
 /build
 /dist
 *.tif
+TAGS
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,55 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+Fixed:
+
+ * continue processing when no columns detected but text regions exist
+ * convert marginalia to main text if no main text is present
+ * reset deskewing angle to 0° when text covers <30% image area and detected angle >45°
+ * :fire: polygons: avoid invalid paths (use `Polygon.buffer()` instead of dilation etc.)
+ * `return_boxes_of_images_by_order_of_reading_new`: avoid Numpy.dtype mismatch, simplify
+ * `return_boxes_of_images_by_order_of_reading_new`: log any exceptions instead of ignoring
+ * `filter_contours_without_textline_inside`: avoid removing from duplicate lists twice
+ * `get_marginals`: exit early if no peaks found to avoid spurious overlap mask
+ * `get_smallest_skew`: after shifting search range of rotation angle, use overall best result
+ * Dockerfile: fix CUDA installation (cuDNN contested between Torch and TF due to extra OCR)
+ * OCR: re-instate missing methods and fix `utils_ocr` function calls
+ * mbreorder/enhancement CLIs: missing imports
+ * :fire: writer: `SeparatorRegion` needs `SeparatorRegionType` (not `ImageRegionType`)
+f458e3e
+ * tests: switch from `pytest-subtests` to `parametrize` so we can use `pytest-isolate`
+   (so CUDA memory gets freed between tests if running on GPU)
+
+Added:
+ * :fire: `layout` CLI: new option `--model_version` to override default choices
+ * test coverage for OCR options in `layout`
+ * test coverage for table detection in `layout`
+ * CI linting with ruff
+
+Changed:
+
+ * polygons: slightly widen for regions and lines, increase for separators
+ * various refactorings, some code style and identifier improvements
+ * deskewing/multiprocessing: switch back to ProcessPoolExecutor (faster), 
+   but use shared memory if necessary, and switch back from `loky` to stdlib,
+   and shutdown in `del()` instead of `atexit`
+ * :fire: OCR: switch CNN-RNN model to `20250930` version compatible with TF 2.12 on CPU, too
+ * OCR: allow running `-tr` without `-fl`, too
+ * :fire: writer: use `@type='heading'` instead of `'header'` for headings
+ * :fire: performance gains via refactoring (simplification, less copy-code, vectorization,
+   avoiding unused calculations, avoiding unnecessary 3-channel image operations)
+ * :fire: heuristic reading order detection: many improvements
+    - contour vs splitter box matching: 
+      * contour must be contained in box exactly instead of heuristics
+      * make fallback center matching, center must be contained in box
+    - original vs deskewed contour matching:
+      * same min-area filter on both sides
+      * similar area score in addition to center proximity
+      * avoid duplicate and missing mappings by allowing N:M
+        matches and splitting+joining where necessary
+ * CI: update+improve model caching
+
+
 ## [0.5.0] - 2025-09-26
 
 Fixed:

diff --git a/Dockerfile b/Dockerfile
@@ -40,6 +40,8 @@ RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename
 RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json
 # install everything and reduce image size
 RUN make install EXTRAS=OCR && rm -rf /build/eynollah
+# fixup for broken cuDNN installation (Torch pulls in 8.5.0, which is incompatible with Tensorflow)
+RUN pip install nvidia-cudnn-cu11==8.6.0.163
 # smoke test
 RUN eynollah --help
 

diff --git a/Makefile b/Makefile
@@ -13,12 +13,18 @@ DOCKER ?= docker
 #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.0/models_eynollah.tar.gz
 #SEG_MODEL := https://github.com/qurator-spk/eynollah/releases/download/v0.3.1/models_eynollah.tar.gz
 SEG_MODEL := https://zenodo.org/records/17194824/files/models_layout_v0_5_0.tar.gz?download=1
+SEG_MODELFILE = $(notdir $(patsubst %?download=1,%,$(SEG_MODEL)))
+SEG_MODELNAME = $(SEG_MODELFILE:%.tar.gz=%)
 
 BIN_MODEL := https://github.com/qurator-spk/sbb_binarization/releases/download/v0.0.11/saved_model_2021_03_09.zip
+BIN_MODELFILE = $(notdir $(BIN_MODEL))
+BIN_MODELNAME := default-2021-03-09
 
-OCR_MODEL := https://zenodo.org/records/17194824/files/models_ocr_v0_5_0.tar.gz?download=1
+OCR_MODEL := https://zenodo.org/records/17236998/files/models_ocr_v0_5_1.tar.gz?download=1
+OCR_MODELFILE = $(notdir $(patsubst %?download=1,%,$(OCR_MODEL)))
+OCR_MODELNAME = $(OCR_MODELFILE:%.tar.gz=%)
 
-PYTEST_ARGS ?= -vv
+PYTEST_ARGS ?= -vv --isolate
 
 # BEGIN-EVAL makefile-parser --make-help Makefile
 
@@ -31,7 +37,8 @@ help:
 	@echo "    install      Install package with pip"
 	@echo "    install-dev  Install editable with pip"
 	@echo "    deps-test    Install test dependencies with pip"
-	@echo "    models       Download and extract models to $(CURDIR)/models_layout_v0_5_0"
+	@echo "    models       Download and extract models to $(CURDIR):"
+	@echo "                 $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)"
 	@echo "    smoke-test   Run simple CLI check"
 	@echo "    ocrd-test    Run OCR-D CLI check"
 	@echo "    test         Run unit tests"
@@ -42,33 +49,32 @@ help:
 	@echo "    PYTEST_ARGS  pytest args for 'test' (Set to '-s' to see log output during test execution, '-vv' to see individual tests. [$(PYTEST_ARGS)]"
 	@echo "    SEG_MODEL    URL of 'models' archive to download for segmentation 'test' [$(SEG_MODEL)]"
 	@echo "    BIN_MODEL    URL of 'models' archive to download for binarization 'test' [$(BIN_MODEL)]"
+	@echo "    OCR_MODEL    URL of 'models' archive to download for binarization 'test' [$(OCR_MODEL)]"
 	@echo ""
 
 # END-EVAL
 
 
 # Download and extract models to $(PWD)/models_layout_v0_5_0
-models: models_layout_v0_5_0 models_ocr_v0_5_0 default-2021-03-09
+models: $(BIN_MODELNAME) $(SEG_MODELNAME) $(OCR_MODELNAME)
 
-models_layout_v0_5_0: models_layout_v0_5_0.tar.gz
-	tar zxf models_layout_v0_5_0.tar.gz
+# do not download these files if we already have the directories
+.INTERMEDIATE: $(BIN_MODELFILE) $(SEG_MODELFILE) $(OCR_MODELFILE)
 
-models_layout_v0_5_0.tar.gz:
+$(BIN_MODELFILE):
+	wget -O $@ $(BIN_MODEL)
+$(SEG_MODELFILE):
 	wget -O $@ $(SEG_MODEL)
-
-models_ocr_v0_5_0: models_ocr_v0_5_0.tar.gz
-	tar zxf models_ocr_v0_5_0.tar.gz
-
-models_ocr_v0_5_0.tar.gz:
+$(OCR_MODELFILE):
 	wget -O $@ $(OCR_MODEL)
 
-default-2021-03-09: $(notdir $(BIN_MODEL))
-	unzip $(notdir $(BIN_MODEL))
+$(BIN_MODELNAME): $(BIN_MODELFILE)
 	mkdir $@
-	mv $(basename $(notdir $(BIN_MODEL))) $@
-
-$(notdir $(BIN_MODEL)):
-	wget $(BIN_MODEL)
+	unzip -d $@ $<
+$(SEG_MODELNAME): $(SEG_MODELFILE)
+	tar zxf $<
+$(OCR_MODELNAME): $(OCR_MODELFILE)
+	tar zxf $<
 
 build:
 	$(PIP) install build
@@ -82,28 +88,34 @@ install:
 install-dev:
 	$(PIP) install -e .$(and $(EXTRAS),[$(EXTRAS)])
 
-deps-test: models_layout_v0_5_0
+ifeq (OCR,$(findstring OCR, $(EXTRAS)))
+deps-test: $(OCR_MODELNAME)
+endif
+deps-test: $(BIN_MODELNAME) $(SEG_MODELNAME)
 	$(PIP) install -r requirements-test.txt
+ifeq (OCR,$(findstring OCR, $(EXTRAS)))
+	ln -rs $(OCR_MODELNAME)/* $(SEG_MODELNAME)/
+endif
 
 smoke-test: TMPDIR != mktemp -d
 smoke-test: tests/resources/kant_aufklaerung_1784_0020.tif
 	# layout analysis:
-	eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/models_layout_v0_5_0
+	eynollah layout -i $< -o $(TMPDIR) -m $(CURDIR)/$(SEG_MODELNAME)
 	fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml
 	fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$(basename $(<F)).xml
 	# layout, directory mode (skip one, add one):
-	eynollah layout -di $(<D) -o $(TMPDIR) -m $(CURDIR)/models_layout_v0_5_0
+	eynollah layout -di $(<D) -o $(TMPDIR) -m $(CURDIR)/$(SEG_MODELNAME)
 	test -s $(TMPDIR)/euler_rechenkunst01_1738_0025.xml
 	# mbreorder, directory mode (overwrite):
-	eynollah machine-based-reading-order -di $(<D) -o $(TMPDIR) -m $(CURDIR)/models_layout_v0_5_0
+	eynollah machine-based-reading-order -di $(<D) -o $(TMPDIR) -m $(CURDIR)/$(SEG_MODELNAME)
 	fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$(basename $(<F)).xml
 	fgrep -c -e RegionRefIndexed $(TMPDIR)/$(basename $(<F)).xml
 	# binarize:
-	eynollah binarization -m $(CURDIR)/default-2021-03-09 -i $< -o $(TMPDIR)/$(<F)
+	eynollah binarization -m $(CURDIR)/$(BIN_MODELNAME) -i $< -o $(TMPDIR)/$(<F)
 	test -s $(TMPDIR)/$(<F)
 	@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
 	# enhance:
-	eynollah enhancement -m $(CURDIR)/models_layout_v0_5_0 -sos -i $< -o $(TMPDIR) -O
+	eynollah enhancement -m $(CURDIR)/$(SEG_MODELNAME) -sos -i $< -o $(TMPDIR) -O
 	test -s $(TMPDIR)/$(<F)
 	@set -x; test "$$(identify -format '%w %h' $<)" = "$$(identify -format '%w %h' $(TMPDIR)/$(<F))"
 	$(RM) -r $(TMPDIR)
@@ -114,18 +126,18 @@ ocrd-test: tests/resources/kant_aufklaerung_1784_0020.tif
 	cp $< $(TMPDIR)
 	ocrd workspace -d $(TMPDIR) init
 	ocrd workspace -d $(TMPDIR) add -G OCR-D-IMG -g PHYS_0020 -i OCR-D-IMG_0020 $(<F)
-	ocrd-eynollah-segment -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-SEG -P models $(CURDIR)/models_layout_v0_5_0
+	ocrd-eynollah-segment -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-SEG -P models $(CURDIR)/$(SEG_MODELNAME)
 	result=$$(ocrd workspace -d $(TMPDIR) find -G OCR-D-SEG); \
 	fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 $(TMPDIR)/$$result && \
 	fgrep -c -e TextRegion -e ImageRegion -e SeparatorRegion $(TMPDIR)/$$result
-	ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-BIN -P model $(CURDIR)/default-2021-03-09
-	ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-SEG -O OCR-D-SEG-BIN -P model $(CURDIR)/default-2021-03-09 -P operation_level region
+	ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-IMG -O OCR-D-BIN -P model $(CURDIR)/$(BIN_MODELNAME)
+	ocrd-sbb-binarize -w $(TMPDIR) -I OCR-D-SEG -O OCR-D-SEG-BIN -P model $(CURDIR)/$(BIN_MODELNAME) -P operation_level region
 	$(RM) -r $(TMPDIR)
 
 # Run unit tests
-test: export MODELS_LAYOUT=$(CURDIR)/models_layout_v0_5_0
-test: export MODELS_OCR=$(CURDIR)/models_ocr_v0_5_0
-test: export MODELS_BIN=$(CURDIR)/default-2021-03-09
+test: export MODELS_LAYOUT=$(CURDIR)/$(SEG_MODELNAME)
+test: export MODELS_OCR=$(CURDIR)/$(OCR_MODELNAME)
+test: export MODELS_BIN=$(CURDIR)/$(BIN_MODELNAME)
 test:
 	$(PYTHON) -m pytest tests --durations=0 --continue-on-collection-errors $(PYTEST_ARGS)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -51,3 +51,21 @@ where = ["src"]
 [tool.coverage.run]
 branch = true
 source = ["eynollah"]
+
+[tool.ruff]
+line-length = 120
+
+[tool.ruff.lint]
+ignore = [
+# disable unused imports
+"F401",
+# disable import order
+"E402",
+# disable unused variables
+"F841",
+# disable bare except
+"E722",
+]
+
+[tool.ruff.format]
+quote-style = "preserve"
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,4 +1,4 @@
 pytest
-pytest-subtests
+pytest-isolate
 coverage[toml]
 black
diff --git a/requirements.txt b/requirements.txt
@@ -5,5 +5,4 @@ scikit-learn >= 0.23.2
 tensorflow < 2.13
 numba <= 0.58.1
 scikit-image
-loky
 biopython
diff --git a/src/eynollah/cli.py b/src/eynollah/cli.py
@@ -202,6 +202,13 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low
     type=click.Path(exists=True, file_okay=False),
     required=True,
 )
+@click.option(
+    "--model_version",
+    "-mv",
+    help="override default versions of model categories",
+    type=(str, str),
+    multiple=True,
+)
 @click.option(
     "--save_images",
     "-si",
@@ -373,7 +380,7 @@ def enhancement(image, out, overwrite, dir_in, model, num_col_upper, num_col_low
     help="Setup a basic console logger",
 )
 
-def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, transformer_ocr, batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level, setup_logging):
+def layout(image, out, overwrite, dir_in, model, model_version, save_images, save_layout, save_deskewed, save_all, extract_only_images, save_page, enable_plotting, allow_enhancement, curved_line, textline_light, full_layout, tables, right2left, input_binary, allow_scaling, headers_off, light_version, reading_order_machine_based, do_ocr, transformer_ocr, batch_size_ocr, num_col_upper, num_col_lower, threshold_art_class_textline, threshold_art_class_layout, skip_layout_and_reading_order, ignore_page_extraction, log_level, setup_logging):
     if setup_logging:
         console_handler = logging.StreamHandler(sys.stdout)
         console_handler.setLevel(logging.INFO)
@@ -404,6 +411,7 @@ def layout(image, out, overwrite, dir_in, model, save_images, save_layout, save_
     assert bool(image) != bool(dir_in), "Either -i (single input) or -di (directory) must be provided, but not both."
     eynollah = Eynollah(
         model,
+        model_versions=model_version,
         extract_only_images=extract_only_images,
         enable_plotting=enable_plotting,
         allow_enhancement=allow_enhancement,