Skip to content

Commit 9754ac6

Browse files
authored
Merge pull request #107 from swisstopo/LGVISIUM-100/extract_depth_ValueError_handling
Lgvisium 100/extract depth value error handling
2 parents 23cda82 + b659a85 commit 9754ac6

File tree

11 files changed

+229
-126
lines changed

11 files changed

+229
-126
lines changed

src/app/api/v1/endpoints/extract_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def create_response(coord, srs):
147147
),
148148
)
149149

150-
coord_extractor = CoordinateExtractor(pdf_page)
150+
coord_extractor = CoordinateExtractor()
151151
extracted_coord = coord_extractor.extract_coordinates_from_bbox(
152152
pdf_page, extract_data_request.page_number, user_defined_bbox
153153
)

src/stratigraphy/data_extractor/data_extractor.py

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
import fitz
1212
import regex
13-
from stratigraphy.data_extractor.utility import get_lines_near_rect
1413
from stratigraphy.lines.line import TextLine
1514
from stratigraphy.util.util import read_params
1615

@@ -96,7 +95,6 @@ class DataExtractor(ABC):
9695
This class defines the interface for extracting data from stratigraphy data files.
9796
"""
9897

99-
doc: fitz.Document = None
10098
feature_keys: list[str] = None
10199
feature_fp_keys: list[str] = None
102100
feature_name: str = None
@@ -112,17 +110,15 @@ class DataExtractor(ABC):
112110

113111
preprocess_replacements: dict[str, str] = {}
114112

115-
def __init__(self, document: fitz.Document):
113+
def __init__(self):
116114
"""Initializes the DataExtractor object.
117115
118116
Args:
119-
document (fitz.Document): A PDF document.
120117
feature_name (str): The name of the feature to extract.
121118
"""
122119
if not self.feature_name:
123120
raise ValueError("Feature name must be specified.")
124121

125-
self.doc = document
126122
self.feature_keys = read_params("matching_params.yml")[f"{self.feature_name}_keys"]
127123
self.feature_fp_keys = read_params("matching_params.yml")[f"{self.feature_name}_fp_keys"] or []
128124

@@ -193,7 +189,7 @@ def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]:
193189
list[TextLine]: The lines close to the key.
194190
"""
195191
key_rect = key_line.rect
196-
feature_lines = self.get_lines_near_rect(lines, key_rect)
192+
feature_lines = self.get_axis_aligned_lines(lines, key_rect)
197193

198194
# Insert key_line first and remove duplicates
199195
feature_lines.insert(0, key_line)
@@ -204,21 +200,38 @@ def get_lines_near_key(self, lines, key_line: TextLine) -> list[TextLine]:
204200

205201
return feature_lines_sorted
206202

207-
def get_lines_near_rect(self, lines, rect: fitz.Rect) -> list[TextLine]:
208-
"""Find the lines of the text that are close to a given rectangle.
203+
def get_axis_aligned_lines(self, lines: list[TextLine], rect: fitz.Rect) -> list[TextLine]:
204+
"""Find the lines of text that are horizontally and vertically close to a given rectangle.
205+
206+
Lines that are found both horizontally and vertically are included only once.
209207
210208
Args:
211209
lines (list[TextLine]): Arbitrary text lines to search in.
212210
rect (fitz.Rect): The rectangle to search around.
213211
214212
Returns:
215-
list[TextLine]: The lines close to the rectangle.
213+
list[TextLine]: A combined list of lines close to the rectangle within the horizontal
214+
(left/right) and vertical (above/below) regions, with intersection included only once.
216215
"""
217-
return get_lines_near_rect(
218-
self.search_left_factor,
219-
self.search_right_factor,
220-
self.search_above_factor,
221-
self.search_below_factor,
222-
lines,
223-
rect,
216+
# Horizontal rectangle (left-right limits)
217+
horizontal_rect = fitz.Rect(
218+
rect.x0 - self.search_left_factor * rect.width,
219+
rect.y0,
220+
rect.x1 + self.search_right_factor * rect.width,
221+
rect.y1,
222+
)
223+
224+
# Vertical rectangle (above-below limits)
225+
vertical_rect = fitz.Rect(
226+
rect.x0,
227+
rect.y0 - self.search_above_factor * rect.height,
228+
rect.x1,
229+
rect.y1 + self.search_below_factor * rect.height,
224230
)
231+
232+
horizontal_lines = {line for line in lines if line.rect.intersects(horizontal_rect)}
233+
vertical_lines = {line for line in lines if line.rect.intersects(vertical_rect)}
234+
235+
feature_lines = horizontal_lines | vertical_lines
236+
237+
return list(feature_lines)

src/stratigraphy/groundwater/groundwater_extraction.py

Lines changed: 73 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,15 @@
55
from dataclasses import dataclass
66
from datetime import date as dt
77
from datetime import datetime
8-
from pathlib import Path
98

109
import fitz
1110
import numpy as np
1211
from stratigraphy.data_extractor.data_extractor import DataExtractor, ExtractedFeature, FeatureOnPage
12+
from stratigraphy.data_extractor.utility import get_lines_near_rect
13+
from stratigraphy.depths_materials_column_pairs.bounding_boxes import BoundingBox
1314
from stratigraphy.groundwater.utility import extract_date, extract_depth, extract_elevation
1415
from stratigraphy.lines.line import TextLine
1516
from stratigraphy.metadata.elevation_extraction import Elevation
16-
from stratigraphy.text.extract_text import extract_text_lines
1717

1818
logger = logging.getLogger(__name__)
1919

@@ -114,30 +114,43 @@ class GroundwaterInDocument:
114114
filename: str
115115

116116
@classmethod
117-
def from_document(cls, doc: fitz.Document, terrain_elevation: Elevation | None = None) -> "GroundwaterInDocument":
118-
"""Initializes the GroundwaterInDocument object and extracts the groundwater from the document.
117+
def near_material_description(
118+
cls,
119+
document: fitz.Document,
120+
page_number: int,
121+
lines: list[TextLine],
122+
material_description_bbox: BoundingBox,
123+
terrain_elevation: Elevation | None = None,
124+
) -> list[FeatureOnPage[Groundwater]]:
125+
"""Extracts groundwater information from a near material description bounding box on a page.
119126
120127
Args:
121-
doc (fitz.Document): The PDF document.
128+
document (fitz.Document): The PDF document.
129+
page_number (int): The page number (1-based) to process.
130+
lines (list[TextLine]): The list of text lines to retrieve the groundwater from.
131+
material_description_bbox (BoundingBox): The material description box from which
122132
terrain_elevation (Elevation | None): The elevation of the terrain.
123133
124134
Returns:
125-
GroundwaterInDocument: The extracted groundwater information from the document.
135+
list[FeatureOnPage[Groundwater]]: The groundwater information near a material description bounding box.
126136
"""
127-
filename = Path(doc.name).name
128-
129-
groundwater_extractor = GroundwaterLevelExtractor(document=doc)
130-
groundwater: list[FeatureOnPage[Groundwater]] = groundwater_extractor.extract_groundwater(terrain_elevation)
131-
132-
return GroundwaterInDocument(groundwater=groundwater, filename=filename)
133-
134-
def get_groundwater_per_page(self) -> list[FeatureOnPage[Groundwater]]:
135-
"""Returns the groundwater information in the document.
137+
groundwater_extractor = GroundwaterLevelExtractor()
138+
139+
lines_for_groundwater_key = get_lines_near_rect(
140+
search_left_factor=4,
141+
search_right_factor=4,
142+
search_above_factor=2,
143+
search_below_factor=3,
144+
lines=lines,
145+
rect=material_description_bbox.rect,
146+
)
136147

137-
Returns:
138-
list[FeatureOnPage[Groundwater]]: The groundwater information in the document.
139-
"""
140-
return self.groundwater
148+
return groundwater_extractor.extract_groundwater(
149+
page_number=page_number,
150+
lines=lines_for_groundwater_key,
151+
document=document,
152+
terrain_elevation=terrain_elevation,
153+
)
141154

142155
def to_json(self) -> list[dict]:
143156
"""Converts the object to a list of dictionaries.
@@ -157,14 +170,14 @@ class GroundwaterLevelExtractor(DataExtractor):
157170

158171
# look for elevation values to the left, right and/or immediately below the key
159172
search_left_factor: float = 2
160-
search_right_factor: float = 10
161-
search_below_factor: float = 4
173+
search_right_factor: float = 8
174+
search_below_factor: float = 2
162175
search_above_factor: float = 0
163176

164177
preprocess_replacements = {",": ".", "'": ".", "o": "0", "\n": " ", "ü": "u"}
165178

166-
def __init__(self, document):
167-
super().__init__(document)
179+
def __init__(self):
180+
super().__init__()
168181

169182
self.is_searching_groundwater_illustration = os.getenv("IS_SEARCHING_GROUNDWATER_ILLUSTRATION") == "True"
170183
if self.is_searching_groundwater_illustration:
@@ -194,18 +207,14 @@ def get_groundwater_near_key(self, lines: list[TextLine], page: int) -> list[Fea
194207
key_center = (key_rect.x0 + key_rect.x1) / 2
195208
groundwater_info_lines.sort(key=lambda line: abs((line.rect.x0 + line.rect.x1) / 2 - key_center))
196209

197-
try:
198-
extracted_gw = self.get_groundwater_info_from_lines(groundwater_info_lines, page)
199-
if extracted_gw.feature.depth or extracted_gw.feature.elevation:
200-
# if the depth or elevation is extracted, add the extracted groundwater information to the list
201-
extracted_groundwater_list.append(extracted_gw)
202-
except ValueError as error:
203-
logger.warning("ValueError: %s", error)
204-
logger.warning("Could not extract groundwater information from the lines near the key.")
210+
extracted_groundwater = self.get_groundwater_info_from_lines(groundwater_info_lines, page)
211+
if extracted_groundwater:
212+
# if the depth or elevation is extracted, add the extracted groundwater information to the list
213+
extracted_groundwater_list.append(extracted_groundwater)
205214

206215
return extracted_groundwater_list
207216

208-
def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> FeatureOnPage[Groundwater]:
217+
def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> FeatureOnPage[Groundwater] | None:
209218
"""Extracts the groundwater information from a list of text lines.
210219
211220
Args:
@@ -219,7 +228,6 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> F
219228
elevation: float | None = None
220229

221230
matched_lines_rect = []
222-
223231
for idx, line in enumerate(lines):
224232
text = self.preprocess(line.text)
225233

@@ -303,9 +311,11 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> F
303311
page=page,
304312
)
305313
else:
306-
raise ValueError("Could not extract all required information from the lines provided.")
314+
logger.warning("Could not extract groundwater depth nor elevation from the lines near the key.")
307315

308-
def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[FeatureOnPage[Groundwater]]:
316+
def extract_groundwater(
317+
self, page_number: int, lines: list[TextLine], document: fitz.Document, terrain_elevation: Elevation | None
318+
) -> list[FeatureOnPage[Groundwater]]:
309319
"""Extracts the groundwater information from a borehole profile.
310320
311321
Processes the borehole profile page by page and tries to find the coordinates in the respective text of the
@@ -314,41 +324,40 @@ def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[Featu
314324
1. if that gives no results, search for coordinates close to an explicit "groundwater" label (e.g. "Gswp")
315325
316326
Args:
317-
terrain_elevation (ElevationInformation | None): The elevation of the borehole.
327+
page_number (int): The page number (1-based) of the PDF document.
328+
lines (list[TextLine]): The lines of text to extract the groundwater information from.
329+
document (fitz.Document): The document used to extract groundwater from illustration.
330+
terrain_elevation (Elevation | None): The elevation of the borehole.
318331
319332
Returns:
320333
list[FeatureOnPage[Groundwater]]: the extracted coordinates (if any)
321334
"""
322-
for page in self.doc:
323-
lines = extract_text_lines(page)
324-
page_number = page.number + 1 # NOTE: page.number is 0-based
325-
326-
found_groundwater = self.get_groundwater_near_key(lines, page_number)
327-
if not found_groundwater and self.is_searching_groundwater_illustration:
328-
from stratigraphy.groundwater.gw_illustration_template_matching import (
329-
get_groundwater_from_illustration,
330-
)
331-
332-
# Extract groundwater from illustration
333-
found_groundwater, confidence_list = get_groundwater_from_illustration(
334-
self, lines, page_number, terrain_elevation
335-
)
336-
if found_groundwater:
337-
logger.info("Confidence list: %s", confidence_list)
338-
logger.info("Found groundwater from illustration on page %s: %s", page_number, found_groundwater)
339-
340-
if terrain_elevation:
341-
# If the elevation is provided, calculate the depth of the groundwater
342-
for entry in found_groundwater:
343-
if not entry.feature.depth and entry.feature.elevation:
344-
entry.feature.depth = round(terrain_elevation.elevation - entry.feature.elevation, 2)
345-
if not entry.feature.elevation and entry.feature.depth:
346-
entry.feature.elevation = round(terrain_elevation.elevation - entry.feature.depth, 2)
335+
found_groundwater = self.get_groundwater_near_key(lines, page_number)
336+
if not found_groundwater and self.is_searching_groundwater_illustration:
337+
from stratigraphy.groundwater.gw_illustration_template_matching import (
338+
get_groundwater_from_illustration,
339+
)
347340

341+
# Extract groundwater from illustration
342+
found_groundwater, confidence_list = get_groundwater_from_illustration(
343+
self, lines, page_number, document, terrain_elevation
344+
)
348345
if found_groundwater:
349-
groundwater_output = ", ".join([str(entry.feature) for entry in found_groundwater])
350-
logger.info("Found groundwater information on page %s: %s", page_number, groundwater_output)
351-
return found_groundwater
346+
logger.info("Confidence list: %s", confidence_list)
347+
logger.info("Found groundwater from illustration on page %s: %s", page_number, found_groundwater)
348+
349+
if terrain_elevation:
350+
# If the elevation is provided, calculate the depth of the groundwater
351+
for entry in found_groundwater:
352+
if not entry.feature.depth and entry.feature.elevation:
353+
entry.feature.depth = round(terrain_elevation.elevation - entry.feature.elevation, 2)
354+
if not entry.feature.elevation and entry.feature.depth:
355+
entry.feature.elevation = round(terrain_elevation.elevation - entry.feature.depth, 2)
356+
357+
if found_groundwater:
358+
groundwater_output = ", ".join([str(entry.feature) for entry in found_groundwater])
359+
logger.info("Found groundwater information on page %s: %s", page_number, groundwater_output)
360+
return found_groundwater
352361

353362
logger.info("No groundwater found in this borehole profile.")
354363
return []

src/stratigraphy/groundwater/gw_illustration_template_matching.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,16 @@ def get_groundwater_from_illustration(
3939
groundwater_extractor: GroundwaterLevelExtractor,
4040
lines: list[TextLine],
4141
page_number: int,
42+
document: fitz.Document,
4243
terrain_elevation: Elevation | None,
4344
) -> tuple[list[FeatureOnPage[Groundwater]], list[float]]:
4445
"""Extracts the groundwater information from an illustration.
4546
4647
Args:
47-
groundwater_extractor (GroundwaterLevelExtractor): the groundwater level extractor
48-
lines (list[TextLine]): the lines of text to extract the groundwater information from
49-
page_number (int): the page number (1-based) of the PDF document
48+
groundwater_extractor (GroundwaterLevelExtractor): the groundwater level extractor.
49+
lines (list[TextLine]): The lines of text to extract the groundwater information from.
50+
page_number (int): The page number (1-based) of the PDF document.
51+
document (fitz.Document): The document to extract groundwater from illustration from.
5052
terrain_elevation (Elevation | None): The elevation of the terrain.
5153
5254
Returns:
@@ -57,8 +59,8 @@ def get_groundwater_from_illustration(
5759
confidence_list = []
5860

5961
# convert the doc to an image
60-
page = groundwater_extractor.doc.load_page(page_number - 1)
61-
filename = Path(groundwater_extractor.doc.name).stem
62+
page = document.load_page(page_number - 1)
63+
filename = Path(document.name).stem
6264
png_filename = f"{filename}-{page_number + 1}.png"
6365
png_path = f"/tmp/{png_filename}" # Local path to save the PNG
6466
fitz.utils.get_pixmap(page, matrix=fitz.Matrix(2, 2), clip=page.rect).save(png_path)

src/stratigraphy/groundwater/utility.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,23 @@ def extract_depth(text: str, max_depth: int) -> float | None:
3030
depth_patterns = [
3131
r"([\d.]+)\s*m\s*u\.t\.", # e.g. "5.13 m u.T."
3232
r"([\d.]+)\s*m\s*u\.t",
33-
r"(\d+.\d+)",
33+
r"(\d+\.\d+)",
3434
]
3535

3636
depth = None
3737
corrected_text = correct_ocr_text(text).lower()
3838
for pattern in depth_patterns:
3939
depth_match = regex.search(pattern, corrected_text)
40-
if depth_match:
41-
depth = float(depth_match.group(1).replace(",", "."))
42-
if depth > max_depth:
43-
# If the extracted depth is greater than the max depth, set it to None and continue searching.
44-
depth = None
45-
else:
46-
break
40+
try:
41+
if depth_match:
42+
depth = float(depth_match.group(1).replace(",", "."))
43+
if depth > max_depth:
44+
# If the extracted depth is greater than the max depth, set it to None and continue searching.
45+
depth = None
46+
else:
47+
break
48+
except ValueError:
49+
continue
4750
return depth
4851

4952

0 commit comments

Comments
 (0)