5
5
from dataclasses import dataclass
6
6
from datetime import date as dt
7
7
from datetime import datetime
8
- from pathlib import Path
9
8
10
9
import fitz
11
10
import numpy as np
12
11
from stratigraphy .data_extractor .data_extractor import DataExtractor , ExtractedFeature , FeatureOnPage
12
+ from stratigraphy .data_extractor .utility import get_lines_near_rect
13
+ from stratigraphy .depths_materials_column_pairs .bounding_boxes import BoundingBox
13
14
from stratigraphy .groundwater .utility import extract_date , extract_depth , extract_elevation
14
15
from stratigraphy .lines .line import TextLine
15
16
from stratigraphy .metadata .elevation_extraction import Elevation
16
- from stratigraphy .text .extract_text import extract_text_lines
17
17
18
18
logger = logging .getLogger (__name__ )
19
19
@@ -114,30 +114,43 @@ class GroundwaterInDocument:
114
114
filename : str
115
115
116
116
@classmethod
117
- def from_document (cls , doc : fitz .Document , terrain_elevation : Elevation | None = None ) -> "GroundwaterInDocument" :
118
- """Initializes the GroundwaterInDocument object and extracts the groundwater from the document.
117
+ def near_material_description (
118
+ cls ,
119
+ document : fitz .Document ,
120
+ page_number : int ,
121
+ lines : list [TextLine ],
122
+ material_description_bbox : BoundingBox ,
123
+ terrain_elevation : Elevation | None = None ,
124
+ ) -> list [FeatureOnPage [Groundwater ]]:
125
+ """Extracts groundwater information from a near material description bounding box on a page.
119
126
120
127
Args:
121
- doc (fitz.Document): The PDF document.
128
+ document (fitz.Document): The PDF document.
129
+ page_number (int): The page number (1-based) to process.
130
+ lines (list[TextLine]): The list of text lines to retrieve the groundwater from.
131
+ material_description_bbox (BoundingBox): The material description box from which
122
132
terrain_elevation (Elevation | None): The elevation of the terrain.
123
133
124
134
Returns:
125
- GroundwaterInDocument : The extracted groundwater information from the document .
135
+ list[FeatureOnPage[Groundwater]] : The groundwater information near a material description bounding box .
126
136
"""
127
- filename = Path (doc .name ).name
128
-
129
- groundwater_extractor = GroundwaterLevelExtractor (document = doc )
130
- groundwater : list [FeatureOnPage [Groundwater ]] = groundwater_extractor .extract_groundwater (terrain_elevation )
131
-
132
- return GroundwaterInDocument (groundwater = groundwater , filename = filename )
133
-
134
- def get_groundwater_per_page (self ) -> list [FeatureOnPage [Groundwater ]]:
135
- """Returns the groundwater information in the document.
137
+ groundwater_extractor = GroundwaterLevelExtractor ()
138
+
139
+ lines_for_groundwater_key = get_lines_near_rect (
140
+ search_left_factor = 4 ,
141
+ search_right_factor = 4 ,
142
+ search_above_factor = 2 ,
143
+ search_below_factor = 3 ,
144
+ lines = lines ,
145
+ rect = material_description_bbox .rect ,
146
+ )
136
147
137
- Returns:
138
- list[FeatureOnPage[Groundwater]]: The groundwater information in the document.
139
- """
140
- return self .groundwater
148
+ return groundwater_extractor .extract_groundwater (
149
+ page_number = page_number ,
150
+ lines = lines_for_groundwater_key ,
151
+ document = document ,
152
+ terrain_elevation = terrain_elevation ,
153
+ )
141
154
142
155
def to_json (self ) -> list [dict ]:
143
156
"""Converts the object to a list of dictionaries.
@@ -157,14 +170,14 @@ class GroundwaterLevelExtractor(DataExtractor):
157
170
158
171
# look for elevation values to the left, right and/or immediately below the key
159
172
search_left_factor : float = 2
160
- search_right_factor : float = 10
161
- search_below_factor : float = 4
173
+ search_right_factor : float = 8
174
+ search_below_factor : float = 2
162
175
search_above_factor : float = 0
163
176
164
177
preprocess_replacements = {"," : "." , "'" : "." , "o" : "0" , "\n " : " " , "ü" : "u" }
165
178
166
- def __init__ (self , document ):
167
- super ().__init__ (document )
179
+ def __init__ (self ):
180
+ super ().__init__ ()
168
181
169
182
self .is_searching_groundwater_illustration = os .getenv ("IS_SEARCHING_GROUNDWATER_ILLUSTRATION" ) == "True"
170
183
if self .is_searching_groundwater_illustration :
@@ -194,18 +207,14 @@ def get_groundwater_near_key(self, lines: list[TextLine], page: int) -> list[Fea
194
207
key_center = (key_rect .x0 + key_rect .x1 ) / 2
195
208
groundwater_info_lines .sort (key = lambda line : abs ((line .rect .x0 + line .rect .x1 ) / 2 - key_center ))
196
209
197
- try :
198
- extracted_gw = self .get_groundwater_info_from_lines (groundwater_info_lines , page )
199
- if extracted_gw .feature .depth or extracted_gw .feature .elevation :
200
- # if the depth or elevation is extracted, add the extracted groundwater information to the list
201
- extracted_groundwater_list .append (extracted_gw )
202
- except ValueError as error :
203
- logger .warning ("ValueError: %s" , error )
204
- logger .warning ("Could not extract groundwater information from the lines near the key." )
210
+ extracted_groundwater = self .get_groundwater_info_from_lines (groundwater_info_lines , page )
211
+ if extracted_groundwater :
212
+ # if the depth or elevation is extracted, add the extracted groundwater information to the list
213
+ extracted_groundwater_list .append (extracted_groundwater )
205
214
206
215
return extracted_groundwater_list
207
216
208
- def get_groundwater_info_from_lines (self , lines : list [TextLine ], page : int ) -> FeatureOnPage [Groundwater ]:
217
+ def get_groundwater_info_from_lines (self , lines : list [TextLine ], page : int ) -> FeatureOnPage [Groundwater ] | None :
209
218
"""Extracts the groundwater information from a list of text lines.
210
219
211
220
Args:
@@ -219,7 +228,6 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> F
219
228
elevation : float | None = None
220
229
221
230
matched_lines_rect = []
222
-
223
231
for idx , line in enumerate (lines ):
224
232
text = self .preprocess (line .text )
225
233
@@ -303,9 +311,11 @@ def get_groundwater_info_from_lines(self, lines: list[TextLine], page: int) -> F
303
311
page = page ,
304
312
)
305
313
else :
306
- raise ValueError ("Could not extract all required information from the lines provided ." )
314
+ logger . warning ("Could not extract groundwater depth nor elevation from the lines near the key ." )
307
315
308
- def extract_groundwater (self , terrain_elevation : Elevation | None ) -> list [FeatureOnPage [Groundwater ]]:
316
+ def extract_groundwater (
317
+ self , page_number : int , lines : list [TextLine ], document : fitz .Document , terrain_elevation : Elevation | None
318
+ ) -> list [FeatureOnPage [Groundwater ]]:
309
319
"""Extracts the groundwater information from a borehole profile.
310
320
311
321
Processes the borehole profile page by page and tries to find the coordinates in the respective text of the
@@ -314,41 +324,40 @@ def extract_groundwater(self, terrain_elevation: Elevation | None) -> list[Featu
314
324
1. if that gives no results, search for coordinates close to an explicit "groundwater" label (e.g. "Gswp")
315
325
316
326
Args:
317
- terrain_elevation (ElevationInformation | None): The elevation of the borehole.
327
+ page_number (int): The page number (1-based) of the PDF document.
328
+ lines (list[TextLine]): The lines of text to extract the groundwater information from.
329
+ document (fitz.Document): The document used to extract groundwater from illustration.
330
+ terrain_elevation (Elevation | None): The elevation of the borehole.
318
331
319
332
Returns:
320
333
list[FeatureOnPage[Groundwater]]: the extracted coordinates (if any)
321
334
"""
322
- for page in self .doc :
323
- lines = extract_text_lines (page )
324
- page_number = page .number + 1 # NOTE: page.number is 0-based
325
-
326
- found_groundwater = self .get_groundwater_near_key (lines , page_number )
327
- if not found_groundwater and self .is_searching_groundwater_illustration :
328
- from stratigraphy .groundwater .gw_illustration_template_matching import (
329
- get_groundwater_from_illustration ,
330
- )
331
-
332
- # Extract groundwater from illustration
333
- found_groundwater , confidence_list = get_groundwater_from_illustration (
334
- self , lines , page_number , terrain_elevation
335
- )
336
- if found_groundwater :
337
- logger .info ("Confidence list: %s" , confidence_list )
338
- logger .info ("Found groundwater from illustration on page %s: %s" , page_number , found_groundwater )
339
-
340
- if terrain_elevation :
341
- # If the elevation is provided, calculate the depth of the groundwater
342
- for entry in found_groundwater :
343
- if not entry .feature .depth and entry .feature .elevation :
344
- entry .feature .depth = round (terrain_elevation .elevation - entry .feature .elevation , 2 )
345
- if not entry .feature .elevation and entry .feature .depth :
346
- entry .feature .elevation = round (terrain_elevation .elevation - entry .feature .depth , 2 )
335
+ found_groundwater = self .get_groundwater_near_key (lines , page_number )
336
+ if not found_groundwater and self .is_searching_groundwater_illustration :
337
+ from stratigraphy .groundwater .gw_illustration_template_matching import (
338
+ get_groundwater_from_illustration ,
339
+ )
347
340
341
+ # Extract groundwater from illustration
342
+ found_groundwater , confidence_list = get_groundwater_from_illustration (
343
+ self , lines , page_number , document , terrain_elevation
344
+ )
348
345
if found_groundwater :
349
- groundwater_output = ", " .join ([str (entry .feature ) for entry in found_groundwater ])
350
- logger .info ("Found groundwater information on page %s: %s" , page_number , groundwater_output )
351
- return found_groundwater
346
+ logger .info ("Confidence list: %s" , confidence_list )
347
+ logger .info ("Found groundwater from illustration on page %s: %s" , page_number , found_groundwater )
348
+
349
+ if terrain_elevation :
350
+ # If the elevation is provided, calculate the depth of the groundwater
351
+ for entry in found_groundwater :
352
+ if not entry .feature .depth and entry .feature .elevation :
353
+ entry .feature .depth = round (terrain_elevation .elevation - entry .feature .elevation , 2 )
354
+ if not entry .feature .elevation and entry .feature .depth :
355
+ entry .feature .elevation = round (terrain_elevation .elevation - entry .feature .depth , 2 )
356
+
357
+ if found_groundwater :
358
+ groundwater_output = ", " .join ([str (entry .feature ) for entry in found_groundwater ])
359
+ logger .info ("Found groundwater information on page %s: %s" , page_number , groundwater_output )
360
+ return found_groundwater
352
361
353
362
logger .info ("No groundwater found in this borehole profile." )
354
363
return []
0 commit comments