Skip to content

Commit a5d0947

Browse files
authored
Merge pull request #103 from swisstopo/LGVISIUM-89
LGVISIUM-89: Improved clipping behavior documentation
2 parents 498a4e3 + 530702c commit a5d0947

File tree

6 files changed

+417
-35
lines changed

6 files changed

+417
-35
lines changed

example/clipping_test-1.png

96.4 KB
Loading

example/clipping_test.pdf

37.8 KB
Binary file not shown.

src/app/api/v1/router.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,30 @@ class BadRequestResponse(BaseModel):
3838
},
3939
)
4040
def post_create_pngs(request: PNGRequest) -> PNGResponse:
41-
"""Create PNGs from the given data."""
41+
"""Create PNG images from a PDF stored in the S3 bucket.
42+
43+
This endpoint generates PNG images from each page of a specified PDF document stored in the AWS S3 bucket.
44+
The PDF file must be accessible in the bucket with a valid filename provided in the request.
45+
46+
### Request Body
47+
- **request** (`PNGRequest`): Contains the `filename` of the PDF document in the S3 bucket from which PNGs
48+
should be generated.
49+
50+
### Returns
51+
- **PNGResponse**: Response containing a list of keys (filenames) for the generated PNG images stored in the
52+
S3 bucket.
53+
54+
### Status Codes
55+
- **200 OK**: PNG images were successfully created and stored in the S3 bucket.
56+
- **400 Bad Request**: The request format or content is invalid. Verify that `filename` is correctly specified.
57+
- **404 Not Found**: PDF file not found in S3 bucket.
58+
- **500 Internal Server Error**: An error occurred on the server while creating PNGs.
59+
60+
### Additional Information
61+
- The endpoint connects to AWS S3 to retrieve the specified PDF, converts its pages to PNGs, and stores
62+
the generated images back in S3. Ensure the PDF file exists in the S3 bucket and is accessible before
63+
making a request.
64+
"""
4265
return create_pngs(request.filename)
4366

4467

@@ -58,7 +81,37 @@ def post_create_pngs(request: PNGRequest) -> PNGResponse:
5881
def post_extract_data(
5982
extract_data_request: ExtractDataRequest,
6083
) -> ExtractCoordinatesResponse | ExtractTextResponse | ExtractNumberResponse:
61-
"""Extract data from the given PNGs."""
84+
"""Extract specified data from a given document based on the bounding box coordinates and format.
85+
86+
Behavior of the data extraction from the specified bounding box is the following: extraction on a per-letter
87+
basis, which means that as soon as the specified bounding box overlaps (partially or fully) with a letter
88+
or number, then this character is added to the extracted text. This behavior is consistent with the
89+
clipping behavior of the `PyMuPDF` library.
90+
91+
### Prerequisites
92+
Ensure that the PDF file has been processed by the create_pngs endpoint first.
93+
94+
### Request Body
95+
- **extract_data_request**: Instance of `ExtractDataRequest`, containing file details, page number, bounding
96+
box, and data format. The bounding box in PNG coordinates helps locate the region to extract data from.
97+
98+
### Returns
99+
The endpoint responds with one of the following response models based on the extracted data:
100+
- **ExtractCoordinatesResponse**: If geographic coordinates are extracted.
101+
- **ExtractTextResponse**: If text content is extracted.
102+
- **ExtractNumberResponse**: If numerical data is extracted.
103+
104+
### Status Codes
105+
- **200 OK**: Successful extraction, returning the specified data type.
106+
- **400 Bad Request**: Input request was invalid, typically due to misformatted or missing parameters.
107+
- **404 Not Found**: Requested data could not be found within the specified bounding box or page.
108+
- **500 Internal Server Error**: An error occurred on the server side during data extraction.
109+
110+
### Error Handling
111+
Known `ValueError`s (e.g., invalid input data) result in a `400 Bad Request` response with a relevant error
112+
message.
113+
For other errors, the endpoint returns a `500 Internal Server Error`.
114+
"""
62115
try:
63116
# Extract the data based on the request
64117
response = extract_data(extract_data_request)

src/app/common/schemas.py

Lines changed: 176 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,17 @@
1717

1818

1919
def validate_filename(value: str) -> str:
20-
"""Ensure the filename is not empty."""
20+
"""Ensure the filename is not empty.
21+
22+
Args:
23+
value (str): The filename to validate.
24+
25+
Returns:
26+
str: The validated filename.
27+
28+
Raises:
29+
ValueError: If the filename is empty
30+
"""
2131
if value == "":
2232
raise ValueError("Filename must not be empty.")
2333
return value
@@ -31,18 +41,28 @@ class PNGRequest(BaseModel):
3141
@field_validator("filename", mode="before")
3242
@classmethod
3343
def validate_filename(cls, value: str) -> str:
44+
"""Ensure the filename is not empty."""
3445
return validate_filename(value)
3546

3647
class Config:
3748
"""Make to allow using non-standard types like Path."""
3849

39-
arbitrary_types_allowed = True # This allows using non-standard types like Path
50+
arbitrary_types_allowed: bool = True # This allows using non-standard types like Path
4051

4152

4253
class PNGResponse(BaseModel):
43-
"""Response schema for the create_pngs endpoint."""
54+
"""Response schema for the `create_pngs` endpoint, representing the output of PNG file creation and storage.
4455
45-
keys: list[str] # keys in the S3 bucket
56+
This schema lists the keys (identifiers) of the created PNG files stored in an S3 bucket,
57+
enabling users to retrieve or reference them as needed.
58+
"""
59+
60+
keys: list[str] = Field(
61+
...,
62+
description="""List of unique identifiers (keys) for the generated PNG files stored in the S3 bucket. Each key
63+
allows access to a specific file within the bucket.""",
64+
example=["dataextraction/file1-1.png", "dataextraction/file1-2.png", "dataextraction/file1-3.png"],
65+
)
4666

4767

4868
########################################################################################################################
@@ -59,19 +79,44 @@ class FormatTypes(str, Enum):
5979

6080

6181
class BoundingBox(BaseModel):
62-
"""Bounding box schema."""
82+
"""Bounding box schema for defining a rectangular area within an image.
6383
64-
x0: float = Field(..., example=0.0)
65-
y0: float = Field(..., example=0.0)
66-
x1: float = Field(..., example=100.0)
67-
y1: float = Field(..., example=100.0)
84+
This schema represents the coordinates of the box’s corners, which can be used
85+
to specify an area of interest in image processing tasks. Coordinates are
86+
defined with the origin at the top-left of the image. Coordinates are in pixels.
87+
"""
88+
89+
x0: float = Field(
90+
...,
91+
description="""The x-coordinate of the top-left corner of the bounding box. This value marks the
92+
horizontal starting point of the box.""",
93+
example=0.0,
94+
)
95+
y0: float = Field(
96+
...,
97+
description="""The y-coordinate of the top-left corner of the bounding box. This value marks the vertical
98+
starting point of the box.""",
99+
example=0.0,
100+
)
101+
x1: float = Field(
102+
...,
103+
description="""The x-coordinate of the bottom-right corner of the bounding box. This value marks the
104+
horizontal endpoint of the box.""",
105+
example=100.0,
106+
)
107+
y1: float = Field(
108+
...,
109+
description="""The y-coordinate of the bottom-right corner of the bounding box. This value marks the vertical
110+
endpoint of the box.""",
111+
example=100.0,
112+
)
68113

69114
@field_validator("x0", "y0", "x1", "y1")
70115
@classmethod
71-
def page_number_must_be_positive(cls, v: int) -> int:
72-
"""Validate that the page number is positive."""
116+
def bbox_corners_must_be_positive(cls, v: int) -> int:
117+
"""Validate that the edges of the bounding box are positive."""
73118
if v < 0.0:
74-
raise ValueError("Bounding box coordinate must be a positive integer")
119+
raise ValueError("Bounding box coordinates must be positive")
75120
return v
76121

77122
def rescale(
@@ -125,28 +170,96 @@ def load_from_fitz_rect(rect: fitz.Rect) -> "BoundingBox":
125170

126171

127172
class Coordinates(BaseModel):
128-
"""Coordinates schema."""
173+
"""Coordinates schema for representing geographical data points.
129174
130-
east: float = Field(..., example=1.0)
131-
north: float = Field(..., example=2.0)
132-
projection: str = Field(..., example="LV95")
175+
This schema defines the format for specifying location data using east/north coordinates
176+
along with the projection system used.
177+
"""
178+
179+
east: float = Field(
180+
...,
181+
description="""Easting coordinate. The value should be in the units of the specified projection system.""",
182+
example=1.0,
183+
)
184+
north: float = Field(
185+
...,
186+
description="""Northing coordinate. The value should be in the units of the specified projection system.""",
187+
example=2.0,
188+
)
189+
projection: str = Field(
190+
...,
191+
description="""Projection system used to reference the coordinates. This defines the coordinate reference
192+
system, such as 'LV95' for Swiss coordinate systems.""",
193+
example="LV95",
194+
)
133195

134196

135197
class ExtractDataRequest(ABC, BaseModel):
136-
"""Request schema for the extract_data endpoint.
198+
"""Request schema for the `extract_data` endpoint.
199+
200+
** Requirements:**
201+
Before using this schema, ensure that the PDF file has been processed by the create_pngs endpoint first.
202+
203+
**Coordinate Systems:**
204+
- **PNG coordinates:** Pixels are measured from the top-left corner (0, 0), where x increases rightward
205+
and y downward.
206+
207+
### Fields
208+
Each field below includes inline examples to aid users in creating requests. See `json_schema_extra`
209+
for a complete example.
210+
211+
**Attributes:**
212+
- **filename** (`Path`): Path to the PDF file. _Example_: `"document.pdf"`
213+
- **page_number** (`int`): Target page for data extraction. This is a 1-based index. _Example_: `1`
214+
- **bbox** (`BoundingBox`): Bounding box for the extraction area, in PNG coordinates. Origin is the
215+
top-left, with x increasing rightward and y increasing downward.
216+
- Example format: `{"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}`
217+
- **format** (`FormatTypes`): Specifies the expected format for extracted data, e.g., `"coordinates"`.
218+
219+
### Validation
220+
Custom validators ensure data integrity:
221+
- **Filename Validator:** Ensures filename is not empty.
222+
- **Page Number Validator:** Confirms page number is positive.
223+
- **Format Validator:** Checks format is valid as per `FormatTypes`.
224+
225+
The bounding box should be provided in PNG coordinates.
137226
138227
Each field in the Pydantic model can have an example parameter, which provides an inline
139228
example for that specific field.
140229
"""
141230

142-
filename: Path = Field(..., example=Path("document.png"))
143-
page_number: int = Field(..., example=1) # 1-based index
144-
bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0})
145-
format: FormatTypes = Field(..., example=FormatTypes.COORDINATES.value)
231+
filename: Path = Field(
232+
...,
233+
description="""Path to the input PDF document file that contains the data to be extracted. This should be
234+
a valid file path, and the file should be accessible to the API.""",
235+
example=Path("document.pdf"),
236+
)
237+
page_number: int = Field(
238+
...,
239+
description="""Page number within the document where the extraction is to be performed. This is a 1-based
240+
index (e.g., 1 for the first page), applicable for multi-page files like PDFs.""",
241+
example=1,
242+
)
243+
bbox: BoundingBox = Field(
244+
...,
245+
description="""Bounding box defining the area for data extraction within the PNG version of the specified
246+
PDF file. The box is specified in pixels with the top-left as the origin (0,0), where x increases to the
247+
right and y increases downward. This box should be provided in PNG coordinates, and any
248+
transformations to PDF coordinates are managed internally.
249+
""",
250+
example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0},
251+
)
252+
format: FormatTypes = Field(
253+
...,
254+
description="""Specifies the desired format for extracted data, allowing for options like `coordinates` or
255+
other defined `FormatTypes` values. This dictates the structure of the output returned by the API.""",
256+
example=FormatTypes.COORDINATES.value,
257+
)
146258

147259
@field_validator("filename", mode="before")
148260
@classmethod
149261
def validate_filename(cls, value: str) -> str:
262+
"""Ensure the filename is not empty."""
150263
return validate_filename(value)
151264

152265
@field_validator("page_number")
@@ -183,40 +296,73 @@ class Config:
183296

184297

185298
class ExtractDataResponse(ABC, BaseModel):
186-
"""Response schema for the extract_data endpoint."""
299+
"""Base response schema for the `extract_data` endpoint, representing the extracted data's bounding box.
300+
301+
This abstract base class provides a bounding box field for data localization and an abstract property
302+
`response_type` to be implemented by subclasses, indicating the type of extracted content.
303+
"""
187304

188-
bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0})
305+
bbox: BoundingBox = Field(
306+
...,
307+
description="""Bounding box coordinates that define the area within the document where data was extracted.
308+
The box is specified in PNG coordinates, with the origin at the top-left corner (0,0).""",
309+
example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0},
310+
)
189311

190312
@property
191313
@abstractmethod
192314
def response_type(self):
193-
"""Abstract property to be implemented by subclasses to define response type."""
315+
"""Abstract property to be implemented by subclasses to define the type of response content."""
194316

195317

196318
class ExtractCoordinatesResponse(ExtractDataResponse):
197-
"""Response schema for the extract_data endpoint."""
319+
"""Response schema for the `extract_data` endpoint when returning geographic coordinates.
198320
199-
coordinates: Coordinates = Field(..., example={"east": 1.0, "north": 2.0, "page": 1, "projection": "LV95"})
321+
This schema includes a `coordinates` field with east/north values and projection information.
322+
"""
323+
324+
coordinates: Coordinates = Field(
325+
...,
326+
description="""Geographical coordinates extracted from the document, including east and north values,
327+
and projection type.""",
328+
example={"east": 1.0, "north": 2.0, "projection": "LV95"},
329+
)
200330

201331
@property
202332
def response_type(self):
203333
return "coordinates"
204334

205335

206336
class ExtractTextResponse(ExtractDataResponse):
207-
"""Response schema for the extract_data endpoint."""
337+
"""Response schema for the `extract_data` endpoint when returning extracted text content.
338+
339+
This schema includes a `text` field with the extracted textual content from the specified bounding box.
340+
"""
208341

209-
text: str = Field(..., example="text")
342+
text: str = Field(
343+
...,
344+
description="""Text content extracted from the specified bounding box within the document.""",
345+
example="text",
346+
)
210347

211348
@property
212349
def response_type(self):
213350
return "text"
214351

215352

216353
class ExtractNumberResponse(ExtractDataResponse):
217-
"""Response schema for the extract_data endpoint."""
354+
"""Response schema for the `extract_data` endpoint when returning numerical data.
355+
356+
This schema includes a `number` field for extracted numeric content, such as measurements or other
357+
quantitative data.
358+
"""
218359

219-
number: float = Field(..., example=1.0)
360+
number: float = Field(
361+
...,
362+
description="""Numeric value extracted from the specified bounding box within the document, representing a
363+
measurement or quantitative data.""",
364+
example=1.0,
365+
)
220366

221367
@property
222368
def response_type(self):

0 commit comments

Comments
 (0)