17
17
18
18
19
19
def validate_filename (value : str ) -> str :
20
- """Ensure the filename is not empty."""
20
+ """Ensure the filename is not empty.
21
+
22
+ Args:
23
+ value (str): The filename to validate.
24
+
25
+ Returns:
26
+ str: The validated filename.
27
+
28
+ Raises:
29
+ ValueError: If the filename is empty
30
+ """
21
31
if value == "" :
22
32
raise ValueError ("Filename must not be empty." )
23
33
return value
@@ -31,18 +41,28 @@ class PNGRequest(BaseModel):
31
41
@field_validator ("filename" , mode = "before" )
32
42
@classmethod
33
43
def validate_filename (cls , value : str ) -> str :
44
+ """Ensure the filename is not empty."""
34
45
return validate_filename (value )
35
46
36
47
class Config :
37
48
"""Make to allow using non-standard types like Path."""
38
49
39
- arbitrary_types_allowed = True # This allows using non-standard types like Path
50
+ arbitrary_types_allowed : bool = True # This allows using non-standard types like Path
40
51
41
52
42
53
class PNGResponse (BaseModel ):
43
- """Response schema for the create_pngs endpoint."""
54
+ """Response schema for the ` create_pngs` endpoint, representing the output of PNG file creation and storage.
44
55
45
- keys : list [str ] # keys in the S3 bucket
56
+ This schema lists the keys (identifiers) of the created PNG files stored in an S3 bucket,
57
+ enabling users to retrieve or reference them as needed.
58
+ """
59
+
60
+ keys : list [str ] = Field (
61
+ ...,
62
+ description = """List of unique identifiers (keys) for the generated PNG files stored in the S3 bucket. Each key
63
+ allows access to a specific file within the bucket.""" ,
64
+ example = ["dataextraction/file1-1.png" , "dataextraction/file1-2.png" , "dataextraction/file1-3.png" ],
65
+ )
46
66
47
67
48
68
########################################################################################################################
@@ -59,19 +79,44 @@ class FormatTypes(str, Enum):
59
79
60
80
61
81
class BoundingBox (BaseModel ):
62
- """Bounding box schema."""
82
+ """Bounding box schema for defining a rectangular area within an image.
63
83
64
- x0 : float = Field (..., example = 0.0 )
65
- y0 : float = Field (..., example = 0.0 )
66
- x1 : float = Field (..., example = 100.0 )
67
- y1 : float = Field (..., example = 100.0 )
84
+ This schema represents the coordinates of the box’s corners, which can be used
85
+ to specify an area of interest in image processing tasks. Coordinates are
86
+ defined with the origin at the top-left of the image. Coordinates are in pixels.
87
+ """
88
+
89
+ x0 : float = Field (
90
+ ...,
91
+ description = """The x-coordinate of the top-left corner of the bounding box. This value marks the
92
+ horizontal starting point of the box.""" ,
93
+ example = 0.0 ,
94
+ )
95
+ y0 : float = Field (
96
+ ...,
97
+ description = """The y-coordinate of the top-left corner of the bounding box. This value marks the vertical
98
+ starting point of the box.""" ,
99
+ example = 0.0 ,
100
+ )
101
+ x1 : float = Field (
102
+ ...,
103
+ description = """The x-coordinate of the bottom-right corner of the bounding box. This value marks the
104
+ horizontal endpoint of the box.""" ,
105
+ example = 100.0 ,
106
+ )
107
+ y1 : float = Field (
108
+ ...,
109
+ description = """The y-coordinate of the bottom-right corner of the bounding box. This value marks the vertical
110
+ endpoint of the box.""" ,
111
+ example = 100.0 ,
112
+ )
68
113
69
114
@field_validator ("x0" , "y0" , "x1" , "y1" )
70
115
@classmethod
71
- def page_number_must_be_positive (cls , v : int ) -> int :
72
- """Validate that the page number is positive."""
116
+ def bbox_corners_must_be_positive (cls , v : int ) -> int :
117
+ """Validate that the edges of the bounding box are positive."""
73
118
if v < 0.0 :
74
- raise ValueError ("Bounding box coordinate must be a positive integer " )
119
+ raise ValueError ("Bounding box coordinates must be positive" )
75
120
return v
76
121
77
122
def rescale (
@@ -125,28 +170,96 @@ def load_from_fitz_rect(rect: fitz.Rect) -> "BoundingBox":
125
170
126
171
127
172
class Coordinates (BaseModel ):
128
- """Coordinates schema."""
173
+ """Coordinates schema for representing geographical data points.
129
174
130
- east : float = Field (..., example = 1.0 )
131
- north : float = Field (..., example = 2.0 )
132
- projection : str = Field (..., example = "LV95" )
175
+ This schema defines the format for specifying location data using east/north coordinates
176
+ along with the projection system used.
177
+ """
178
+
179
+ east : float = Field (
180
+ ...,
181
+ description = """Easting coordinate. The value should be in the units of the specified projection system.""" ,
182
+ example = 1.0 ,
183
+ )
184
+ north : float = Field (
185
+ ...,
186
+ description = """Northing coordinate. The value should be in the units of the specified projection system.""" ,
187
+ example = 2.0 ,
188
+ )
189
+ projection : str = Field (
190
+ ...,
191
+ description = """Projection system used to reference the coordinates. This defines the coordinate reference
192
+ system, such as 'LV95' for Swiss coordinate systems.""" ,
193
+ example = "LV95" ,
194
+ )
133
195
134
196
135
197
class ExtractDataRequest (ABC , BaseModel ):
136
- """Request schema for the extract_data endpoint.
198
+ """Request schema for the `extract_data` endpoint.
199
+
200
+ ** Requirements:**
201
+ Before using this schema, ensure that the PDF file has been processed by the create_pngs endpoint first.
202
+
203
+ **Coordinate Systems:**
204
+ - **PNG coordinates:** Pixels are measured from the top-left corner (0, 0), where x increases rightward
205
+ and y downward.
206
+
207
+ ### Fields
208
+ Each field below includes inline examples to aid users in creating requests. See `json_schema_extra`
209
+ for a complete example.
210
+
211
+ **Attributes:**
212
+ - **filename** (`Path`): Path to the PDF file. _Example_: `"document.pdf"`
213
+ - **page_number** (`int`): Target page for data extraction. This is a 1-based index. _Example_: `1`
214
+ - **bbox** (`BoundingBox`): Bounding box for the extraction area, in PNG coordinates. Origin is the
215
+ top-left, with x increasing rightward and y increasing downward.
216
+ - Example format: `{"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}`
217
+ - **format** (`FormatTypes`): Specifies the expected format for extracted data, e.g., `"coordinates"`.
218
+
219
+ ### Validation
220
+ Custom validators ensure data integrity:
221
+ - **Filename Validator:** Ensures filename is not empty.
222
+ - **Page Number Validator:** Confirms page number is positive.
223
+ - **Format Validator:** Checks format is valid as per `FormatTypes`.
224
+
225
+ The bounding box should be provided in PNG coordinates.
137
226
138
227
Each field in the Pydantic model can have an example parameter, which provides an inline
139
228
example for that specific field.
140
229
"""
141
230
142
- filename : Path = Field (..., example = Path ("document.png" ))
143
- page_number : int = Field (..., example = 1 ) # 1-based index
144
- bbox : BoundingBox = Field (..., example = {"x0" : 0.0 , "y0" : 0.0 , "x1" : 100.0 , "y1" : 100.0 })
145
- format : FormatTypes = Field (..., example = FormatTypes .COORDINATES .value )
231
+ filename : Path = Field (
232
+ ...,
233
+ description = """Path to the input PDF document file that contains the data to be extracted. This should be
234
+ a valid file path, and the file should be accessible to the API.""" ,
235
+ example = Path ("document.pdf" ),
236
+ )
237
+ page_number : int = Field (
238
+ ...,
239
+ description = """Page number within the document where the extraction is to be performed. This is a 1-based
240
+ index (e.g., 1 for the first page), applicable for multi-page files like PDFs.""" ,
241
+ example = 1 ,
242
+ )
243
+ bbox : BoundingBox = Field (
244
+ ...,
245
+ description = """Bounding box defining the area for data extraction within the PNG version of the specified
246
+ PDF file. The box is specified in pixels with the top-left as the origin (0,0), where x increases to the
247
+ right and y increases downward. This box should be provided in PNG coordinates, and any
248
+ transformations to PDF coordinates are managed internally.
249
+ """ ,
250
+ example = {"x0" : 0.0 , "y0" : 0.0 , "x1" : 100.0 , "y1" : 100.0 },
251
+ )
252
+ format : FormatTypes = Field (
253
+ ...,
254
+ description = """Specifies the desired format for extracted data, allowing for options like `coordinates` or
255
+ other defined `FormatTypes` values. This dictates the structure of the output returned by the API.""" ,
256
+ example = FormatTypes .COORDINATES .value ,
257
+ )
146
258
147
259
@field_validator ("filename" , mode = "before" )
148
260
@classmethod
149
261
def validate_filename (cls , value : str ) -> str :
262
+ """Ensure the filename is not empty."""
150
263
return validate_filename (value )
151
264
152
265
@field_validator ("page_number" )
@@ -183,40 +296,73 @@ class Config:
183
296
184
297
185
298
class ExtractDataResponse (ABC , BaseModel ):
186
- """Response schema for the extract_data endpoint."""
299
+ """Base response schema for the `extract_data` endpoint, representing the extracted data's bounding box.
300
+
301
+ This abstract base class provides a bounding box field for data localization and an abstract property
302
+ `response_type` to be implemented by subclasses, indicating the type of extracted content.
303
+ """
187
304
188
- bbox : BoundingBox = Field (..., example = {"x0" : 0.0 , "y0" : 0.0 , "x1" : 100.0 , "y1" : 100.0 })
305
+ bbox : BoundingBox = Field (
306
+ ...,
307
+ description = """Bounding box coordinates that define the area within the document where data was extracted.
308
+ The box is specified in PNG coordinates, with the origin at the top-left corner (0,0).""" ,
309
+ example = {"x0" : 0.0 , "y0" : 0.0 , "x1" : 100.0 , "y1" : 100.0 },
310
+ )
189
311
190
312
@property
191
313
@abstractmethod
192
314
def response_type (self ):
193
- """Abstract property to be implemented by subclasses to define response type."""
315
+ """Abstract property to be implemented by subclasses to define the type of response content ."""
194
316
195
317
196
318
class ExtractCoordinatesResponse (ExtractDataResponse ):
197
- """Response schema for the extract_data endpoint."""
319
+ """Response schema for the ` extract_data` endpoint when returning geographic coordinates.
198
320
199
- coordinates : Coordinates = Field (..., example = {"east" : 1.0 , "north" : 2.0 , "page" : 1 , "projection" : "LV95" })
321
+ This schema includes a `coordinates` field with east/north values and projection information.
322
+ """
323
+
324
+ coordinates : Coordinates = Field (
325
+ ...,
326
+ description = """Geographical coordinates extracted from the document, including east and north values,
327
+ and projection type.""" ,
328
+ example = {"east" : 1.0 , "north" : 2.0 , "projection" : "LV95" },
329
+ )
200
330
201
331
@property
202
332
def response_type (self ):
203
333
return "coordinates"
204
334
205
335
206
336
class ExtractTextResponse (ExtractDataResponse ):
207
- """Response schema for the extract_data endpoint."""
337
+ """Response schema for the `extract_data` endpoint when returning extracted text content.
338
+
339
+ This schema includes a `text` field with the extracted textual content from the specified bounding box.
340
+ """
208
341
209
- text : str = Field (..., example = "text" )
342
+ text : str = Field (
343
+ ...,
344
+ description = """Text content extracted from the specified bounding box within the document.""" ,
345
+ example = "text" ,
346
+ )
210
347
211
348
@property
212
349
def response_type (self ):
213
350
return "text"
214
351
215
352
216
353
class ExtractNumberResponse (ExtractDataResponse ):
217
- """Response schema for the extract_data endpoint."""
354
+ """Response schema for the `extract_data` endpoint when returning numerical data.
355
+
356
+ This schema includes a `number` field for extracted numeric content, such as measurements or other
357
+ quantitative data.
358
+ """
218
359
219
- number : float = Field (..., example = 1.0 )
360
+ number : float = Field (
361
+ ...,
362
+ description = """Numeric value extracted from the specified bounding box within the document, representing a
363
+ measurement or quantitative data.""" ,
364
+ example = 1.0 ,
365
+ )
220
366
221
367
@property
222
368
def response_type (self ):
0 commit comments