swisstopo
diff --git a/‎example/clipping_test-1.png‎
96.4 KB b/‎example/clipping_test-1.png‎
96.4 KB
diff --git a/‎example/clipping_test.pdf‎
37.8 KB b/‎example/clipping_test.pdf‎
37.8 KB
diff --git a/‎src/app/api/v1/router.py‎
Lines changed: 55 additions & 2 deletions b/‎src/app/api/v1/router.py‎
Lines changed: 55 additions & 2 deletions
diff --git a/‎src/app/common/schemas.py‎
Lines changed: 176 additions & 30 deletions b/‎src/app/common/schemas.py‎
Lines changed: 176 additions & 30 deletions
@@ -38,7 +38,30 @@ class BadRequestResponse(BaseModel):
     },
 )
 def post_create_pngs(request: PNGRequest) -> PNGResponse:
-    """Create PNGs from the given data."""
+    """Create PNG images from a PDF stored in the S3 bucket.
+
+    This endpoint generates PNG images from each page of a specified PDF document stored in the AWS S3 bucket.
+    The PDF file must be accessible in the bucket with a valid filename provided in the request.
+
+    ### Request Body
+    - **request** (`PNGRequest`): Contains the `filename` of the PDF document in the S3 bucket from which PNGs
+    should be generated.
+
+    ### Returns
+    - **PNGResponse**: Response containing a list of keys (filenames) for the generated PNG images stored in the
+    S3 bucket.
+
+    ### Status Codes
+    - **200 OK**: PNG images were successfully created and stored in the S3 bucket.
+    - **400 Bad Request**: The request format or content is invalid. Verify that `filename` is correctly specified.
+    - **404 Not Found**: PDF file not found in S3 bucket.
+    - **500 Internal Server Error**: An error occurred on the server while creating PNGs.
+
+    ### Additional Information
+    - The endpoint connects to AWS S3 to retrieve the specified PDF, converts its pages to PNGs, and stores
+    the generated images back in S3. Ensure the PDF file exists in the S3 bucket and is accessible before
+    making a request.
+    """
     return create_pngs(request.filename)
 
 
@@ -58,7 +81,37 @@ def post_create_pngs(request: PNGRequest) -> PNGResponse:
 def post_extract_data(
     extract_data_request: ExtractDataRequest,
 ) -> ExtractCoordinatesResponse | ExtractTextResponse | ExtractNumberResponse:
-    """Extract data from the given PNGs."""
+    """Extract specified data from a given document based on the bounding box coordinates and format.
+
+    Behavior of the data extraction from the specified bounding box is the following: extraction on a per-letter
+    basis, which means that as soon as the specified bounding box overlaps (partially or fully) with a letter
+    or number, then this character is added to the extracted text. This behavior is consistent with the
+    clipping behavior of the `PyMuPDF` library.
+
+    ### Prerequisites
+    Ensure that the PDF file has been processed by the create_pngs endpoint first.
+
+    ### Request Body
+    - **extract_data_request**: Instance of `ExtractDataRequest`, containing file details, page number, bounding
+    box, and data format. The bounding box in PNG coordinates helps locate the region to extract data from.
+
+    ### Returns
+    The endpoint responds with one of the following response models based on the extracted data:
+    - **ExtractCoordinatesResponse**: If geographic coordinates are extracted.
+    - **ExtractTextResponse**: If text content is extracted.
+    - **ExtractNumberResponse**: If numerical data is extracted.
+
+    ### Status Codes
+    - **200 OK**: Successful extraction, returning the specified data type.
+    - **400 Bad Request**: Input request was invalid, typically due to misformatted or missing parameters.
+    - **404 Not Found**: Requested data could not be found within the specified bounding box or page.
+    - **500 Internal Server Error**: An error occurred on the server side during data extraction.
+
+    ### Error Handling
+    Known `ValueError`s (e.g., invalid input data) result in a `400 Bad Request` response with a relevant error
+    message.
+    For other errors, the endpoint returns a `500 Internal Server Error`.
+    """
     try:
         # Extract the data based on the request
         response = extract_data(extract_data_request)
 
@@ -17,7 +17,17 @@
 
 
 def validate_filename(value: str) -> str:
-    """Ensure the filename is not empty."""
+    """Ensure the filename is not empty.
+
+    Args:
+        value (str): The filename to validate.
+
+    Returns:
+        str: The validated filename.
+
+    Raises:
+        ValueError: If the filename is empty
+    """
     if value == "":
         raise ValueError("Filename must not be empty.")
     return value
@@ -31,18 +41,28 @@ class PNGRequest(BaseModel):
     @field_validator("filename", mode="before")
     @classmethod
     def validate_filename(cls, value: str) -> str:
+        """Ensure the filename is not empty."""
         return validate_filename(value)
 
     class Config:
         """Make to allow using non-standard types like Path."""
 
-        arbitrary_types_allowed = True  # This allows using non-standard types like Path
+        arbitrary_types_allowed: bool = True  # This allows using non-standard types like Path
 
 
 class PNGResponse(BaseModel):
-    """Response schema for the create_pngs endpoint."""
+    """Response schema for the `create_pngs` endpoint, representing the output of PNG file creation and storage.
 
-    keys: list[str]  # keys in the S3 bucket
+    This schema lists the keys (identifiers) of the created PNG files stored in an S3 bucket,
+    enabling users to retrieve or reference them as needed.
+    """
+
+    keys: list[str] = Field(
+        ...,
+        description="""List of unique identifiers (keys) for the generated PNG files stored in the S3 bucket. Each key 
+        allows access to a specific file within the bucket.""",
+        example=["dataextraction/file1-1.png", "dataextraction/file1-2.png", "dataextraction/file1-3.png"],
+    )
 
 
 ########################################################################################################################
@@ -59,19 +79,44 @@ class FormatTypes(str, Enum):
 
 
 class BoundingBox(BaseModel):
-    """Bounding box schema."""
+    """Bounding box schema for defining a rectangular area within an image.
 
-    x0: float = Field(..., example=0.0)
-    y0: float = Field(..., example=0.0)
-    x1: float = Field(..., example=100.0)
-    y1: float = Field(..., example=100.0)
+    This schema represents the coordinates of the box’s corners, which can be used
+    to specify an area of interest in image processing tasks. Coordinates are
+    defined with the origin at the top-left of the image. Coordinates are in pixels.
+    """
+
+    x0: float = Field(
+        ...,
+        description="""The x-coordinate of the top-left corner of the bounding box. This value marks the 
+        horizontal starting point of the box.""",
+        example=0.0,
+    )
+    y0: float = Field(
+        ...,
+        description="""The y-coordinate of the top-left corner of the bounding box. This value marks the vertical 
+        starting point of the box.""",
+        example=0.0,
+    )
+    x1: float = Field(
+        ...,
+        description="""The x-coordinate of the bottom-right corner of the bounding box. This value marks the 
+        horizontal endpoint of the box.""",
+        example=100.0,
+    )
+    y1: float = Field(
+        ...,
+        description="""The y-coordinate of the bottom-right corner of the bounding box. This value marks the vertical 
+        endpoint of the box.""",
+        example=100.0,
+    )
 
     @field_validator("x0", "y0", "x1", "y1")
     @classmethod
-    def page_number_must_be_positive(cls, v: int) -> int:
-        """Validate that the page number is positive."""
+    def bbox_corners_must_be_positive(cls, v: int) -> int:
+        """Validate that the edges of the bounding box are positive."""
         if v < 0.0:
-            raise ValueError("Bounding box coordinate must be a positive integer")
+            raise ValueError("Bounding box coordinates must be positive")
         return v
 
     def rescale(
@@ -125,28 +170,96 @@ def load_from_fitz_rect(rect: fitz.Rect) -> "BoundingBox":
 
 
 class Coordinates(BaseModel):
-    """Coordinates schema."""
+    """Coordinates schema for representing geographical data points.
 
-    east: float = Field(..., example=1.0)
-    north: float = Field(..., example=2.0)
-    projection: str = Field(..., example="LV95")
+    This schema defines the format for specifying location data using east/north coordinates
+    along with the projection system used.
+    """
+
+    east: float = Field(
+        ...,
+        description="""Easting coordinate. The value should be in the units of the specified projection system.""",
+        example=1.0,
+    )
+    north: float = Field(
+        ...,
+        description="""Northing coordinate. The value should be in the units of the specified projection system.""",
+        example=2.0,
+    )
+    projection: str = Field(
+        ...,
+        description="""Projection system used to reference the coordinates. This defines the coordinate reference
+        system, such as 'LV95' for Swiss coordinate systems.""",
+        example="LV95",
+    )
 
 
 class ExtractDataRequest(ABC, BaseModel):
-    """Request schema for the extract_data endpoint.
+    """Request schema for the `extract_data` endpoint.
+
+    ** Requirements:**
+    Before using this schema, ensure that the PDF file has been processed by the create_pngs endpoint first.
+
+    **Coordinate Systems:**
+    - **PNG coordinates:** Pixels are measured from the top-left corner (0, 0), where x increases rightward
+    and y downward.
+
+    ### Fields
+    Each field below includes inline examples to aid users in creating requests. See `json_schema_extra`
+    for a complete example.
+
+    **Attributes:**
+    - **filename** (`Path`): Path to the PDF file. _Example_: `"document.pdf"`
+    - **page_number** (`int`): Target page for data extraction. This is a 1-based index. _Example_: `1`
+    - **bbox** (`BoundingBox`): Bounding box for the extraction area, in PNG coordinates. Origin is the
+    top-left, with x increasing rightward and y increasing downward.
+        - Example format: `{"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0}`
+    - **format** (`FormatTypes`): Specifies the expected format for extracted data, e.g., `"coordinates"`.
+
+    ### Validation
+    Custom validators ensure data integrity:
+    - **Filename Validator:** Ensures filename is not empty.
+    - **Page Number Validator:** Confirms page number is positive.
+    - **Format Validator:** Checks format is valid as per `FormatTypes`.
+
+    The bounding box should be provided in PNG coordinates.
 
     Each field in the Pydantic model can have an example parameter, which provides an inline
     example for that specific field.
     """
 
-    filename: Path = Field(..., example=Path("document.png"))
-    page_number: int = Field(..., example=1)  # 1-based index
-    bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0})
-    format: FormatTypes = Field(..., example=FormatTypes.COORDINATES.value)
+    filename: Path = Field(
+        ...,
+        description="""Path to the input PDF document file that contains the data to be extracted. This should be
+        a valid file path, and the file should be accessible to the API.""",
+        example=Path("document.pdf"),
+    )
+    page_number: int = Field(
+        ...,
+        description="""Page number within the document where the extraction is to be performed. This is a 1-based 
+        index (e.g., 1 for the first page), applicable for multi-page files like PDFs.""",
+        example=1,
+    )
+    bbox: BoundingBox = Field(
+        ...,
+        description="""Bounding box defining the area for data extraction within the PNG version of the specified 
+        PDF file. The box is specified in pixels with the top-left as the origin (0,0), where x increases to the 
+        right and y increases downward. This box should be provided in PNG coordinates, and any 
+        transformations to PDF coordinates are managed internally.
+        """,
+        example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0},
+    )
+    format: FormatTypes = Field(
+        ...,
+        description="""Specifies the desired format for extracted data, allowing for options like `coordinates` or 
+        other defined `FormatTypes` values. This dictates the structure of the output returned by the API.""",
+        example=FormatTypes.COORDINATES.value,
+    )
 
     @field_validator("filename", mode="before")
     @classmethod
     def validate_filename(cls, value: str) -> str:
+        """Ensure the filename is not empty."""
         return validate_filename(value)
 
     @field_validator("page_number")
@@ -183,40 +296,73 @@ class Config:
 
 
 class ExtractDataResponse(ABC, BaseModel):
-    """Response schema for the extract_data endpoint."""
+    """Base response schema for the `extract_data` endpoint, representing the extracted data's bounding box.
+
+    This abstract base class provides a bounding box field for data localization and an abstract property
+    `response_type` to be implemented by subclasses, indicating the type of extracted content.
+    """
 
-    bbox: BoundingBox = Field(..., example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0})
+    bbox: BoundingBox = Field(
+        ...,
+        description="""Bounding box coordinates that define the area within the document where data was extracted.
+        The box is specified in PNG coordinates, with the origin at the top-left corner (0,0).""",
+        example={"x0": 0.0, "y0": 0.0, "x1": 100.0, "y1": 100.0},
+    )
 
     @property
     @abstractmethod
     def response_type(self):
-        """Abstract property to be implemented by subclasses to define response type."""
+        """Abstract property to be implemented by subclasses to define the type of response content."""
 
 
 class ExtractCoordinatesResponse(ExtractDataResponse):
-    """Response schema for the extract_data endpoint."""
+    """Response schema for the `extract_data` endpoint when returning geographic coordinates.
 
-    coordinates: Coordinates = Field(..., example={"east": 1.0, "north": 2.0, "page": 1, "projection": "LV95"})
+    This schema includes a `coordinates` field with east/north values and projection information.
+    """
+
+    coordinates: Coordinates = Field(
+        ...,
+        description="""Geographical coordinates extracted from the document, including east and north values, 
+        and projection type.""",
+        example={"east": 1.0, "north": 2.0, "projection": "LV95"},
+    )
 
     @property
     def response_type(self):
         return "coordinates"
 
 
 class ExtractTextResponse(ExtractDataResponse):
-    """Response schema for the extract_data endpoint."""
+    """Response schema for the `extract_data` endpoint when returning extracted text content.
+
+    This schema includes a `text` field with the extracted textual content from the specified bounding box.
+    """
 
-    text: str = Field(..., example="text")
+    text: str = Field(
+        ...,
+        description="""Text content extracted from the specified bounding box within the document.""",
+        example="text",
+    )
 
     @property
     def response_type(self):
         return "text"
 
 
 class ExtractNumberResponse(ExtractDataResponse):
-    """Response schema for the extract_data endpoint."""
+    """Response schema for the `extract_data` endpoint when returning numerical data.
+
+    This schema includes a `number` field for extracted numeric content, such as measurements or other
+    quantitative data.
+    """
 
-    number: float = Field(..., example=1.0)
+    number: float = Field(
+        ...,
+        description="""Numeric value extracted from the specified bounding box within the document, representing a
+        measurement or quantitative data.""",
+        example=1.0,
+    )
 
     @property
     def response_type(self):