Merge pull request #10 from maxent-ai/pipeline

Pipeline
maxent-ai · Jul 3, 2022 · 06b4ebf · 06b4ebf
2 parents e70afa3 + 4166192
commit 06b4ebf
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 3 deletions.
diff --git a/ocrpy/__init__.py b/ocrpy/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.3.3"
+__version__ = "0.3.4"
 
 from .io import *
 from .parsers import *

diff --git a/ocrpy/io/reader.py b/ocrpy/io/reader.py
@@ -33,7 +33,7 @@ class DocumentReader:
 
     file: str = field()
     credentials: str = field(default=None)
-    storage_type = field(default=None, init=False)
+    storage_type: str = field(default=None, init=False)
 
     def __attrs_post_init__(self):
         self.storage_type = guess_storage(self.file)

diff --git a/ocrpy/parsers/text/__init__.py b/ocrpy/parsers/text/__init__.py
@@ -1,3 +1,4 @@
 from .gcp_text import *
 from .aws_text import *
 from .tesseract_text import *
+from .text_parser import *
diff --git a/ocrpy/parsers/text/text_parser.py b/ocrpy/parsers/text/text_parser.py
@@ -0,0 +1,54 @@
+from attrs import define, field
+from typing import Any, Optional
+from .aws_text import AwsTextOCR
+from .gcp_text import GcpTextOCR
+from .tesseract_text import TesseractTextOCR
+from ...utils import BackendNotSupported
+
+__all__ = ["TextParser"]
+
+
+@define
+class TextParser:
+    """
+    High level interface for multiple text ocr backends.
+    Note: Currently only supports Pytesseract, Google Cloud Vision and Amazon Textract.
+
+    Attributes
+    ----------
+    backend : str
+        The name of the backend to use.
+        default: "pytesseract"
+        alternative options: "pytesseract", "aws-textract", "google-cloud-vision"
+    reader : Any
+        The reader object to use.
+    credentials : Optional[str]
+        The credentials to use for the selected backend.
+        default: None
+    """
+
+    reader: Any = field()
+    credentials: Optional[str] = field(default=None)
+    backend: str = field(default="pytesseract")
+
+    @backend.validator
+    def supported_backends(self, attribute, value):
+        _backends = ["pytesseract", "aws-textract", "google-cloud-vision"]
+        if value not in _backends:
+            raise BackendNotSupported(
+                f"backend type {value} not supported. choose one of these instead: {', '.join(_backends)}"
+            )
+
+    def _dispatch_parser(self):
+        parser_registry = {
+            "pytesseract": TesseractTextOCR,
+            "aws-textract": AwsTextOCR,
+            "google-cloud-vision": GcpTextOCR,
+        }
+
+        return parser_registry[self.backend]
+
+    def parse(self):
+        parser = self._dispatch_parser()(self.reader, self.credentials)
+        parsed_doc = parser.parse()
+        return parsed_doc
diff --git a/ocrpy/utils/exceptions.py b/ocrpy/utils/exceptions.py
@@ -1,4 +1,4 @@
-__all__ = ["FileTypeNotSupported", "AttributeNotSupported"]
+__all__ = ["FileTypeNotSupported", "AttributeNotSupported", "BackendNotSupported"]
 
 
 class FileTypeNotSupported(Exception):
@@ -11,3 +11,7 @@ class AttributeNotSupported(Exception):
     """Raise when an Attribute like block or line extraction is not supported by the backends."""
 
     pass
+
+
+class BackendNotSupported(Exception):
+    pass