Skip to content

Commit

Permalink
Merge pull request #10 from maxent-ai/pipeline
Browse files Browse the repository at this point in the history
Pipeline
  • Loading branch information
bharathgs authored Jul 3, 2022
2 parents e70afa3 + 4166192 commit 06b4ebf
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 3 deletions.
2 changes: 1 addition & 1 deletion ocrpy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.3.3"
__version__ = "0.3.4"

from .io import *
from .parsers import *
Expand Down
2 changes: 1 addition & 1 deletion ocrpy/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class DocumentReader:

file: str = field()
credentials: str = field(default=None)
storage_type = field(default=None, init=False)
storage_type: str = field(default=None, init=False)

def __attrs_post_init__(self):
self.storage_type = guess_storage(self.file)
Expand Down
1 change: 1 addition & 0 deletions ocrpy/parsers/text/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .gcp_text import *
from .aws_text import *
from .tesseract_text import *
from .text_parser import *
54 changes: 54 additions & 0 deletions ocrpy/parsers/text/text_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from attrs import define, field
from typing import Any, Optional
from .aws_text import AwsTextOCR
from .gcp_text import GcpTextOCR
from .tesseract_text import TesseractTextOCR
from ...utils import BackendNotSupported

__all__ = ["TextParser"]


@define
class TextParser:
"""
High level interface for multiple text ocr backends.
Note: Currently only supports Pytesseract, Google Cloud Vision and Amazon Textract.
Attributes
----------
backend : str
The name of the backend to use.
default: "pytesseract"
alternative options: "pytesseract", "aws-textract", "google-cloud-vision"
reader : Any
The reader object to use.
credentials : Optional[str]
The credentials to use for the selected backend.
default: None
"""

reader: Any = field()
credentials: Optional[str] = field(default=None)
backend: str = field(default="pytesseract")

@backend.validator
def supported_backends(self, attribute, value):
_backends = ["pytesseract", "aws-textract", "google-cloud-vision"]
if value not in _backends:
raise BackendNotSupported(
f"backend type {value} not supported. choose one of these instead: {', '.join(_backends)}"
)

def _dispatch_parser(self):
parser_registry = {
"pytesseract": TesseractTextOCR,
"aws-textract": AwsTextOCR,
"google-cloud-vision": GcpTextOCR,
}

return parser_registry[self.backend]

def parse(self):
parser = self._dispatch_parser()(self.reader, self.credentials)
parsed_doc = parser.parse()
return parsed_doc
6 changes: 5 additions & 1 deletion ocrpy/utils/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__all__ = ["FileTypeNotSupported", "AttributeNotSupported"]
__all__ = ["FileTypeNotSupported", "AttributeNotSupported", "BackendNotSupported"]


class FileTypeNotSupported(Exception):
Expand All @@ -11,3 +11,7 @@ class AttributeNotSupported(Exception):
"""Raise when an Attribute like block or line extraction is not supported by the backends."""

pass


class BackendNotSupported(Exception):
pass

0 comments on commit 06b4ebf

Please sign in to comment.