-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from maxent-ai/pipeline
Pipeline
- Loading branch information
Showing
5 changed files
with
62 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
__version__ = "0.3.3" | ||
__version__ = "0.3.4" | ||
|
||
from .io import * | ||
from .parsers import * | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from .gcp_text import * | ||
from .aws_text import * | ||
from .tesseract_text import * | ||
from .text_parser import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from attrs import define, field | ||
from typing import Any, Optional | ||
from .aws_text import AwsTextOCR | ||
from .gcp_text import GcpTextOCR | ||
from .tesseract_text import TesseractTextOCR | ||
from ...utils import BackendNotSupported | ||
|
||
__all__ = ["TextParser"] | ||
|
||
|
||
@define | ||
class TextParser: | ||
""" | ||
High level interface for multiple text ocr backends. | ||
Note: Currently only supports Pytesseract, Google Cloud Vision and Amazon Textract. | ||
Attributes | ||
---------- | ||
backend : str | ||
The name of the backend to use. | ||
default: "pytesseract" | ||
alternative options: "pytesseract", "aws-textract", "google-cloud-vision" | ||
reader : Any | ||
The reader object to use. | ||
credentials : Optional[str] | ||
The credentials to use for the selected backend. | ||
default: None | ||
""" | ||
|
||
reader: Any = field() | ||
credentials: Optional[str] = field(default=None) | ||
backend: str = field(default="pytesseract") | ||
|
||
@backend.validator | ||
def supported_backends(self, attribute, value): | ||
_backends = ["pytesseract", "aws-textract", "google-cloud-vision"] | ||
if value not in _backends: | ||
raise BackendNotSupported( | ||
f"backend type {value} not supported. choose one of these instead: {', '.join(_backends)}" | ||
) | ||
|
||
def _dispatch_parser(self): | ||
parser_registry = { | ||
"pytesseract": TesseractTextOCR, | ||
"aws-textract": AwsTextOCR, | ||
"google-cloud-vision": GcpTextOCR, | ||
} | ||
|
||
return parser_registry[self.backend] | ||
|
||
def parse(self): | ||
parser = self._dispatch_parser()(self.reader, self.credentials) | ||
parsed_doc = parser.parse() | ||
return parsed_doc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters