Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add paddleocr #83

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/layoutparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
is_effdet_available,
is_pytesseract_available,
is_gcv_available,
is_paddleocr_available,
)

_import_structure = {
Expand Down Expand Up @@ -51,6 +52,7 @@
"is_paddle_available",
"is_pytesseract_available",
"is_gcv_available",
"is_paddleocr_available",
"requires_backends"
],
"tools": [
Expand Down Expand Up @@ -80,6 +82,9 @@
if is_gcv_available():
_import_structure["ocr.gcv_agent"] = ["GCVAgent", "GCVFeatureType"]

if is_paddleocr_available():
_import_structure["ocr.paddleocr_agent"] = ["PaddleOCRAgent", "PaddleOCRFeatureType"]

sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
Expand Down
19 changes: 19 additions & 0 deletions src/layoutparser/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@
except ModuleNotFoundError:
_gcv_available = False

try:
_paddleocr_available = importlib.util.find_spec("paddleocr") is not None
try:
_paddleocr_version = importlib_metadata.version("paddleocr")
logger.debug(f"PaddleOCR version {_paddleocr_version} available.")
except importlib_metadata.PackageNotFoundError:
_paddleocr_available = False
except ModuleNotFoundError:
_paddleocr_available = False


def is_torch_available():
return _torch_available
Expand Down Expand Up @@ -121,6 +131,9 @@ def is_pytesseract_available():
def is_gcv_available():
return _gcv_available

def is_paddleocr_available():
return _paddleocr_available


PYTORCH_IMPORT_ERROR = """
{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
Expand Down Expand Up @@ -154,6 +167,11 @@ def is_gcv_available():
`pip install google-cloud-vision==1`
"""

PADDLEOCR_IMPORT_ERROR = """
{0} requires the PaddleOCR library but it was not found in your environment. You can install it with pip:
`pip install paddleocr`
"""

BACKENDS_MAPPING = dict(
[
("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
Expand All @@ -162,6 +180,7 @@ def is_gcv_available():
("effdet", (is_effdet_available, EFFDET_IMPORT_ERROR)),
("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
("google-cloud-vision", (is_gcv_available, GCV_IMPORT_ERROR)),
("paddleocr", (is_paddleocr_available, PADDLEOCR_IMPORT_ERROR)),
]
)

Expand Down
3 changes: 2 additions & 1 deletion src/layoutparser/ocr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
# limitations under the License.

from .gcv_agent import GCVAgent, GCVFeatureType
from .tesseract_agent import TesseractAgent, TesseractFeatureType
from .tesseract_agent import TesseractAgent, TesseractFeatureType
from .paddleocr_agent import PaddleOCRAgent
121 changes: 121 additions & 0 deletions src/layoutparser/ocr/paddleocr_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Copyright 2021 The Layout Parser team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import csv
import pickle
import numpy as np

import pandas as pd

from .base import BaseOCRAgent, BaseOCRElementType
from ..io import load_dataframe
from ..file_utils import is_paddleocr_available

if is_paddleocr_available():
import paddleocr


class PaddleOCRAgent(BaseOCRAgent):
"""
A wrapper for `PaddleOCR <https://github.com/PaddlePaddle/PaddleOCR>`_ Text
Detection APIs based on `PaddleOCR <https://github.com/PaddlePaddle/PaddleOCR>`_.
"""

DEPENDENCIES = ["paddleocr"]

def __init__(self, languages="en", use_gpu=True, use_angle_cls=False, det=True, rec=True, cls=False, **kwargs):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. default value of use_gpu should be False for better compatibility.

"""Create a Tesseract OCR Agent.

Args:
languages (:obj:`list` or :obj:`str`, optional):
You can specify the language code(s) of the documents to detect to improve
accuracy. The supported language and their code can be found on
`its github repo <https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/doc/doc_ch/whl.md>`_.
It supports llaguages:`ch`, `en`, `french`, `german`, `korean`, `japan`.
Defaults to 'eng'.
"""
self.lang = languages
self.use_gpu = use_gpu
self.use_angle_cls = use_angle_cls
self.configs = kwargs
self.ocr = paddleocr.PaddleOCR(use_gpu=self.use_gpu, use_angle_cls=self.use_angle_cls, lang=self.lang)

def resized_long(self, image, target=480):
shape = image.shape
if max(image.shape[0], image.shape[1]) >= target:
return image
if shape[0] >= shape[1]:
ratio = 1.0 * target / shape[0]
out = [int(shape[1] * ratio), target]
else:
ratio = 1.0 * target / shape[1]
out = [target, int(shape[0] * ratio)]
return cv2.resize(image, out)

def pad_img_to_longer_edge(self, image):
max_shape = max(image.shape[0], image.shape[1])
out_img = np.ones([max_shape, max_shape, 3]) * 127

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

127 is also suggested to be set as fixable for some others special situations.

out_img[:image.shape[0], :image.shape[1], :image.shape[2]] = image
return out_img

def detect(
self, image, det=True, rec=True, cls=True,
return_response=False, return_only_text=True
):
"""Send the input image for OCR.

Args:
image (:obj:`np.ndarray` or :obj:`str`):
The input image array or the name of the image file
det (:obj:`bool`, optional):
use text detection or not, if false, only rec will be exec.
Default to `True`.
rec (:obj:`bool`, optional):
Use text recognition or not, if false, only det will be exec.
Default to `True`.
cls (:obj:`bool`, optional):
Use 180 degree rotation text recognition or not.
Default to `True`.
return_response (:obj:`bool`, optional):
Whether directly return all output (string and boxes
info) from Tesseract.
Defaults to `False`.
return_only_text (:obj:`bool`, optional):
Whether return only the texts in the OCR results.
Defaults to `False`.
"""
image = self.resized_long(image)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that we can not modify target=480, the value is suggested to be an initialization parameter of PaddleOCRAgent class.

image = self.pad_img_to_longer_edge(image)
res = self.ocr.ocr(image, det=det, rec=rec, cls=cls)

if return_response:
return res

if return_only_text:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. A score is needed to filter the recognition results and corresponded polys.
  2. The format should be same as other agents

return ['\n'.join(line[1][0] for line in res)]

return ['\n'.join(line[1][0] for line in res)]

@staticmethod
def load_response(filename):
with open(filename, "rb") as fp:
res = pickle.load(fp)
return res

@staticmethod
def save_response(res, file_name):

with open(file_name, "wb") as fp:
pickle.dump(res, fp, protocol=pickle.HIGHEST_PROTOCOL)
14 changes: 13 additions & 1 deletion tests/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
GCVFeatureType,
TesseractAgent,
TesseractFeatureType,
PaddleOCRAgent,
)
import json, cv2, os

Expand Down Expand Up @@ -76,4 +77,15 @@ def test_tesseract(test_detect=False):
assert r2 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.BLOCK)
assert r3 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.PARA)
assert r4 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.LINE)
assert r5 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.WORD)
assert r5 == ocr_agent.gather_data(res, agg_level=TesseractFeatureType.WORD)


def test_paddleocr(test_detect=False):

ocr_agent = PaddleOCRAgent(languages="en")

# The results could be different is using another version of PaddleOCR Engine.
# PaddleOCR 2.0.1 is used for generating the result.
if test_detect:
res = ocr_agent.detect(image)
print(res)