OCA · len-foss · Jun 21, 2016 · May 9, 2017 · Jun 20, 2017 · Jun 20, 2017
diff --git a/attachment_indexation_mupdf/README.rst b/attachment_indexation_mupdf/README.rst
@@ -0,0 +1,5 @@
+=====================================================
+Attachments List and Document Indexation with PyMuPDF
+=====================================================
+
+Module to index pdf document using state-of-the-art library.
diff --git a/attachment_indexation_mupdf/__init__.py b/attachment_indexation_mupdf/__init__.py
@@ -0,0 +1,4 @@
+# Copyright 2023 len-foss/Financial Way
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import models
diff --git a/attachment_indexation_mupdf/__manifest__.py b/attachment_indexation_mupdf/__manifest__.py
@@ -0,0 +1,17 @@
+# Copyright 2023 len-foss/Financial Way
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+{
+    "name": "Attachments List and Document Indexation with PyMuPDF",
+    "category": "Hidden/Tools",
+    "version": "16.0.0.0.0",
+    "summary": "Attachments List and Document Indexation with PyMuPDF",
+    "author": "len-foss/FinancialWay,Odoo Community Association (OCA)",
+    "website": "https://github.com/OCA/knowledge",
+    "license": "AGPL-3",
+    "depends": ["attachment_indexation"],
+    "auto_install": True,
+    "installable": True,
+    "data": [],
+    "assets": {},
+    "external_dependencies": {"python": ["PyMuPDF"]},
+}
diff --git a/attachment_indexation_mupdf/models/__init__.py b/attachment_indexation_mupdf/models/__init__.py
@@ -0,0 +1,4 @@
+# Copyright 2023 len-foss/Financial Way
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+from . import ir_attachment
diff --git a/attachment_indexation_mupdf/models/ir_attachment.py b/attachment_indexation_mupdf/models/ir_attachment.py
@@ -0,0 +1,36 @@
+# Copyright 2023 len-foss/Financial Way
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+import io
+import logging
+
+from odoo import models
+
+_logger = logging.getLogger(__name__)
+
+try:
+    import fitz
+except ImportError:
+    fitz = None
+    _logger.warning(
+        "Attachment indexation of PDF documents is unavailable"
+        "because PyMuPDF cannot be loaded."
+    )
+
+
+class IrAttachment(models.Model):
+    _inherit = "ir.attachment"
+
+    def _index_pdf(self, bin_data):
+        """Index PDF documents with MuPDF if available"""
+        if fitz is None:
+            return super()._index_pdf(bin_data)
+        buf = ""
+        try:
+            f = io.BytesIO(bin_data)
+            doc = fitz.open(stream=f, filetype="pdf")
+            for page in doc:
+                buf += page.get_text()
+        except Exception:  # pylint: disable=except-pass
+            pass
+        return buf
diff --git a/attachment_indexation_mupdf/tests/__init__.py b/attachment_indexation_mupdf/tests/__init__.py
@@ -0,0 +1,2 @@
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from . import test_indexation
diff --git a/attachment_indexation_mupdf/tests/files/test_content.pdf b/attachment_indexation_mupdf/tests/files/test_content.pdf
diff --git a/attachment_indexation_mupdf/tests/test_indexation.py b/attachment_indexation_mupdf/tests/test_indexation.py
@@ -0,0 +1,27 @@
+# Copyright 2023 len-foss/Financial Way
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+import os
+from unittest import skipIf
+
+from odoo.tests.common import TransactionCase, tagged
+
+directory = os.path.dirname(__file__)
+
+try:
+    import fitz
+except ImportError:
+    fitz = None
+
+
+@tagged("post_install", "-at_install")
+class TestCaseIndexation(TransactionCase):
+    @skipIf(fitz is None, "PyMyPDF is not installed")
+    def test_attachment_pdf_indexation(self):
+        with open(os.path.join(directory, "files", "test_content.pdf"), "rb") as file:
+            pdf = file.read()
+            text = self.env["ir.attachment"]._index(pdf, "application/pdf")
+            # note that the whitespace character is not the same as with pdfminer
+            self.assertEqual(
+                text, "TestContent!!\n", "the index content should be correct"
+            )
diff --git a/attachment_indexation_ocr/README.rst b/attachment_indexation_ocr/README.rst
@@ -0,0 +1,107 @@
+=================
+OCR for documents
+=================
+
+.. 
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   !! This file is generated by oca-gen-addon-readme !!
+   !! changes will be overwritten.                   !!
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   !! source digest: sha256:488ceb3b031015c08770a769f1357f5dcd462d28eaca37048790a61ef9a5feab
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+.. |badge1| image:: https://img.shields.io/badge/maturity-Beta-yellow.png
+    :target: https://odoo-community.org/page/development-status
+    :alt: Beta
+.. |badge2| image:: https://img.shields.io/badge/licence-AGPL--3-blue.png
+    :target: http://www.gnu.org/licenses/agpl-3.0-standalone.html
+    :alt: License: AGPL-3
+.. |badge3| image:: https://img.shields.io/badge/github-OCA%2Fknowledge-lightgray.png?logo=github
+    :target: https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr
+    :alt: OCA/knowledge
+.. |badge4| image:: https://img.shields.io/badge/weblate-Translate%20me-F47D42.png
+    :target: https://translation.odoo-community.org/projects/knowledge-16-0/knowledge-16-0-attachment_indexation_ocr
+    :alt: Translate me on Weblate
+.. |badge5| image:: https://img.shields.io/badge/runboat-Try%20me-875A7B.png
+    :target: https://runboat.odoo-community.org/builds?repo=OCA/knowledge&target_branch=16.0
+    :alt: Try me on Runboat
+
+|badge1| |badge2| |badge3| |badge4| |badge5|
+
+This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
+
+It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
+
+**Table of contents**
+
+.. contents::
+   :local:
+
+Installation
+============
+
+To install this module, you need to:
+
+#. install tesseract and the language(s) your documents use
+#. if you want to support OCR on PDFs, install imagemagick
+#. install the module itself
+
+On an Debian or Ubuntu system you would typically run::
+
+    $ sudo apt-get install tesseract-ocr imagemagick
+
+Configuration
+=============
+
+To configure this module, go to:
+
+#. Settings/Technical/Parameters/System parameters and review the parameters with names ocr.*
+
+Usage
+=====
+
+By default, character recognition is done asynchronously by a cronjob at night.
+This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
+The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
+In case you want to force the OCR to be done immediately, set configuration parameter ``ocr.synchronous`` to value ``True``.
+
+Bug Tracker
+===========
+
+Bugs are tracked on `GitHub Issues <https://github.com/OCA/knowledge/issues>`_.
+In case of trouble, please check there if your issue has already been reported.
+If you spotted it first, help us to smash it by providing a detailed and welcomed
+`feedback <https://github.com/OCA/knowledge/issues/new?body=module:%20attachment_indexation_ocr%0Aversion:%2016.0%0A%0A**Steps%20to%20reproduce**%0A-%20...%0A%0A**Current%20behavior**%0A%0A**Expected%20behavior**>`_.
+
+Do not contact contributors directly about support or help with technical issues.
+
+Credits
+=======
+
+Authors
+~~~~~~~
+
+* Therp BV
+
+Contributors
+~~~~~~~~~~~~
+
+* Holger Brunn <[email protected]>
+* len-foss <[email protected]>
+
+Maintainers
+~~~~~~~~~~~
+
+This module is maintained by the OCA.
+
+.. image:: https://odoo-community.org/logo.png
+   :alt: Odoo Community Association
+   :target: https://odoo-community.org
+
+OCA, or the Odoo Community Association, is a nonprofit organization whose
+mission is to support the collaborative development of Odoo features and
+promote its widespread use.
+
+This module is part of the `OCA/knowledge <https://github.com/OCA/knowledge/tree/16.0/attachment_indexation_ocr>`_ project on GitHub.
+
+You are welcome to contribute. To learn how please visit https://odoo-community.org/page/Contribute.
diff --git a/attachment_indexation_ocr/__init__.py b/attachment_indexation_ocr/__init__.py
@@ -0,0 +1,3 @@
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from . import models
diff --git a/attachment_indexation_ocr/__manifest__.py b/attachment_indexation_ocr/__manifest__.py
@@ -0,0 +1,18 @@
+# © 2016 Therp BV <http://therp.nl>
+# Copyright 2023 len-foss/Financial Way
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+{
+    "name": "OCR for documents",
+    "version": "16.0.1.0.0",
+    "author": "Therp BV,Odoo Community Association (OCA)",
+    "license": "AGPL-3",
+    "website": "https://github.com/OCA/knowledge",
+    "category": "Knowledge Management",
+    "summary": "Run character recognition on uploaded files",
+    "depends": ["attachment_indexation"],
+    "data": [
+        "data/ir_cron.xml",
+        "data/ir_config_parameter.xml",
+    ],
+    "external_dependencies": {"bin": ["tesseract"], "python": ["PyMuPDF"]},
+}
diff --git a/attachment_indexation_ocr/data/ir_config_parameter.xml b/attachment_indexation_ocr/data/ir_config_parameter.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<odoo noupdate="1">
+        <record id="param_synchronous" model="ir.config_parameter">
+            <field name="key">ocr.synchronous</field>
+            <field name="value">False</field>
+        </record>
+        <record id="param_dpi" model="ir.config_parameter">
+            <field name="key">ocr.dpi</field>
+            <field name="value">300</field>
+        </record>
+</odoo>
diff --git a/attachment_indexation_ocr/data/ir_cron.xml b/attachment_indexation_ocr/data/ir_cron.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<odoo noupdate="1">
+        <record id="cron" model="ir.cron">
+            <field name="name">Run OCR on uploaded documents</field>
+            <field name="interval_type">days</field>
+            <field name="interval_number">1</field>
+            <field name="model_id" ref="model_ir_attachment" />
+            <field name="state">code</field>
+            <field name="code">model._ocr_cron(limit=100)</field>
+            <field name="numbercall">-1</field>
+        </record>
+</odoo>
diff --git a/attachment_indexation_ocr/models/__init__.py b/attachment_indexation_ocr/models/__init__.py
@@ -0,0 +1,3 @@
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+from . import ir_attachment
diff --git a/attachment_indexation_ocr/models/ir_attachment.py b/attachment_indexation_ocr/models/ir_attachment.py
@@ -0,0 +1,106 @@
+# © 2016 Therp BV <http://therp.nl>
+# License AGPL-3.0 or later (http://www.gnu.org/licenses/agpl.html).
+
+import base64
+import logging
+import subprocess
+from io import BytesIO
+
+import fitz
+from PIL import Image
+
+from odoo import api, models
+
+_logger = logging.getLogger(__name__)
+_MARKER_PHRASE = "[[waiting for OCR]]"
+
+
+class IrAttachment(models.Model):
+    _inherit = "ir.attachment"
+
+    @api.model
+    def _get_no_content_strings(self):
+        return ["image", "application"]
+
+    @api.model
+    def _not_content(self, text):
+        return not text or text in self._get_no_content_strings()
+
+    @api.model
+    def _index(self, bin_data, file_type, checksum=None):
+        content = super()._index(bin_data, file_type, checksum)
+        if bin_data and file_type and self._not_content(content):
+            synchronous = self.env["ir.config_parameter"].get_param("ocr.synchronous")
+            if synchronous == "True" or self.env.context.get("ocr_force"):
+                content = self._index_ocr(bin_data, file_type)
+            else:
+                content = _MARKER_PHRASE
+        return content
+
+    @api.model
+    def _index_ocr(self, bin_data, file_type, dpi=0):
+        if not dpi:
+            icp = self.env["ir.config_parameter"]
+            dpi = int(icp.get_param("ocr.dpi", "500"))
+        if "/" not in file_type:
+            _logger.warning("Invalid mimetype %s", file_type)
+            return None
+        top_type, sub_type = file_type.split("/", 1)
+        images = []
+        if sub_type == "pdf":
+            images += self._index_ocr_get_data_pdf(bin_data, dpi)  # TODO
+        else:
+            image_data = BytesIO()
+            images.append(image_data)
+            try:
+                i = Image.open(BytesIO(bin_data))
+                i.save(image_data, "png", dpi=(dpi, dpi))
+            except IOError:
+                _logger.exception("Failed to OCR image")
+                return None
+        tesseract_command = ["tesseract", "stdin", "stdout"]
+        if self.env.context.get("ocr_lang"):
+            # no check that this lang has been correctly installed;
+            # the corresponding tessdata should be listed by `tesseract --list-langs`
+            tesseract_command += ["-l", self.env.context["ocr_lang"]]
+        result = ""
+        for im in images:
+            process = subprocess.Popen(
+                tesseract_command,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            stdout, stderr = process.communicate(im.getvalue())
+            if process.returncode:
+                _logger.error("Error during OCR: %s", stderr)
+            result += stdout.decode("utf-8")
+        return result
+
+    @api.model
+    def _index_ocr_get_data_pdf(self, bin_data, dpi):
+        # tesseract only supports image of at most 32K pixels in any dimension
+        # it is thus better to have a list of images than a single one
+        res = []
+        for page in fitz.open(stream=bin_data, filetype="pdf"):
+            pix = page.get_pixmap(dpi=dpi, alpha=False)
+            res.append(BytesIO(pix.tobytes("png")))
+        return res
+
+    @api.model
+    def _ocr_cron(self, limit=None):
+        domain = [("index_content", "=", _MARKER_PHRASE)]
+        recs = self.with_context(ocr_force=True).search(domain, limit=limit)
+        recs.perform_ocr()
+
+    def perform_ocr(self, tesseract_lang=None):
+        for rec in self:
+            if not rec.datas:
+                index_content = ""  # the _MARKER_PHRASE should be removed
+            else:
+                bin_data = base64.b64decode(rec.datas)
+                ctx = {"ocr_force": True}
+                if tesseract_lang:
+                    ctx["ocr_lang"] = tesseract_lang
+                index_content = rec.with_context(**ctx)._index(bin_data, rec.mimetype)
+            rec.write({"index_content": index_content})
diff --git a/attachment_indexation_ocr/readme/CONFIGURE.rst b/attachment_indexation_ocr/readme/CONFIGURE.rst
@@ -0,0 +1,3 @@
+To configure this module, go to:
+
+#. Settings/Technical/Parameters/System parameters and review the parameters with names ocr.*
diff --git a/attachment_indexation_ocr/readme/CONTRIBUTORS.rst b/attachment_indexation_ocr/readme/CONTRIBUTORS.rst
@@ -0,0 +1,2 @@
+* Holger Brunn <[email protected]>
+* len-foss <[email protected]>
diff --git a/attachment_indexation_ocr/readme/DESCRIPTION.rst b/attachment_indexation_ocr/readme/DESCRIPTION.rst
@@ -0,0 +1,3 @@
+This module was written to make uploaded documents, for example scans, searchable by running OCR on them.
+
+It supports all image formats `Pillow supports <http://pillow.readthedocs.io/en/3.2.x/handbook/image-file-formats.html>`_ for reading and PDFs.
diff --git a/attachment_indexation_ocr/readme/INSTALL.rst b/attachment_indexation_ocr/readme/INSTALL.rst
@@ -0,0 +1,9 @@
+To install this module, you need to:
+
+#. install tesseract and the language(s) your documents use
+#. if you want to support OCR on PDFs, install imagemagick
+#. install the module itself
+
+On an Debian or Ubuntu system you would typically run::
+
+    $ sudo apt-get install tesseract-ocr imagemagick
diff --git a/attachment_indexation_ocr/readme/USAGE.rst b/attachment_indexation_ocr/readme/USAGE.rst
@@ -0,0 +1,4 @@
+By default, character recognition is done asynchronously by a cronjob at night.
+This is because the recognition process takes a while and you don't want to make your users wait for the indexation to finish.
+The interval to run the cronjob can be adjusted to your needs in the ``Scheduled Actions`` menu, under ` `Settings``.
+In case you want to force the OCR to be done immediately, set configuration parameter ``ocr.synchronous`` to value ``True``.
diff --git a/attachment_indexation_ocr/static/description/icon.png b/attachment_indexation_ocr/static/description/icon.png