diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f1b3a51..95e9674 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,6 +8,7 @@ The key maintainers of this codebase are: - For small changes or a bug fixes, go ahead and create a PR - For large changes, create an issue on github with the proposal and @ one of the key maintainers for discussion before working on it - Run existing test cases and add test cases for changes to line_parser +- Note that the ingestor tests are not yet runnable and have database dependencies (work in progress), so the likelihood of breaking things is very high ## Contribution areas - Make the new_indent_parser more accurate diff --git a/nlm_ingestor/ingestion_daemon/__main__.py b/nlm_ingestor/ingestion_daemon/__main__.py index 306e1b0..fc53c3e 100644 --- a/nlm_ingestor/ingestion_daemon/__main__.py +++ b/nlm_ingestor/ingestion_daemon/__main__.py @@ -64,7 +64,7 @@ def parse_document( return make_response(jsonify({"status": status, "reason": msg}), rc) def main(): - logger.info("Starting parser service..") + logger.info("Starting ingestor service..") app.run(host="0.0.0.0", port=5001, debug=False) diff --git a/nlm_ingestor/ingestor/visual_ingestor/block_renderer.py b/nlm_ingestor/ingestor/visual_ingestor/block_renderer.py index c9bd5f3..31d79bf 100644 --- a/nlm_ingestor/ingestor/visual_ingestor/block_renderer.py +++ b/nlm_ingestor/ingestor/visual_ingestor/block_renderer.py @@ -255,6 +255,12 @@ def render_json(self): "page_idx": block["page_idx"], "block_class": block["block_class"], "sentences": [block_text], + "bbox": [ + block["box_style"][1], + block["box_style"][0], + block["box_style"][1] + block["box_style"][3], + block["box_style"][0] + block["box_style"][4], + ] } elif block_type == "list_item" and not is_rendering_table: block_dict = self.render_nested_block_as_dict(block, "list_item") @@ -266,6 +272,12 @@ def render_json(self): "page_idx": block["page_idx"], "block_class": block["block_class"], "sentences": [block_text], + "bbox": [ + block["box_style"][1], + block["box_style"][0], + block["box_style"][1] + block["box_style"][3], + block["box_style"][0] + block["box_style"][4], + ] } if block_dict: @@ -329,7 +341,14 @@ def render_json(self): if 'is_table_end' in block: is_rendering_table = False - render_dict["blocks"][-1]["table_rows"] = table_rows + table_block = render_dict["blocks"][-1] + table_block["table_rows"] = table_rows + table_block["bbox"] = [ + table_block["left"], + table_block["top"], + table_block["left"] + block["box_style"][3], + table_block["top"] + block["box_style"][4], + ] table_rows = [] return render_dict @@ -349,6 +368,12 @@ def render_nested_block_as_dict(self, block, tag): "block_class": block["block_class"], "sentences": [sent for sent in block["block_sents"]], "block_idx": block["block_idx"], + "bbox": [ + block["box_style"][1], + block["box_style"][0], + block["box_style"][1] + block["box_style"][3], + block["box_style"][0] + block["box_style"][4], + ] } return block_dict diff --git a/nlm_ingestor/ingestor_utils/parsing_utils.py b/nlm_ingestor/ingestor_utils/parsing_utils.py index ed66251..ae8c41d 100644 --- a/nlm_ingestor/ingestor_utils/parsing_utils.py +++ b/nlm_ingestor/ingestor_utils/parsing_utils.py @@ -131,7 +131,7 @@ def format_to_tr_block(prev_block, gap_threshold): child_x1 = child['box_style'][1] if gap_threshold <= round(child_x1 - prev_child_x2): - new_child_block = ingestor.visual_ingestor.Doc.merge_vls(block_buff) + new_child_block = nlm_ingestor.ingestor.visual_ingestor.Doc.merge_vls(block_buff) new_visual_lines.append(new_child_block) new_block_children.append({"text": block_text, "centroid": get_centroid(block_buff[0]['box_style'][1], @@ -152,7 +152,7 @@ def format_to_tr_block(prev_block, gap_threshold): block_buff[-1]['box_style'][2]), "span": (prev_child['box_style'][1], child['box_style'][2]) }) - new_child_block = ingestor.visual_ingestor.Doc.merge_vls(block_buff) + new_child_block = nlm_ingestor.ingestor.visual_ingestor.Doc.merge_vls(block_buff) new_visual_lines.append(new_child_block) return new_block_children, new_visual_lines diff --git a/notebooks/test_llmsherpa_api.ipynb b/notebooks/test_llmsherpa_api.ipynb index 9dc3270..e38d39e 100644 --- a/notebooks/test_llmsherpa_api.ipynb +++ b/notebooks/test_llmsherpa_api.ipynb @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 61, "id": "d765b72f-5d58-4343-9f48-432acb31b7d6", "metadata": {}, "outputs": [ @@ -48,7 +48,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/2792954116.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n", + "/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/3984827310.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n", " from IPython.core.display import display, HTML\n" ] } @@ -58,11 +58,11 @@ "from IPython.core.display import display, HTML\n", "# llmsherpa_api_url = \"https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all\"\n", "llmsherpa_api_url = \"http://localhost:5001/api/parseDocument?renderFormat=all\"\n", - "# pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n", + "pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n", "# pdf_url = \"https://www.apache.org/licenses/LICENSE-2.0.txt\"\n", "# pdf_url = \"https://microsoft.gcs-web.com/static-files/931d7780-ccfc-47e3-97ad-09d87e12b795\"\n", "# pdf_url = \"https://podcasts.ceu.edu/sites/podcasts.ceu.edu/files/sample.doc\"\n", - "pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n", + "# pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n", "# pdf_url = \"https://en.wikipedia.org/wiki/Language_model\"\n", "# pdf_url = \"https://raw.githubusercontent.com/nlmatics/llmsherpa/main/README.md\"\n", "# pdf_url = \"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=35362092&retmode=xml\"\n", @@ -72,29 +72,30 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 74, "id": "274fc39e-a574-4312-9d44-53b7758fa961", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "

NLMatics Kubernetes Cluster

1. Connect to the cluster.

e.g.\n", - "to connect to LLMSherpa AKS.

2. List the pods

3. Log tailing for a specific pod

4. Certificate Renewal

· Delete the certificate.

· Delete the secret associated with the certificate.

· Restart the ingress-nginx-controller.

$ kubectl delete certificate tls-secret; kubectl delete secret tls-secret

Find the ingress-nginx-controller deployment file.

Edit the deployment file.

Modify the “replicas” to “0”.\n", - "Save and quit.

After 5 seconds, re-edit the file and modify the “replicas” back to 1.\n", - "Wait for ~30 seconds and the new certificate will be applied.

" - ], "text/plain": [ - "" + "{'bbox': [179.52, 165.33, 421.01, 177.29000000000002],\n", + " 'block_class': 'cls_5',\n", + " 'block_idx': 2,\n", + " 'level': 1,\n", + " 'page_idx': 0,\n", + " 'sentences': ['{mikelewis,yinhanliu,naman}@fb.com'],\n", + " 'tag': 'header'}" ] }, - "execution_count": 28, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "HTML(doc.sections()[0].to_html(include_children=True, recurse=True))" + "# HTML(doc.sections()[0].to_html(include_children=True, recurse=True))\n", + "doc.sections()[1].block_json" ] } ], diff --git a/setup.py b/setup.py index 7309b40..945a0e3 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( name='nlm-ingestor', - version='0.1.1', + version='0.1.2', description='Parsers and ingestors for different file types and formats', long_description=open('README.md').read(), long_description_content_type='text/markdown', @@ -11,7 +11,7 @@ license='Apache License 2.0', packages=find_packages(), include_package_data=True, - package_data={'': ['ingestor_utils/*.txt']}, + package_data={'': ['ingestor_utils/*.txt', ]}, install_requires=[ "flask", "flask_restful", diff --git a/tests/run_ingestor_page_test.py b/tests/run_ingestor_page_test.py index 046262d..9dccf22 100644 --- a/tests/run_ingestor_page_test.py +++ b/tests/run_ingestor_page_test.py @@ -9,8 +9,8 @@ from pymongo import MongoClient from tika import parser -from ingestor import table_parser -from ingestor import visual_ingestor +from nlm_ingestor.ingestor import table_parser +from nlm_ingestor.ingestor import visual_ingestor db_client = MongoClient(os.getenv("MONGO_HOST", "localhost")) db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")] diff --git a/tests/run_ingestor_table_tests.py b/tests/run_ingestor_table_tests.py index 452587f..6312260 100644 --- a/tests/run_ingestor_table_tests.py +++ b/tests/run_ingestor_table_tests.py @@ -9,8 +9,8 @@ from pymongo import MongoClient from tika import parser -from ingestor import table_parser -from ingestor import visual_ingestor +from nlm_ingestor.ingestor import table_parser +from nlm_ingestor.ingestor import visual_ingestor db_client = MongoClient(os.getenv("MONGO_HOST", "localhost")) db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")] diff --git a/tests/run_ingestor_test_full_doc.py b/tests/run_ingestor_test_full_doc.py index c22c812..a976e48 100644 --- a/tests/run_ingestor_test_full_doc.py +++ b/tests/run_ingestor_test_full_doc.py @@ -10,8 +10,8 @@ from pymongo import MongoClient from tika import parser -from ingestor import table_parser -from ingestor import visual_ingestor +from nlm_ingestor.ingestor import table_parser +from nlm_ingestor.ingestor import visual_ingestor db_client = MongoClient(os.getenv("MONGO_HOST", "localhost")) db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]