Skip to content

Commit

Permalink
added bbox, fixed imports and bumped version
Browse files Browse the repository at this point in the history
  • Loading branch information
Ambika Sukla committed Jan 23, 2024
1 parent 62ff190 commit e59831b
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 26 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The key maintainers of this codebase are:
- For small changes or a bug fixes, go ahead and create a PR
- For large changes, create an issue on github with the proposal and @ one of the key maintainers for discussion before working on it
- Run existing test cases and add test cases for changes to line_parser
- Note that the ingestor tests are not yet runnable and have database dependencies (work in progress), so the likelihood of breaking things is very high

## Contribution areas
- Make the new_indent_parser more accurate
Expand Down
2 changes: 1 addition & 1 deletion nlm_ingestor/ingestion_daemon/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def parse_document(
return make_response(jsonify({"status": status, "reason": msg}), rc)

def main():
logger.info("Starting parser service..")
logger.info("Starting ingestor service..")
app.run(host="0.0.0.0", port=5001, debug=False)


Expand Down
27 changes: 26 additions & 1 deletion nlm_ingestor/ingestor/visual_ingestor/block_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,12 @@ def render_json(self):
"page_idx": block["page_idx"],
"block_class": block["block_class"],
"sentences": [block_text],
"bbox": [
block["box_style"][1],
block["box_style"][0],
block["box_style"][1] + block["box_style"][3],
block["box_style"][0] + block["box_style"][4],
]
}
elif block_type == "list_item" and not is_rendering_table:
block_dict = self.render_nested_block_as_dict(block, "list_item")
Expand All @@ -266,6 +272,12 @@ def render_json(self):
"page_idx": block["page_idx"],
"block_class": block["block_class"],
"sentences": [block_text],
"bbox": [
block["box_style"][1],
block["box_style"][0],
block["box_style"][1] + block["box_style"][3],
block["box_style"][0] + block["box_style"][4],
]
}

if block_dict:
Expand Down Expand Up @@ -329,7 +341,14 @@ def render_json(self):

if 'is_table_end' in block:
is_rendering_table = False
render_dict["blocks"][-1]["table_rows"] = table_rows
table_block = render_dict["blocks"][-1]
table_block["table_rows"] = table_rows
table_block["bbox"] = [
table_block["left"],
table_block["top"],
table_block["left"] + block["box_style"][3],
table_block["top"] + block["box_style"][4],
]
table_rows = []

return render_dict
Expand All @@ -349,6 +368,12 @@ def render_nested_block_as_dict(self, block, tag):
"block_class": block["block_class"],
"sentences": [sent for sent in block["block_sents"]],
"block_idx": block["block_idx"],
"bbox": [
block["box_style"][1],
block["box_style"][0],
block["box_style"][1] + block["box_style"][3],
block["box_style"][0] + block["box_style"][4],
]
}
return block_dict

4 changes: 2 additions & 2 deletions nlm_ingestor/ingestor_utils/parsing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def format_to_tr_block(prev_block, gap_threshold):
child_x1 = child['box_style'][1]

if gap_threshold <= round(child_x1 - prev_child_x2):
new_child_block = ingestor.visual_ingestor.Doc.merge_vls(block_buff)
new_child_block = nlm_ingestor.ingestor.visual_ingestor.Doc.merge_vls(block_buff)
new_visual_lines.append(new_child_block)
new_block_children.append({"text": block_text,
"centroid": get_centroid(block_buff[0]['box_style'][1],
Expand All @@ -152,7 +152,7 @@ def format_to_tr_block(prev_block, gap_threshold):
block_buff[-1]['box_style'][2]),
"span": (prev_child['box_style'][1], child['box_style'][2])
})
new_child_block = ingestor.visual_ingestor.Doc.merge_vls(block_buff)
new_child_block = nlm_ingestor.ingestor.visual_ingestor.Doc.merge_vls(block_buff)
new_visual_lines.append(new_child_block)
return new_block_children, new_visual_lines

Expand Down
29 changes: 15 additions & 14 deletions notebooks/test_llmsherpa_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 61,
"id": "d765b72f-5d58-4343-9f48-432acb31b7d6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/2792954116.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
"/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/3984827310.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n",
" from IPython.core.display import display, HTML\n"
]
}
Expand All @@ -58,11 +58,11 @@
"from IPython.core.display import display, HTML\n",
"# llmsherpa_api_url = \"https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all\"\n",
"llmsherpa_api_url = \"http://localhost:5001/api/parseDocument?renderFormat=all\"\n",
"# pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n",
"pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n",
"# pdf_url = \"https://www.apache.org/licenses/LICENSE-2.0.txt\"\n",
"# pdf_url = \"https://microsoft.gcs-web.com/static-files/931d7780-ccfc-47e3-97ad-09d87e12b795\"\n",
"# pdf_url = \"https://podcasts.ceu.edu/sites/podcasts.ceu.edu/files/sample.doc\"\n",
"pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n",
"# pdf_url = \"/Users/ambikasukla/projects/data/Kubernetes.docx\"\n",
"# pdf_url = \"https://en.wikipedia.org/wiki/Language_model\"\n",
"# pdf_url = \"https://raw.githubusercontent.com/nlmatics/llmsherpa/main/README.md\"\n",
"# pdf_url = \"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=35362092&retmode=xml\"\n",
Expand All @@ -72,29 +72,30 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 74,
"id": "274fc39e-a574-4312-9d44-53b7758fa961",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<h1>NLMatics Kubernetes Cluster</h1><h2>1. Connect to the cluster.</h2><p>e.g.\n",
"to connect to LLMSherpa AKS.</p><p>2. List the pods</p><p>3. Log tailing for a specific pod</p><h2>4. Certificate Renewal</h2><p>· Delete the certificate.</p><p>· Delete the secret associated with the certificate.</p><p>· Restart the ingress-nginx-controller.</p><p>$ kubectl delete certificate tls-secret; kubectl delete secret tls-secret</p><p>Find the ingress-nginx-controller deployment file.</p><p>Edit the deployment file.</p><p>Modify the “replicas” to “0”.\n",
"Save and quit.</p><p>After 5 seconds, re-edit the file and modify the “replicas” back to 1.\n",
"Wait for ~30 seconds and the new certificate will be applied.</p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
"{'bbox': [179.52, 165.33, 421.01, 177.29000000000002],\n",
" 'block_class': 'cls_5',\n",
" 'block_idx': 2,\n",
" 'level': 1,\n",
" 'page_idx': 0,\n",
" 'sentences': ['{mikelewis,yinhanliu,naman}@fb.com'],\n",
" 'tag': 'header'}"
]
},
"execution_count": 28,
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"HTML(doc.sections()[0].to_html(include_children=True, recurse=True))"
"# HTML(doc.sections()[0].to_html(include_children=True, recurse=True))\n",
"doc.sections()[1].block_json"
]
}
],
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages
setup(
name='nlm-ingestor',
version='0.1.1',
version='0.1.2',
description='Parsers and ingestors for different file types and formats',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
Expand All @@ -11,7 +11,7 @@
license='Apache License 2.0',
packages=find_packages(),
include_package_data=True,
package_data={'': ['ingestor_utils/*.txt']},
package_data={'': ['ingestor_utils/*.txt', ]},
install_requires=[
"flask",
"flask_restful",
Expand Down
4 changes: 2 additions & 2 deletions tests/run_ingestor_page_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from pymongo import MongoClient
from tika import parser

from ingestor import table_parser
from ingestor import visual_ingestor
from nlm_ingestor.ingestor import table_parser
from nlm_ingestor.ingestor import visual_ingestor

db_client = MongoClient(os.getenv("MONGO_HOST", "localhost"))
db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]
Expand Down
4 changes: 2 additions & 2 deletions tests/run_ingestor_table_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from pymongo import MongoClient
from tika import parser

from ingestor import table_parser
from ingestor import visual_ingestor
from nlm_ingestor.ingestor import table_parser
from nlm_ingestor.ingestor import visual_ingestor

db_client = MongoClient(os.getenv("MONGO_HOST", "localhost"))
db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]
Expand Down
4 changes: 2 additions & 2 deletions tests/run_ingestor_test_full_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from pymongo import MongoClient
from tika import parser

from ingestor import table_parser
from ingestor import visual_ingestor
from nlm_ingestor.ingestor import table_parser
from nlm_ingestor.ingestor import visual_ingestor

db_client = MongoClient(os.getenv("MONGO_HOST", "localhost"))
db = db_client[os.getenv("MONGO_DATABASE", "doc-store-dev")]
Expand Down

0 comments on commit e59831b

Please sign in to comment.