Skip to content

Commit

Permalink
feat: add an experimental v2 parser to improve performance (#29)
Browse files Browse the repository at this point in the history

---------

Signed-off-by: Peter Staar <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
Signed-off-by: rmdg88 <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
Co-authored-by: Peter Staar <[email protected]>
Co-authored-by: Peter W. J. Staar <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Rui Dias Gomes <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
Co-authored-by: rmdg88 <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
  • Loading branch information
9 people authored Oct 11, 2024
1 parent 179b784 commit e5856f0
Show file tree
Hide file tree
Showing 705 changed files with 3,582,536 additions and 706 deletions.
38 changes: 28 additions & 10 deletions .github/scripts/build_rhel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,49 @@
set -e # trigger failure on error - do not remove!
set -x # display command on output

# Build the sdist
# Build the Python package with Poetry
poetry build -f sdist

# Compile the wheel from sdist in centos stream
USE_SYSTEM_DEPS="ON"

docker build --progress=plain \
--build-arg USE_SYSTEM_DEPS="$USE_SYSTEM_DEPS" \
-f - . <<EOF
# syntax=docker/dockerfile:1
docker build -f - . <<EOF
FROM quay.io/centos/centos:stream9
RUN dnf config-manager --set-enabled crb
# RUN dnf copr -y enable cheimes/deepsearch-glm rhel-9-x86_64
RUN dnf copr -y enable cheimes/deepsearch-glm rhel-9-x86_64
RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
&& dnf clean all
RUN dnf install -y --nodocs \
gcc gcc-c++ git make cmake pkgconfig glibc-devel \
autoconf automake binutils cmake gcc gcc-c++ git glibc-devel glibc-headers glibc-static kernel-devel libtool libstdc++-devel make ninja-build pkgconfig zlib-devel \
python3.11 python3.11-pip python3.11-devel \
libjpeg-turbo-devel libpng-devel qpdf-devel json-devel utf8cpp-devel zlib-devel \
loguru-devel \
libjpeg-turbo-devel libpng-devel qpdf-devel json-devel utf8cpp-devel \
&& dnf clean all
# # RUN dnf install -y --nodocs loguru-devel
# TEMPORARY loguru install method
# https://koji.fedoraproject.org/koji/buildinfo?buildID=2563067
RUN curl -O https://kojipkgs.fedoraproject.org//packages/loguru/2.2.0%5E20230406git4adaa18/5.el9/x86_64/loguru-2.2.0%5E20230406git4adaa18-5.el9.x86_64.rpm
RUN dnf install -y loguru-2.2.0%5E20230406git4adaa18-5.el9.x86_64.rpm
RUN curl -O https://kojipkgs.fedoraproject.org//packages/loguru/2.2.0%5E20230406git4adaa18/5.el9/x86_64/loguru-devel-2.2.0%5E20230406git4adaa18-5.el9.x86_64.rpm
RUN dnf install -y loguru-devel-2.2.0%5E20230406git4adaa18-5.el9.x86_64.rpm
RUN mkdir /src
COPY ./dist/*.tar.gz /src/
RUN USE_SYSTEM_DEPS=ON pip3.11 install /src/docling_parse*.tar.gz \
&& python3.11 -c 'from docling_parse.docling_parse import pdf_parser'
RUN USE_SYSTEM_DEPS=\$USE_SYSTEM_DEPS pip3.11 install /src/docling_parse*.tar.gz \
&& python3.11 -c 'from docling_parse.docling_parse import pdf_parser, pdf_parser_v2'
COPY ./tests /src/tests
RUN cd /src \
&& pip3.11 install pytest \
&& pytest
Expand Down
78 changes: 51 additions & 27 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,16 +100,16 @@ if(NOT USE_SYSTEM_DEPS)
endif()
endif()


# include dependencies
include(cmake/extlib_cxxopts.cmake)
include(cmake/extlib_loguru.cmake)
include(cmake/extlib_json.cmake)
include(cmake/extlib_utf8.git.cmake)
include(cmake/extlib_jpeg.cmake)
include(cmake/extlib_qpdf_v11.cmake)

# aggregate the targets created by the dependencies
set(DEPENDENCIES qpdf jpeg utf8 json)

set(DEPENDENCIES qpdf jpeg utf8 json loguru cxxopts)

# ************************
# *** libraries ***
Expand All @@ -136,21 +136,53 @@ include_directories(${SUBDIRS})
# *** Executables ***
# **********************

add_executable(parse.exe "${TOPLEVEL_PREFIX_PATH}/app/parse.cpp")
set_property(TARGET parse.exe PROPERTY CXX_STANDARD 20)
add_dependencies(parse.exe ${DEPENDENCIES})
target_include_directories(parse.exe INTERFACE ${DEPENDENCIES})
target_link_libraries(parse.exe ${DEPENDENCIES} ${LIB_LINK})
string(REPLACE ";" ";" SUBDIRS_SEMICOLON "${SUBDIRS}")
message(STATUS "subdirs: ${SUBDIRS_SEMICOLON}")

string(REPLACE ";" ";" DEPENDENCIES_SEMICOLON "${DEPENDENCIES}")
message(STATUS "cmake dependencies: ${DEPENDENCIES_SEMICOLON}")

string(REPLACE ";" ";" LIBLINK_SEMICOLON "${LIB_LINK}")
message(STATUS "cmake lib-link: ${LIBLINK_SEMICOLON}")

add_executable(parse_v1.exe "${TOPLEVEL_PREFIX_PATH}/app/parse_v1.cpp")
add_executable(parse_v2.exe "${TOPLEVEL_PREFIX_PATH}/app/parse_v2.cpp")
add_executable(parse_v2_fonts.exe "${TOPLEVEL_PREFIX_PATH}/app/parse_v2_fonts.cpp")

set_property(TARGET parse_v1.exe PROPERTY CXX_STANDARD 20)
set_property(TARGET parse_v2.exe PROPERTY CXX_STANDARD 20)
set_property(TARGET parse_v2_fonts.exe PROPERTY CXX_STANDARD 20)

add_dependencies(parse_v1.exe ${DEPENDENCIES})
add_dependencies(parse_v2.exe ${DEPENDENCIES})
add_dependencies(parse_v2_fonts.exe ${DEPENDENCIES})

target_include_directories(parse_v1.exe INTERFACE ${DEPENDENCIES})
target_include_directories(parse_v2.exe INTERFACE ${DEPENDENCIES})
target_include_directories(parse_v2_fonts.exe INTERFACE ${DEPENDENCIES})

target_link_libraries(parse_v1.exe ${DEPENDENCIES} ${LIB_LINK})
target_link_libraries(parse_v2.exe ${DEPENDENCIES} ${LIB_LINK})
target_link_libraries(parse_v2_fonts.exe ${DEPENDENCIES} ${LIB_LINK})

# **********************
# *** Libraries ***
# **********************

add_library(libparse STATIC "${TOPLEVEL_PREFIX_PATH}/app/parse.cpp")
add_dependencies(libparse ${DEPENDENCIES})
target_include_directories(libparse INTERFACE ${DEPENDENCIES})
set_target_properties(libparse PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(libparse ${DEPENDENCIES} ${LIB_LINK})
add_library(parse_v1 STATIC "${TOPLEVEL_PREFIX_PATH}/app/parse_v1.cpp")
add_library(parse_v2 STATIC "${TOPLEVEL_PREFIX_PATH}/app/parse_v1.cpp")

add_dependencies(parse_v1 ${DEPENDENCIES})
add_dependencies(parse_v2 ${DEPENDENCIES})

target_include_directories(parse_v1 INTERFACE ${DEPENDENCIES})
target_include_directories(parse_v2 INTERFACE ${DEPENDENCIES})

set_target_properties(parse_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(parse_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)

target_link_libraries(parse_v1 ${DEPENDENCIES} ${LIB_LINK})
target_link_libraries(parse_v2 ${DEPENDENCIES} ${LIB_LINK})

# ***************************
# *** Python-binding ***
Expand All @@ -161,25 +193,17 @@ target_link_libraries(libparse ${DEPENDENCIES} ${LIB_LINK})
find_package(pybind11 CONFIG REQUIRED)

pybind11_add_module(docling_parse "${TOPLEVEL_PREFIX_PATH}/app/pybind_parse.cpp")
add_dependencies(docling_parse libparse)

add_dependencies(docling_parse parse_v1 parse_v2)

target_include_directories(docling_parse INTERFACE ${DEPENDENCIES})

target_compile_definitions(docling_parse PRIVATE VERSION_INFO=${CMAKE_PROJECT_VERSION})
target_link_libraries(docling_parse PRIVATE libparse)

target_link_libraries(docling_parse PRIVATE parse_v1 parse_v2)

# *****************
# *** Install ***
# *****************

install(TARGETS docling_parse DESTINATION "${TOPLEVEL_PREFIX_PATH}/docling_parse")

# *****************
# *** Testing ***
# *****************

#function(do_test target arg result)
# add_test(NAME Comp${arg} COMMAND ${target} ${arg})
# set_tests_properties(Comp${arg} PROPERTIES PASS_REGULAR_EXPRESSION ${result})
# endfunction()

# do a bunch of result based tests
# do_test(Tutorial 4 "4 is 2")
119 changes: 70 additions & 49 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,32 @@
[![Platforms](https://img.shields.io/badge/platform-macos%20|%20linux%20|%20windows-blue)](https://github.com/DS4SD/docling-parse/)
[![License MIT](https://img.shields.io/github/license/DS4SD/docling-parse)](https://opensource.org/licenses/MIT)

Simple package to extract text with coordinates from programmatic PDFs.
This package is part of the [Docling](https://github.com/DS4SD/docling) conversion.

Simple package to extract text, paths and bitmap images with coordinates from programmatic PDFs.
This package is used in the [Docling](https://github.com/DS4SD/docling) PDF conversion.

<table>
<tr>
<th>Version</th>
<th>Original</th>
<th>Word-level</th>
<th>Snippet-level</th>
<th>Performance</th>
</tr>
<tr>
<th>V1</th>
<td rowspan="2"><img src="./docs/example_visualisations/2305.14962v1.pdf_page=0.png" alt="screenshot" width="100"/></td>
<td>Not Supported</td>
<td><img src="./docs/example_visualisations/2305.14962v1.pdf_page=0.v1.png" alt="v1 snippet" width="100"/></td>
<td>~0.250 page/sec</td>
</tr>
<tr>
<th>V2</th>
<!-- The "Original" column image spans from the previous row -->
<td><img src="./docs/example_visualisations/2305.14962v1.pdf_page=0.v2.original.png" alt="v1 word" width="100"/></td>
<td><img src="./docs/example_visualisations/2305.14962v1.pdf_page=0.v2.sanitized.png" alt="v2 snippet" width="100"/></td>
<td>~0.050 page/sec <br><br>[~5-10X faster than v1]</td>
</tr>
</table>

## Quick start

Expand All @@ -19,13 +42,13 @@ Install the package from Pypi
pip install docling-parse
```

Convert a PDF
Convert a PDF (look in the [visualise.py](docling_parse/visualise.py) for a more detailed information)

```python
from docling_parse.docling_parse import pdf_parser
from docling_parse.docling_parse import pdf_parser_v2

# Do this only once to load fonts (avoid initialising it many times)
parser = pdf_parser()
parser = pdf_parser_v2()

# parser.set_loglevel(1) # 1=error, 2=warning, 3=success, 4=info

Expand Down Expand Up @@ -64,39 +87,7 @@ for page in range(0, num_pages):
# parsed page is the first one!
json_page = json_doc["pages"][0]

page_dimensions = [json_page["dimensions"]["width"], json_page["dimensions"]["height"]]

# find text cells
cells=[]
for cell_id,cell in enumerate(json_page["cells"]):
cells.append([page,
cell_id,
cell["content"]["rnormalized"], # text
cell["box"]["device"][0], # x0 (lower left x)
cell["box"]["device"][1], # y0 (lower left y)
cell["box"]["device"][2], # x1 (upper right x)
cell["box"]["device"][3], # y1 (upper right y)
])

# find bitmap images
images=[]
for image_id,image in enumerate(json_page["images"]):
images.append([page,
image_id,
image["box"][0], # x0 (lower left x)
image["box"][1], # y0 (lower left y)
image["box"][2], # x1 (upper right x)
image["box"][3], # y1 (upper right y)
])

# find paths
paths=[]
for path_id,path in enumerate(json_page["paths"]):
paths.append([page,
path_id,
path["x-values"], # array of x values
path["y-values"], # array of y values
])
# <Insert your own code>

# Unload the (QPDF) document and buffers
parser.unload_document(doc_key)
Expand Down Expand Up @@ -128,10 +119,38 @@ To build the parse, simply run the following command in the root folder,
rm -rf build; cmake -B ./build; cd build; make
```

You can run the parser from your build folder with
You can run the parser from your build folder. Example from parse_v1,

```sh
% ./parse_v1.exe -h
A program to process PDF files or configuration files
Usage:
PDFProcessor [OPTION...]

-i, --input arg Input PDF file
-c, --config arg Config file
--create-config arg Create config file
-o, --output arg Output file
-l, --loglevel arg loglevel [error;warning;success;info]
-h, --help Print usage
```

Example from parse_v2,

```sh
./parse.exe <input-file> <optional-logging:true>
% ./parse_v2.exe -h
program to process PDF files or configuration files
Usage:
PDFProcessor [OPTION...]

-i, --input arg Input PDF file
-c, --config arg Config file
--create-config arg Create config file
-p, --page arg Pages to process (default: -1 for all) (default:
-1)
-o, --output arg Output file
-l, --loglevel arg loglevel [error;warning;success;info]
-h, --help Print usage
```

If you dont have an input file, then a template input file will be printed on the terminal.
Expand All @@ -148,7 +167,7 @@ poetry build
To test the package, run,

```
poetry run pytest ./tests/test_parse.py
poetry run pytest ./tests -v -s
```


Expand All @@ -162,13 +181,15 @@ Please read [Contributing to Docling Parse](https://github.com/DS4SD/docling-par
If you use Docling in your projects, please consider citing the following:

```bib
@software{Docling,
author = {Deep Search Team},
month = {7},
title = {{Docling}},
url = {https://github.com/DS4SD/docling},
version = {main},
year = {2024}
@techreport{Docling,
author = {Deep Search Team},
month = {8},
title = {Docling Technical Report},
url = {https://arxiv.org/abs/2408.09869},
eprint = {2408.09869},
doi = {10.48550/arXiv.2408.09869},
version = {1.0.0},
year = {2024}
}
```

Expand Down
Loading

0 comments on commit e5856f0

Please sign in to comment.