Skip to content

Commit

Permalink
feat: fixed the v2 parser to only return the pages that are requested (
Browse files Browse the repository at this point in the history
…#47)

* fixed the v2 parser to only return the pages that are requested

Signed-off-by: Peter Staar <[email protected]>

* updated the visualize script

Signed-off-by: Peter Staar <[email protected]>

* fixed the default args for compilation

Signed-off-by: Peter Staar <[email protected]>

* put std::make_pair to avoid warnings

Signed-off-by: Peter Staar <[email protected]>

---------

Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM authored Oct 23, 2024
1 parent 836571a commit 48451ad
Show file tree
Hide file tree
Showing 32 changed files with 568 additions and 2,024,476 deletions.
6 changes: 3 additions & 3 deletions app/parse_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ void set_loglevel(std::string level)
//loguru::set_verbosity(loguru::Verbosity_ERROR);
}
else
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR; {

}
{
loguru::g_stderr_verbosity = loguru::Verbosity_ERROR;
}
}

nlohmann::json create_config(std::filesystem::path ifile,
Expand Down
2 changes: 1 addition & 1 deletion docling_parse/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from tabulate import tabulate

from docling_parse.docling_parse import pdf_parser, pdf_parser_v2
from docling_parse import pdf_parser, pdf_parser_v2

try:
from PIL import Image, ImageDraw
Expand Down
6 changes: 1 addition & 5 deletions src/pybind/docling_parser_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,7 @@ namespace docling

std::string pdf_resources_dir;

//std::map<std::string, std::filesystem::path> key2doc;
std::map<std::string, decoder_ptr_type> key2doc;

//plib::parser parser;
};

docling_parser_v2::docling_parser_v2():
Expand Down Expand Up @@ -82,7 +79,6 @@ namespace docling
std::map<std::string, double> timings = {};
pdflib::pdf_resource<pdflib::PAGE_FONT>::initialise(data, timings);
}


void docling_parser_v2::set_loglevel(int level)
{
Expand Down Expand Up @@ -114,7 +110,7 @@ namespace docling
{
loguru::g_stderr_verbosity = loguru::Verbosity_INFO;
}
else if(level=="warning")
else if(level=="warning" or level=="warn")
{
loguru::g_stderr_verbosity = loguru::Verbosity_WARNING;
}
Expand Down
25 changes: 17 additions & 8 deletions src/v2/pdf_decoders/document.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ namespace pdflib

private:

void update_timings(std::map<std::string, double>& timings_);
void update_timings(std::map<std::string, double>& timings_, bool set_timer);

private:

Expand Down Expand Up @@ -181,7 +181,10 @@ namespace pdflib
utils::timer timer;

nlohmann::json& json_pages = json_document["pages"];

json_pages = nlohmann::json::array({});

bool set_timer=true;

int page_number=0;
for(QPDFObjectHandle page : qpdf_document.getAllPages())
{
Expand All @@ -190,7 +193,8 @@ namespace pdflib
pdf_decoder<PAGE> page_decoder(page);

auto timings_ = page_decoder.decode_page();
update_timings(timings_);
update_timings(timings_, set_timer);
set_timer = false;

json_pages.push_back(page_decoder.get());

Expand All @@ -208,10 +212,13 @@ namespace pdflib
LOG_S(INFO) << "start decoding selected pages ...";
utils::timer timer;

// make sure that we only return the page from the page-numbers
nlohmann::json& json_pages = json_document["pages"];

json_pages = nlohmann::json::array({});

std::vector<QPDFObjectHandle> pages = qpdf_document.getAllPages();


bool set_timer=true; // make sure we override all timings for this page-set
for(auto page_number:page_numbers)
{
utils::timer timer;
Expand All @@ -223,7 +230,9 @@ namespace pdflib
pdf_decoder<PAGE> page_decoder(pages.at(page_number));

auto timings_ = page_decoder.decode_page();
update_timings(timings_);

update_timings(timings_, set_timer);
set_timer=false;

json_pages.push_back(page_decoder.get());

Expand All @@ -244,11 +253,11 @@ namespace pdflib
timings[__FUNCTION__] = timer.get_time();
}

void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_)
void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_, bool set_timer)
{
for(auto itr=timings_.begin(); itr!=timings_.end(); itr++)
{
if(timings.count(itr->first)==0)
if(timings.count(itr->first)==0 or set_timer)
{
timings[itr->first] = itr->second;
}
Expand Down
9 changes: 6 additions & 3 deletions src/v2/pdf_resources/page_line.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,22 @@ namespace pdflib
std::pair<double, double> pdf_resource<PAGE_LINE>::front()
{
assert(x.size()>0);
return std::pair<double, double>(x.front(), y.front());
//return std::pair<double, double>(x.front(), y.front());
return std::make_pair(x.front(), y.front());
}

std::pair<double, double> pdf_resource<PAGE_LINE>::back()
{
assert(x.size()>0);
return std::pair<double, double>(x.back(), y.back());
//return std::pair<double, double>(x.back(), y.back());
return std::make_pair(x.back(), y.back());
}

std::pair<double, double> pdf_resource<PAGE_LINE>::operator[](int i)
{
assert(x.size()>0 and i<x.size());
return std::pair<double, double>(x[i], y[i]);
//return std::pair<double, double>(x[i], y[i]);
return std::make_pair(x[i], y[i]);
}

void pdf_resource<PAGE_LINE>::transform(std::array<double, 9> trafo_matrix)
Expand Down
126 changes: 63 additions & 63 deletions tests/pdf_docs/tests/2305.14962v1.pdf.v2.bytesio.json
Original file line number Diff line number Diff line change
Expand Up @@ -16684,11 +16684,11 @@
"lines": []
},
"timings": {
"decode_contents": 0.003507,
"decode_dimensions": 0.0,
"decode_page": 0.015862,
"decode_resources": 0.008806,
"sanitise_contents": 4.1e-05
"decode_contents": 0.003358,
"decode_dimensions": 4e-06,
"decode_page": 0.015178,
"decode_resources": 0.008326,
"sanitise_contents": 3.9e-05
}
},
{
Expand Down Expand Up @@ -29861,11 +29861,11 @@
]
},
"timings": {
"decode_contents": 0.015415,
"decode_contents": 0.015422,
"decode_dimensions": 0.0,
"decode_page": 0.02518,
"decode_resources": 0.007428,
"sanitise_contents": 3.2e-05
"decode_page": 0.025149,
"decode_resources": 0.007392,
"sanitise_contents": 2.9e-05
}
},
{
Expand Down Expand Up @@ -40840,10 +40840,10 @@
]
},
"timings": {
"decode_contents": 0.002348,
"decode_contents": 0.00241,
"decode_dimensions": 0.0,
"decode_page": 0.011055,
"decode_resources": 0.006615,
"decode_page": 0.011236,
"decode_resources": 0.006414,
"sanitise_contents": 2.5e-05
}
},
Expand Down Expand Up @@ -54715,10 +54715,10 @@
]
},
"timings": {
"decode_contents": 0.004488,
"decode_contents": 0.004505,
"decode_dimensions": 0.0,
"decode_page": 0.012061,
"decode_resources": 0.005828,
"decode_page": 0.012243,
"decode_resources": 0.005641,
"sanitise_contents": 2.8e-05
}
},
Expand Down Expand Up @@ -71744,11 +71744,11 @@
]
},
"timings": {
"decode_contents": 0.002497,
"decode_contents": 0.002465,
"decode_dimensions": 0.0,
"decode_page": 0.011164,
"decode_resources": 0.006387,
"sanitise_contents": 4.9e-05
"decode_page": 0.011008,
"decode_resources": 0.006174,
"sanitise_contents": 4.1e-05
}
},
{
Expand Down Expand Up @@ -88941,11 +88941,11 @@
]
},
"timings": {
"decode_contents": 0.004848,
"decode_contents": 0.004823,
"decode_dimensions": 0.0,
"decode_page": 0.015907,
"decode_resources": 0.008227,
"sanitise_contents": 4e-05
"decode_page": 0.016107,
"decode_resources": 0.00802,
"sanitise_contents": 3.5e-05
}
},
{
Expand Down Expand Up @@ -109738,11 +109738,11 @@
]
},
"timings": {
"decode_contents": 0.010908,
"decode_contents": 0.01093,
"decode_dimensions": 0.0,
"decode_page": 0.018141,
"decode_resources": 0.005595,
"sanitise_contents": 4e-05
"decode_page": 0.018393,
"decode_resources": 0.005545,
"sanitise_contents": 3.8e-05
}
},
{
Expand Down Expand Up @@ -126017,11 +126017,11 @@
"lines": []
},
"timings": {
"decode_contents": 0.002365,
"decode_contents": 0.00238,
"decode_dimensions": 0.0,
"decode_page": 0.008046,
"decode_resources": 0.003733,
"sanitise_contents": 3.8e-05
"decode_page": 0.007619,
"decode_resources": 0.003613,
"sanitise_contents": 3.7e-05
}
},
{
Expand Down Expand Up @@ -140088,11 +140088,11 @@
"lines": []
},
"timings": {
"decode_contents": 0.002037,
"decode_contents": 0.002042,
"decode_dimensions": 0.0,
"decode_page": 0.008592,
"decode_resources": 0.004611,
"sanitise_contents": 3.2e-05
"decode_page": 0.008342,
"decode_resources": 0.004467,
"sanitise_contents": 3.5e-05
}
},
{
Expand Down Expand Up @@ -156805,10 +156805,10 @@
]
},
"timings": {
"decode_contents": 0.002362,
"decode_contents": 0.002402,
"decode_dimensions": 0.0,
"decode_page": 0.010877,
"decode_resources": 0.006402,
"decode_page": 0.010979,
"decode_resources": 0.006272,
"sanitise_contents": 3.8e-05
}
},
Expand Down Expand Up @@ -176492,10 +176492,10 @@
"lines": []
},
"timings": {
"decode_contents": 0.003171,
"decode_contents": 0.003227,
"decode_dimensions": 0.0,
"decode_page": 0.008376,
"decode_resources": 0.002849,
"decode_page": 0.007453,
"decode_resources": 0.002728,
"sanitise_contents": 4.7e-05
}
},
Expand Down Expand Up @@ -186267,33 +186267,33 @@
"lines": []
},
"timings": {
"decode_contents": 0.001524,
"decode_contents": 0.001489,
"decode_dimensions": 0.0,
"decode_page": 0.004688,
"decode_resources": 0.001911,
"decode_page": 0.004278,
"decode_resources": 0.001836,
"sanitise_contents": 2.2e-05
}
}
],
"timings": {
"decode_contents": 0.05547,
"decode_dimensions": 0.0,
"decode_document": 0.154582,
"decode_page": 0.149949,
"decode_resources": 0.068392,
"decoding page 0": 0.016121,
"decoding page 1": 0.025364,
"decoding page 10": 0.008664,
"decoding page 11": 0.004831,
"decoding page 2": 0.011215,
"decoding page 3": 0.01223,
"decoding page 4": 0.011415,
"decoding page 5": 0.016156,
"decoding page 6": 0.018376,
"decoding page 7": 0.008278,
"decoding page 8": 0.008797,
"decoding page 9": 0.011129,
"process_document_from_bytesio": 0.000393,
"sanitise_contents": 0.00043200000000000004
"decode_contents": 0.055453,
"decode_dimensions": 4e-06,
"decode_document": 0.152719,
"decode_page": 0.14798499999999998,
"decode_resources": 0.066428,
"decoding page 0": 0.015425,
"decoding page 1": 0.025324,
"decoding page 10": 0.007762,
"decoding page 11": 0.004436,
"decoding page 2": 0.011409,
"decoding page 3": 0.012426,
"decoding page 4": 0.011271,
"decoding page 5": 0.016368,
"decoding page 6": 0.018673,
"decoding page 7": 0.007881,
"decoding page 8": 0.008561,
"decoding page 9": 0.011247,
"process_document_from_bytesio": 0.000421,
"sanitise_contents": 0.000414
}
}
Loading

0 comments on commit 48451ad

Please sign in to comment.