Skip to content

Commit

Permalink
feat: add the export of annotations and ToC (#58)
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM authored Nov 20, 2024
1 parent b3a33a2 commit 22cf280
Show file tree
Hide file tree
Showing 199 changed files with 4,850,762 additions and 120,856 deletions.
6 changes: 6 additions & 0 deletions app/pybind_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ PYBIND11_MODULE(docling_parse, m) {
.def("unload_document", &docling::docling_parser_v2::unload_document)

.def("number_of_pages", &docling::docling_parser_v2::number_of_pages)

.def("get_annotations", &docling::docling_parser_v2::get_annotations,
"Get annotations at the top-level of the document")

.def("get_table_of_contents", &docling::docling_parser_v2::get_table_of_contents,
"Get the table-of-contents (None if not available)")

.def("parse_pdf_from_key",
pybind11::overload_cast<std::string>(&docling::docling_parser_v2::parse_pdf_from_key),
Expand Down
60 changes: 42 additions & 18 deletions docling_parse/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,24 @@ def visualise_v1(
img.save(oname)


def draw_annotations(draw, annot, H, W):

if "/Rect" in annot:
bbox = annot["/Rect"]

bl = (bbox[0], H - bbox[1])
br = (bbox[2], H - bbox[1])
tr = (bbox[2], H - bbox[3])
tl = (bbox[0], H - bbox[3])

# Draw the rectangle as a polygon
draw.polygon([bl, br, tr, tl], outline="white", fill="green")

if "/Kids" in annot:
for _ in annot["/Kids"]:
draw_annotations(draw, annot, H, W)


def visualise_v2(
log_level: str,
pdf_path: str,
Expand Down Expand Up @@ -260,6 +278,8 @@ def visualise_v2(

lines = page[_]["lines"]

annots = page["annotations"]

if PIL_INSTALLED:

W = dimension["width"]
Expand All @@ -286,24 +306,6 @@ def visualise_v2(
# Draw the rectangle as a polygon
draw.polygon([bl, br, tr, tl], outline="green", fill="yellow")

# Draw each rectangle by connecting its four points
for line in lines:

i = line["i"]
x = line["x"]
y = line["y"]

for l in range(0, len(i), 2):
i0 = i[l + 0]
i1 = i[l + 1]

for k in range(i0, i1 - 1):
draw.line(
(x[k], H - y[k], x[k + 1], H - y[k + 1]),
fill="black",
width=3,
)

# Draw each rectangle by connecting its four points
for row in cells:

Expand All @@ -330,6 +332,28 @@ def visualise_v2(
# You can change the outline and fill color
draw.polygon(rect, outline="red", fill="blue")

# Draw widgets
for annot in annots:
draw_annotations(draw, annot, H, W)

# Draw each rectangle by connecting its four points
for line in lines:

i = line["i"]
x = line["x"]
y = line["y"]

for l in range(0, len(i), 2):
i0 = i[l + 0]
i1 = i[l + 1]

for k in range(i0, i1 - 1):
draw.line(
(x[k], H - y[k], x[k + 1], H - y[k + 1]),
fill="black",
width=1,
)

# Show the image
if interactive:
img.show()
Expand Down
35 changes: 34 additions & 1 deletion src/pybind/docling_parser_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ namespace docling

int number_of_pages(std::string key);

nlohmann::json get_annotations(std::string key);
nlohmann::json get_table_of_contents(std::string key);

nlohmann::json parse_pdf_from_key(std::string key);

nlohmann::json parse_pdf_from_key_on_page(std::string key, int page);
Expand Down Expand Up @@ -230,10 +233,40 @@ namespace docling

return -1;
}

nlohmann::json docling_parser_v2::get_annotations(std::string key)
{
LOG_S(INFO) << __FUNCTION__;

auto itr = key2doc.find(key);

if(itr==key2doc.end())
{
LOG_S(ERROR) << "key not found: " << key;
return nlohmann::json::value_t::null;
}

return (itr->second)->get_annotations();
}

nlohmann::json docling_parser_v2::get_table_of_contents(std::string key)
{
LOG_S(INFO) << __FUNCTION__;

auto itr = key2doc.find(key);

if(itr==key2doc.end())
{
LOG_S(ERROR) << "key not found: " << key;
return nlohmann::json::value_t::null;
}

return (itr->second)->get_table_of_contents();
}

nlohmann::json docling_parser_v2::parse_pdf_from_key(std::string key)
{
LOG_S(WARNING) << __FUNCTION__;
LOG_S(INFO) << __FUNCTION__;

auto itr = key2doc.find(key);

Expand Down
2 changes: 0 additions & 2 deletions src/v2/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ namespace plib
bool initialise(nlohmann::json& data);

private:



void execute_parse();

Expand Down
34 changes: 26 additions & 8 deletions src/v2/pdf_decoders/document.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ namespace pdflib
nlohmann::json get();

int get_number_of_pages() { return number_of_pages; }

nlohmann::json get_annotations() { return json_annots; }
nlohmann::json get_table_of_contents() { return json_annots["table_of_contents"]; }

bool process_document_from_file(std::string& _filename);
bool process_document_from_bytesio(std::string& _buffer);
Expand All @@ -47,6 +50,8 @@ namespace pdflib

int number_of_pages;

//nlohmann::json json_toc; // table-of-contents
nlohmann::json json_annots;
nlohmann::json json_document;
};

Expand All @@ -56,15 +61,15 @@ namespace pdflib

timings({}),
qpdf_document(),

//qpdf_root(NULL),
//qpdf_pages(NULL),

// have compatibulity between QPDF v10 and v11
qpdf_root(),
qpdf_pages(),

number_of_pages(-1),

//json_toc(nlohmann::json::value_t::null),
json_annots(nlohmann::json::value_t::null),
json_document(nlohmann::json::value_t::null)
{}

Expand All @@ -75,14 +80,14 @@ namespace pdflib
timings(timings_),
qpdf_document(),

//qpdf_root(NULL),
//qpdf_pages(NULL),

// have compatibulity between QPDF v10 and v11
qpdf_root(),
qpdf_pages(),

number_of_pages(-1),

//json_toc(nlohmann::json::value_t::null),
json_annots(nlohmann::json::value_t::null),
json_document(nlohmann::json::value_t::null)
{}

Expand All @@ -91,6 +96,11 @@ namespace pdflib

nlohmann::json pdf_decoder<DOCUMENT>::get()
{
{
//json_document["table_of_contents"] = json_toc;
json_document["annotations"] = json_annots;
}

{
nlohmann::json& timings_ = json_document["timings"];

Expand Down Expand Up @@ -118,6 +128,9 @@ namespace pdflib
qpdf_root = qpdf_document.getRoot();
qpdf_pages = qpdf_root.getKey("/Pages");

//json_toc = extract_toc_in_json(qpdf_root);
json_annots = extract_document_annotations_in_json(qpdf_document, qpdf_root);

number_of_pages = qpdf_pages.getKey("/Count").getIntValue();
LOG_S(INFO) << "#-pages: " << number_of_pages;

Expand Down Expand Up @@ -148,13 +161,17 @@ namespace pdflib
try
{
std::string description = "processing buffer";
qpdf_document.processMemoryFile(description.c_str(), buffer.c_str(), buffer.size());
qpdf_document.processMemoryFile(description.c_str(),
buffer.c_str(), buffer.size());

LOG_S(INFO) << "buffer processed by qpdf!";

qpdf_root = qpdf_document.getRoot();
qpdf_pages = qpdf_root.getKey("/Pages");

//json_toc = extract_toc_in_json(qpdf_root);
json_annots = extract_document_annotations_in_json(qpdf_document, qpdf_root);

number_of_pages = qpdf_pages.getKey("/Count").getIntValue();
LOG_S(INFO) << "#-pages: " << number_of_pages;

Expand Down Expand Up @@ -253,7 +270,8 @@ namespace pdflib
timings[__FUNCTION__] = timer.get_time();
}

void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_, bool set_timer)
void pdf_decoder<DOCUMENT>::update_timings(std::map<std::string, double>& timings_,
bool set_timer)
{
for(auto itr=timings_.begin(); itr!=timings_.end(); itr++)
{
Expand Down
88 changes: 87 additions & 1 deletion src/v2/pdf_decoders/page.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ namespace pdflib
// Contents
void decode_contents();

void decode_annots();

void sanitise_contents();

private:
Expand All @@ -49,6 +51,8 @@ namespace pdflib
QPDFObjectHandle qpdf_fonts;
QPDFObjectHandle qpdf_xobjects;

nlohmann::json json_annots;

nlohmann::json json_page;
nlohmann::json json_resources;
nlohmann::json json_grphs;
Expand Down Expand Up @@ -85,6 +89,8 @@ namespace pdflib
{
nlohmann::json result;
{
result["annotations"] = json_annots;

nlohmann::json& timings_ = result["timings"];
{
for(auto itr=timings.begin(); itr!=timings.end(); itr++)
Expand Down Expand Up @@ -126,6 +132,8 @@ namespace pdflib
utils::timer timer;

json_page = to_json(qpdf_page);

json_annots = extract_annots_in_json(qpdf_page);

try
{
Expand Down Expand Up @@ -153,8 +161,10 @@ namespace pdflib

decode_contents();

decode_annots();

sanitise_contents();

timings[__FUNCTION__] = timer.get_time();

return timings;
Expand Down Expand Up @@ -264,6 +274,82 @@ namespace pdflib
}
}

timings[__FUNCTION__] = timer.get_time();
}

void pdf_decoder<PAGE>::decode_annots()
{
LOG_S(INFO) << __FUNCTION__;
utils::timer timer;

//LOG_S(INFO) << "analyzing: " << json_annots.dump(2);
if(json_annots.is_array())
{
for(auto item:json_annots)
{
LOG_S(INFO) << "analyzing: " << item.dump(2);

if(item.count("/Type")==1 and item["/Type"].get<std::string>()=="/Annot" and
item.count("/Subtype")==1 and item["/Subtype"].get<std::string>()=="/Widget" and
item.count("/Rect")==1 and
item.count("/V")==1 and
item.count("/T")==1 and true)
{
std::array<double, 4> bbox = item["/Rect"].get<std::array<double, 4> >();
//LOG_S(INFO) << bbox[0] << ", "<< bbox[1] << ", "<< bbox[2] << ", "<< bbox[3];

std::string text = item["/V"].get<std::string>();
//LOG_S(INFO) << "text: " << text;

pdf_resource<PAGE_CELL> cell;
{
cell.widget = true;

cell.x0 = bbox[0];
cell.y0 = bbox[1];
cell.x1 = bbox[2];
cell.y1 = bbox[3];

cell.r_x0 = bbox[0];
cell.r_y0 = bbox[1];
cell.r_x1 = bbox[2];
cell.r_y1 = bbox[1];
cell.r_x2 = bbox[2];
cell.r_y2 = bbox[3];
cell.r_x3 = bbox[0];
cell.r_y3 = bbox[3];

cell.text = text;
cell.rendering_mode = 0;

cell.space_width = 0;
cell.chars = {};//chars;
cell.widths = {};//widths;

cell.enc_name = "Form-font"; //font.get_encoding_name();

cell.font_enc = "Form-font"; //to_string(font.get_encoding());
cell.font_key = "Form-font"; //font.get_key();

cell.font_name = "Form-font"; //font.get_name();
cell.font_size = 0; //font_size/1000.0;

cell.italic = false;
cell.bold = false;

cell.ocr = false;
cell.confidence = -1.0;

cell.stack_size = -1;
cell.block_count = -1;
cell.instr_count = -1;
}

page_cells.push_back(cell);
}
}
}

timings[__FUNCTION__] = timer.get_time();
}

Expand Down
Loading

0 comments on commit 22cf280

Please sign in to comment.