Skip to content

Commit

Permalink
fix: removing asserts that break parse-v2 (#55)
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM authored Nov 13, 2024
1 parent e59c904 commit bb978c2
Show file tree
Hide file tree
Showing 12 changed files with 349 additions and 148 deletions.
9 changes: 5 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ pytest = "^7.4.2"

[tool.poetry.group.visualisation.dependencies]
pillow = "^10.4.0"
tqdm = "^4.67.0"

[tool.poetry.scripts]
docling-parse = "docling_parse.run:main"
Expand Down
72 changes: 56 additions & 16 deletions src/v2/pdf_resources/page_font.h
Original file line number Diff line number Diff line change
Expand Up @@ -671,11 +671,19 @@ namespace pdflib
if(subtype==TYPE_0 and utils::json::has(keys_0, json_font))
{
auto desc_fonts = utils::json::get(keys_0, json_font);
assert(desc_fonts.size()==1);
//assert(desc_fonts.size()==1);

desc_font = desc_fonts[0];

LOG_S(INFO) << "found the descendant font";// << desc_font.dump(2);
if(desc_fonts.size()==1)
{
LOG_S(INFO) << "found the descendant font";// << desc_font.dump(2);
desc_font = desc_fonts[0];
}
else
{
std::string message = "no descendant font!";
LOG_S(ERROR) << message;
throw std::logic_error(message);
}
}
else if(subtype==TYPE_0)
{
Expand Down Expand Up @@ -777,7 +785,7 @@ namespace pdflib
}
else if(utils::json::has(keys_1, json_font))
{
assert(subtype==TYPE_3);
//assert(subtype==TYPE_3);

auto result = utils::json::get(keys_1, json_font);

Expand All @@ -788,7 +796,7 @@ namespace pdflib
}
else if(utils::json::has(keys_1, desc_font))
{
assert(subtype==TYPE_3);
//assert(subtype==TYPE_3);

auto result = utils::json::get(keys_1, desc_font);

Expand Down Expand Up @@ -1102,8 +1110,8 @@ namespace pdflib
{
//LOG_S(INFO) << l << "\t" << ws[l].is_number() << "\t beg: " << ws[l].dump();

assert(l<ws.size());

//assert(l<ws.size());
beg = ws[l].get<int>();
l += 1;

Expand All @@ -1116,13 +1124,26 @@ namespace pdflib
{
//LOG_S(INFO) << l << "\t" << ws[l].is_number() << "\t end: " << ws[l].dump();

assert(l<ws.size());
//assert(l<ws.size());

if(l>=ws.size())
{
LOG_S(WARNING) << "index " << l << " is out of bounds " << ws.size();
continue;
}

end = ws[l].get<int>();
l += 1;

//LOG_S(INFO) << l << "\t" << ws[l].is_number() << "\t w: " << ws[l].dump();

assert(l<ws.size());
//assert(l<ws.size());
if(l>=ws.size())
{
LOG_S(WARNING) << "index " << l << " is out of bounds " << ws.size();
continue;
}

double w = ws[l].get<double>();
l += 1;

Expand All @@ -1136,7 +1157,13 @@ namespace pdflib
{
//LOG_S(INFO) << l << "\t" << ws[l].is_number() << "\t widths: " << ws[l].dump();

assert(l<ws.size());
//assert(l<ws.size());
if(l>=ws.size())
{
LOG_S(WARNING) << "index " << l << " is out of bounds " << ws.size();
continue;
}

std::vector<double> w = ws[l].get<std::vector<double> >();
l += 1;

Expand Down Expand Up @@ -1180,8 +1207,15 @@ namespace pdflib
}

auto qpdf_obj = qpdf_font.getKey("/ToUnicode");
assert(qpdf_obj.isStream());
//assert(qpdf_obj.isStream());

if(not qpdf_obj.isStream())
{
std::string message = "not qpdf_obj.isStream()";
LOG_S(ERROR) << message;
throw std::logic_error(message);
}

std::vector<qpdf_instruction> stream;

// decode the stream
Expand Down Expand Up @@ -1396,7 +1430,7 @@ namespace pdflib
else if(name_to_descr.count(name)==1 and
cmap_numb_to_char.count(numb)==0)
{
assert(subtype==TYPE_3);
//assert(subtype==TYPE_3);
LOG_S(WARNING) << "could not resolve the character (name="<<name
<<", numb="<<numb<<") for TYPE_3 font:" << font_name;
Expand Down Expand Up @@ -1462,7 +1496,7 @@ namespace pdflib

if(utils::json::has(keys, json_font))
{
assert(subtype==TYPE_3);
//assert(subtype==TYPE_3);

QPDFObjectHandle qpdf_char_procs = qpdf_font.getKey(keys.front());
LOG_S(WARNING) << "found CharProcs: " << qpdf_char_procs.getTypeName();
Expand All @@ -1478,8 +1512,14 @@ namespace pdflib
QPDFObjectHandle qpdf_char_proc = qpdf_char_procs.getKey(key);
//LOG_S(INFO) << "decoding: " << key << " -> " << qpdf_char_proc.getTypeName();

assert(qpdf_char_proc.isStream());

//assert(qpdf_char_proc.isStream());
if(not qpdf_char_proc.isStream())
{
std::string message = "not qpdf_obj.isStream()";
LOG_S(ERROR) << message;
throw std::logic_error(message);
}

std::vector<qpdf_instruction> stream={};

// decode the stream
Expand Down
Loading

0 comments on commit bb978c2

Please sign in to comment.