Skip to content

Commit

Permalink
fix: Robustify parser v2 (#49)
Browse files Browse the repository at this point in the history
* avoiding fatals

Signed-off-by: Peter Staar <[email protected]>

* ran entire DLN-v1 with only two errors

Signed-off-by: Peter Staar <[email protected]>

* added a few more safeguards

Signed-off-by: Peter Staar <[email protected]>

---------

Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM authored Oct 25, 2024
1 parent 09224e4 commit 1815e7d
Show file tree
Hide file tree
Showing 10 changed files with 130 additions and 28 deletions.
12 changes: 9 additions & 3 deletions src/v2/pdf_resources/page_font.h
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ namespace pdflib
else
{
subtype=NULL_TYPE;
LOG_S(FATAL) << "could not find subtype in font: " << json_font.dump(2);
LOG_S(ERROR) << "could not find subtype in font: " << json_font.dump(2);
}
}

Expand Down Expand Up @@ -1050,15 +1050,21 @@ namespace pdflib

if(values.size()!=(lchar-fchar+1))
{
LOG_S(FATAL) << "values.size()!=(lchar-fchar+1) -> "
LOG_S(ERROR) << "values.size()!=(lchar-fchar+1) -> "
<< values.size() << "!=" << lchar << "-" << fchar << "+1";
}

int cnt=0;
for(int ind=fchar; ind<=lchar; ind++)
{
if(cnt>=values.size())
{
LOG_S(ERROR) << "going out of bounds with " << cnt << " >= " << values.size();
continue;
}

numb_to_widths[ind] = values[cnt++];
LOG_S(INFO) << "index: " << ind << " -> width: " << numb_to_widths.at(ind);
//LOG_S(INFO) << "index: " << ind << " -> width: " << numb_to_widths.at(ind);
}
}

Expand Down
15 changes: 13 additions & 2 deletions src/v2/pdf_resources/page_font/base_fonts.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,15 @@ namespace pdflib

private:

bool initialized;

std::set<std::string> core_14_fonts;

std::map<std::string, base_font_type> name_to_basefont;
};

base_fonts::base_fonts()
base_fonts::base_fonts():
initialized(false)
{}

base_fonts::~base_fonts()
Expand Down Expand Up @@ -159,6 +162,12 @@ namespace pdflib
template<typename glyphs_type>
void base_fonts::initialise(std::string dirname, glyphs_type& glyphs)
{
if(initialized)
{
LOG_S(WARNING) << "skipping base_fonts::initialise, already initialized ...";
return;
}

std::vector<std::string> standard = utils::filesystem::list_files(dirname+"/standard");
std::sort(standard.begin(), standard.end());

Expand Down Expand Up @@ -217,7 +226,9 @@ namespace pdflib
{
//LOG_S(WARNING) << "\t font-name (=" << fontname << ") already read";
}
}
}

initialized = true;
}

std::string base_fonts::read_fontname(std::string filename)
Expand Down
33 changes: 29 additions & 4 deletions src/v2/pdf_resources/page_font/cmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ namespace pdflib

if(_map.count(begin+i)==1)
{
LOG_S(FATAL) << "overwriting number c=" << begin+i;
LOG_S(ERROR) << "overwriting number c=" << begin+i;
}

_map[begin + i] = tgt.at(i);
Expand All @@ -514,18 +514,43 @@ namespace pdflib
void cmap_parser::parse_beginbfchar(std::vector<qpdf_instruction>& parameters)
{
LOG_S(INFO) << __FUNCTION__;
assert(parameters.size()==1);
//assert(parameters.size()==1);

char_count = parameters[0].to_int();
if(parameters.size()==1)
{
char_count = parameters[0].to_int();
}
else if(parameters.size()>0)
{
LOG_S(WARNING) << "parameters.size()>0 for parse_beginbfchar";
char_count = parameters[0].to_int();
}
else
{
LOG_S(ERROR) << "parameters.size()!=1 for parse_beginbfchar";
}
}

void cmap_parser::parse_endbfchar(std::vector<qpdf_instruction>& parameters)
{
LOG_S(INFO) << __FUNCTION__ << ": starting ...";
assert(parameters.size()==2*char_count);

if(parameters.size()!=2*char_count)
{
LOG_S(WARNING) << "parameters.size()!=2*char_count -> "
<< "parameters: " << parameters.size() << ", "
<< "char_count: " << char_count;
}
//assert(parameters.size()==2*char_count);

for(size_t i=0; i<char_count; i++)
{
if(2*i>=parameters.size())
{
LOG_S(ERROR) << "going out of bounds: skipping parse_endbfchar";
continue;
}

QPDFObjectHandle source_ = parameters[2*i+0].obj;
QPDFObjectHandle target_ = parameters[2*i+1].obj;

Expand Down
13 changes: 12 additions & 1 deletion src/v2/pdf_resources/page_font/encodings.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@ namespace pdflib

private:

bool initialized;

std::map<font_encoding_name, font_encoding> name_to_encoding;
};

font_encodings::font_encodings()
font_encodings::font_encodings():
initialized(false)
{}

font_encodings::~font_encodings()
Expand All @@ -38,6 +41,12 @@ namespace pdflib
template<typename glyphs_type>
void font_encodings::initialise(std::string dirname, glyphs_type& glyphs)
{
if(initialized)
{
LOG_S(WARNING) << "skipping font_encodings::initialise, already initialized ...";
return;
}

std::vector<std::pair<font_encoding_name, std::string> > items = {
{STANDARD, "std.dat"},
{MACROMAN, "macroman.dat"},
Expand All @@ -50,6 +59,8 @@ namespace pdflib
font_encoding& encoding = name_to_encoding[item.first];
encoding.initialise(item.first, dirname+"/"+item.second, glyphs);
}

initialized = true;
}

}
Expand Down
14 changes: 12 additions & 2 deletions src/v2/pdf_resources/page_font/font_cids.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ namespace pdflib

private:

bool initialized;
std::string directory;

std::map<std::string, int> ro_2_sup;
Expand All @@ -45,7 +46,8 @@ namespace pdflib
std::map<std::string, font_cid> cids;
};

font_cids::font_cids()
font_cids::font_cids():
initialized(false)
{}

font_cids::~font_cids()
Expand Down Expand Up @@ -84,7 +86,13 @@ namespace pdflib

void font_cids::initialise(std::string dirname)
{
LOG_S(INFO) << __FUNCTION__;
if(initialized)
{
LOG_S(WARNING) << "skipping font_cids::initialise, already initialized ...";
return;
}

LOG_S(INFO) << "initialise font_cids";

directory = dirname;
directory += (directory.back()=='/'? "" : "/");
Expand Down Expand Up @@ -126,6 +134,8 @@ namespace pdflib
cmap_2_filename[file] = cdir+"/"+file;
}
}

initialized = true;
}

bool font_cids::decode_cmap_resource(std::string cmap_name)
Expand Down
21 changes: 16 additions & 5 deletions src/v2/pdf_resources/page_font/glyphs.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace pdflib
std::string operator[](std::string key);

void initialise(std::string dirname);

private:

void read_file_hex(std::string filename);
Expand All @@ -39,13 +39,16 @@ namespace pdflib

private:

bool initialized;

std::set<std::string> unknown_glyphs;

std::map<std::string, std::string> name_to_code;
std::map<std::string, std::string> name_to_utf8;
};

font_glyphs::font_glyphs()
font_glyphs::font_glyphs():
initialized(false)
{}

font_glyphs::~font_glyphs()
Expand Down Expand Up @@ -103,6 +106,12 @@ namespace pdflib

void font_glyphs::initialise(std::string dirname)
{
if(initialized)
{
LOG_S(WARNING) << "skipping font_glyphs::initialise, already initialized ...";
return;
}

LOG_S(INFO) << "font-glyphs initialise from directory: "
<< dirname;

Expand All @@ -116,7 +125,7 @@ namespace pdflib
"/custom/MathematicalPi/MathematicalPi.hex.dat"
};

for(auto path : paths_hex)
for(auto path:paths_hex)
{
std::string fpath = dirname + path;
read_file_hex(fpath);
Expand All @@ -126,11 +135,13 @@ namespace pdflib
"/custom/MathematicalPi/MathematicalPi.uni.dat"
};

for(auto path : paths_uni)
for(auto path:paths_uni)
{
std::string fpath = dirname + path;
read_file_uni(fpath);
}

initialized = true;
}

void font_glyphs::read_file_hex(std::string filename)
Expand Down Expand Up @@ -165,7 +176,7 @@ namespace pdflib
{
name_to_utf8[key] = utils::string::hex_to_utf8(val_, 4);
}
else if(name_to_utf8.count(key)==1)
else if(name_to_utf8.count(key)==1) // already present
{
LOG_S(ERROR) << "key [" << key << "] is defined twice";
}
Expand Down
30 changes: 25 additions & 5 deletions src/v2/pdf_states/grph.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ namespace pdflib
int line_cap;
int line_join;

int dash_phase;
double dash_phase;
std::vector<double> dash_array;

double flatness;
Expand Down Expand Up @@ -213,8 +213,20 @@ namespace pdflib
dash_array.push_back(val);
}

assert(instructions[1].is_integer());
dash_phase = instructions[1].to_int();
if(instructions[1].is_integer())
{
dash_phase = instructions[1].to_int();
}
else if(instructions[1].is_number())
{
dash_phase = instructions[1].to_double();
}
else
{
dash_phase = 0;
LOG_S(ERROR) << "failed instructions[1] with is_integer() and is_number"
<< instructions[1].unparse();
}
}

void pdf_state<GRPH>::ri(std::vector<qpdf_instruction>& instructions)
Expand All @@ -227,8 +239,16 @@ namespace pdflib
//assert(instructions.size()==1);
if(not verify(instructions, 1, __FUNCTION__) ) { return; }

assert(instructions[0].is_number());
flatness = instructions[0].to_double();
if(instructions[0].is_number())
{
flatness = instructions[0].to_double();
}
else
{
flatness = 0;
LOG_S(ERROR) << "failed instructions[0].is_number(): "
<< instructions[0].unparse();
}
}

void pdf_state<GRPH>::gs(std::vector<qpdf_instruction>& instructions)
Expand Down
7 changes: 7 additions & 0 deletions src/v2/pdf_states/line.h
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,13 @@ namespace pdflib

// first close
auto& line = curr_lines.back();

if(line.size()==0)
{
LOG_S(WARNING) << "applying 'h' on empty line";
return;
}

std::pair<double, double> coor = line.front();

line.append(coor.first, coor.second);
Expand Down
7 changes: 4 additions & 3 deletions src/v2/pdf_states/text.h
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ namespace pdflib
}
else
{
LOG_S(FATAL) << "unknown page-font: " << font_name;
LOG_S(ERROR) << "unknown page-font: " << font_name;
}
}

Expand Down Expand Up @@ -313,8 +313,9 @@ namespace pdflib
}
else
{
LOG_S(FATAL) << "item is not a string nor a value: "
<< item.unparse() << " [" << item.getTypeName() << "]";
LOG_S(ERROR) << "item is not a string nor a value: "
<< item.unparse() << " [" << item.getTypeName() << "]"
<< " -> skipping for now ...";
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions src/v2/qpdf/to_json.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ namespace pdflib
// FIXME: add a begin time to cap the max time spent in this routine
nlohmann::json to_json(QPDFObjectHandle obj, std::set<std::string> prev_objs={}, int level=0)
{
//const static int max_level=32;
const static int max_level=128;
const static int max_level=32;
//const static int max_level=128;

LOG_S(INFO) << "to_json (level=" << level << "): " << prev_objs.size();

Expand Down Expand Up @@ -47,7 +47,7 @@ namespace pdflib

if(level<max_level)
{
const static std::set<std::string> keys_to_be_skipped = {"/Parent", "/P", "/Annots"};
const static std::set<std::string> keys_to_be_skipped = {"/Parent", "/P", "/Annots", "/B"};

if(obj.isDictionary())
{
Expand Down

0 comments on commit 1815e7d

Please sign in to comment.