Skip to content

Commit

Permalink
fix: Replace all the FATAL with ERROR messages in the v2 parser (#53)
Browse files Browse the repository at this point in the history
* updated the visualize script

Signed-off-by: Peter Staar <[email protected]>

* replaced all the errors with fatals

Signed-off-by: Peter Staar <[email protected]>

* reformatted the python code

Signed-off-by: Peter Staar <[email protected]>

---------

Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM authored Nov 5, 2024
1 parent 004df07 commit cd15d00
Show file tree
Hide file tree
Showing 14 changed files with 186 additions and 69 deletions.
46 changes: 37 additions & 9 deletions docling_parse/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ def parse_args():
help="Enable interactive mode (default: False)",
)

# Add an optional boolean argument for interactive mode
parser.add_argument(
"--display-text",
action="store_true",
help="Enable interactive mode (default: False)",
)

# Add an argument for the output directory, defaulting to "./tmp"
parser.add_argument(
"-o",
Expand Down Expand Up @@ -91,11 +98,17 @@ def parse_args():
args.interactive,
args.output_dir,
int(args.page),
args.display_text,
)


def visualise_v1(
log_level: str, pdf_path: str, interactive: str, output_dir: str, page_num: int
log_level: str,
pdf_path: str,
interactive: str,
output_dir: str,
page_num: int,
display_text: bool,
):

parser = pdf_parser_v1()
Expand Down Expand Up @@ -200,7 +213,12 @@ def visualise_v1(


def visualise_v2(
log_level: str, pdf_path: str, interactive: str, output_dir: str, page_num: int
log_level: str,
pdf_path: str,
interactive: str,
output_dir: str,
page_num: int,
display_text: bool,
):

parser = pdf_parser_v2(log_level)
Expand All @@ -214,10 +232,17 @@ def visualise_v2(

doc = None

if page_num == -1:
doc = parser.parse_pdf_from_key(doc_key)
else:
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)
try:
if page_num == -1:
doc = parser.parse_pdf_from_key(doc_key)
else:
doc = parser.parse_pdf_from_key_on_page(doc_key, page_num)
except Exception as exc:
print(f"Could not parse pdf-document: {exc}")
doc = None

if doc == None:
return

parser.unload_document(doc_key)

Expand Down Expand Up @@ -295,6 +320,9 @@ def visualise_v2(
(x[3], H - y[3]),
]

if display_text:
print(row[cells_header.index("text")])

if "glyph" in row[cells_header.index("text")]:
print(f" skip cell -> {row}")
continue
Expand Down Expand Up @@ -328,12 +356,12 @@ def visualise_v2(

def main():

log_level, version, pdf, interactive, output_dir, page = parse_args()
log_level, version, pdf, interactive, output_dir, page, display_text = parse_args()

if version == "v1":
visualise_v1(log_level, pdf, interactive, output_dir, page)
visualise_v1(log_level, pdf, interactive, output_dir, page, display_text)
elif version == "v2":
visualise_v2(log_level, pdf, interactive, output_dir, page)
visualise_v2(log_level, pdf, interactive, output_dir, page, display_text)
else:
return -1

Expand Down
8 changes: 4 additions & 4 deletions src/v2/enums.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ namespace pdflib
else if (name=="CID_FONT_TYPE_2" or name=="/CIDFontType2") { return CID_FONT_TYPE_2; }
else
{
LOG_S(FATAL) << "unknown subtype " << name;
LOG_S(ERROR) << "unknown subtype " << name;
return NULL_TYPE;
}
}
Expand All @@ -59,7 +59,7 @@ namespace pdflib

default:
{
LOG_S(FATAL) << "encountered a NULL_ENCODING";
LOG_S(ERROR) << "encountered a NULL_ENCODING";
return "NULL_ENCODING";
}
}
Expand Down Expand Up @@ -90,7 +90,7 @@ namespace pdflib
else if(name=="CMAP_RESOURCES" ) { return CMAP_RESOURCES; }
else
{
LOG_S(FATAL) << __FILE__ << ":" << __LINE__ << " --> unknown encoding " << name;
LOG_S(ERROR) << __FILE__ << ":" << __LINE__ << " --> unknown encoding " << name;
return NULL_ENCODING;
}
}
Expand All @@ -109,7 +109,7 @@ namespace pdflib

default:
{
LOG_S(FATAL) << "encountered a NULL_ENCODING";
LOG_S(ERROR) << "encountered a NULL_ENCODING";
return "NULL_ENCODING";
}
}
Expand Down
6 changes: 5 additions & 1 deletion src/v2/pdf_decoders/stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,11 @@ namespace pdflib
{
if(stack.size()==0)
{
LOG_S(FATAL) << "stack-size is zero!";
std::stringstream message;
message << "stack-size is zero in " << __FILE__ << ":" << __LINE__;

LOG_S(ERROR) << message.str();
throw std::logic_error(message.str());
}

pdf_state<GLOBAL>& state = stack.back();
Expand Down
10 changes: 7 additions & 3 deletions src/v2/pdf_resources/page_dimension.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ namespace pdflib
}
else
{
LOG_S(FATAL) << "The page is missing the required '/MediaBox'";
LOG_S(ERROR) << "The page is missing the required '/MediaBox'";
}

if(json_resources.count("/CropBox"))
Expand Down Expand Up @@ -173,8 +173,12 @@ namespace pdflib
}
else
{
LOG_S(FATAL) << "could not find the page-dimensions: "
<< json_resources.dump(4);
std::stringstream ss;
ss << "could not find the page-dimensions: "
<< json_resources.dump(4);

LOG_S(ERROR) << ss.str();
throw std::logic_error(ss.str());
}
}

Expand Down
18 changes: 14 additions & 4 deletions src/v2/pdf_resources/page_font.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,9 @@ namespace pdflib
}
else
{
LOG_S(FATAL) << "no existing pdf_resources_dir: "
<< pdf_resources_dir;
std::string message = "no existing pdf_resources_dir: " + pdf_resources_dir;
LOG_S(ERROR) << message;
throw std::logic_error(message);
}

utils::timer timer;
Expand Down Expand Up @@ -1148,7 +1149,11 @@ namespace pdflib
}
else
{
LOG_S(FATAL) << "unknown type in " << __FUNCTION__;
std::stringstream message;
message << "unknown type in " << __FUNCTION__;

LOG_S(ERROR) << message.str();
throw std::logic_error(message.str());
}
}
}
Expand All @@ -1166,7 +1171,12 @@ namespace pdflib
if(not qpdf_font.hasKey("/ToUnicode"))
{
auto tmp = to_json(qpdf_font);
LOG_S(FATAL) << "qpdf-font: " << tmp.dump();

std::stringstream ss;
ss << "qpdf-font: " << tmp.dump();

LOG_S(ERROR) << ss.str();
throw std::logic_error(ss.str());
}

auto qpdf_obj = qpdf_font.getKey("/ToUnicode");
Expand Down
34 changes: 26 additions & 8 deletions src/v2/pdf_resources/page_font/base_font.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,15 @@ namespace pdflib
return bbox[3];
}

LOG_S(FATAL) << "properties does not have key 'Ascender': "
<< properties.dump(2);

{
std::stringstream ss;
ss << "properties does not have key 'Ascender': "
<< properties.dump(2);

LOG_S(ERROR) << ss.str();
throw std::logic_error(ss.str());
}

return -1.;
}

Expand All @@ -179,9 +185,15 @@ namespace pdflib
return bbox[1];
}

LOG_S(FATAL) << "properties does not have key 'Descender': "
<< properties.dump(2);
{
std::stringstream ss;
ss << "properties does not have key 'Descender': "
<< properties.dump(2);

LOG_S(ERROR) << ss.str();
throw std::logic_error(ss.str());
}

return -1.;
}

Expand All @@ -194,9 +206,15 @@ namespace pdflib
return properties["FontBBox"].get<std::array<double, 4> >();
}

LOG_S(FATAL) << "properties does not have key 'FontBBox': "
<< properties.dump(2);

{
std::stringstream ss;
ss << "properties does not have key 'FontBBox': "
<< properties.dump(2);

LOG_S(ERROR) << ss.str();
throw std::logic_error(ss.str());
}

return {0.0, 0.0, 0.0, 0.0};
}

Expand Down
12 changes: 7 additions & 5 deletions src/v2/pdf_resources/page_font/base_fonts.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,6 @@ namespace pdflib
{
if(norm_name.find(itr->first)!=std::string::npos)
{
//return itr->first;

// we have to be careful that "Helvetica" is not returned for Helvetice-Bold!
if(result.size()<(itr->first).size())
{
Expand All @@ -136,8 +134,8 @@ namespace pdflib
return result;
}

LOG_S(FATAL) << "unkown " << font_name << "[norm_name=" << norm_name << "]";

LOG_S(ERROR) << "unkown " << font_name << "[norm_name=" << norm_name << "]";
return "Unknown";
}

Expand Down Expand Up @@ -269,7 +267,11 @@ namespace pdflib

if(fontname=="unknown")
{
LOG_S(FATAL) << "no FontName found in " << filename;
std::stringstream ss;
ss << "no FontName found in " << filename;

LOG_S(ERROR) << ss.str();
throw std::logic_error(ss.str());
}

return fontname;
Expand Down
2 changes: 1 addition & 1 deletion src/v2/pdf_resources/page_font/cmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ namespace pdflib

if(_map.count(begin+i)==1)
{
LOG_S(FATAL) << "overwriting number c=" << begin+i;
LOG_S(WARNING) << "overwriting number c=" << begin+i;
}

_map[begin + i] = tmp;
Expand Down
12 changes: 8 additions & 4 deletions src/v2/pdf_resources/page_font/font_cid.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ namespace pdflib
if(file.fail())
{
LOG_S(ERROR) << "filename does not exists: " << filename;
LOG_S(FATAL) << "unknown data-file!";


}

bool cmap=false;
Expand Down Expand Up @@ -143,8 +144,11 @@ namespace pdflib

if(file.fail())
{
LOG_S(ERROR) << "filename does not exists: " << filename;
LOG_S(FATAL) << "unknown data-file!";
std::stringstream ss;
ss << "filename does not exists: " << filename;

LOG_S(ERROR) << ss.str();
throw std::logic_error(ss.str());
}

std::vector<int> col_inds = {};
Expand Down Expand Up @@ -246,7 +250,7 @@ namespace pdflib
}
else
{
LOG_S(FATAL) << "we should never arrive here!";
LOG_S(ERROR) << "all options exhausted for " << __FUNCTION__;
}
}

Expand Down
14 changes: 10 additions & 4 deletions src/v2/pdf_resources/page_font/glyphs.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,11 @@ namespace pdflib

if(file.fail())
{
LOG_S(ERROR) << "filename does not exists: " << filename;
LOG_S(FATAL) << "unknown data-file!";
std::stringstream ss;
ss << "filename does not exists: " << filename;

LOG_S(ERROR) << ss.str();
throw std::logic_error(ss.str());
}

std::string line;
Expand Down Expand Up @@ -218,8 +221,11 @@ namespace pdflib

if(file.fail())
{
LOG_S(ERROR) << "filename does not exists: " << filename;
LOG_S(FATAL) << "unknown data-file!";
std::stringstream ss;
ss << "filename does not exists: " << filename;

LOG_S(ERROR) << ss.str();
throw std::logic_error(ss.str());
}

std::string line;
Expand Down
Loading

0 comments on commit cd15d00

Please sign in to comment.