Skip to content

Commit 9371854

Browse files
Merge pull request camelot-dev#189 from camelot-dev/fix-179
[MRG] Prevent taking max of an empty set
2 parents 9087429 + 5d20d56 commit 9371854

File tree

6 files changed

+67
-43
lines changed

6 files changed

+67
-43
lines changed

camelot/parsers/stream.py

Lines changed: 43 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ def _group_rows(text, row_tol=2):
121121
row_y = 0
122122
rows = []
123123
temp = []
124+
124125
for t in text:
125126
# is checking for upright necessary?
126127
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
@@ -131,8 +132,10 @@ def _group_rows(text, row_tol=2):
131132
temp = []
132133
row_y = t.y0
133134
temp.append(t)
135+
134136
rows.append(sorted(temp, key=lambda t: t.x0))
135-
__ = rows.pop(0) # TODO: hacky
137+
if len(rows) > 1:
138+
__ = rows.pop(0) # TODO: hacky
136139
return rows
137140

138141
@staticmethod
@@ -345,43 +348,46 @@ def _generate_columns_and_rows(self, table_idx, tk):
345348
else:
346349
# calculate mode of the list of number of elements in
347350
# each row to guess the number of columns
348-
ncols = max(set(elements), key=elements.count)
349-
if ncols == 1:
350-
# if mode is 1, the page usually contains not tables
351-
# but there can be cases where the list can be skewed,
352-
# try to remove all 1s from list in this case and
353-
# see if the list contains elements, if yes, then use
354-
# the mode after removing 1s
355-
elements = list(filter(lambda x: x != 1, elements))
356-
if len(elements):
357-
ncols = max(set(elements), key=elements.count)
358-
else:
359-
warnings.warn(
360-
f"No tables found in table area {table_idx + 1}"
351+
if not len(elements):
352+
cols = [(text_x_min, text_x_max)]
353+
else:
354+
ncols = max(set(elements), key=elements.count)
355+
if ncols == 1:
356+
# if mode is 1, the page usually contains not tables
357+
# but there can be cases where the list can be skewed,
358+
# try to remove all 1s from list in this case and
359+
# see if the list contains elements, if yes, then use
360+
# the mode after removing 1s
361+
elements = list(filter(lambda x: x != 1, elements))
362+
if len(elements):
363+
ncols = max(set(elements), key=elements.count)
364+
else:
365+
warnings.warn(
366+
f"No tables found in table area {table_idx + 1}"
367+
)
368+
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
369+
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
370+
inner_text = []
371+
for i in range(1, len(cols)):
372+
left = cols[i - 1][1]
373+
right = cols[i][0]
374+
inner_text.extend(
375+
[
376+
t
377+
for direction in self.t_bbox
378+
for t in self.t_bbox[direction]
379+
if t.x0 > left and t.x1 < right
380+
]
361381
)
362-
cols = [(t.x0, t.x1) for r in rows_grouped if len(r) == ncols for t in r]
363-
cols = self._merge_columns(sorted(cols), column_tol=self.column_tol)
364-
inner_text = []
365-
for i in range(1, len(cols)):
366-
left = cols[i - 1][1]
367-
right = cols[i][0]
368-
inner_text.extend(
369-
[
370-
t
371-
for direction in self.t_bbox
372-
for t in self.t_bbox[direction]
373-
if t.x0 > left and t.x1 < right
374-
]
375-
)
376-
outer_text = [
377-
t
378-
for direction in self.t_bbox
379-
for t in self.t_bbox[direction]
380-
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
381-
]
382-
inner_text.extend(outer_text)
383-
cols = self._add_columns(cols, inner_text, self.row_tol)
384-
cols = self._join_columns(cols, text_x_min, text_x_max)
382+
outer_text = [
383+
t
384+
for direction in self.t_bbox
385+
for t in self.t_bbox[direction]
386+
if t.x0 > cols[-1][1] or t.x1 < cols[0][0]
387+
]
388+
inner_text.extend(outer_text)
389+
cols = self._add_columns(cols, inner_text, self.row_tol)
390+
cols = self._join_columns(cols, text_x_min, text_x_max)
385391

386392
return cols, rows
387393

tests/files/blank.pdf

-4.79 KB
Binary file not shown.

tests/files/empty.pdf

2.3 KB
Binary file not shown.

tests/files/only_page_number.pdf

2.38 KB
Binary file not shown.

tests/test_cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@ def test_cli_output_format():
160160

161161
def test_cli_quiet():
162162
with TemporaryDirectory() as tempdir:
163-
infile = os.path.join(testdir, "blank.pdf")
164-
outfile = os.path.join(tempdir, "blank.csv")
163+
infile = os.path.join(testdir, "empty.pdf")
164+
outfile = os.path.join(tempdir, "empty.csv")
165165
runner = CliRunner()
166166

167167
result = runner.invoke(

tests/test_errors.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,33 @@ def test_image_warning():
5555
)
5656

5757

58-
def test_no_tables_found():
59-
filename = os.path.join(testdir, "blank.pdf")
58+
def test_lattice_no_tables_on_page():
59+
filename = os.path.join(testdir, "empty.pdf")
6060
with warnings.catch_warnings():
6161
warnings.simplefilter("error")
6262
with pytest.raises(UserWarning) as e:
63-
tables = camelot.read_pdf(filename)
63+
tables = camelot.read_pdf(filename, flavor="lattice")
64+
assert str(e.value) == "No tables found on page-1"
65+
66+
67+
def test_stream_no_tables_on_page():
68+
filename = os.path.join(testdir, "empty.pdf")
69+
with warnings.catch_warnings():
70+
warnings.simplefilter("error")
71+
with pytest.raises(UserWarning) as e:
72+
tables = camelot.read_pdf(filename, flavor="stream")
6473
assert str(e.value) == "No tables found on page-1"
6574

6675

76+
def test_stream_no_tables_in_area():
77+
filename = os.path.join(testdir, "only_page_number.pdf")
78+
with warnings.catch_warnings():
79+
warnings.simplefilter("error")
80+
with pytest.raises(UserWarning) as e:
81+
tables = camelot.read_pdf(filename, flavor="stream")
82+
assert str(e.value) == "No tables found in table area 1"
83+
84+
6785
def test_no_tables_found_logs_suppressed():
6886
filename = os.path.join(testdir, "foo.pdf")
6987
with warnings.catch_warnings():
@@ -77,7 +95,7 @@ def test_no_tables_found_logs_suppressed():
7795

7896

7997
def test_no_tables_found_warnings_suppressed():
80-
filename = os.path.join(testdir, "blank.pdf")
98+
filename = os.path.join(testdir, "empty.pdf")
8199
with warnings.catch_warnings():
82100
# the test should fail if any warning is thrown
83101
warnings.simplefilter("error")

0 commit comments

Comments
 (0)