@@ -121,6 +121,7 @@ def _group_rows(text, row_tol=2):
121
121
row_y = 0
122
122
rows = []
123
123
temp = []
124
+
124
125
for t in text :
125
126
# is checking for upright necessary?
126
127
# if t.get_text().strip() and all([obj.upright for obj in t._objs if
@@ -131,8 +132,10 @@ def _group_rows(text, row_tol=2):
131
132
temp = []
132
133
row_y = t .y0
133
134
temp .append (t )
135
+
134
136
rows .append (sorted (temp , key = lambda t : t .x0 ))
135
- __ = rows .pop (0 ) # TODO: hacky
137
+ if len (rows ) > 1 :
138
+ __ = rows .pop (0 ) # TODO: hacky
136
139
return rows
137
140
138
141
@staticmethod
@@ -345,43 +348,46 @@ def _generate_columns_and_rows(self, table_idx, tk):
345
348
else :
346
349
# calculate mode of the list of number of elements in
347
350
# each row to guess the number of columns
348
- ncols = max (set (elements ), key = elements .count )
349
- if ncols == 1 :
350
- # if mode is 1, the page usually contains not tables
351
- # but there can be cases where the list can be skewed,
352
- # try to remove all 1s from list in this case and
353
- # see if the list contains elements, if yes, then use
354
- # the mode after removing 1s
355
- elements = list (filter (lambda x : x != 1 , elements ))
356
- if len (elements ):
357
- ncols = max (set (elements ), key = elements .count )
358
- else :
359
- warnings .warn (
360
- f"No tables found in table area { table_idx + 1 } "
351
+ if not len (elements ):
352
+ cols = [(text_x_min , text_x_max )]
353
+ else :
354
+ ncols = max (set (elements ), key = elements .count )
355
+ if ncols == 1 :
356
+ # if mode is 1, the page usually contains not tables
357
+ # but there can be cases where the list can be skewed,
358
+ # try to remove all 1s from list in this case and
359
+ # see if the list contains elements, if yes, then use
360
+ # the mode after removing 1s
361
+ elements = list (filter (lambda x : x != 1 , elements ))
362
+ if len (elements ):
363
+ ncols = max (set (elements ), key = elements .count )
364
+ else :
365
+ warnings .warn (
366
+ f"No tables found in table area { table_idx + 1 } "
367
+ )
368
+ cols = [(t .x0 , t .x1 ) for r in rows_grouped if len (r ) == ncols for t in r ]
369
+ cols = self ._merge_columns (sorted (cols ), column_tol = self .column_tol )
370
+ inner_text = []
371
+ for i in range (1 , len (cols )):
372
+ left = cols [i - 1 ][1 ]
373
+ right = cols [i ][0 ]
374
+ inner_text .extend (
375
+ [
376
+ t
377
+ for direction in self .t_bbox
378
+ for t in self .t_bbox [direction ]
379
+ if t .x0 > left and t .x1 < right
380
+ ]
361
381
)
362
- cols = [(t .x0 , t .x1 ) for r in rows_grouped if len (r ) == ncols for t in r ]
363
- cols = self ._merge_columns (sorted (cols ), column_tol = self .column_tol )
364
- inner_text = []
365
- for i in range (1 , len (cols )):
366
- left = cols [i - 1 ][1 ]
367
- right = cols [i ][0 ]
368
- inner_text .extend (
369
- [
370
- t
371
- for direction in self .t_bbox
372
- for t in self .t_bbox [direction ]
373
- if t .x0 > left and t .x1 < right
374
- ]
375
- )
376
- outer_text = [
377
- t
378
- for direction in self .t_bbox
379
- for t in self .t_bbox [direction ]
380
- if t .x0 > cols [- 1 ][1 ] or t .x1 < cols [0 ][0 ]
381
- ]
382
- inner_text .extend (outer_text )
383
- cols = self ._add_columns (cols , inner_text , self .row_tol )
384
- cols = self ._join_columns (cols , text_x_min , text_x_max )
382
+ outer_text = [
383
+ t
384
+ for direction in self .t_bbox
385
+ for t in self .t_bbox [direction ]
386
+ if t .x0 > cols [- 1 ][1 ] or t .x1 < cols [0 ][0 ]
387
+ ]
388
+ inner_text .extend (outer_text )
389
+ cols = self ._add_columns (cols , inner_text , self .row_tol )
390
+ cols = self ._join_columns (cols , text_x_min , text_x_max )
385
391
386
392
return cols , rows
387
393
0 commit comments