From 33ddd7b2f15386e39218c638fedc308aa7f32dc5 Mon Sep 17 00:00:00 2001 From: BobLd Date: Tue, 22 Sep 2020 12:33:34 +0100 Subject: [PATCH 1/4] start implementing nurminen --- Tabula.Csv/Tabula.Csv.csproj | 3 +- Tabula.Json/Tabula.Json.csproj | 4 +- .../SimpleNurminenDetectionAlgorithm.cs | 818 ++++++++++++++++++ Tabula/PageArea.cs | 2 +- Tabula/Ruling.cs | 18 +- Tabula/Tabula.csproj | 2 + Tabula/Tabula.xml | 29 +- Tabula/TextStripper.cs | 18 +- 8 files changed, 877 insertions(+), 17 deletions(-) create mode 100644 Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs diff --git a/Tabula.Csv/Tabula.Csv.csproj b/Tabula.Csv/Tabula.Csv.csproj index d39a5c8..e5fca44 100644 --- a/Tabula.Csv/Tabula.Csv.csproj +++ b/Tabula.Csv/Tabula.Csv.csproj @@ -1,10 +1,11 @@ - + netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47 Extract tables from PDF files (port of tabula-java using PdfPig). Csv and Tsv writers. https://github.com/BobLd/tabula-sharp 0.1.0-alpha001 + BobLd diff --git a/Tabula.Json/Tabula.Json.csproj b/Tabula.Json/Tabula.Json.csproj index 2cd4422..efe0ffa 100644 --- a/Tabula.Json/Tabula.Json.csproj +++ b/Tabula.Json/Tabula.Json.csproj @@ -1,10 +1,12 @@ - + netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47 Extract tables from PDF files (port of tabula-java using PdfPig). Json writer. https://github.com/BobLd/tabula-sharp 0.1.0-alpha001 + BobLd + BobLd diff --git a/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs b/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs new file mode 100644 index 0000000..c2c39fc --- /dev/null +++ b/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs @@ -0,0 +1,818 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using Tabula.Extractors; +using UglyToad.PdfPig; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.Core; +using UglyToad.PdfPig.DocumentLayoutAnalysis; + +namespace Tabula.Detectors +{ + public class SimpleNurminenDetectionAlgorithm : IDetectionAlgorithm + { + + private static int GRAYSCALE_INTENSITY_THRESHOLD = 25; + private static int HORIZONTAL_EDGE_WIDTH_MINIMUM = 50; + private static int VERTICAL_EDGE_HEIGHT_MINIMUM = 10; + private static int CELL_CORNER_DISTANCE_MAXIMUM = 10; + private static float POINT_SNAP_DISTANCE_THRESHOLD = 8f; + private static float TABLE_PADDING_AMOUNT = 1.0f; + private static int REQUIRED_TEXT_LINES_FOR_EDGE = 4; + private static int REQUIRED_CELLS_FOR_TABLE = 4; + private static float IDENTICAL_TABLE_OVERLAP_RATIO = 0.9f; + + /// + /// Helper class that encapsulates a text edge + /// + private class TextEdge + { + public PdfLine Line; //: Line2D.Float + + // types of text edges + public const int LEFT = 0; + public const int MID = 1; + public const int RIGHT = 2; + public const int NUM_TYPES = 3; + + public int intersectingTextRowCount; + + public TextEdge(double x1, double y1, double x2, double y2) + { + Line = new PdfLine(x1, y1, x2, y2); //super(x1, y1, x2, y2); + this.intersectingTextRowCount = 0; + } + } + + /// + /// Helper container for all text edges on a page + /// + private class TextEdges : List> + { + public TextEdges(List leftEdges, List midEdges, List rightEdges) : base(3) + { + //super(3); + this.Add(leftEdges); + this.Add(midEdges); + this.Add(rightEdges); + } + } + + /// + /// Helper container for relevant text edge info + /// + private class RelevantEdges + { + public int edgeType; + public int edgeCount; + + public RelevantEdges(int edgeType, int edgeCount) + { + this.edgeType = edgeType; + this.edgeCount = edgeCount; + } + } + + public List Detect(PageArea page) + { + // get horizontal & vertical lines + // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF + // instructions that are interpreted incorrectly as visible elements - we really want to capture what a + // person sees when they look at the PDF + // hack here, we don't convert to an image + var pageRulings = page.GetRulings(); + List horizontalRulings = this.getHorizontalRulings(pageRulings); + List verticalRulings = this.getVerticalRulings(pageRulings); + // end hack here + + List allEdges = new List(horizontalRulings); //ArrayList<>(horizontalRulings); + allEdges.AddRange(verticalRulings); + + List tableAreas = new List(); // ArrayList<>(); + + // if we found some edges, try to find some tables based on them + if (allEdges.Count > 0) + { + // now we need to snap edge endpoints to a grid + Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD); + + // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings + foreach (List rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings)) + { + //for (Iterator iterator = rulings.iterator(); iterator.hasNext();) + foreach (var ruling in rulings.ToList()) // use ToList to be able to remove + { + //Ruling ruling = iterator.next(); + + ruling.Normalize(); + if (ruling.IsOblique) + { + rulings.Remove(ruling);//iterator.remove(); + } + } + } + + // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier + // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the + // edge detection/pixel snapping steps + horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5); + verticalRulings = Ruling.CollapseOrientedRulings(verticalRulings, 5); + + // use the rulings and points to find cells + List cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast().ToList(); + + // then use those cells to make table areas + tableAreas = getTableAreasFromCells(cells); + } + + // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as + // cells if there are missing horizontal lines (which there often are) + // let's assume though that these lines should be part of the table + foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float + { + foreach (TableRectangle tableArea in tableAreas) + { + if (verticalRuling.Intersects(tableArea) && + !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2))) + { + tableArea.SetTop((float)Math.Floor(Math.Min(tableArea.Top, verticalRuling.Y1))); // bobld: top and bottom! min and max + tableArea.SetBottom((float)Math.Ceiling(Math.Max(tableArea.Bottom, verticalRuling.Y2))); // bobld: top and bottom! min and max + break; + } + } + } + + /* BobLd: not sure this is the case in tabula-sharp/PdfPig + // the tabula Page coordinate space is half the size of the PDFBox image coordinate space + // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything + foreach (TableRectangle area in tableAreas) + { + area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT; + area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT; + area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT; + area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT; + } + + // we're going to want halved horizontal lines later too + foreach (Ruling ruling in horizontalRulings) // Line2D.Float + { + ruling.x1 = ruling.x1 / 2; + ruling.y1 = ruling.y1 / 2; + ruling.x2 = ruling.x2 / 2; + ruling.y2 = ruling.y2 / 2; + } + */ + + // now look at text rows to help us find more tables and flesh out existing ones + List textChunks = TextElement.MergeWords(page.GetText()); + List lines = TextChunk.GroupByLines(textChunks); + + // first look for text rows that intersect an existing table - those lines should probably be part of the table + foreach (TableLine textRow in lines) + { + foreach (TableRectangle tableArea in tableAreas) + { + if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea)) + { + tableArea.SetLeft((float)Math.Floor(Math.Min(textRow.Left, tableArea.Left))); + tableArea.SetRight((float)Math.Ceiling(Math.Max(textRow.Right, tableArea.Right))); + } + } + } + + // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic + //for (Iterator iterator = tableAreas.iterator(); iterator.hasNext();) + foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be abnle to remove + { + //Rectangle table = iterator.next(); + + bool intersectsText = false; + foreach (TableLine textRow in lines) + { + if (table.Intersects(textRow)) + { + intersectsText = true; + break; + } + } + + if (!intersectsText) + { + tableAreas.Remove(table); + //iterator.remove(); + } + } + + // lastly, there may be some tables that don't have any vertical rulings at all + // we'll use text edges we've found to try and guess which text rows are part of a table + + // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table + // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects + // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be + // part of a table. + + bool foundTable; + + do + { + foundTable = false; + + // get rid of any text lines contained within existing tables, this allows us to find more tables + //for (Iterator iterator = lines.iterator(); iterator.hasNext();) + foreach (var textRow in lines.ToList()) + { + //TableLine textRow = iterator.next(); + foreach (TableRectangle table in tableAreas) + { + if (table.Contains(textRow)) + { + //iterator.remove(); + lines.Remove(textRow); + break; + } + } + } + + // get text edges from remaining lines in the document + TextEdges textEdges = getTextEdges(lines); + List leftTextEdges = textEdges[TextEdge.LEFT]; + List midTextEdges = textEdges[TextEdge.MID]; + List rightTextEdges = textEdges[TextEdge.RIGHT]; + + // find the relevant text edges (the ones we think define where a table is) + RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines); + + // we found something relevant so let's look for rows that fit our criteria + if (relevantEdgeInfo.edgeType != -1) + { + List relevantEdges = null; + switch (relevantEdgeInfo.edgeType) + { + case TextEdge.LEFT: + relevantEdges = leftTextEdges; + break; + case TextEdge.MID: + relevantEdges = midTextEdges; + break; + case TextEdge.RIGHT: + relevantEdges = rightTextEdges; + break; + } + + TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings); + + if (table != null) + { + foundTable = true; + tableAreas.Add(table); + } + } + } while (foundTable); + + // create a set of our current tables that will eliminate duplicate tables + SortedSet tableSet = new SortedSet(new TreeSetComparer()); //Set tableSet = new TreeSet<>(new Comparator() {... + //tableSet.addAll(tableAreas); + foreach (var table in tableAreas) + { + tableSet.Add(table); + } + + return tableSet.ToList(); //new ArrayList<>(tableSet); + } + + private class TreeSetComparer : IComparer + { + public int Compare(TableRectangle o1, TableRectangle o2) + { + if (o1.Equals(o2)) + { + return 0; + } + + // o1 is "equal" to o2 if o2 contains all of o1 + if (o2.Contains(o1)) + { + return 0; + } + + if (o1.Contains(o2)) + { + return 0; + } + + // otherwise see if these tables are "mostly" the same + double overlap = o1.OverlapRatio(o2); + if (overlap >= IDENTICAL_TABLE_OVERLAP_RATIO) + { + return 0; + } + else + { + return 1; + } + } + } + + private TableRectangle getTableFromText(List lines, List relevantEdges, int relevantEdgeCount, List horizontalRulings) + { + TableRectangle table = new TableRectangle(); + + TableLine prevRow = null; + TableLine firstTableRow = null; + TableLine lastTableRow = null; + + int tableSpaceCount = 0; + double totalRowSpacing = 0; + + // go through the lines and find the ones that have the correct count of the relevant edges + foreach (TableLine textRow in lines) + { + int numRelevantEdges = 0; + + if (firstTableRow != null && tableSpaceCount > 0) + { + // check to make sure this text row is within a line or so of the other lines already added + // if it's not, we should stop the table here + double tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5; + double lineDistance = textRow.Top - prevRow.Top; // bobld: top or bottom??? + + if (lineDistance > tableLineThreshold) + { + lastTableRow = prevRow; + break; + } + } + + // for larger tables, be a little lenient on the number of relevant rows the text intersects + // for smaller tables, not so much - otherwise we'll end up treating paragraphs as tables too + int relativeEdgeDifferenceThreshold = 1; + if (relevantEdgeCount <= 3) + { + relativeEdgeDifferenceThreshold = 0; + } + + foreach (TextEdge edge in relevantEdges) + { + if (textRow.IntersectsLine(edge.Line)) + { + numRelevantEdges++; + } + } + + // see if we have a candidate text row + if (numRelevantEdges >= (relevantEdgeCount - relativeEdgeDifferenceThreshold)) + { + // keep track of table row spacing + if (prevRow != null && firstTableRow != null) + { + tableSpaceCount++; + totalRowSpacing += textRow.Top - prevRow.Top; // bobld: top or bottom??? + } + + // row is part of a table + if (table.Area == 0) + { + firstTableRow = textRow; + table.SetRect(textRow); + } + else + { + table.SetLeft(Math.Min(table.Left, textRow.Left)); + table.SetBottom(Math.Max(table.Bottom, textRow.Bottom)); // bobld: max or min? + table.SetRight(Math.Max(table.Right, textRow.Right)); + } + } + else + { + // no dice + // if we're at the end of the table, save the last row + if (firstTableRow != null && lastTableRow == null) + { + lastTableRow = prevRow; + } + } + + prevRow = textRow; + } + + // if we don't have a table now, we won't after the next step either + if (table.Area == 0) + { + return null; + } + + if (lastTableRow == null) + { + // takes care of one-row tables or tables that end at the bottom of a page + lastTableRow = prevRow; + } + + // use the average row height and nearby horizontal lines to extend the table area + double avgRowHeight; + if (tableSpaceCount > 0) + { + avgRowHeight = totalRowSpacing / tableSpaceCount; + } + else + { + avgRowHeight = lastTableRow.Height; + } + + double rowHeightThreshold = avgRowHeight * 1.5; + + // check lines after the bottom of the table + foreach (Ruling ruling in horizontalRulings) //Line2D.Float + { + + if (ruling.Y1 < table.Bottom) // bobld: warning top and bottom + { + continue; + } + + double distanceFromTable = ruling.Y1 - table.Bottom; + if (distanceFromTable <= rowHeightThreshold) + { + // use this ruling to help define the table + table.SetBottom(Math.Max(table.Bottom, ruling.Y1)); // bobld: max or min? + table.SetLeft(Math.Min(table.Left, ruling.X1)); + table.SetRight(Math.Max(table.Right, ruling.X2)); + } + else + { + // no use checking any further + break; + } + } + + // do the same for lines at the top, but make the threshold greater since table headings tend to be + // larger to fit up to three-ish rows of text (at least but we don't want to grab too much) + rowHeightThreshold = avgRowHeight * 3.8; + + for (int i = horizontalRulings.Count - 1; i >= 0; i--) + { + Ruling ruling = horizontalRulings[i]; //.get(i); // Line2D.Float + + if (ruling.Y1 > table.Top) //.getTop()) + { + continue; + } + + double distanceFromTable = table.Top - ruling.Y1; + if (distanceFromTable <= rowHeightThreshold) + { + table.SetTop(Math.Min(table.Top, ruling.Y1)); // bobld: max or min? + table.SetLeft(Math.Min(table.Left, ruling.X1)); + table.SetRight(Math.Max(table.Right, ruling.X2)); + } + else + { + break; + } + } + + // add a bit of padding since the halved horizontal lines are a little fuzzy anyways + table.SetTop(Math.Floor(table.Top) - TABLE_PADDING_AMOUNT);// Bobld: Ceiling/Floor Top/Bottom?? + table.SetBottom(Math.Ceiling(table.Bottom) + TABLE_PADDING_AMOUNT); // Bobld: Ceiling/Floor Top/Bottom?? + table.SetLeft(Math.Floor(table.Left) - TABLE_PADDING_AMOUNT); + table.SetRight(Math.Ceiling(table.Right) + TABLE_PADDING_AMOUNT); + + return table; + } + + private RelevantEdges getRelevantEdges(TextEdges textEdges, List lines) + { + List leftTextEdges = textEdges[TextEdge.LEFT]; + List midTextEdges = textEdges[TextEdge.MID]; + List rightTextEdges = textEdges[TextEdge.RIGHT]; + + // first we'll find the number of lines each type of edge crosses + int[][] edgeCountsPerLine = new int[lines.Count][]; //[TextEdge.NUM_TYPES]; + for (int i = 0; i < edgeCountsPerLine.Length; i++) + { + edgeCountsPerLine[i] = new int[TextEdge.NUM_TYPES]; + } + + foreach (TextEdge edge in leftTextEdges) + { + edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.LEFT]++; + } + + foreach (TextEdge edge in midTextEdges) + { + edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.MID]++; + } + + foreach (TextEdge edge in rightTextEdges) + { + edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.RIGHT]++; + } + + // now let's find the relevant edge type and the number of those edges we should look for + // we'll only take a minimum of two edges to look for tables + int relevantEdgeType = -1; + int relevantEdgeCount = 0; + for (int i = edgeCountsPerLine.Length - 1; i > 2; i--) + { + if (edgeCountsPerLine[i][TextEdge.LEFT] > 2 && + edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.RIGHT] && + edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.MID]) + { + relevantEdgeCount = edgeCountsPerLine[i][TextEdge.LEFT]; + relevantEdgeType = TextEdge.LEFT; + break; + } + + if (edgeCountsPerLine[i][TextEdge.RIGHT] > 1 && + edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.LEFT] && + edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.MID]) + { + relevantEdgeCount = edgeCountsPerLine[i][TextEdge.RIGHT]; + relevantEdgeType = TextEdge.RIGHT; + break; + } + + if (edgeCountsPerLine[i][TextEdge.MID] > 1 && + edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.RIGHT] && + edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.LEFT]) + { + relevantEdgeCount = edgeCountsPerLine[i][TextEdge.MID]; + relevantEdgeType = TextEdge.MID; + break; + } + } + + return new RelevantEdges(relevantEdgeType, relevantEdgeCount); + } + + + private TextEdges getTextEdges(List lines) + { + // get all text edges (lines that align with the left, middle and right of chunks of text) that extend + // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text + + List leftTextEdges = new List(); //ArrayList<>(); + List midTextEdges = new List(); // ArrayList<>(); + List rightTextEdges = new List(); // ArrayList<>(); + + Dictionary> currLeftEdges = new Dictionary>(); //HashMap<>(); + Dictionary> currMidEdges = new Dictionary>(); //HashMap<>(); + Dictionary> currRightEdges = new Dictionary>(); //HashMap<>(); + + foreach (TableLine textRow in lines) + { + foreach (TextChunk text in textRow.TextElements) //.getTextElements()) + { + int left = (int)Math.Floor(text.Left); //.getLeft())); + int right = (int)Math.Floor(text.Right); //.getRight())); + int mid = (int)(left + ((right - left) / 2)); + + // first put this chunk into any edge buckets it belongs to + if (!currLeftEdges.TryGetValue(left, out List leftEdge)) + { + leftEdge = new List(); //ArrayList<>(); + currLeftEdges[left] = leftEdge; + } + leftEdge.Add(text); + + if (!currMidEdges.TryGetValue(mid, out List midEdge)) + { + midEdge = new List(); //ArrayList<>(); + currMidEdges[mid] = midEdge; + } + midEdge.Add(text); + + if (!currRightEdges.TryGetValue(right, out List rightEdge)) + { + rightEdge = new List(); //ArrayList<>(); + currRightEdges[right] = rightEdge; + } + rightEdge.Add(text); + + // now see if this text chunk blows up any other edges + //for (Iterator>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext();) + foreach (var entry in currLeftEdges.ToList()) // use tolist to be able to remove + { + //Map.Entry> entry = iterator.next(); + int key = entry.Key; //.getKey(); + if (key > left && key < right) + { + //iterator.remove(); + currLeftEdges.Remove(key); + List edgeChunks = entry.Value; //.getValue(); + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; //.get(0); + TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + + TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); // bobld: carefull here top/bottom + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + leftTextEdges.Add(edge); + } + } + } + + //for (Iterator>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext();) + foreach (var entry in currMidEdges.ToList()) + { + //Map.Entry> entry = iterator.next(); + int key = entry.Key; //.getKey(); + if (key > left && key < right && Math.Abs(key - mid) > 2) + { + //iterator.remove(); + currMidEdges.Remove(key); + List edgeChunks = entry.Value; //.getValue(); + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; //.get(0); + TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + + TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + midTextEdges.Add(edge); + } + } + } + + //for (Iterator>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext();) + foreach (var entry in currRightEdges.ToList()) + { + //Map.Entry> entry = iterator.next(); + int key = entry.Key; //.getKey(); + if (key > left && key < right) + { + //iterator.remove(); + currRightEdges.Remove(key); + List edgeChunks = entry.Value; //.getValue(); + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; //.get(0); + TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + + TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + rightTextEdges.Add(edge); + } + } + } + } + } + + // add the leftovers + foreach (int key in currLeftEdges.Keys) //.keySet()) + { + List edgeChunks = currLeftEdges[key]; //.get(key); + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; //.get(0); + TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + + TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + leftTextEdges.Add(edge); + } + } + + foreach (int key in currMidEdges.Keys)//.keySet()) + { + List edgeChunks = currMidEdges[key]; //.get(key); + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; //.get(0); + TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + + TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + midTextEdges.Add(edge); + } + } + + foreach (int key in currRightEdges.Keys) //.keySet()) + { + List edgeChunks = currRightEdges[key]; //.get(key); + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; //.get(0); + TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + + TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + rightTextEdges.Add(edge); + } + } + + return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges); + } + + [Obsolete("Is it redundant??????")] + private List getTableAreasFromCells(List cells) + { + List> cellGroups = new List>(); //ArrayList<>(); + foreach (TableRectangle cell in cells) + { + bool addedToGroup = false; + + foreach (List cellGroup in cellGroups) + { + //if (!cellCheck) break; // equivalent to break cellCheck; + foreach (TableRectangle groupCell in cellGroup) + { + //if (!cellCheck) break; // equivalent to break cellCheck; + PdfPoint[] groupCellCorners = groupCell.Points; + PdfPoint[] candidateCorners = cell.Points; + + for (int i = 0; i < candidateCorners.Length; i++) + { + //if (!cellCheck) break; // equivalent to break cellCheck; + for (int j = 0; j < groupCellCorners.Length; j++) + { + //if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) + if (Distances.Euclidean(candidateCorners[i], groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) + { + cellGroup.Add(cell); + addedToGroup = true; + //cellCheck = false; + //break;// cellCheck; + goto cellCheck; + } + } + } + } + } + + cellCheck: + if (!addedToGroup) + { + List cellGroup = new List(); //ArrayList<>(); + cellGroup.Add(cell); + cellGroups.Add(cellGroup); + } + } + + // create table areas based on cell group + List tableAreas = new List(); //ArrayList<>(); + foreach (List cellGroup in cellGroups) + { + // less than four cells should not make a table + if (cellGroup.Count < REQUIRED_CELLS_FOR_TABLE) + { + continue; + } + + // warning below: min/max and top/bottom + double top = double.MaxValue; //Float.MAX_VALUE; + double left = double.MaxValue; // Float.MAX_VALUE; + double bottom = double.MinValue; // Float.MIN_VALUE; + double right = double.MinValue; // Float.MIN_VALUE; + + foreach (TableRectangle cell in cellGroup) + { + if (cell.Top < top) top = cell.Top; //.getTop(); + if (cell.Left < left) left = cell.Left; //.getLeft(); + if (cell.Bottom > bottom) bottom = cell.Bottom; //.getBottom(); + if (cell.Right > right) right = cell.Right; //.getRight(); + } + + //below is deprecated + tableAreas.Add(new TableRectangle(new PdfRectangle(left, bottom, right, top))); //top, left, right - left, bottom - top)); + } + + return tableAreas; + } + + private List getHorizontalRulings(IReadOnlyList rulings) + { + List horizontalR = new List(); + foreach (Ruling r in rulings) + { + if (r.IsHorizontal) + { + horizontalR.Add(r); + } + } + + return Ruling.CollapseOrientedRulings(horizontalR); + } + + private List getVerticalRulings(IReadOnlyList rulings) + { + List verticalR = new List(); + foreach (Ruling r in rulings) + { + if (r.IsVertical) + { + verticalR.Add(r); + } + } + + return Ruling.CollapseOrientedRulings(verticalR); + } + } +} diff --git a/Tabula/PageArea.cs b/Tabula/PageArea.cs index 4c6e604..36cc4b9 100644 --- a/Tabula/PageArea.cs +++ b/Tabula/PageArea.cs @@ -306,7 +306,7 @@ public IReadOnlyList GetRulings() /// public void AddRuling(Ruling r) { - if (r.Oblique) + if (r.IsOblique) { throw new InvalidOperationException("Can't add an oblique ruling"); } diff --git a/Tabula/Ruling.cs b/Tabula/Ruling.cs index bc8b535..b1a32ac 100644 --- a/Tabula/Ruling.cs +++ b/Tabula/Ruling.cs @@ -85,7 +85,7 @@ public void Normalize() /// /// Is the blique? Neither vertical nor horizontal. /// - public bool Oblique => !(this.IsVertical || this.IsHorizontal); + public bool IsOblique => !(this.IsVertical || this.IsHorizontal); /// /// attributes that make sense only for non-oblique lines @@ -95,7 +95,7 @@ public double Position { get { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -106,7 +106,7 @@ public double Position public void SetPosition(float v) { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -127,7 +127,7 @@ public double Start { get { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -138,7 +138,7 @@ public double Start public void SetStart(double v) { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -157,7 +157,7 @@ public double End { get { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -168,7 +168,7 @@ public double End public void SetEnd(double v) { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -185,7 +185,7 @@ public void SetEnd(double v) private void SetStartEnd(double start, double end) { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -599,7 +599,7 @@ public static List CollapseOrientedRulings(List lines, int expan double newEnd = lastFlipped ? Math.Min(nextE, lastEnd) : Math.Max(nextE, lastEnd); last.SetStartEnd(newStart, newEnd); - Debug.Assert(!last.Oblique); + Debug.Assert(!last.IsOblique); } else if (next_line.Length == 0) { diff --git a/Tabula/Tabula.csproj b/Tabula/Tabula.csproj index fb4602c..c596522 100644 --- a/Tabula/Tabula.csproj +++ b/Tabula/Tabula.csproj @@ -7,6 +7,8 @@ 0.1.0.0 0.1.0.0 0.1.0-alpha001 + BobLd + BobLd diff --git a/Tabula/Tabula.xml b/Tabula/Tabula.xml index aa6043b..cd41346 100644 --- a/Tabula/Tabula.xml +++ b/Tabula/Tabula.xml @@ -78,6 +78,21 @@ Helper container for relevant text edge info + + + Helper class that encapsulates a text edge + + + + + Helper container for all text edges on a page + + + + + Helper container for relevant text edge info + + This is the basic spreadsheet table detection algorithm currently implemented in tabula (web). @@ -437,7 +452,7 @@ Is the horizontal? - + Is the blique? Neither vertical nor horizontal. @@ -848,6 +863,18 @@ + + + Create a TextStripper for the given page. + + + + + + + Process the page. + + Unicode extensions. diff --git a/Tabula/TextStripper.cs b/Tabula/TextStripper.cs index 9d88e7b..f67f26f 100644 --- a/Tabula/TextStripper.cs +++ b/Tabula/TextStripper.cs @@ -22,12 +22,20 @@ public class TextStripper public double totalHeight; public int countHeight; + /// + /// Create a TextStripper for the given page. + /// + /// + /// public TextStripper(PdfDocument document, int pageNumber) { this.document = document; this.pageNumber = pageNumber; } + /// + /// Process the page. + /// public void Process() { var page = document.GetPage(pageNumber); @@ -48,14 +56,16 @@ public void Process() double wos = GetExpectedWhitespaceSize(letter); //textPosition.getWidthOfSpace(); - TextElement te = new TextElement(GetBbox(letter), letter.Font, letter.PointSize, c, wos, letter.GlyphRectangle.Rotation); // Rotation->The direction of the text(0, 90, 180, or 270) - te.letter = letter; + TextElement te = new TextElement(GetBbox(letter), letter.Font, letter.PointSize, c, wos, letter.GlyphRectangle.Rotation) + { + letter = letter + }; this.minCharWidth = Math.Min(this.minCharWidth, te.Width); - this.minCharHeight = Math.Min(this.minCharHeight, te.Height); + this.minCharHeight = Math.Min(this.minCharHeight, Math.Max(te.Height, 1)); // added by bobld: min height value to 1 countHeight++; - totalHeight += te.Height; + totalHeight += Math.Max(te.Height, 1); // added by bobld: min height value to 1 double avgHeight = totalHeight / countHeight; if (avgHeight > 0 && te.Height >= (avgHeight * AVG_HEIGHT_MULT_THRESHOLD) && (te.GetText()?.Trim().Equals("") != false)) From 8b97c98e4917f97cee24a102ae2b59cf3af4cf7d Mon Sep 17 00:00:00 2001 From: BobLd Date: Tue, 22 Sep 2020 16:43:22 +0100 Subject: [PATCH 2/4] implement nurminen cont.' --- Tabula.Tests/TestsNurminenDetector.cs | 13 ++ .../SimpleNurminenDetectionAlgorithm.cs | 153 +++++++++++------- Tabula/TableRectangle.cs | 28 ++-- Tabula/Tabula.xml | 12 ++ Tabula/TextStripper.cs | 4 +- 5 files changed, 143 insertions(+), 67 deletions(-) create mode 100644 Tabula.Tests/TestsNurminenDetector.cs diff --git a/Tabula.Tests/TestsNurminenDetector.cs b/Tabula.Tests/TestsNurminenDetector.cs new file mode 100644 index 0000000..813fbc3 --- /dev/null +++ b/Tabula.Tests/TestsNurminenDetector.cs @@ -0,0 +1,13 @@ +using Xunit; + +namespace Tabula.Tests +{ + public class TestsNurminenDetector + { + [Fact(Skip = "TO DO")] + public void TestLinesToCells() + { + + } + } +} diff --git a/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs b/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs index c2c39fc..ce9a3a4 100644 --- a/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs +++ b/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs @@ -13,8 +13,7 @@ namespace Tabula.Detectors { public class SimpleNurminenDetectionAlgorithm : IDetectionAlgorithm { - - private static int GRAYSCALE_INTENSITY_THRESHOLD = 25; + //private static int GRAYSCALE_INTENSITY_THRESHOLD = 25; private static int HORIZONTAL_EDGE_WIDTH_MINIMUM = 50; private static int VERTICAL_EDGE_HEIGHT_MINIMUM = 10; private static int CELL_CORNER_DISTANCE_MAXIMUM = 10; @@ -44,6 +43,11 @@ public TextEdge(double x1, double y1, double x2, double y2) Line = new PdfLine(x1, y1, x2, y2); //super(x1, y1, x2, y2); this.intersectingTextRowCount = 0; } + + public override string ToString() + { + return $"{Line.Point1}-{Line.Point2}"; + } } /// @@ -75,22 +79,26 @@ public RelevantEdges(int edgeType, int edgeCount) } } + /// + /// + /// + /// public List Detect(PageArea page) { // get horizontal & vertical lines // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF // instructions that are interpreted incorrectly as visible elements - we really want to capture what a // person sees when they look at the PDF - // hack here, we don't convert to an image + // BobLd: hack here, we don't convert to an image var pageRulings = page.GetRulings(); List horizontalRulings = this.getHorizontalRulings(pageRulings); List verticalRulings = this.getVerticalRulings(pageRulings); // end hack here - List allEdges = new List(horizontalRulings); //ArrayList<>(horizontalRulings); + List allEdges = new List(horizontalRulings); allEdges.AddRange(verticalRulings); - List tableAreas = new List(); // ArrayList<>(); + List tableAreas = new List(); // if we found some edges, try to find some tables based on them if (allEdges.Count > 0) @@ -137,8 +145,8 @@ public List Detect(PageArea page) if (verticalRuling.Intersects(tableArea) && !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2))) { - tableArea.SetTop((float)Math.Floor(Math.Min(tableArea.Top, verticalRuling.Y1))); // bobld: top and bottom! min and max - tableArea.SetBottom((float)Math.Ceiling(Math.Max(tableArea.Bottom, verticalRuling.Y2))); // bobld: top and bottom! min and max + tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2))); // bobld: Floor and Min, Y1 + tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2 break; } } @@ -176,15 +184,15 @@ public List Detect(PageArea page) { if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea)) { - tableArea.SetLeft((float)Math.Floor(Math.Min(textRow.Left, tableArea.Left))); - tableArea.SetRight((float)Math.Ceiling(Math.Max(textRow.Right, tableArea.Right))); + tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left))); + tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right))); } } } // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic //for (Iterator iterator = tableAreas.iterator(); iterator.hasNext();) - foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be abnle to remove + foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove { //Rectangle table = iterator.next(); @@ -274,12 +282,12 @@ public List Detect(PageArea page) // create a set of our current tables that will eliminate duplicate tables SortedSet tableSet = new SortedSet(new TreeSetComparer()); //Set tableSet = new TreeSet<>(new Comparator() {... //tableSet.addAll(tableAreas); - foreach (var table in tableAreas) + foreach (var table in tableAreas.OrderByDescending(t => t.Area)) { tableSet.Add(table); } - return tableSet.ToList(); //new ArrayList<>(tableSet); + return tableSet.ToList(); } private class TreeSetComparer : IComparer @@ -317,6 +325,8 @@ public int Compare(TableRectangle o1, TableRectangle o2) private TableRectangle getTableFromText(List lines, List relevantEdges, int relevantEdgeCount, List horizontalRulings) { + //var sortedHorizontalRulings = horizontalRulings.OrderByDescending(h => h.Y1).ToList(); // sort by Y, from top to bottom + TableRectangle table = new TableRectangle(); TableLine prevRow = null; @@ -336,7 +346,7 @@ private TableRectangle getTableFromText(List lines, List re // check to make sure this text row is within a line or so of the other lines already added // if it's not, we should stop the table here double tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5; - double lineDistance = textRow.Top - prevRow.Top; // bobld: top or bottom??? + double lineDistance = textRow.Bottom - prevRow.Bottom; // bobld: Top Top if (lineDistance > tableLineThreshold) { @@ -353,9 +363,14 @@ private TableRectangle getTableFromText(List lines, List re relativeEdgeDifferenceThreshold = 0; } + var rect = new TableLine(); + rect.SetLeft(Math.Floor(textRow.Left)); + rect.SetBottom(Math.Floor(textRow.Bottom)); + rect.SetRight(Math.Ceiling(textRow.Right)); + rect.SetTop(Math.Ceiling(textRow.Top)); foreach (TextEdge edge in relevantEdges) { - if (textRow.IntersectsLine(edge.Line)) + if (rect.IntersectsLine(edge.Line)) { numRelevantEdges++; } @@ -368,7 +383,7 @@ private TableRectangle getTableFromText(List lines, List re if (prevRow != null && firstTableRow != null) { tableSpaceCount++; - totalRowSpacing += textRow.Top - prevRow.Top; // bobld: top or bottom??? + totalRowSpacing += prevRow.Bottom - textRow.Bottom; // bobld: textRow.Top - prevRow.Top } // row is part of a table @@ -380,7 +395,7 @@ private TableRectangle getTableFromText(List lines, List re else { table.SetLeft(Math.Min(table.Left, textRow.Left)); - table.SetBottom(Math.Max(table.Bottom, textRow.Bottom)); // bobld: max or min? + table.SetBottom(Math.Min(table.Bottom, textRow.Bottom)); // bobld: Max table.SetRight(Math.Max(table.Right, textRow.Right)); } } @@ -423,19 +438,20 @@ private TableRectangle getTableFromText(List lines, List re double rowHeightThreshold = avgRowHeight * 1.5; // check lines after the bottom of the table - foreach (Ruling ruling in horizontalRulings) //Line2D.Float + //foreach (Ruling ruling in sortedHorizontalRulings) //Line2D.Float + for (int i = horizontalRulings.Count - 1; i >= 0; i--) // reverse order { - - if (ruling.Y1 < table.Bottom) // bobld: warning top and bottom + var ruling = horizontalRulings[i]; + if (ruling.Y1 > table.Bottom) // bobld: < { continue; } - double distanceFromTable = ruling.Y1 - table.Bottom; + double distanceFromTable = table.Bottom - ruling.Y2; // bobld: Y1 if (distanceFromTable <= rowHeightThreshold) { // use this ruling to help define the table - table.SetBottom(Math.Max(table.Bottom, ruling.Y1)); // bobld: max or min? + table.SetBottom(Math.Min(table.Bottom, ruling.Y2)); // bobld: Max Y1 table.SetLeft(Math.Min(table.Left, ruling.X1)); table.SetRight(Math.Max(table.Right, ruling.X2)); } @@ -450,19 +466,20 @@ private TableRectangle getTableFromText(List lines, List re // larger to fit up to three-ish rows of text (at least but we don't want to grab too much) rowHeightThreshold = avgRowHeight * 3.8; - for (int i = horizontalRulings.Count - 1; i >= 0; i--) + //for (int i = horizontalRulings.Count - 1; i >= 0; i--) + for (int i = 0; i < horizontalRulings.Count; i++) { Ruling ruling = horizontalRulings[i]; //.get(i); // Line2D.Float - if (ruling.Y1 > table.Top) //.getTop()) + if (ruling.Y1 < table.Top) //bobld: > { continue; } - double distanceFromTable = table.Top - ruling.Y1; + double distanceFromTable = ruling.Y1 - table.Top; // bobld: table.Top - ruling.Y1 if (distanceFromTable <= rowHeightThreshold) { - table.SetTop(Math.Min(table.Top, ruling.Y1)); // bobld: max or min? + table.SetTop(Math.Max(table.Top, ruling.Y2)); // bobld: Min Y1 table.SetLeft(Math.Min(table.Left, ruling.X1)); table.SetRight(Math.Max(table.Right, ruling.X2)); } @@ -473,8 +490,8 @@ private TableRectangle getTableFromText(List lines, List re } // add a bit of padding since the halved horizontal lines are a little fuzzy anyways - table.SetTop(Math.Floor(table.Top) - TABLE_PADDING_AMOUNT);// Bobld: Ceiling/Floor Top/Bottom?? - table.SetBottom(Math.Ceiling(table.Bottom) + TABLE_PADDING_AMOUNT); // Bobld: Ceiling/Floor Top/Bottom?? + table.SetTop(Math.Ceiling(table.Top) + TABLE_PADDING_AMOUNT); // bobld: Floor - + table.SetBottom(Math.Floor(table.Bottom) - TABLE_PADDING_AMOUNT); // bobld: Ceiling + table.SetLeft(Math.Floor(table.Left) - TABLE_PADDING_AMOUNT); table.SetRight(Math.Ceiling(table.Right) + TABLE_PADDING_AMOUNT); @@ -546,46 +563,47 @@ private RelevantEdges getRelevantEdges(TextEdges textEdges, List line return new RelevantEdges(relevantEdgeType, relevantEdgeCount); } - private TextEdges getTextEdges(List lines) { // get all text edges (lines that align with the left, middle and right of chunks of text) that extend // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text - List leftTextEdges = new List(); //ArrayList<>(); - List midTextEdges = new List(); // ArrayList<>(); - List rightTextEdges = new List(); // ArrayList<>(); + List leftTextEdges = new List(); + List midTextEdges = new List(); + List rightTextEdges = new List(); - Dictionary> currLeftEdges = new Dictionary>(); //HashMap<>(); - Dictionary> currMidEdges = new Dictionary>(); //HashMap<>(); - Dictionary> currRightEdges = new Dictionary>(); //HashMap<>(); + Dictionary> currLeftEdges = new Dictionary>(); + Dictionary> currMidEdges = new Dictionary>(); + Dictionary> currRightEdges = new Dictionary>(); foreach (TableLine textRow in lines) { - foreach (TextChunk text in textRow.TextElements) //.getTextElements()) + foreach (TextChunk text in textRow.TextElements) { - int left = (int)Math.Floor(text.Left); //.getLeft())); - int right = (int)Math.Floor(text.Right); //.getRight())); + if (text.GetText().Equals("")) continue; // added by bobld + + int left = (int)Math.Floor(text.Left); //.getLeft())); + int right = (int)Math.Floor(text.Right); //.getRight())); int mid = (int)(left + ((right - left) / 2)); // first put this chunk into any edge buckets it belongs to if (!currLeftEdges.TryGetValue(left, out List leftEdge)) { - leftEdge = new List(); //ArrayList<>(); + leftEdge = new List(); currLeftEdges[left] = leftEdge; } leftEdge.Add(text); if (!currMidEdges.TryGetValue(mid, out List midEdge)) { - midEdge = new List(); //ArrayList<>(); + midEdge = new List(); currMidEdges[mid] = midEdge; } midEdge.Add(text); if (!currRightEdges.TryGetValue(right, out List rightEdge)) { - rightEdge = new List(); //ArrayList<>(); + rightEdge = new List(); currRightEdges[right] = rightEdge; } rightEdge.Add(text); @@ -606,7 +624,7 @@ private TextEdges getTextEdges(List lines) TextChunk first = edgeChunks[0]; //.get(0); TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); // bobld: carefull here top/bottom + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); leftTextEdges.Add(edge); @@ -629,7 +647,7 @@ private TextEdges getTextEdges(List lines) TextChunk first = edgeChunks[0]; //.get(0); TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); midTextEdges.Add(edge); @@ -652,7 +670,7 @@ private TextEdges getTextEdges(List lines) TextChunk first = edgeChunks[0]; //.get(0); TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); rightTextEdges.Add(edge); @@ -671,7 +689,7 @@ private TextEdges getTextEdges(List lines) TextChunk first = edgeChunks[0]; //.get(0); TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); leftTextEdges.Add(edge); @@ -686,7 +704,7 @@ private TextEdges getTextEdges(List lines) TextChunk first = edgeChunks[0]; //.get(0); TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom); edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); midTextEdges.Add(edge); @@ -701,7 +719,7 @@ private TextEdges getTextEdges(List lines) TextChunk first = edgeChunks[0]; //.get(0); TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); rightTextEdges.Add(edge); @@ -767,17 +785,17 @@ private List getTableAreasFromCells(List cells) } // warning below: min/max and top/bottom - double top = double.MaxValue; //Float.MAX_VALUE; - double left = double.MaxValue; // Float.MAX_VALUE; - double bottom = double.MinValue; // Float.MIN_VALUE; - double right = double.MinValue; // Float.MIN_VALUE; + double top = double.MinValue; // bobld: MaxValue + double left = double.MaxValue; + double bottom = double.MaxValue; // bobld: MinValue + double right = double.MinValue; foreach (TableRectangle cell in cellGroup) { - if (cell.Top < top) top = cell.Top; //.getTop(); - if (cell.Left < left) left = cell.Left; //.getLeft(); - if (cell.Bottom > bottom) bottom = cell.Bottom; //.getBottom(); - if (cell.Right > right) right = cell.Right; //.getRight(); + if (cell.Top > top) top = cell.Top; // bobld: < + if (cell.Left < left) left = cell.Left; + if (cell.Bottom < bottom) bottom = cell.Bottom; // bobld: > + if (cell.Right > right) right = cell.Right; } //below is deprecated @@ -798,7 +816,19 @@ private List getHorizontalRulings(IReadOnlyList rulings) } } - return Ruling.CollapseOrientedRulings(horizontalR); + List horizontalRulings = new List(); + foreach (var r in horizontalR) + { + var endX = r.Right + 1; + var startY = r.Left - 1; + if (endX - startY > HORIZONTAL_EDGE_WIDTH_MINIMUM) + { + horizontalRulings.Add(new Ruling(new PdfPoint(startY, r.Bottom), new PdfPoint(endX, r.Top))); + } + } + + return horizontalRulings; + //return Ruling.CollapseOrientedRulings(horizontalR).Where(h => h.Width > HORIZONTAL_EDGE_WIDTH_MINIMUM).ToList(); } private List getVerticalRulings(IReadOnlyList rulings) @@ -812,7 +842,18 @@ private List getVerticalRulings(IReadOnlyList rulings) } } - return Ruling.CollapseOrientedRulings(verticalR); + List verticalRulings = new List(); + foreach (var r in verticalR) + { + var endY = r.Top + 1; + var startY = r.Bottom - 1; + if (endY - startY > VERTICAL_EDGE_HEIGHT_MINIMUM) + { + verticalRulings.Add(new Ruling(new PdfPoint(r.Left, startY), new PdfPoint(r.Right, endY))); + } + } + //return Ruling.CollapseOrientedRulings(verticalR).Where(v => v.Height > VERTICAL_EDGE_HEIGHT_MINIMUM).ToList(); + return verticalRulings; } } } diff --git a/Tabula/TableRectangle.cs b/Tabula/TableRectangle.cs index b59d46e..98f86c3 100644 --- a/Tabula/TableRectangle.cs +++ b/Tabula/TableRectangle.cs @@ -101,8 +101,8 @@ public virtual int IsLtrDominant() public double VerticalOverlap(TableRectangle other) { - return Math.Max(0, Math.Min(this.BoundingBox.Top, other.BoundingBox.Top) - - Math.Max(this.BoundingBox.Bottom, other.BoundingBox.Bottom)); + return Math.Max(0, Math.Min(this.BoundingBox.TopLeft.Y, other.BoundingBox.TopLeft.Y) + - Math.Max(this.BoundingBox.BottomLeft.Y, other.BoundingBox.BottomLeft.Y)); } public bool VerticallyOverlaps(TableRectangle other) @@ -122,8 +122,8 @@ public bool HorizontallyOverlaps(TableRectangle other) public double VerticalOverlapRatio(TableRectangle other) { - double delta = Math.Min(this.BoundingBox.Top - this.BoundingBox.Bottom, - other.BoundingBox.Top - other.BoundingBox.Bottom); + double delta = Math.Min(this.BoundingBox.TopLeft.Y - this.BoundingBox.BottomLeft.Y, + other.BoundingBox.TopLeft.Y - other.BoundingBox.BottomLeft.Y); var overl = VerticalOverlap(other); return overl / delta; } @@ -146,7 +146,7 @@ public TableRectangle Merge(TableRectangle other) /// /// Get the 's top coordinate. /// - public double Top => BoundingBox.Top; + public double Top => BoundingBox.TopRight.Y; //.Top; /// /// Set the 's top coordinate. @@ -163,7 +163,7 @@ public void SetTop(double top) /// /// Get the 's right coordinate. /// - public double Right => BoundingBox.Right; + public double Right => BoundingBox.TopRight.X; //.Right; /// /// Set the 's right coordinate. @@ -178,7 +178,7 @@ public void SetRight(double right) /// /// Get the 's left coordinate. /// - public double Left => BoundingBox.Left; + public double Left => BoundingBox.BottomLeft.X; //.Left; /// /// Set the 's left coordinate. @@ -194,7 +194,7 @@ public void SetLeft(double left) /// /// Get the 's bottom coordinate. /// - public double Bottom => BoundingBox.Bottom; + public double Bottom => BoundingBox.BottomLeft.Y; //.Bottom; /// /// Set the 's bottom coordinate. @@ -270,10 +270,20 @@ public bool IntersectsLine(Ruling ruling) return IntersectsLine(ruling.Line); } + + /// + /// hack to include border + /// + /// + private PdfRectangle Expand(PdfRectangle rectangle) + { + return new PdfRectangle(rectangle.Left - 1, rectangle.Bottom - 1, rectangle.Right + 1, rectangle.Top + 1); + } + public bool IntersectsLine(PdfLine line) { var clipper = new Clipper(); - clipper.AddPath(Clipper.ToClipperIntPoints(this.BoundingBox), PolyType.ptClip, true); + clipper.AddPath(Clipper.ToClipperIntPoints(Expand(this.BoundingBox)), PolyType.ptClip, true); clipper.AddPath(Clipper.ToClipperIntPoints(line), PolyType.ptSubject, false); diff --git a/Tabula/Tabula.xml b/Tabula/Tabula.xml index cd41346..973839d 100644 --- a/Tabula/Tabula.xml +++ b/Tabula/Tabula.xml @@ -93,6 +93,12 @@ Helper container for relevant text edge info + + + + + + This is the basic spreadsheet table detection algorithm currently implemented in tabula (web). @@ -675,6 +681,12 @@ Counter-clockwise, starting from bottom left point. + + + hack to include border + + + The 's top-left X coordinate. diff --git a/Tabula/TextStripper.cs b/Tabula/TextStripper.cs index f67f26f..37555e6 100644 --- a/Tabula/TextStripper.cs +++ b/Tabula/TextStripper.cs @@ -61,8 +61,8 @@ public void Process() letter = letter }; - this.minCharWidth = Math.Min(this.minCharWidth, te.Width); - this.minCharHeight = Math.Min(this.minCharHeight, Math.Max(te.Height, 1)); // added by bobld: min height value to 1 + if (!string.IsNullOrWhiteSpace(c)) this.minCharWidth = Math.Min(this.minCharWidth, te.Width); + if (!string.IsNullOrWhiteSpace(c)) this.minCharHeight = Math.Min(this.minCharHeight, Math.Max(te.Height, 1)); // added by bobld: min height value to 1 countHeight++; totalHeight += Math.Max(te.Height, 1); // added by bobld: min height value to 1 From e4de67e3c51c2f90066aa57b3980747a1c4600b4 Mon Sep 17 00:00:00 2001 From: BobLd Date: Tue, 22 Sep 2020 17:08:24 +0100 Subject: [PATCH 3/4] implement nurminen cont.' 2 --- Tabula.Tests/TestsNurminenDetector.cs | 22 +- .../Detectors/NurminenDetectionAlgorithm.cs | 1003 ----------------- .../SimpleNurminenDetectionAlgorithm.cs | 143 ++- .../SpreadsheetDetectionAlgorithm.cs | 5 +- Tabula/Tabula.xml | 23 +- 5 files changed, 98 insertions(+), 1098 deletions(-) diff --git a/Tabula.Tests/TestsNurminenDetector.cs b/Tabula.Tests/TestsNurminenDetector.cs index 813fbc3..26dca81 100644 --- a/Tabula.Tests/TestsNurminenDetector.cs +++ b/Tabula.Tests/TestsNurminenDetector.cs @@ -1,4 +1,10 @@ -using Xunit; +using System; +using System.Collections.Generic; +using System.Text; +using Tabula.Detectors; +using Tabula.Extractors; +using UglyToad.PdfPig; +using Xunit; namespace Tabula.Tests { @@ -7,7 +13,21 @@ public class TestsNurminenDetector [Fact(Skip = "TO DO")] public void TestLinesToCells() { + using (PdfDocument document = PdfDocument.Open(@"test3.pdf", new ParsingOptions() { ClipPaths = true })) + { + ObjectExtractor oe = new ObjectExtractor(document); + PageArea page = oe.Extract(1); + SimpleNurminenDetectionAlgorithm detector = new SimpleNurminenDetectionAlgorithm(); + var regions = detector.Detect(page); + + foreach (var a in regions) + { + IExtractionAlgorithm ea = new BasicExtractionAlgorithm(); + var newArea = page.GetArea(a.BoundingBox); + List tables = ea.Extract(newArea); + } + } } } } diff --git a/Tabula/Detectors/NurminenDetectionAlgorithm.cs b/Tabula/Detectors/NurminenDetectionAlgorithm.cs index 0b28550..6356321 100644 --- a/Tabula/Detectors/NurminenDetectionAlgorithm.cs +++ b/Tabula/Detectors/NurminenDetectionAlgorithm.cs @@ -1,10 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; -using UglyToad.PdfPig; -using UglyToad.PdfPig.Content; -using UglyToad.PdfPig.Core; -using UglyToad.PdfPig.DocumentLayoutAnalysis; namespace Tabula.Detectors { @@ -22,1007 +17,9 @@ namespace Tabula.Detectors /// public class NurminenDetectionAlgorithm : IDetectionAlgorithm { - private static int GRAYSCALE_INTENSITY_THRESHOLD = 25; - private static int HORIZONTAL_EDGE_WIDTH_MINIMUM = 50; - private static int VERTICAL_EDGE_HEIGHT_MINIMUM = 10; - private static int CELL_CORNER_DISTANCE_MAXIMUM = 10; - private static float POINT_SNAP_DISTANCE_THRESHOLD = 8f; - private static float TABLE_PADDING_AMOUNT = 1.0f; - private static int REQUIRED_TEXT_LINES_FOR_EDGE = 4; - private static int REQUIRED_CELLS_FOR_TABLE = 4; - private static float IDENTICAL_TABLE_OVERLAP_RATIO = 0.9f; - - /// - /// Helper class that encapsulates a text edge - /// - private class TextEdge // static - { - public readonly PdfLine line; - // types of text edges - public const int LEFT = 0; - public const int MID = 1; - public const int RIGHT = 2; - public const int NUM_TYPES = 3; - - public int intersectingTextRowCount; - - public TextEdge(double x1, double y1, double x2, double y2) - { - this.line = new PdfLine(x1, y1, x2, y2); // bobld: careful with order here - //super(x1, y1, x2, y2); - this.intersectingTextRowCount = 0; - } - } - - /// - /// Helper container for all text edges on a page - /// - private class TextEdges : List> // ArrayList> // static - { - public TextEdges(List leftEdges, List midEdges, List rightEdges) - : base(3) - { - //super(3); - this.Add(leftEdges); - this.Add(midEdges); - this.Add(rightEdges); - } - } - - /// - /// Helper container for relevant text edge info - /// - private class RelevantEdges // static - { - public int edgeType; - public int edgeCount; - - public RelevantEdges(int edgeType, int edgeCount) - { - this.edgeType = edgeType; - this.edgeCount = edgeCount; - } - } - public List Detect(PageArea page) { throw new NotImplementedException(); - - /* - // get horizontal & vertical lines - // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF - // instructions that are interpreted incorrectly as visible elements - we really want to capture what a - // person sees when they look at the PDF - BufferedImage image; - Page pdfPage = page.getPDPage(); - try - { - image = Utils.pageConvertToImage(page.getPDDoc(), pdfPage, 144, ImageType.GRAY); - } - catch (IOException e) - { - return new List(); //ArrayList<>(); - } - - List horizontalRulings = this.getHorizontalRulings(image); - - // now check the page for vertical lines, but remove the text first to make things less confusing - PdfDocument removeTextDocument = null; - try - { - removeTextDocument = this.removeText(pdfPage); - pdfPage = removeTextDocument.GetPage(1); //.getPage(0); - image = Utils.pageConvertToImage(removeTextDocument, pdfPage, 144); //, ImageType.GRAY); - } - catch (Exception e) - { - return new List(); //ArrayList<>(); - } - finally - { - if (removeTextDocument != null) - { - try - { - removeTextDocument.Dispose(); //.close(); - } - catch (IOException e) - { - // TODO Auto-generated catch block - // e.printStackTrace(); - } - } - } - - List verticalRulings = this.getVerticalRulings(image); - - List allEdges = new List(horizontalRulings); - allEdges.AddRange(verticalRulings); - - List tableAreas = new List(); - - // if we found some edges, try to find some tables based on them - if (allEdges.Count > 0) - { - // now we need to snap edge endpoints to a grid - Utils.snapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD); - - // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings - foreach (List rulings in new[] { horizontalRulings, verticalRulings }) // Arrays.asList(horizontalRulings, verticalRulings)) - { - //foreach (Iterator iterator = rulings.iterator(); iterator.hasNext();) - foreach (var ruling in rulings.ToList()) // ToList() to do a copy to allow remove in original - { - //Ruling ruling = iterator.next(); - - ruling.normalize(); - if (ruling.oblique()) - { - rulings.Remove(ruling); //iterator.remove(); - } - } - } - - // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier - // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the - // edge detection/pixel snapping steps - horizontalRulings = Ruling.collapseOrientedRulings(horizontalRulings, 5); - verticalRulings = Ruling.collapseOrientedRulings(verticalRulings, 5); - - // use the rulings and points to find cells - var cells = SpreadsheetExtractionAlgorithm.findCells(horizontalRulings, verticalRulings); // List - - // then use those cells to make table areas - tableAreas = this.getTableAreasFromCells(cells.Cast().ToList()); - } - - // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as - // cells if there are missing horizontal lines (which there often are) - // let's assume though that these lines should be part of the table - foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float - { - foreach (TableRectangle tableArea in tableAreas) - { - if (verticalRuling.intersects(tableArea) && !(tableArea.contains(verticalRuling.getP1()) && tableArea.contains(verticalRuling.getP2()))) - { - tableArea.setTop((float)Math.Floor(Math.Max(tableArea.getTop(), verticalRuling.getY1()))); // min - tableArea.setBottom((float)Math.Ceiling(Math.Min(tableArea.getBottom(), verticalRuling.getY2()))); // max - break; - } - } - } - - // the tabula Page coordinate space is half the size of the PDFBox image coordinate space - // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything - foreach (TableRectangle area in tableAreas) - { - area.x = (float)Math.Floor(area.x / 2) - TABLE_PADDING_AMOUNT; - area.y = (float)Math.Floor(area.y / 2) - TABLE_PADDING_AMOUNT; - area.width = (float)Math.Ceiling(area.width / 2) + TABLE_PADDING_AMOUNT; - area.height = (float)Math.Ceiling(area.height / 2) + TABLE_PADDING_AMOUNT; - } - - // we're going to want halved horizontal lines later too - foreach (Ruling ruling in horizontalRulings) // Line2D.Float - { - ruling.x1 = ruling.x1 / 2; - ruling.y1 = ruling.y1 / 2; - ruling.x2 = ruling.x2 / 2; - ruling.y2 = ruling.y2 / 2; - } - - // now look at text rows to help us find more tables and flesh out existing ones - List textChunks = TextElement.mergeWords(page.getText()); - List lines = TextChunk.groupByLines(textChunks); - - // first look for text rows that intersect an existing table - those lines should probably be part of the table - foreach (TableLine textRow in lines) - { - foreach (TableRectangle tableArea in tableAreas) - { - if (!tableArea.contains(textRow) && textRow.intersects(tableArea)) - { - tableArea.setLeft((float)Math.Floor(Math.Min(textRow.getLeft(), tableArea.getLeft()))); - tableArea.setRight((float)Math.Ceiling(Math.Max(textRow.getRight(), tableArea.getRight()))); - } - } - } - - // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic - //for (Iterator iterator = tableAreas.iterator(); iterator.hasNext();) - foreach (var table in tableAreas.ToList()) // ToList() to do a copy to allow remove in original - { - //TableRectangle table = iterator.next(); - - bool intersectsText = false; - foreach (TableLine textRow in lines) - { - if (table.intersects(textRow)) - { - intersectsText = true; - break; - } - } - - if (!intersectsText) - { - //iterator.remove(); - tableAreas.Remove(table); - } - } - - // lastly, there may be some tables that don't have any vertical rulings at all - // we'll use text edges we've found to try and guess which text rows are part of a table - - // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table - // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects - // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be - // part of a table. - - bool foundTable; - - do - { - foundTable = false; - - // get rid of any text lines contained within existing tables, this allows us to find more tables - //for (Iterator iterator = lines.iterator(); iterator.hasNext();) - foreach (var textRow in lines) - { - //TableLine textRow = iterator.next(); - foreach (TableRectangle table in tableAreas.ToList()) // ToList() to do a copy to allow remove in original - { - if (table.contains(textRow)) - { - //iterator.remove(); - lines.Remove(textRow); - break; - } - } - } - - // get text edges from remaining lines in the document - TextEdges textEdges = this.getTextEdges(lines); - List leftTextEdges = textEdges.get(TextEdge.LEFT); - List midTextEdges = textEdges.get(TextEdge.MID); - List rightTextEdges = textEdges.get(TextEdge.RIGHT); - - // find the relevant text edges (the ones we think define where a table is) - RelevantEdges relevantEdgeInfo = this.getRelevantEdges(textEdges, lines); - - // we found something relevant so let's look for rows that fit our criteria - if (relevantEdgeInfo.edgeType != -1) - { - List relevantEdges = null; - switch (relevantEdgeInfo.edgeType) - { - case TextEdge.LEFT: - relevantEdges = leftTextEdges; - break; - case TextEdge.MID: - relevantEdges = midTextEdges; - break; - case TextEdge.RIGHT: - relevantEdges = rightTextEdges; - break; - } - - TableRectangle table = this.getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings); - - if (table != null) - { - foundTable = true; - tableAreas.Add(table); - } - } - } while (foundTable); - - // create a set of our current tables that will eliminate duplicate tables - // Set tableSet = new TreeSet<>(new Comparator() { - // not sure if works with sorted set?? - SortedSet tableSet = new SortedSet(new TreeSetRectangleComparer()); - - //tableSet.addAll(tableAreas); - foreach (var ta in tableAreas) - { - tableSet.Add(ta); - } - - return new List(tableSet); //ArrayList<>(tableSet); - */ - } - - public class TreeSetRectangleComparer : IComparer - { - public int Compare(TableRectangle o1, TableRectangle o2) - { - if (o1.Equals(o2)) - { - return 0; - } - - // o1 is "equal" to o2 if o2 contains all of o1 - if (o2.Contains(o1)) - { - return 0; - } - - if (o1.Contains(o2)) - { - return 0; - } - - // otherwise see if these tables are "mostly" the same - double overlap = o1.OverlapRatio(o2); - if (overlap >= IDENTICAL_TABLE_OVERLAP_RATIO) - { - return 0; - } - else - { - return 1; - } - } - } - - private TableRectangle getTableFromText(List lines, - List relevantEdges, - int relevantEdgeCount, - List horizontalRulings) - { - - TableRectangle table = new TableRectangle(); - - TableLine prevRow = null; - TableLine firstTableRow = null; - TableLine lastTableRow = null; - - int tableSpaceCount = 0; - double totalRowSpacing = 0; - - // go through the lines and find the ones that have the correct count of the relevant edges - foreach (TableLine textRow in lines) - { - int numRelevantEdges = 0; - - if (firstTableRow != null && tableSpaceCount > 0) - { - // check to make sure this text row is within a line or so of the other lines already added - // if it's not, we should stop the table here - double tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5; - double lineDistance = textRow.Top - prevRow.Top; - - if (lineDistance > tableLineThreshold) - { - lastTableRow = prevRow; - break; - } - } - - // for larger tables, be a little lenient on the number of relevant rows the text intersects - // for smaller tables, not so much - otherwise we'll end up treating paragraphs as tables too - int relativeEdgeDifferenceThreshold = 1; - if (relevantEdgeCount <= 3) - { - relativeEdgeDifferenceThreshold = 0; - } - - foreach (TextEdge edge in relevantEdges) - { - if (textRow.IntersectsLine(edge.line)) - { - numRelevantEdges++; - } - } - - // see if we have a candidate text row - if (numRelevantEdges >= (relevantEdgeCount - relativeEdgeDifferenceThreshold)) - { - // keep track of table row spacing - if (prevRow != null && firstTableRow != null) - { - tableSpaceCount++; - totalRowSpacing += textRow.Top - prevRow.Top; - } - - // row is part of a table - if (table.Area == 0) - { - firstTableRow = textRow; - table.SetRect(textRow); - } - else - { - table.SetLeft(Math.Min(table.Left, textRow.Left)); - table.SetBottom(Math.Min(table.Bottom, textRow.Bottom)); // max - table.SetRight(Math.Max(table.Right, textRow.Right)); - } - } - else - { - // no dice - // if we're at the end of the table, save the last row - if (firstTableRow != null && lastTableRow == null) - { - lastTableRow = prevRow; - } - } - - prevRow = textRow; - } - - // if we don't have a table now, we won't after the next step either - if (table.Area == 0) - { - return null; - } - - if (lastTableRow == null) - { - // takes care of one-row tables or tables that end at the bottom of a page - lastTableRow = prevRow; - } - - // use the average row height and nearby horizontal lines to extend the table area - double avgRowHeight; - if (tableSpaceCount > 0) - { - avgRowHeight = totalRowSpacing / tableSpaceCount; - } - else - { - avgRowHeight = lastTableRow.Height; - } - - double rowHeightThreshold = avgRowHeight * 1.5; - - // check lines after the bottom of the table - foreach (Ruling ruling in horizontalRulings) // Line2D.Float - { - - if (ruling.Y1 < table.Bottom) - { - continue; - } - - double distanceFromTable = ruling.Y1 - table.Bottom; - if (distanceFromTable <= rowHeightThreshold) - { - // use this ruling to help define the table - table.SetBottom(Math.Min(table.Bottom, ruling.Y1)); // max - table.SetLeft(Math.Min(table.Left, ruling.X1)); - table.SetRight(Math.Max(table.Right, ruling.X2)); - } - else - { - // no use checking any further - break; - } - } - - // do the same for lines at the top, but make the threshold greater since table headings tend to be - // larger to fit up to three-ish rows of text (at least but we don't want to grab too much) - rowHeightThreshold = avgRowHeight * 3.8f; - - for (int i = horizontalRulings.Count - 1; i >= 0; i--) - { - Ruling ruling = horizontalRulings[i];//.get(i); Line2D.Float - - if (ruling.Y1 > table.Top) // bobld or lines) - { - List leftTextEdges = textEdges[TextEdge.LEFT]; //.get(TextEdge.LEFT); - List midTextEdges = textEdges[TextEdge.MID]; //.get(TextEdge.MID); - List rightTextEdges = textEdges[TextEdge.RIGHT]; //.get(TextEdge.RIGHT); - - // first we'll find the number of lines each type of edge crosses - int[][] edgeCountsPerLine = new int[lines.Count][]; //[TextEdge.NUM_TYPES]; - for (int i = 0; i < edgeCountsPerLine.Length; i++) - { - edgeCountsPerLine[i] = new int[TextEdge.NUM_TYPES]; - } - - foreach (TextEdge edge in leftTextEdges) - { - edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.LEFT]++; - } - - foreach (TextEdge edge in midTextEdges) - { - edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.MID]++; - } - - foreach (TextEdge edge in rightTextEdges) - { - edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.RIGHT]++; - } - - // now let's find the relevant edge type and the number of those edges we should look for - // we'll only take a minimum of two edges to look for tables - int relevantEdgeType = -1; - int relevantEdgeCount = 0; - for (int i = edgeCountsPerLine.Length - 1; i > 2; i--) - { - if (edgeCountsPerLine[i][TextEdge.LEFT] > 2 && - edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.RIGHT] && - edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.MID]) - { - relevantEdgeCount = edgeCountsPerLine[i][TextEdge.LEFT]; - relevantEdgeType = TextEdge.LEFT; - break; - } - - if (edgeCountsPerLine[i][TextEdge.RIGHT] > 1 && - edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.LEFT] && - edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.MID]) - { - relevantEdgeCount = edgeCountsPerLine[i][TextEdge.RIGHT]; - relevantEdgeType = TextEdge.RIGHT; - break; - } - - if (edgeCountsPerLine[i][TextEdge.MID] > 1 && - edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.RIGHT] && - edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.LEFT]) - { - relevantEdgeCount = edgeCountsPerLine[i][TextEdge.MID]; - relevantEdgeType = TextEdge.MID; - break; - } - } - - return new RelevantEdges(relevantEdgeType, relevantEdgeCount); - } - - private TextEdges getTextEdges(List lines) - { - // get all text edges (lines that align with the left, middle and right of chunks of text) that extend - // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text - List leftTextEdges = new List(); // ArrayList<>(); - List midTextEdges = new List(); //ArrayList<>(); - List rightTextEdges = new List(); // ArrayList<>(); - - var currLeftEdges = new Dictionary>(); // Map> currLeftEdges = new HashMap<>(); - var currMidEdges = new Dictionary>(); // Map> currMidEdges = new HashMap<>(); - var currRightEdges = new Dictionary>(); // Map> currRightEdges = new HashMap<>(); - - foreach (TableLine textRow in lines) - { - foreach (TextChunk text in textRow.TextElements) - { - int left = (int)Math.Floor(text.Left); // new Integer( - int right = (int)Math.Floor(text.Right); //new Integer( - int mid = left + ((right - left) / 2);//new Integer( - - // first put this chunk into any edge buckets it belongs to - List leftEdge = currLeftEdges[left]; //.get(left); - if (leftEdge == null) - { - leftEdge = new List(); //ArrayList<>(); - currLeftEdges[left] = leftEdge; //.put(left, leftEdge); - } - leftEdge.Add(text); - - List midEdge = currMidEdges[mid];//.get(mid); - if (midEdge == null) - { - midEdge = new List(); //ArrayList<>(); - currMidEdges[mid] = midEdge; //.put(mid, midEdge); - } - midEdge.Add(text); - - List rightEdge = currRightEdges[right]; //.get(right); - if (rightEdge == null) - { - rightEdge = new List(); //ArrayList<>(); - currRightEdges[right] = rightEdge; //.put(right, rightEdge); - } - rightEdge.Add(text); - - // now see if this text chunk blows up any other edges - //for (Iterator>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext();) - foreach (var entry in currLeftEdges.ToList()) - { - //Map.Entry> entry = iterator.next(); - int key = entry.Key; //.getKey(); - if (key > left && key < right) - { - //iterator.remove(); - currLeftEdges.Remove(key); - List edgeChunks = entry.Value; //.getValue(); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0];//.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1];//.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - leftTextEdges.Add(edge); - } - } - } - - //for (Iterator>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext();) - foreach (var entry in currMidEdges.ToList()) - { - //Map.Entry> entry = iterator.next(); - int key = entry.Key; //.getKey(); - if (key > left && key < right && Math.Abs(key - mid) > 2) - { - //iterator.remove(); - currMidEdges.Remove(key); - List edgeChunks = entry.Value; //.getValue(); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0];//.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - midTextEdges.Add(edge); - } - } - } - - //for (Iterator>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext();) - foreach (var entry in currRightEdges.ToList()) - { - //Map.Entry> entry = iterator.next(); - int key = entry.Key; //.getKey(); - if (key > left && key < right) - { - //iterator.remove(); - currRightEdges.Remove(key); - List edgeChunks = entry.Value; //.getValue(); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0];//.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - rightTextEdges.Add(edge); - } - } - } - } - } - - // add the leftovers - //foreach (Integer key in currLeftEdges.keySet()) - foreach (var key in currLeftEdges.Keys) - { - List edgeChunks = currLeftEdges[key]; //.get(key); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - leftTextEdges.Add(edge); - } - } - - foreach (int key in currMidEdges.Keys) //.keySet()) - { - List edgeChunks = currMidEdges[key]; // .get(key); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1];//.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - midTextEdges.Add(edge); - } - } - - foreach (int key in currRightEdges.Keys) //.keySet()) - { - List edgeChunks = currRightEdges[key]; //.get(key); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0];//.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; // .get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - rightTextEdges.Add(edge); - } - } - - return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges); - } - - private List getTableAreasFromCells(List cells) - { - List> cellGroups = new List>(); // ArrayList<>(); - foreach (TableRectangle cell in cells) - { - bool addedToGroup = false; - - bool breakCellCheck = false; - //cellCheck: - foreach (List cellGroup in cellGroups) - { - if (breakCellCheck) break; // simulates 'break cellCheck;' - foreach (TableRectangle groupCell in cellGroup) - { - PdfPoint[] groupCellCorners = groupCell.Points; - PdfPoint[] candidateCorners = cell.Points; - - for (int i = 0; i < candidateCorners.Length; i++) - { - for (int j = 0; j < groupCellCorners.Length; j++) - { - //if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) - if (Distances.Euclidean(candidateCorners[i], groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) - { - cellGroup.Add(cell); - addedToGroup = true; - //break cellCheck; - breakCellCheck = true; - break; - } - } - } - } - } - - if (!addedToGroup) - { - List cellGroup = new List(); //ArrayList cellGroup = new ArrayList<>(); - cellGroup.Add(cell); - cellGroups.Add(cellGroup); - } - } - - // create table areas based on cell group - List tableAreas = new List(); //new ArrayList<>(); - foreach (List cellGroup in cellGroups) - { - // less than four cells should not make a table - if (cellGroup.Count < REQUIRED_CELLS_FOR_TABLE) - { - continue; - } - - double top = double.MinValue; // Float.MAX_VALUE; - double left = double.MaxValue; // Float.MAX_VALUE; - double bottom = double.MaxValue; // Float.MIN_VALUE; - double right = double.MinValue; // Float.MIN_VALUE; - - foreach (TableRectangle cell in cellGroup) - { - if (cell.Top > top) top = cell.Top; // (cell.getTop() < top) - if (cell.Left < left) left = cell.Left; - if (cell.Bottom < bottom) bottom = cell.Bottom; // (cell.getBottom() > bottom) - if (cell.Right > right) right = cell.Right; - } - - tableAreas.Add(new TableRectangle(top, left, right - left, bottom - top)); - } - - return tableAreas; - } - - private List getHorizontalRulings(object image) // BufferedImage - { - throw new NotImplementedException(); - - /* - // get all horizontal edges, which we'll define as a change in grayscale colour - // along a straight line of a certain length - List horizontalRulings = new List(); // ArrayList<>(); - - Raster r = image.getRaster(); - int width = r.getWidth(); - int height = r.getHeight(); - - for (int x = 0; x < width; x++) - { - int[] lastPixel = r.getPixel(x, 0, (int[])null); - - for (int y = 1; y < height - 1; y++) - { - int[] currPixel = r.getPixel(x, y, (int[])null); - - int diff = Math.Abs(currPixel[0] - lastPixel[0]); - if (diff > GRAYSCALE_INTENSITY_THRESHOLD) - { - // we hit what could be a line - // don't bother scanning it if we've hit a pixel in the line before - bool alreadyChecked = false; - foreach (var line in horizontalRulings) // - { - if (y == line.getY1() && x >= line.getX1() && x <= line.getX2()) - { - alreadyChecked = true; - break; - } - } - - if (alreadyChecked) - { - lastPixel = currPixel; - continue; - } - - int lineX = x + 1; - - while (lineX < width) - { - int[] linePixel = r.getPixel(lineX, y, (int[])null); - int[] abovePixel = r.getPixel(lineX, y - 1, (int[])null); - - if (Math.Abs(linePixel[0] - abovePixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD - || Math.Abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD) - { - break; - } - - lineX++; - } - - int endX = lineX - 1; - int lineWidth = endX - x; - if (lineWidth > HORIZONTAL_EDGE_WIDTH_MINIMUM) - { - horizontalRulings.Add(new Ruling(new PdfPoint(x, y), new PdfPoint(endX, y))); - } - } - - lastPixel = currPixel; - } - } - - return horizontalRulings; - */ - } - - private List getVerticalRulings(object image) // BufferedImage - { - throw new NotImplementedException(); - - /* - // get all vertical edges, which we'll define as a change in grayscale colour - // along a straight line of a certain length - List verticalRulings = new List();//new ArrayList<>(); - - Raster r = image.getRaster(); - int width = r.getWidth(); - int height = r.getHeight(); - - for (int y = 0; y < height; y++) - { - int[] lastPixel = r.getPixel(0, y, (int[])null); - - for (int x = 1; x < width - 1; x++) - { - int[] currPixel = r.getPixel(x, y, (int[])null); - - int diff = Math.Abs(currPixel[0] - lastPixel[0]); - if (diff > GRAYSCALE_INTENSITY_THRESHOLD) - { - // we hit what could be a line - // don't bother scanning it if we've hit a pixel in the line before - bool alreadyChecked = false; - foreach (var line in verticalRulings) - { - if (x == line.getX1() && y >= line.getY1() && y <= line.getY2()) - { - alreadyChecked = true; - break; - } - } - - if (alreadyChecked) - { - lastPixel = currPixel; - continue; - } - - int lineY = y + 1; - - while (lineY < height) - { - int[] linePixel = r.getPixel(x, lineY, (int[])null); - int[] leftPixel = r.getPixel(x - 1, lineY, (int[])null); - - if (Math.Abs(linePixel[0] - leftPixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD - || Math.Abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD) - { - break; - } - - lineY++; - } - - int endY = lineY - 1; - int lineLength = endY - y; - if (lineLength > VERTICAL_EDGE_HEIGHT_MINIMUM) - { - verticalRulings.Add(new Ruling(new PdfPoint(x, y), new PdfPoint(x, endY))); - } - } - - lastPixel = currPixel; - } - } - - return verticalRulings; - */ - } - - // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html - private PdfDocument removeText(Page page) - { - throw new NotImplementedException(); - /* - PDFStreamParser parser = new PDFStreamParser(page); - parser.parse(); - List tokens = parser.getTokens(); - List newTokens = new List(); //ArrayList<>(); - foreach (object token in tokens) - { - if (token is Operator op) // instanceof - { - //Operator op = (Operator)token; - if (op.getName().equals("TJ") || op.getName().equals("Tj")) - { - //remove the one argument to this operator - newTokens.remove(newTokens.size() - 1); - continue; - } - } - newTokens.Add(token); - } - - PdfDocument document = new PdfDocument(); - Page newPage = document.importPage(page); - newPage.setResources(page.getResources()); - - PDStream newContents = new PDStream(document); - OutputStream outp = newContents.createOutputStream(COSName.FLATE_DECODE); - ContentStreamWriter writer = new ContentStreamWriter(outp); - writer.writeTokens(newTokens); - outp.close(); - newPage.setContents(newContents); - return document; - */ } } } diff --git a/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs b/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs index ce9a3a4..00e9ca1 100644 --- a/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs +++ b/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs @@ -1,34 +1,42 @@ using System; using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; using System.Linq; -using System.Text; using Tabula.Extractors; -using UglyToad.PdfPig; -using UglyToad.PdfPig.Content; using UglyToad.PdfPig.Core; using UglyToad.PdfPig.DocumentLayoutAnalysis; namespace Tabula.Detectors { + /* + * ** tabula/detectors/NurminenDetectionAlgorithm.java ** + * Created by matt on 2015-12-17. + *

+ * Attempt at an implementation of the table finding algorithm described by + * Anssi Nurminen's master's thesis: + * http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 + */ + + ///

+ /// Simplified Nurminen detection algorithm. + /// Does not do any image processing. + /// public class SimpleNurminenDetectionAlgorithm : IDetectionAlgorithm { - //private static int GRAYSCALE_INTENSITY_THRESHOLD = 25; private static int HORIZONTAL_EDGE_WIDTH_MINIMUM = 50; private static int VERTICAL_EDGE_HEIGHT_MINIMUM = 10; private static int CELL_CORNER_DISTANCE_MAXIMUM = 10; - private static float POINT_SNAP_DISTANCE_THRESHOLD = 8f; - private static float TABLE_PADDING_AMOUNT = 1.0f; + private static double POINT_SNAP_DISTANCE_THRESHOLD = 8.0; + private static double TABLE_PADDING_AMOUNT = 1.0; private static int REQUIRED_TEXT_LINES_FOR_EDGE = 4; private static int REQUIRED_CELLS_FOR_TABLE = 4; - private static float IDENTICAL_TABLE_OVERLAP_RATIO = 0.9f; + private static double IDENTICAL_TABLE_OVERLAP_RATIO = 0.9; /// /// Helper class that encapsulates a text edge /// private class TextEdge { - public PdfLine Line; //: Line2D.Float + public PdfLine Line; // types of text edges public const int LEFT = 0; @@ -40,7 +48,7 @@ private class TextEdge public TextEdge(double x1, double y1, double x2, double y2) { - Line = new PdfLine(x1, y1, x2, y2); //super(x1, y1, x2, y2); + Line = new PdfLine(x1, y1, x2, y2); this.intersectingTextRowCount = 0; } @@ -57,7 +65,6 @@ private class TextEdges : List> { public TextEdges(List leftEdges, List midEdges, List rightEdges) : base(3) { - //super(3); this.Add(leftEdges); this.Add(midEdges); this.Add(rightEdges); @@ -80,7 +87,14 @@ public RelevantEdges(int edgeType, int edgeCount) } /// - /// + /// Simplified Nurminen detection algorithm. + /// Does not do any image processing. + /// + public SimpleNurminenDetectionAlgorithm() + { } + + /// + /// Detects the tables in the page. /// /// public List Detect(PageArea page) @@ -112,12 +126,10 @@ public List Detect(PageArea page) //for (Iterator iterator = rulings.iterator(); iterator.hasNext();) foreach (var ruling in rulings.ToList()) // use ToList to be able to remove { - //Ruling ruling = iterator.next(); - ruling.Normalize(); if (ruling.IsOblique) { - rulings.Remove(ruling);//iterator.remove(); + rulings.Remove(ruling); } } } @@ -194,7 +206,6 @@ public List Detect(PageArea page) //for (Iterator iterator = tableAreas.iterator(); iterator.hasNext();) foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove { - //Rectangle table = iterator.next(); bool intersectsText = false; foreach (TableLine textRow in lines) @@ -209,7 +220,6 @@ public List Detect(PageArea page) if (!intersectsText) { tableAreas.Remove(table); - //iterator.remove(); } } @@ -231,12 +241,10 @@ public List Detect(PageArea page) //for (Iterator iterator = lines.iterator(); iterator.hasNext();) foreach (var textRow in lines.ToList()) { - //TableLine textRow = iterator.next(); foreach (TableRectangle table in tableAreas) { if (table.Contains(textRow)) { - //iterator.remove(); lines.Remove(textRow); break; } @@ -281,7 +289,6 @@ public List Detect(PageArea page) // create a set of our current tables that will eliminate duplicate tables SortedSet tableSet = new SortedSet(new TreeSetComparer()); //Set tableSet = new TreeSet<>(new Comparator() {... - //tableSet.addAll(tableAreas); foreach (var table in tableAreas.OrderByDescending(t => t.Area)) { tableSet.Add(table); @@ -325,8 +332,6 @@ public int Compare(TableRectangle o1, TableRectangle o2) private TableRectangle getTableFromText(List lines, List relevantEdges, int relevantEdgeCount, List horizontalRulings) { - //var sortedHorizontalRulings = horizontalRulings.OrderByDescending(h => h.Y1).ToList(); // sort by Y, from top to bottom - TableRectangle table = new TableRectangle(); TableLine prevRow = null; @@ -346,7 +351,9 @@ private TableRectangle getTableFromText(List lines, List re // check to make sure this text row is within a line or so of the other lines already added // if it's not, we should stop the table here double tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5; - double lineDistance = textRow.Bottom - prevRow.Bottom; // bobld: Top Top + double lineDistance = prevRow.Bottom - textRow.Bottom; // bobld: textRow.Top - prevRow.Top + + System.Diagnostics.Debug.Assert(lineDistance >= 0); if (lineDistance > tableLineThreshold) { @@ -363,14 +370,9 @@ private TableRectangle getTableFromText(List lines, List re relativeEdgeDifferenceThreshold = 0; } - var rect = new TableLine(); - rect.SetLeft(Math.Floor(textRow.Left)); - rect.SetBottom(Math.Floor(textRow.Bottom)); - rect.SetRight(Math.Ceiling(textRow.Right)); - rect.SetTop(Math.Ceiling(textRow.Top)); foreach (TextEdge edge in relevantEdges) { - if (rect.IntersectsLine(edge.Line)) + if (textRow.IntersectsLine(edge.Line)) { numRelevantEdges++; } @@ -428,6 +430,7 @@ private TableRectangle getTableFromText(List lines, List re double avgRowHeight; if (tableSpaceCount > 0) { + System.Diagnostics.Debug.Assert(totalRowSpacing >= 0); avgRowHeight = totalRowSpacing / tableSpaceCount; } else @@ -448,6 +451,7 @@ private TableRectangle getTableFromText(List lines, List re } double distanceFromTable = table.Bottom - ruling.Y2; // bobld: Y1 + System.Diagnostics.Debug.Assert(distanceFromTable >= 0); if (distanceFromTable <= rowHeightThreshold) { // use this ruling to help define the table @@ -469,7 +473,7 @@ private TableRectangle getTableFromText(List lines, List re //for (int i = horizontalRulings.Count - 1; i >= 0; i--) for (int i = 0; i < horizontalRulings.Count; i++) { - Ruling ruling = horizontalRulings[i]; //.get(i); // Line2D.Float + Ruling ruling = horizontalRulings[i]; if (ruling.Y1 < table.Top) //bobld: > { @@ -477,6 +481,7 @@ private TableRectangle getTableFromText(List lines, List re } double distanceFromTable = ruling.Y1 - table.Top; // bobld: table.Top - ruling.Y1 + System.Diagnostics.Debug.Assert(distanceFromTable >= 0); if (distanceFromTable <= rowHeightThreshold) { table.SetTop(Math.Max(table.Top, ruling.Y2)); // bobld: Min Y1 @@ -505,7 +510,7 @@ private RelevantEdges getRelevantEdges(TextEdges textEdges, List line List rightTextEdges = textEdges[TextEdge.RIGHT]; // first we'll find the number of lines each type of edge crosses - int[][] edgeCountsPerLine = new int[lines.Count][]; //[TextEdge.NUM_TYPES]; + int[][] edgeCountsPerLine = new int[lines.Count][]; for (int i = 0; i < edgeCountsPerLine.Length; i++) { edgeCountsPerLine[i] = new int[TextEdge.NUM_TYPES]; @@ -582,8 +587,8 @@ private TextEdges getTextEdges(List lines) { if (text.GetText().Equals("")) continue; // added by bobld - int left = (int)Math.Floor(text.Left); //.getLeft())); - int right = (int)Math.Floor(text.Right); //.getRight())); + int left = (int)Math.Floor(text.Left); + int right = (int)Math.Floor(text.Right); int mid = (int)(left + ((right - left) / 2)); // first put this chunk into any edge buckets it belongs to @@ -612,17 +617,15 @@ private TextEdges getTextEdges(List lines) //for (Iterator>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext();) foreach (var entry in currLeftEdges.ToList()) // use tolist to be able to remove { - //Map.Entry> entry = iterator.next(); - int key = entry.Key; //.getKey(); + int key = entry.Key; if (key > left && key < right) { - //iterator.remove(); currLeftEdges.Remove(key); - List edgeChunks = entry.Value; //.getValue(); + List edgeChunks = entry.Value; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); @@ -635,17 +638,15 @@ private TextEdges getTextEdges(List lines) //for (Iterator>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext();) foreach (var entry in currMidEdges.ToList()) { - //Map.Entry> entry = iterator.next(); - int key = entry.Key; //.getKey(); + int key = entry.Key; if (key > left && key < right && Math.Abs(key - mid) > 2) { - //iterator.remove(); currMidEdges.Remove(key); - List edgeChunks = entry.Value; //.getValue(); + List edgeChunks = entry.Value; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); @@ -658,17 +659,15 @@ private TextEdges getTextEdges(List lines) //for (Iterator>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext();) foreach (var entry in currRightEdges.ToList()) { - //Map.Entry> entry = iterator.next(); - int key = entry.Key; //.getKey(); + int key = entry.Key; if (key > left && key < right) { - //iterator.remove(); currRightEdges.Remove(key); - List edgeChunks = entry.Value; //.getValue(); + List edgeChunks = entry.Value; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); @@ -681,13 +680,13 @@ private TextEdges getTextEdges(List lines) } // add the leftovers - foreach (int key in currLeftEdges.Keys) //.keySet()) + foreach (int key in currLeftEdges.Keys) { - List edgeChunks = currLeftEdges[key]; //.get(key); + List edgeChunks = currLeftEdges[key]; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); @@ -696,13 +695,13 @@ private TextEdges getTextEdges(List lines) } } - foreach (int key in currMidEdges.Keys)//.keySet()) + foreach (int key in currMidEdges.Keys) { - List edgeChunks = currMidEdges[key]; //.get(key); + List edgeChunks = currMidEdges[key]; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom); edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); @@ -711,13 +710,13 @@ private TextEdges getTextEdges(List lines) } } - foreach (int key in currRightEdges.Keys) //.keySet()) + foreach (int key in currRightEdges.Keys) { - List edgeChunks = currRightEdges[key]; //.get(key); + List edgeChunks = currRightEdges[key]; if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); @@ -729,26 +728,22 @@ private TextEdges getTextEdges(List lines) return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges); } - [Obsolete("Is it redundant??????")] private List getTableAreasFromCells(List cells) { - List> cellGroups = new List>(); //ArrayList<>(); + List> cellGroups = new List>(); foreach (TableRectangle cell in cells) { bool addedToGroup = false; foreach (List cellGroup in cellGroups) { - //if (!cellCheck) break; // equivalent to break cellCheck; foreach (TableRectangle groupCell in cellGroup) { - //if (!cellCheck) break; // equivalent to break cellCheck; PdfPoint[] groupCellCorners = groupCell.Points; PdfPoint[] candidateCorners = cell.Points; for (int i = 0; i < candidateCorners.Length; i++) { - //if (!cellCheck) break; // equivalent to break cellCheck; for (int j = 0; j < groupCellCorners.Length; j++) { //if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) @@ -756,8 +751,6 @@ private List getTableAreasFromCells(List cells) { cellGroup.Add(cell); addedToGroup = true; - //cellCheck = false; - //break;// cellCheck; goto cellCheck; } } @@ -768,14 +761,14 @@ private List getTableAreasFromCells(List cells) cellCheck: if (!addedToGroup) { - List cellGroup = new List(); //ArrayList<>(); + List cellGroup = new List(); cellGroup.Add(cell); cellGroups.Add(cellGroup); } } // create table areas based on cell group - List tableAreas = new List(); //ArrayList<>(); + List tableAreas = new List(); foreach (List cellGroup in cellGroups) { // less than four cells should not make a table @@ -784,7 +777,6 @@ private List getTableAreasFromCells(List cells) continue; } - // warning below: min/max and top/bottom double top = double.MinValue; // bobld: MaxValue double left = double.MaxValue; double bottom = double.MaxValue; // bobld: MinValue @@ -798,8 +790,7 @@ private List getTableAreasFromCells(List cells) if (cell.Right > right) right = cell.Right; } - //below is deprecated - tableAreas.Add(new TableRectangle(new PdfRectangle(left, bottom, right, top))); //top, left, right - left, bottom - top)); + tableAreas.Add(new TableRectangle(new PdfRectangle(left, bottom, right, top))); } return tableAreas; @@ -828,7 +819,6 @@ private List getHorizontalRulings(IReadOnlyList rulings) } return horizontalRulings; - //return Ruling.CollapseOrientedRulings(horizontalR).Where(h => h.Width > HORIZONTAL_EDGE_WIDTH_MINIMUM).ToList(); } private List getVerticalRulings(IReadOnlyList rulings) @@ -852,7 +842,6 @@ private List getVerticalRulings(IReadOnlyList rulings) verticalRulings.Add(new Ruling(new PdfPoint(r.Left, startY), new PdfPoint(r.Right, endY))); } } - //return Ruling.CollapseOrientedRulings(verticalR).Where(v => v.Height > VERTICAL_EDGE_HEIGHT_MINIMUM).ToList(); return verticalRulings; } } diff --git a/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs b/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs index 77e08ae..722f339 100644 --- a/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs +++ b/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs @@ -24,13 +24,10 @@ public List Detect(PageArea page) { List cells = SpreadsheetExtractionAlgorithm.FindCells(page.HorizontalRulings, page.VerticalRulings); - //SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); - List tables = SpreadsheetExtractionAlgorithm.FindSpreadsheetsFromCells(cells.Cast().ToList()); // we want tables to be returned from top to bottom on the page - //Collections.sort(tables, TableRectangle.ILL_DEFINED_ORDER); - Utils.Sort(tables, new TableRectangle.ILL_DEFINED_ORDER()); // tables.Sort(new TableRectangle.ILL_DEFINED_ORDER()); + Utils.Sort(tables, new TableRectangle.ILL_DEFINED_ORDER()); return tables; } } diff --git a/Tabula/Tabula.xml b/Tabula/Tabula.xml index 973839d..aaffd3c 100644 --- a/Tabula/Tabula.xml +++ b/Tabula/Tabula.xml @@ -63,19 +63,10 @@ Nurminen detection algorithm. - + - Helper class that encapsulates a text edge - - - - - Helper container for all text edges on a page - - - - - Helper container for relevant text edge info + Simplified Nurminen detection algorithm. + Does not do any image processing. @@ -93,9 +84,15 @@ Helper container for relevant text edge info + + + Simplified Nurminen detection algorithm. + Does not do any image processing. + + - + Detects the tables in the page. From e9584ea749e050b30e822f769b91e0fad747ae04 Mon Sep 17 00:00:00 2001 From: BobLd Date: Tue, 22 Sep 2020 17:10:15 +0100 Subject: [PATCH 4/4] switch to v0.1.0-alpha002 --- Tabula.Csv/Tabula.Csv.csproj | 4 ++-- Tabula.Json/Tabula.Json.csproj | 4 ++-- Tabula/Tabula.csproj | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Tabula.Csv/Tabula.Csv.csproj b/Tabula.Csv/Tabula.Csv.csproj index e5fca44..061fe0e 100644 --- a/Tabula.Csv/Tabula.Csv.csproj +++ b/Tabula.Csv/Tabula.Csv.csproj @@ -1,10 +1,10 @@ - + netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47 Extract tables from PDF files (port of tabula-java using PdfPig). Csv and Tsv writers. https://github.com/BobLd/tabula-sharp - 0.1.0-alpha001 + 0.1.0-alpha002 BobLd diff --git a/Tabula.Json/Tabula.Json.csproj b/Tabula.Json/Tabula.Json.csproj index efe0ffa..d7344e3 100644 --- a/Tabula.Json/Tabula.Json.csproj +++ b/Tabula.Json/Tabula.Json.csproj @@ -1,10 +1,10 @@ - + netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47 Extract tables from PDF files (port of tabula-java using PdfPig). Json writer. https://github.com/BobLd/tabula-sharp - 0.1.0-alpha001 + 0.1.0-alpha002 BobLd BobLd diff --git a/Tabula/Tabula.csproj b/Tabula/Tabula.csproj index c596522..7e4fc80 100644 --- a/Tabula/Tabula.csproj +++ b/Tabula/Tabula.csproj @@ -6,7 +6,7 @@ https://github.com/BobLd/tabula-sharp 0.1.0.0 0.1.0.0 - 0.1.0-alpha001 + 0.1.0-alpha002 BobLd BobLd