diff --git a/Tabula.Csv/Tabula.Csv.csproj b/Tabula.Csv/Tabula.Csv.csproj index d39a5c8..061fe0e 100644 --- a/Tabula.Csv/Tabula.Csv.csproj +++ b/Tabula.Csv/Tabula.Csv.csproj @@ -4,7 +4,8 @@ netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47 Extract tables from PDF files (port of tabula-java using PdfPig). Csv and Tsv writers. https://github.com/BobLd/tabula-sharp - 0.1.0-alpha001 + 0.1.0-alpha002 + BobLd diff --git a/Tabula.Json/Tabula.Json.csproj b/Tabula.Json/Tabula.Json.csproj index 2cd4422..d7344e3 100644 --- a/Tabula.Json/Tabula.Json.csproj +++ b/Tabula.Json/Tabula.Json.csproj @@ -4,7 +4,9 @@ netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47 Extract tables from PDF files (port of tabula-java using PdfPig). Json writer. https://github.com/BobLd/tabula-sharp - 0.1.0-alpha001 + 0.1.0-alpha002 + BobLd + BobLd diff --git a/Tabula.Tests/TestsNurminenDetector.cs b/Tabula.Tests/TestsNurminenDetector.cs new file mode 100644 index 0000000..26dca81 --- /dev/null +++ b/Tabula.Tests/TestsNurminenDetector.cs @@ -0,0 +1,33 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Tabula.Detectors; +using Tabula.Extractors; +using UglyToad.PdfPig; +using Xunit; + +namespace Tabula.Tests +{ + public class TestsNurminenDetector + { + [Fact(Skip = "TO DO")] + public void TestLinesToCells() + { + using (PdfDocument document = PdfDocument.Open(@"test3.pdf", new ParsingOptions() { ClipPaths = true })) + { + ObjectExtractor oe = new ObjectExtractor(document); + PageArea page = oe.Extract(1); + + SimpleNurminenDetectionAlgorithm detector = new SimpleNurminenDetectionAlgorithm(); + var regions = detector.Detect(page); + + foreach (var a in regions) + { + IExtractionAlgorithm ea = new BasicExtractionAlgorithm(); + var newArea = page.GetArea(a.BoundingBox); + List tables = ea.Extract(newArea); + } + } + } + } +} diff --git a/Tabula/Detectors/NurminenDetectionAlgorithm.cs b/Tabula/Detectors/NurminenDetectionAlgorithm.cs index 0b28550..6356321 100644 --- a/Tabula/Detectors/NurminenDetectionAlgorithm.cs +++ b/Tabula/Detectors/NurminenDetectionAlgorithm.cs @@ -1,10 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; -using UglyToad.PdfPig; -using UglyToad.PdfPig.Content; -using UglyToad.PdfPig.Core; -using UglyToad.PdfPig.DocumentLayoutAnalysis; namespace Tabula.Detectors { @@ -22,1007 +17,9 @@ namespace Tabula.Detectors /// public class NurminenDetectionAlgorithm : IDetectionAlgorithm { - private static int GRAYSCALE_INTENSITY_THRESHOLD = 25; - private static int HORIZONTAL_EDGE_WIDTH_MINIMUM = 50; - private static int VERTICAL_EDGE_HEIGHT_MINIMUM = 10; - private static int CELL_CORNER_DISTANCE_MAXIMUM = 10; - private static float POINT_SNAP_DISTANCE_THRESHOLD = 8f; - private static float TABLE_PADDING_AMOUNT = 1.0f; - private static int REQUIRED_TEXT_LINES_FOR_EDGE = 4; - private static int REQUIRED_CELLS_FOR_TABLE = 4; - private static float IDENTICAL_TABLE_OVERLAP_RATIO = 0.9f; - - /// - /// Helper class that encapsulates a text edge - /// - private class TextEdge // static - { - public readonly PdfLine line; - // types of text edges - public const int LEFT = 0; - public const int MID = 1; - public const int RIGHT = 2; - public const int NUM_TYPES = 3; - - public int intersectingTextRowCount; - - public TextEdge(double x1, double y1, double x2, double y2) - { - this.line = new PdfLine(x1, y1, x2, y2); // bobld: careful with order here - //super(x1, y1, x2, y2); - this.intersectingTextRowCount = 0; - } - } - - /// - /// Helper container for all text edges on a page - /// - private class TextEdges : List> // ArrayList> // static - { - public TextEdges(List leftEdges, List midEdges, List rightEdges) - : base(3) - { - //super(3); - this.Add(leftEdges); - this.Add(midEdges); - this.Add(rightEdges); - } - } - - /// - /// Helper container for relevant text edge info - /// - private class RelevantEdges // static - { - public int edgeType; - public int edgeCount; - - public RelevantEdges(int edgeType, int edgeCount) - { - this.edgeType = edgeType; - this.edgeCount = edgeCount; - } - } - public List Detect(PageArea page) { throw new NotImplementedException(); - - /* - // get horizontal & vertical lines - // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF - // instructions that are interpreted incorrectly as visible elements - we really want to capture what a - // person sees when they look at the PDF - BufferedImage image; - Page pdfPage = page.getPDPage(); - try - { - image = Utils.pageConvertToImage(page.getPDDoc(), pdfPage, 144, ImageType.GRAY); - } - catch (IOException e) - { - return new List(); //ArrayList<>(); - } - - List horizontalRulings = this.getHorizontalRulings(image); - - // now check the page for vertical lines, but remove the text first to make things less confusing - PdfDocument removeTextDocument = null; - try - { - removeTextDocument = this.removeText(pdfPage); - pdfPage = removeTextDocument.GetPage(1); //.getPage(0); - image = Utils.pageConvertToImage(removeTextDocument, pdfPage, 144); //, ImageType.GRAY); - } - catch (Exception e) - { - return new List(); //ArrayList<>(); - } - finally - { - if (removeTextDocument != null) - { - try - { - removeTextDocument.Dispose(); //.close(); - } - catch (IOException e) - { - // TODO Auto-generated catch block - // e.printStackTrace(); - } - } - } - - List verticalRulings = this.getVerticalRulings(image); - - List allEdges = new List(horizontalRulings); - allEdges.AddRange(verticalRulings); - - List tableAreas = new List(); - - // if we found some edges, try to find some tables based on them - if (allEdges.Count > 0) - { - // now we need to snap edge endpoints to a grid - Utils.snapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD); - - // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings - foreach (List rulings in new[] { horizontalRulings, verticalRulings }) // Arrays.asList(horizontalRulings, verticalRulings)) - { - //foreach (Iterator iterator = rulings.iterator(); iterator.hasNext();) - foreach (var ruling in rulings.ToList()) // ToList() to do a copy to allow remove in original - { - //Ruling ruling = iterator.next(); - - ruling.normalize(); - if (ruling.oblique()) - { - rulings.Remove(ruling); //iterator.remove(); - } - } - } - - // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier - // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the - // edge detection/pixel snapping steps - horizontalRulings = Ruling.collapseOrientedRulings(horizontalRulings, 5); - verticalRulings = Ruling.collapseOrientedRulings(verticalRulings, 5); - - // use the rulings and points to find cells - var cells = SpreadsheetExtractionAlgorithm.findCells(horizontalRulings, verticalRulings); // List - - // then use those cells to make table areas - tableAreas = this.getTableAreasFromCells(cells.Cast().ToList()); - } - - // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as - // cells if there are missing horizontal lines (which there often are) - // let's assume though that these lines should be part of the table - foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float - { - foreach (TableRectangle tableArea in tableAreas) - { - if (verticalRuling.intersects(tableArea) && !(tableArea.contains(verticalRuling.getP1()) && tableArea.contains(verticalRuling.getP2()))) - { - tableArea.setTop((float)Math.Floor(Math.Max(tableArea.getTop(), verticalRuling.getY1()))); // min - tableArea.setBottom((float)Math.Ceiling(Math.Min(tableArea.getBottom(), verticalRuling.getY2()))); // max - break; - } - } - } - - // the tabula Page coordinate space is half the size of the PDFBox image coordinate space - // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything - foreach (TableRectangle area in tableAreas) - { - area.x = (float)Math.Floor(area.x / 2) - TABLE_PADDING_AMOUNT; - area.y = (float)Math.Floor(area.y / 2) - TABLE_PADDING_AMOUNT; - area.width = (float)Math.Ceiling(area.width / 2) + TABLE_PADDING_AMOUNT; - area.height = (float)Math.Ceiling(area.height / 2) + TABLE_PADDING_AMOUNT; - } - - // we're going to want halved horizontal lines later too - foreach (Ruling ruling in horizontalRulings) // Line2D.Float - { - ruling.x1 = ruling.x1 / 2; - ruling.y1 = ruling.y1 / 2; - ruling.x2 = ruling.x2 / 2; - ruling.y2 = ruling.y2 / 2; - } - - // now look at text rows to help us find more tables and flesh out existing ones - List textChunks = TextElement.mergeWords(page.getText()); - List lines = TextChunk.groupByLines(textChunks); - - // first look for text rows that intersect an existing table - those lines should probably be part of the table - foreach (TableLine textRow in lines) - { - foreach (TableRectangle tableArea in tableAreas) - { - if (!tableArea.contains(textRow) && textRow.intersects(tableArea)) - { - tableArea.setLeft((float)Math.Floor(Math.Min(textRow.getLeft(), tableArea.getLeft()))); - tableArea.setRight((float)Math.Ceiling(Math.Max(textRow.getRight(), tableArea.getRight()))); - } - } - } - - // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic - //for (Iterator iterator = tableAreas.iterator(); iterator.hasNext();) - foreach (var table in tableAreas.ToList()) // ToList() to do a copy to allow remove in original - { - //TableRectangle table = iterator.next(); - - bool intersectsText = false; - foreach (TableLine textRow in lines) - { - if (table.intersects(textRow)) - { - intersectsText = true; - break; - } - } - - if (!intersectsText) - { - //iterator.remove(); - tableAreas.Remove(table); - } - } - - // lastly, there may be some tables that don't have any vertical rulings at all - // we'll use text edges we've found to try and guess which text rows are part of a table - - // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table - // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects - // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be - // part of a table. - - bool foundTable; - - do - { - foundTable = false; - - // get rid of any text lines contained within existing tables, this allows us to find more tables - //for (Iterator iterator = lines.iterator(); iterator.hasNext();) - foreach (var textRow in lines) - { - //TableLine textRow = iterator.next(); - foreach (TableRectangle table in tableAreas.ToList()) // ToList() to do a copy to allow remove in original - { - if (table.contains(textRow)) - { - //iterator.remove(); - lines.Remove(textRow); - break; - } - } - } - - // get text edges from remaining lines in the document - TextEdges textEdges = this.getTextEdges(lines); - List leftTextEdges = textEdges.get(TextEdge.LEFT); - List midTextEdges = textEdges.get(TextEdge.MID); - List rightTextEdges = textEdges.get(TextEdge.RIGHT); - - // find the relevant text edges (the ones we think define where a table is) - RelevantEdges relevantEdgeInfo = this.getRelevantEdges(textEdges, lines); - - // we found something relevant so let's look for rows that fit our criteria - if (relevantEdgeInfo.edgeType != -1) - { - List relevantEdges = null; - switch (relevantEdgeInfo.edgeType) - { - case TextEdge.LEFT: - relevantEdges = leftTextEdges; - break; - case TextEdge.MID: - relevantEdges = midTextEdges; - break; - case TextEdge.RIGHT: - relevantEdges = rightTextEdges; - break; - } - - TableRectangle table = this.getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings); - - if (table != null) - { - foundTable = true; - tableAreas.Add(table); - } - } - } while (foundTable); - - // create a set of our current tables that will eliminate duplicate tables - // Set tableSet = new TreeSet<>(new Comparator() { - // not sure if works with sorted set?? - SortedSet tableSet = new SortedSet(new TreeSetRectangleComparer()); - - //tableSet.addAll(tableAreas); - foreach (var ta in tableAreas) - { - tableSet.Add(ta); - } - - return new List(tableSet); //ArrayList<>(tableSet); - */ - } - - public class TreeSetRectangleComparer : IComparer - { - public int Compare(TableRectangle o1, TableRectangle o2) - { - if (o1.Equals(o2)) - { - return 0; - } - - // o1 is "equal" to o2 if o2 contains all of o1 - if (o2.Contains(o1)) - { - return 0; - } - - if (o1.Contains(o2)) - { - return 0; - } - - // otherwise see if these tables are "mostly" the same - double overlap = o1.OverlapRatio(o2); - if (overlap >= IDENTICAL_TABLE_OVERLAP_RATIO) - { - return 0; - } - else - { - return 1; - } - } - } - - private TableRectangle getTableFromText(List lines, - List relevantEdges, - int relevantEdgeCount, - List horizontalRulings) - { - - TableRectangle table = new TableRectangle(); - - TableLine prevRow = null; - TableLine firstTableRow = null; - TableLine lastTableRow = null; - - int tableSpaceCount = 0; - double totalRowSpacing = 0; - - // go through the lines and find the ones that have the correct count of the relevant edges - foreach (TableLine textRow in lines) - { - int numRelevantEdges = 0; - - if (firstTableRow != null && tableSpaceCount > 0) - { - // check to make sure this text row is within a line or so of the other lines already added - // if it's not, we should stop the table here - double tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5; - double lineDistance = textRow.Top - prevRow.Top; - - if (lineDistance > tableLineThreshold) - { - lastTableRow = prevRow; - break; - } - } - - // for larger tables, be a little lenient on the number of relevant rows the text intersects - // for smaller tables, not so much - otherwise we'll end up treating paragraphs as tables too - int relativeEdgeDifferenceThreshold = 1; - if (relevantEdgeCount <= 3) - { - relativeEdgeDifferenceThreshold = 0; - } - - foreach (TextEdge edge in relevantEdges) - { - if (textRow.IntersectsLine(edge.line)) - { - numRelevantEdges++; - } - } - - // see if we have a candidate text row - if (numRelevantEdges >= (relevantEdgeCount - relativeEdgeDifferenceThreshold)) - { - // keep track of table row spacing - if (prevRow != null && firstTableRow != null) - { - tableSpaceCount++; - totalRowSpacing += textRow.Top - prevRow.Top; - } - - // row is part of a table - if (table.Area == 0) - { - firstTableRow = textRow; - table.SetRect(textRow); - } - else - { - table.SetLeft(Math.Min(table.Left, textRow.Left)); - table.SetBottom(Math.Min(table.Bottom, textRow.Bottom)); // max - table.SetRight(Math.Max(table.Right, textRow.Right)); - } - } - else - { - // no dice - // if we're at the end of the table, save the last row - if (firstTableRow != null && lastTableRow == null) - { - lastTableRow = prevRow; - } - } - - prevRow = textRow; - } - - // if we don't have a table now, we won't after the next step either - if (table.Area == 0) - { - return null; - } - - if (lastTableRow == null) - { - // takes care of one-row tables or tables that end at the bottom of a page - lastTableRow = prevRow; - } - - // use the average row height and nearby horizontal lines to extend the table area - double avgRowHeight; - if (tableSpaceCount > 0) - { - avgRowHeight = totalRowSpacing / tableSpaceCount; - } - else - { - avgRowHeight = lastTableRow.Height; - } - - double rowHeightThreshold = avgRowHeight * 1.5; - - // check lines after the bottom of the table - foreach (Ruling ruling in horizontalRulings) // Line2D.Float - { - - if (ruling.Y1 < table.Bottom) - { - continue; - } - - double distanceFromTable = ruling.Y1 - table.Bottom; - if (distanceFromTable <= rowHeightThreshold) - { - // use this ruling to help define the table - table.SetBottom(Math.Min(table.Bottom, ruling.Y1)); // max - table.SetLeft(Math.Min(table.Left, ruling.X1)); - table.SetRight(Math.Max(table.Right, ruling.X2)); - } - else - { - // no use checking any further - break; - } - } - - // do the same for lines at the top, but make the threshold greater since table headings tend to be - // larger to fit up to three-ish rows of text (at least but we don't want to grab too much) - rowHeightThreshold = avgRowHeight * 3.8f; - - for (int i = horizontalRulings.Count - 1; i >= 0; i--) - { - Ruling ruling = horizontalRulings[i];//.get(i); Line2D.Float - - if (ruling.Y1 > table.Top) // bobld or lines) - { - List leftTextEdges = textEdges[TextEdge.LEFT]; //.get(TextEdge.LEFT); - List midTextEdges = textEdges[TextEdge.MID]; //.get(TextEdge.MID); - List rightTextEdges = textEdges[TextEdge.RIGHT]; //.get(TextEdge.RIGHT); - - // first we'll find the number of lines each type of edge crosses - int[][] edgeCountsPerLine = new int[lines.Count][]; //[TextEdge.NUM_TYPES]; - for (int i = 0; i < edgeCountsPerLine.Length; i++) - { - edgeCountsPerLine[i] = new int[TextEdge.NUM_TYPES]; - } - - foreach (TextEdge edge in leftTextEdges) - { - edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.LEFT]++; - } - - foreach (TextEdge edge in midTextEdges) - { - edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.MID]++; - } - - foreach (TextEdge edge in rightTextEdges) - { - edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.RIGHT]++; - } - - // now let's find the relevant edge type and the number of those edges we should look for - // we'll only take a minimum of two edges to look for tables - int relevantEdgeType = -1; - int relevantEdgeCount = 0; - for (int i = edgeCountsPerLine.Length - 1; i > 2; i--) - { - if (edgeCountsPerLine[i][TextEdge.LEFT] > 2 && - edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.RIGHT] && - edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.MID]) - { - relevantEdgeCount = edgeCountsPerLine[i][TextEdge.LEFT]; - relevantEdgeType = TextEdge.LEFT; - break; - } - - if (edgeCountsPerLine[i][TextEdge.RIGHT] > 1 && - edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.LEFT] && - edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.MID]) - { - relevantEdgeCount = edgeCountsPerLine[i][TextEdge.RIGHT]; - relevantEdgeType = TextEdge.RIGHT; - break; - } - - if (edgeCountsPerLine[i][TextEdge.MID] > 1 && - edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.RIGHT] && - edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.LEFT]) - { - relevantEdgeCount = edgeCountsPerLine[i][TextEdge.MID]; - relevantEdgeType = TextEdge.MID; - break; - } - } - - return new RelevantEdges(relevantEdgeType, relevantEdgeCount); - } - - private TextEdges getTextEdges(List lines) - { - // get all text edges (lines that align with the left, middle and right of chunks of text) that extend - // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text - List leftTextEdges = new List(); // ArrayList<>(); - List midTextEdges = new List(); //ArrayList<>(); - List rightTextEdges = new List(); // ArrayList<>(); - - var currLeftEdges = new Dictionary>(); // Map> currLeftEdges = new HashMap<>(); - var currMidEdges = new Dictionary>(); // Map> currMidEdges = new HashMap<>(); - var currRightEdges = new Dictionary>(); // Map> currRightEdges = new HashMap<>(); - - foreach (TableLine textRow in lines) - { - foreach (TextChunk text in textRow.TextElements) - { - int left = (int)Math.Floor(text.Left); // new Integer( - int right = (int)Math.Floor(text.Right); //new Integer( - int mid = left + ((right - left) / 2);//new Integer( - - // first put this chunk into any edge buckets it belongs to - List leftEdge = currLeftEdges[left]; //.get(left); - if (leftEdge == null) - { - leftEdge = new List(); //ArrayList<>(); - currLeftEdges[left] = leftEdge; //.put(left, leftEdge); - } - leftEdge.Add(text); - - List midEdge = currMidEdges[mid];//.get(mid); - if (midEdge == null) - { - midEdge = new List(); //ArrayList<>(); - currMidEdges[mid] = midEdge; //.put(mid, midEdge); - } - midEdge.Add(text); - - List rightEdge = currRightEdges[right]; //.get(right); - if (rightEdge == null) - { - rightEdge = new List(); //ArrayList<>(); - currRightEdges[right] = rightEdge; //.put(right, rightEdge); - } - rightEdge.Add(text); - - // now see if this text chunk blows up any other edges - //for (Iterator>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext();) - foreach (var entry in currLeftEdges.ToList()) - { - //Map.Entry> entry = iterator.next(); - int key = entry.Key; //.getKey(); - if (key > left && key < right) - { - //iterator.remove(); - currLeftEdges.Remove(key); - List edgeChunks = entry.Value; //.getValue(); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0];//.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1];//.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - leftTextEdges.Add(edge); - } - } - } - - //for (Iterator>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext();) - foreach (var entry in currMidEdges.ToList()) - { - //Map.Entry> entry = iterator.next(); - int key = entry.Key; //.getKey(); - if (key > left && key < right && Math.Abs(key - mid) > 2) - { - //iterator.remove(); - currMidEdges.Remove(key); - List edgeChunks = entry.Value; //.getValue(); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0];//.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - midTextEdges.Add(edge); - } - } - } - - //for (Iterator>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext();) - foreach (var entry in currRightEdges.ToList()) - { - //Map.Entry> entry = iterator.next(); - int key = entry.Key; //.getKey(); - if (key > left && key < right) - { - //iterator.remove(); - currRightEdges.Remove(key); - List edgeChunks = entry.Value; //.getValue(); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0];//.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - rightTextEdges.Add(edge); - } - } - } - } - } - - // add the leftovers - //foreach (Integer key in currLeftEdges.keySet()) - foreach (var key in currLeftEdges.Keys) - { - List edgeChunks = currLeftEdges[key]; //.get(key); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - leftTextEdges.Add(edge); - } - } - - foreach (int key in currMidEdges.Keys) //.keySet()) - { - List edgeChunks = currMidEdges[key]; // .get(key); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0]; //.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1];//.get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - midTextEdges.Add(edge); - } - } - - foreach (int key in currRightEdges.Keys) //.keySet()) - { - List edgeChunks = currRightEdges[key]; //.get(key); - if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) - { - TextChunk first = edgeChunks[0];//.get(0); - TextChunk last = edgeChunks[edgeChunks.Count - 1]; // .get(edgeChunks.size() - 1); - - TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom); - edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); - - rightTextEdges.Add(edge); - } - } - - return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges); - } - - private List getTableAreasFromCells(List cells) - { - List> cellGroups = new List>(); // ArrayList<>(); - foreach (TableRectangle cell in cells) - { - bool addedToGroup = false; - - bool breakCellCheck = false; - //cellCheck: - foreach (List cellGroup in cellGroups) - { - if (breakCellCheck) break; // simulates 'break cellCheck;' - foreach (TableRectangle groupCell in cellGroup) - { - PdfPoint[] groupCellCorners = groupCell.Points; - PdfPoint[] candidateCorners = cell.Points; - - for (int i = 0; i < candidateCorners.Length; i++) - { - for (int j = 0; j < groupCellCorners.Length; j++) - { - //if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) - if (Distances.Euclidean(candidateCorners[i], groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) - { - cellGroup.Add(cell); - addedToGroup = true; - //break cellCheck; - breakCellCheck = true; - break; - } - } - } - } - } - - if (!addedToGroup) - { - List cellGroup = new List(); //ArrayList cellGroup = new ArrayList<>(); - cellGroup.Add(cell); - cellGroups.Add(cellGroup); - } - } - - // create table areas based on cell group - List tableAreas = new List(); //new ArrayList<>(); - foreach (List cellGroup in cellGroups) - { - // less than four cells should not make a table - if (cellGroup.Count < REQUIRED_CELLS_FOR_TABLE) - { - continue; - } - - double top = double.MinValue; // Float.MAX_VALUE; - double left = double.MaxValue; // Float.MAX_VALUE; - double bottom = double.MaxValue; // Float.MIN_VALUE; - double right = double.MinValue; // Float.MIN_VALUE; - - foreach (TableRectangle cell in cellGroup) - { - if (cell.Top > top) top = cell.Top; // (cell.getTop() < top) - if (cell.Left < left) left = cell.Left; - if (cell.Bottom < bottom) bottom = cell.Bottom; // (cell.getBottom() > bottom) - if (cell.Right > right) right = cell.Right; - } - - tableAreas.Add(new TableRectangle(top, left, right - left, bottom - top)); - } - - return tableAreas; - } - - private List getHorizontalRulings(object image) // BufferedImage - { - throw new NotImplementedException(); - - /* - // get all horizontal edges, which we'll define as a change in grayscale colour - // along a straight line of a certain length - List horizontalRulings = new List(); // ArrayList<>(); - - Raster r = image.getRaster(); - int width = r.getWidth(); - int height = r.getHeight(); - - for (int x = 0; x < width; x++) - { - int[] lastPixel = r.getPixel(x, 0, (int[])null); - - for (int y = 1; y < height - 1; y++) - { - int[] currPixel = r.getPixel(x, y, (int[])null); - - int diff = Math.Abs(currPixel[0] - lastPixel[0]); - if (diff > GRAYSCALE_INTENSITY_THRESHOLD) - { - // we hit what could be a line - // don't bother scanning it if we've hit a pixel in the line before - bool alreadyChecked = false; - foreach (var line in horizontalRulings) // - { - if (y == line.getY1() && x >= line.getX1() && x <= line.getX2()) - { - alreadyChecked = true; - break; - } - } - - if (alreadyChecked) - { - lastPixel = currPixel; - continue; - } - - int lineX = x + 1; - - while (lineX < width) - { - int[] linePixel = r.getPixel(lineX, y, (int[])null); - int[] abovePixel = r.getPixel(lineX, y - 1, (int[])null); - - if (Math.Abs(linePixel[0] - abovePixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD - || Math.Abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD) - { - break; - } - - lineX++; - } - - int endX = lineX - 1; - int lineWidth = endX - x; - if (lineWidth > HORIZONTAL_EDGE_WIDTH_MINIMUM) - { - horizontalRulings.Add(new Ruling(new PdfPoint(x, y), new PdfPoint(endX, y))); - } - } - - lastPixel = currPixel; - } - } - - return horizontalRulings; - */ - } - - private List getVerticalRulings(object image) // BufferedImage - { - throw new NotImplementedException(); - - /* - // get all vertical edges, which we'll define as a change in grayscale colour - // along a straight line of a certain length - List verticalRulings = new List();//new ArrayList<>(); - - Raster r = image.getRaster(); - int width = r.getWidth(); - int height = r.getHeight(); - - for (int y = 0; y < height; y++) - { - int[] lastPixel = r.getPixel(0, y, (int[])null); - - for (int x = 1; x < width - 1; x++) - { - int[] currPixel = r.getPixel(x, y, (int[])null); - - int diff = Math.Abs(currPixel[0] - lastPixel[0]); - if (diff > GRAYSCALE_INTENSITY_THRESHOLD) - { - // we hit what could be a line - // don't bother scanning it if we've hit a pixel in the line before - bool alreadyChecked = false; - foreach (var line in verticalRulings) - { - if (x == line.getX1() && y >= line.getY1() && y <= line.getY2()) - { - alreadyChecked = true; - break; - } - } - - if (alreadyChecked) - { - lastPixel = currPixel; - continue; - } - - int lineY = y + 1; - - while (lineY < height) - { - int[] linePixel = r.getPixel(x, lineY, (int[])null); - int[] leftPixel = r.getPixel(x - 1, lineY, (int[])null); - - if (Math.Abs(linePixel[0] - leftPixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD - || Math.Abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD) - { - break; - } - - lineY++; - } - - int endY = lineY - 1; - int lineLength = endY - y; - if (lineLength > VERTICAL_EDGE_HEIGHT_MINIMUM) - { - verticalRulings.Add(new Ruling(new PdfPoint(x, y), new PdfPoint(x, endY))); - } - } - - lastPixel = currPixel; - } - } - - return verticalRulings; - */ - } - - // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html - private PdfDocument removeText(Page page) - { - throw new NotImplementedException(); - /* - PDFStreamParser parser = new PDFStreamParser(page); - parser.parse(); - List tokens = parser.getTokens(); - List newTokens = new List(); //ArrayList<>(); - foreach (object token in tokens) - { - if (token is Operator op) // instanceof - { - //Operator op = (Operator)token; - if (op.getName().equals("TJ") || op.getName().equals("Tj")) - { - //remove the one argument to this operator - newTokens.remove(newTokens.size() - 1); - continue; - } - } - newTokens.Add(token); - } - - PdfDocument document = new PdfDocument(); - Page newPage = document.importPage(page); - newPage.setResources(page.getResources()); - - PDStream newContents = new PDStream(document); - OutputStream outp = newContents.createOutputStream(COSName.FLATE_DECODE); - ContentStreamWriter writer = new ContentStreamWriter(outp); - writer.writeTokens(newTokens); - outp.close(); - newPage.setContents(newContents); - return document; - */ } } } diff --git a/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs b/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs new file mode 100644 index 0000000..00e9ca1 --- /dev/null +++ b/Tabula/Detectors/SimpleNurminenDetectionAlgorithm.cs @@ -0,0 +1,848 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Tabula.Extractors; +using UglyToad.PdfPig.Core; +using UglyToad.PdfPig.DocumentLayoutAnalysis; + +namespace Tabula.Detectors +{ + /* + * ** tabula/detectors/NurminenDetectionAlgorithm.java ** + * Created by matt on 2015-12-17. + *

+ * Attempt at an implementation of the table finding algorithm described by + * Anssi Nurminen's master's thesis: + * http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 + */ + + ///

+ /// Simplified Nurminen detection algorithm. + /// Does not do any image processing. + /// + public class SimpleNurminenDetectionAlgorithm : IDetectionAlgorithm + { + private static int HORIZONTAL_EDGE_WIDTH_MINIMUM = 50; + private static int VERTICAL_EDGE_HEIGHT_MINIMUM = 10; + private static int CELL_CORNER_DISTANCE_MAXIMUM = 10; + private static double POINT_SNAP_DISTANCE_THRESHOLD = 8.0; + private static double TABLE_PADDING_AMOUNT = 1.0; + private static int REQUIRED_TEXT_LINES_FOR_EDGE = 4; + private static int REQUIRED_CELLS_FOR_TABLE = 4; + private static double IDENTICAL_TABLE_OVERLAP_RATIO = 0.9; + + /// + /// Helper class that encapsulates a text edge + /// + private class TextEdge + { + public PdfLine Line; + + // types of text edges + public const int LEFT = 0; + public const int MID = 1; + public const int RIGHT = 2; + public const int NUM_TYPES = 3; + + public int intersectingTextRowCount; + + public TextEdge(double x1, double y1, double x2, double y2) + { + Line = new PdfLine(x1, y1, x2, y2); + this.intersectingTextRowCount = 0; + } + + public override string ToString() + { + return $"{Line.Point1}-{Line.Point2}"; + } + } + + /// + /// Helper container for all text edges on a page + /// + private class TextEdges : List> + { + public TextEdges(List leftEdges, List midEdges, List rightEdges) : base(3) + { + this.Add(leftEdges); + this.Add(midEdges); + this.Add(rightEdges); + } + } + + /// + /// Helper container for relevant text edge info + /// + private class RelevantEdges + { + public int edgeType; + public int edgeCount; + + public RelevantEdges(int edgeType, int edgeCount) + { + this.edgeType = edgeType; + this.edgeCount = edgeCount; + } + } + + /// + /// Simplified Nurminen detection algorithm. + /// Does not do any image processing. + /// + public SimpleNurminenDetectionAlgorithm() + { } + + /// + /// Detects the tables in the page. + /// + /// + public List Detect(PageArea page) + { + // get horizontal & vertical lines + // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF + // instructions that are interpreted incorrectly as visible elements - we really want to capture what a + // person sees when they look at the PDF + // BobLd: hack here, we don't convert to an image + var pageRulings = page.GetRulings(); + List horizontalRulings = this.getHorizontalRulings(pageRulings); + List verticalRulings = this.getVerticalRulings(pageRulings); + // end hack here + + List allEdges = new List(horizontalRulings); + allEdges.AddRange(verticalRulings); + + List tableAreas = new List(); + + // if we found some edges, try to find some tables based on them + if (allEdges.Count > 0) + { + // now we need to snap edge endpoints to a grid + Utils.SnapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD); + + // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings + foreach (List rulings in new[] { horizontalRulings, verticalRulings }) //Arrays.asList(horizontalRulings, verticalRulings)) + { + //for (Iterator iterator = rulings.iterator(); iterator.hasNext();) + foreach (var ruling in rulings.ToList()) // use ToList to be able to remove + { + ruling.Normalize(); + if (ruling.IsOblique) + { + rulings.Remove(ruling); + } + } + } + + // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier + // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the + // edge detection/pixel snapping steps + horizontalRulings = Ruling.CollapseOrientedRulings(horizontalRulings, 5); + verticalRulings = Ruling.CollapseOrientedRulings(verticalRulings, 5); + + // use the rulings and points to find cells + List cells = SpreadsheetExtractionAlgorithm.FindCells(horizontalRulings, verticalRulings).Cast().ToList(); + + // then use those cells to make table areas + tableAreas = getTableAreasFromCells(cells); + } + + // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as + // cells if there are missing horizontal lines (which there often are) + // let's assume though that these lines should be part of the table + foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float + { + foreach (TableRectangle tableArea in tableAreas) + { + if (verticalRuling.Intersects(tableArea) && + !(tableArea.Contains(verticalRuling.P1) && tableArea.Contains(verticalRuling.P2))) + { + tableArea.SetTop(Math.Ceiling(Math.Max(tableArea.Top, verticalRuling.Y2))); // bobld: Floor and Min, Y1 + tableArea.SetBottom(Math.Floor(Math.Min(tableArea.Bottom, verticalRuling.Y1))); // bobld: Ceiling and Max, Y2 + break; + } + } + } + + /* BobLd: not sure this is the case in tabula-sharp/PdfPig + // the tabula Page coordinate space is half the size of the PDFBox image coordinate space + // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything + foreach (TableRectangle area in tableAreas) + { + area.x = (float)Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT; + area.y = (float)Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT; + area.width = (float)Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT; + area.height = (float)Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT; + } + + // we're going to want halved horizontal lines later too + foreach (Ruling ruling in horizontalRulings) // Line2D.Float + { + ruling.x1 = ruling.x1 / 2; + ruling.y1 = ruling.y1 / 2; + ruling.x2 = ruling.x2 / 2; + ruling.y2 = ruling.y2 / 2; + } + */ + + // now look at text rows to help us find more tables and flesh out existing ones + List textChunks = TextElement.MergeWords(page.GetText()); + List lines = TextChunk.GroupByLines(textChunks); + + // first look for text rows that intersect an existing table - those lines should probably be part of the table + foreach (TableLine textRow in lines) + { + foreach (TableRectangle tableArea in tableAreas) + { + if (!tableArea.Contains(textRow) && textRow.Intersects(tableArea)) + { + tableArea.SetLeft(Math.Floor(Math.Min(textRow.Left, tableArea.Left))); + tableArea.SetRight(Math.Ceiling(Math.Max(textRow.Right, tableArea.Right))); + } + } + } + + // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic + //for (Iterator iterator = tableAreas.iterator(); iterator.hasNext();) + foreach (TableRectangle table in tableAreas.ToList()) // use tolist to be able to remove + { + + bool intersectsText = false; + foreach (TableLine textRow in lines) + { + if (table.Intersects(textRow)) + { + intersectsText = true; + break; + } + } + + if (!intersectsText) + { + tableAreas.Remove(table); + } + } + + // lastly, there may be some tables that don't have any vertical rulings at all + // we'll use text edges we've found to try and guess which text rows are part of a table + + // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table + // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects + // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be + // part of a table. + + bool foundTable; + + do + { + foundTable = false; + + // get rid of any text lines contained within existing tables, this allows us to find more tables + //for (Iterator iterator = lines.iterator(); iterator.hasNext();) + foreach (var textRow in lines.ToList()) + { + foreach (TableRectangle table in tableAreas) + { + if (table.Contains(textRow)) + { + lines.Remove(textRow); + break; + } + } + } + + // get text edges from remaining lines in the document + TextEdges textEdges = getTextEdges(lines); + List leftTextEdges = textEdges[TextEdge.LEFT]; + List midTextEdges = textEdges[TextEdge.MID]; + List rightTextEdges = textEdges[TextEdge.RIGHT]; + + // find the relevant text edges (the ones we think define where a table is) + RelevantEdges relevantEdgeInfo = getRelevantEdges(textEdges, lines); + + // we found something relevant so let's look for rows that fit our criteria + if (relevantEdgeInfo.edgeType != -1) + { + List relevantEdges = null; + switch (relevantEdgeInfo.edgeType) + { + case TextEdge.LEFT: + relevantEdges = leftTextEdges; + break; + case TextEdge.MID: + relevantEdges = midTextEdges; + break; + case TextEdge.RIGHT: + relevantEdges = rightTextEdges; + break; + } + + TableRectangle table = getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings); + + if (table != null) + { + foundTable = true; + tableAreas.Add(table); + } + } + } while (foundTable); + + // create a set of our current tables that will eliminate duplicate tables + SortedSet tableSet = new SortedSet(new TreeSetComparer()); //Set tableSet = new TreeSet<>(new Comparator() {... + foreach (var table in tableAreas.OrderByDescending(t => t.Area)) + { + tableSet.Add(table); + } + + return tableSet.ToList(); + } + + private class TreeSetComparer : IComparer + { + public int Compare(TableRectangle o1, TableRectangle o2) + { + if (o1.Equals(o2)) + { + return 0; + } + + // o1 is "equal" to o2 if o2 contains all of o1 + if (o2.Contains(o1)) + { + return 0; + } + + if (o1.Contains(o2)) + { + return 0; + } + + // otherwise see if these tables are "mostly" the same + double overlap = o1.OverlapRatio(o2); + if (overlap >= IDENTICAL_TABLE_OVERLAP_RATIO) + { + return 0; + } + else + { + return 1; + } + } + } + + private TableRectangle getTableFromText(List lines, List relevantEdges, int relevantEdgeCount, List horizontalRulings) + { + TableRectangle table = new TableRectangle(); + + TableLine prevRow = null; + TableLine firstTableRow = null; + TableLine lastTableRow = null; + + int tableSpaceCount = 0; + double totalRowSpacing = 0; + + // go through the lines and find the ones that have the correct count of the relevant edges + foreach (TableLine textRow in lines) + { + int numRelevantEdges = 0; + + if (firstTableRow != null && tableSpaceCount > 0) + { + // check to make sure this text row is within a line or so of the other lines already added + // if it's not, we should stop the table here + double tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5; + double lineDistance = prevRow.Bottom - textRow.Bottom; // bobld: textRow.Top - prevRow.Top + + System.Diagnostics.Debug.Assert(lineDistance >= 0); + + if (lineDistance > tableLineThreshold) + { + lastTableRow = prevRow; + break; + } + } + + // for larger tables, be a little lenient on the number of relevant rows the text intersects + // for smaller tables, not so much - otherwise we'll end up treating paragraphs as tables too + int relativeEdgeDifferenceThreshold = 1; + if (relevantEdgeCount <= 3) + { + relativeEdgeDifferenceThreshold = 0; + } + + foreach (TextEdge edge in relevantEdges) + { + if (textRow.IntersectsLine(edge.Line)) + { + numRelevantEdges++; + } + } + + // see if we have a candidate text row + if (numRelevantEdges >= (relevantEdgeCount - relativeEdgeDifferenceThreshold)) + { + // keep track of table row spacing + if (prevRow != null && firstTableRow != null) + { + tableSpaceCount++; + totalRowSpacing += prevRow.Bottom - textRow.Bottom; // bobld: textRow.Top - prevRow.Top + } + + // row is part of a table + if (table.Area == 0) + { + firstTableRow = textRow; + table.SetRect(textRow); + } + else + { + table.SetLeft(Math.Min(table.Left, textRow.Left)); + table.SetBottom(Math.Min(table.Bottom, textRow.Bottom)); // bobld: Max + table.SetRight(Math.Max(table.Right, textRow.Right)); + } + } + else + { + // no dice + // if we're at the end of the table, save the last row + if (firstTableRow != null && lastTableRow == null) + { + lastTableRow = prevRow; + } + } + + prevRow = textRow; + } + + // if we don't have a table now, we won't after the next step either + if (table.Area == 0) + { + return null; + } + + if (lastTableRow == null) + { + // takes care of one-row tables or tables that end at the bottom of a page + lastTableRow = prevRow; + } + + // use the average row height and nearby horizontal lines to extend the table area + double avgRowHeight; + if (tableSpaceCount > 0) + { + System.Diagnostics.Debug.Assert(totalRowSpacing >= 0); + avgRowHeight = totalRowSpacing / tableSpaceCount; + } + else + { + avgRowHeight = lastTableRow.Height; + } + + double rowHeightThreshold = avgRowHeight * 1.5; + + // check lines after the bottom of the table + //foreach (Ruling ruling in sortedHorizontalRulings) //Line2D.Float + for (int i = horizontalRulings.Count - 1; i >= 0; i--) // reverse order + { + var ruling = horizontalRulings[i]; + if (ruling.Y1 > table.Bottom) // bobld: < + { + continue; + } + + double distanceFromTable = table.Bottom - ruling.Y2; // bobld: Y1 + System.Diagnostics.Debug.Assert(distanceFromTable >= 0); + if (distanceFromTable <= rowHeightThreshold) + { + // use this ruling to help define the table + table.SetBottom(Math.Min(table.Bottom, ruling.Y2)); // bobld: Max Y1 + table.SetLeft(Math.Min(table.Left, ruling.X1)); + table.SetRight(Math.Max(table.Right, ruling.X2)); + } + else + { + // no use checking any further + break; + } + } + + // do the same for lines at the top, but make the threshold greater since table headings tend to be + // larger to fit up to three-ish rows of text (at least but we don't want to grab too much) + rowHeightThreshold = avgRowHeight * 3.8; + + //for (int i = horizontalRulings.Count - 1; i >= 0; i--) + for (int i = 0; i < horizontalRulings.Count; i++) + { + Ruling ruling = horizontalRulings[i]; + + if (ruling.Y1 < table.Top) //bobld: > + { + continue; + } + + double distanceFromTable = ruling.Y1 - table.Top; // bobld: table.Top - ruling.Y1 + System.Diagnostics.Debug.Assert(distanceFromTable >= 0); + if (distanceFromTable <= rowHeightThreshold) + { + table.SetTop(Math.Max(table.Top, ruling.Y2)); // bobld: Min Y1 + table.SetLeft(Math.Min(table.Left, ruling.X1)); + table.SetRight(Math.Max(table.Right, ruling.X2)); + } + else + { + break; + } + } + + // add a bit of padding since the halved horizontal lines are a little fuzzy anyways + table.SetTop(Math.Ceiling(table.Top) + TABLE_PADDING_AMOUNT); // bobld: Floor - + table.SetBottom(Math.Floor(table.Bottom) - TABLE_PADDING_AMOUNT); // bobld: Ceiling + + table.SetLeft(Math.Floor(table.Left) - TABLE_PADDING_AMOUNT); + table.SetRight(Math.Ceiling(table.Right) + TABLE_PADDING_AMOUNT); + + return table; + } + + private RelevantEdges getRelevantEdges(TextEdges textEdges, List lines) + { + List leftTextEdges = textEdges[TextEdge.LEFT]; + List midTextEdges = textEdges[TextEdge.MID]; + List rightTextEdges = textEdges[TextEdge.RIGHT]; + + // first we'll find the number of lines each type of edge crosses + int[][] edgeCountsPerLine = new int[lines.Count][]; + for (int i = 0; i < edgeCountsPerLine.Length; i++) + { + edgeCountsPerLine[i] = new int[TextEdge.NUM_TYPES]; + } + + foreach (TextEdge edge in leftTextEdges) + { + edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.LEFT]++; + } + + foreach (TextEdge edge in midTextEdges) + { + edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.MID]++; + } + + foreach (TextEdge edge in rightTextEdges) + { + edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.RIGHT]++; + } + + // now let's find the relevant edge type and the number of those edges we should look for + // we'll only take a minimum of two edges to look for tables + int relevantEdgeType = -1; + int relevantEdgeCount = 0; + for (int i = edgeCountsPerLine.Length - 1; i > 2; i--) + { + if (edgeCountsPerLine[i][TextEdge.LEFT] > 2 && + edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.RIGHT] && + edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.MID]) + { + relevantEdgeCount = edgeCountsPerLine[i][TextEdge.LEFT]; + relevantEdgeType = TextEdge.LEFT; + break; + } + + if (edgeCountsPerLine[i][TextEdge.RIGHT] > 1 && + edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.LEFT] && + edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.MID]) + { + relevantEdgeCount = edgeCountsPerLine[i][TextEdge.RIGHT]; + relevantEdgeType = TextEdge.RIGHT; + break; + } + + if (edgeCountsPerLine[i][TextEdge.MID] > 1 && + edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.RIGHT] && + edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.LEFT]) + { + relevantEdgeCount = edgeCountsPerLine[i][TextEdge.MID]; + relevantEdgeType = TextEdge.MID; + break; + } + } + + return new RelevantEdges(relevantEdgeType, relevantEdgeCount); + } + + private TextEdges getTextEdges(List lines) + { + // get all text edges (lines that align with the left, middle and right of chunks of text) that extend + // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text + + List leftTextEdges = new List(); + List midTextEdges = new List(); + List rightTextEdges = new List(); + + Dictionary> currLeftEdges = new Dictionary>(); + Dictionary> currMidEdges = new Dictionary>(); + Dictionary> currRightEdges = new Dictionary>(); + + foreach (TableLine textRow in lines) + { + foreach (TextChunk text in textRow.TextElements) + { + if (text.GetText().Equals("")) continue; // added by bobld + + int left = (int)Math.Floor(text.Left); + int right = (int)Math.Floor(text.Right); + int mid = (int)(left + ((right - left) / 2)); + + // first put this chunk into any edge buckets it belongs to + if (!currLeftEdges.TryGetValue(left, out List leftEdge)) + { + leftEdge = new List(); + currLeftEdges[left] = leftEdge; + } + leftEdge.Add(text); + + if (!currMidEdges.TryGetValue(mid, out List midEdge)) + { + midEdge = new List(); + currMidEdges[mid] = midEdge; + } + midEdge.Add(text); + + if (!currRightEdges.TryGetValue(right, out List rightEdge)) + { + rightEdge = new List(); + currRightEdges[right] = rightEdge; + } + rightEdge.Add(text); + + // now see if this text chunk blows up any other edges + //for (Iterator>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext();) + foreach (var entry in currLeftEdges.ToList()) // use tolist to be able to remove + { + int key = entry.Key; + if (key > left && key < right) + { + currLeftEdges.Remove(key); + List edgeChunks = entry.Value; + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; + + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + leftTextEdges.Add(edge); + } + } + } + + //for (Iterator>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext();) + foreach (var entry in currMidEdges.ToList()) + { + int key = entry.Key; + if (key > left && key < right && Math.Abs(key - mid) > 2) + { + currMidEdges.Remove(key); + List edgeChunks = entry.Value; + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; + + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + midTextEdges.Add(edge); + } + } + } + + //for (Iterator>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext();) + foreach (var entry in currRightEdges.ToList()) + { + int key = entry.Key; + if (key > left && key < right) + { + currRightEdges.Remove(key); + List edgeChunks = entry.Value; + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; + + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + rightTextEdges.Add(edge); + } + } + } + } + } + + // add the leftovers + foreach (int key in currLeftEdges.Keys) + { + List edgeChunks = currLeftEdges[key]; + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; + + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + leftTextEdges.Add(edge); + } + } + + foreach (int key in currMidEdges.Keys) + { + List edgeChunks = currMidEdges[key]; + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; + + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom); + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + midTextEdges.Add(edge); + } + } + + foreach (int key in currRightEdges.Keys) + { + List edgeChunks = currRightEdges[key]; + if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE) + { + TextChunk first = edgeChunks[0]; + TextChunk last = edgeChunks[edgeChunks.Count - 1]; + + TextEdge edge = new TextEdge(key, last.Bottom, key, first.Top); // bobld: (key, first.Top, key, last.Bottom) + edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count); + + rightTextEdges.Add(edge); + } + } + + return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges); + } + + private List getTableAreasFromCells(List cells) + { + List> cellGroups = new List>(); + foreach (TableRectangle cell in cells) + { + bool addedToGroup = false; + + foreach (List cellGroup in cellGroups) + { + foreach (TableRectangle groupCell in cellGroup) + { + PdfPoint[] groupCellCorners = groupCell.Points; + PdfPoint[] candidateCorners = cell.Points; + + for (int i = 0; i < candidateCorners.Length; i++) + { + for (int j = 0; j < groupCellCorners.Length; j++) + { + //if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) + if (Distances.Euclidean(candidateCorners[i], groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) + { + cellGroup.Add(cell); + addedToGroup = true; + goto cellCheck; + } + } + } + } + } + + cellCheck: + if (!addedToGroup) + { + List cellGroup = new List(); + cellGroup.Add(cell); + cellGroups.Add(cellGroup); + } + } + + // create table areas based on cell group + List tableAreas = new List(); + foreach (List cellGroup in cellGroups) + { + // less than four cells should not make a table + if (cellGroup.Count < REQUIRED_CELLS_FOR_TABLE) + { + continue; + } + + double top = double.MinValue; // bobld: MaxValue + double left = double.MaxValue; + double bottom = double.MaxValue; // bobld: MinValue + double right = double.MinValue; + + foreach (TableRectangle cell in cellGroup) + { + if (cell.Top > top) top = cell.Top; // bobld: < + if (cell.Left < left) left = cell.Left; + if (cell.Bottom < bottom) bottom = cell.Bottom; // bobld: > + if (cell.Right > right) right = cell.Right; + } + + tableAreas.Add(new TableRectangle(new PdfRectangle(left, bottom, right, top))); + } + + return tableAreas; + } + + private List getHorizontalRulings(IReadOnlyList rulings) + { + List horizontalR = new List(); + foreach (Ruling r in rulings) + { + if (r.IsHorizontal) + { + horizontalR.Add(r); + } + } + + List horizontalRulings = new List(); + foreach (var r in horizontalR) + { + var endX = r.Right + 1; + var startY = r.Left - 1; + if (endX - startY > HORIZONTAL_EDGE_WIDTH_MINIMUM) + { + horizontalRulings.Add(new Ruling(new PdfPoint(startY, r.Bottom), new PdfPoint(endX, r.Top))); + } + } + + return horizontalRulings; + } + + private List getVerticalRulings(IReadOnlyList rulings) + { + List verticalR = new List(); + foreach (Ruling r in rulings) + { + if (r.IsVertical) + { + verticalR.Add(r); + } + } + + List verticalRulings = new List(); + foreach (var r in verticalR) + { + var endY = r.Top + 1; + var startY = r.Bottom - 1; + if (endY - startY > VERTICAL_EDGE_HEIGHT_MINIMUM) + { + verticalRulings.Add(new Ruling(new PdfPoint(r.Left, startY), new PdfPoint(r.Right, endY))); + } + } + return verticalRulings; + } + } +} diff --git a/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs b/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs index 77e08ae..722f339 100644 --- a/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs +++ b/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs @@ -24,13 +24,10 @@ public List Detect(PageArea page) { List cells = SpreadsheetExtractionAlgorithm.FindCells(page.HorizontalRulings, page.VerticalRulings); - //SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); - List tables = SpreadsheetExtractionAlgorithm.FindSpreadsheetsFromCells(cells.Cast().ToList()); // we want tables to be returned from top to bottom on the page - //Collections.sort(tables, TableRectangle.ILL_DEFINED_ORDER); - Utils.Sort(tables, new TableRectangle.ILL_DEFINED_ORDER()); // tables.Sort(new TableRectangle.ILL_DEFINED_ORDER()); + Utils.Sort(tables, new TableRectangle.ILL_DEFINED_ORDER()); return tables; } } diff --git a/Tabula/PageArea.cs b/Tabula/PageArea.cs index 4c6e604..36cc4b9 100644 --- a/Tabula/PageArea.cs +++ b/Tabula/PageArea.cs @@ -306,7 +306,7 @@ public IReadOnlyList GetRulings() /// public void AddRuling(Ruling r) { - if (r.Oblique) + if (r.IsOblique) { throw new InvalidOperationException("Can't add an oblique ruling"); } diff --git a/Tabula/Ruling.cs b/Tabula/Ruling.cs index bc8b535..b1a32ac 100644 --- a/Tabula/Ruling.cs +++ b/Tabula/Ruling.cs @@ -85,7 +85,7 @@ public void Normalize() /// /// Is the blique? Neither vertical nor horizontal. /// - public bool Oblique => !(this.IsVertical || this.IsHorizontal); + public bool IsOblique => !(this.IsVertical || this.IsHorizontal); /// /// attributes that make sense only for non-oblique lines @@ -95,7 +95,7 @@ public double Position { get { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -106,7 +106,7 @@ public double Position public void SetPosition(float v) { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -127,7 +127,7 @@ public double Start { get { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -138,7 +138,7 @@ public double Start public void SetStart(double v) { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -157,7 +157,7 @@ public double End { get { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -168,7 +168,7 @@ public double End public void SetEnd(double v) { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -185,7 +185,7 @@ public void SetEnd(double v) private void SetStartEnd(double start, double end) { - if (this.Oblique) + if (this.IsOblique) { throw new InvalidOperationException(); } @@ -599,7 +599,7 @@ public static List CollapseOrientedRulings(List lines, int expan double newEnd = lastFlipped ? Math.Min(nextE, lastEnd) : Math.Max(nextE, lastEnd); last.SetStartEnd(newStart, newEnd); - Debug.Assert(!last.Oblique); + Debug.Assert(!last.IsOblique); } else if (next_line.Length == 0) { diff --git a/Tabula/TableRectangle.cs b/Tabula/TableRectangle.cs index b59d46e..98f86c3 100644 --- a/Tabula/TableRectangle.cs +++ b/Tabula/TableRectangle.cs @@ -101,8 +101,8 @@ public virtual int IsLtrDominant() public double VerticalOverlap(TableRectangle other) { - return Math.Max(0, Math.Min(this.BoundingBox.Top, other.BoundingBox.Top) - - Math.Max(this.BoundingBox.Bottom, other.BoundingBox.Bottom)); + return Math.Max(0, Math.Min(this.BoundingBox.TopLeft.Y, other.BoundingBox.TopLeft.Y) + - Math.Max(this.BoundingBox.BottomLeft.Y, other.BoundingBox.BottomLeft.Y)); } public bool VerticallyOverlaps(TableRectangle other) @@ -122,8 +122,8 @@ public bool HorizontallyOverlaps(TableRectangle other) public double VerticalOverlapRatio(TableRectangle other) { - double delta = Math.Min(this.BoundingBox.Top - this.BoundingBox.Bottom, - other.BoundingBox.Top - other.BoundingBox.Bottom); + double delta = Math.Min(this.BoundingBox.TopLeft.Y - this.BoundingBox.BottomLeft.Y, + other.BoundingBox.TopLeft.Y - other.BoundingBox.BottomLeft.Y); var overl = VerticalOverlap(other); return overl / delta; } @@ -146,7 +146,7 @@ public TableRectangle Merge(TableRectangle other) /// /// Get the 's top coordinate. /// - public double Top => BoundingBox.Top; + public double Top => BoundingBox.TopRight.Y; //.Top; /// /// Set the 's top coordinate. @@ -163,7 +163,7 @@ public void SetTop(double top) /// /// Get the 's right coordinate. /// - public double Right => BoundingBox.Right; + public double Right => BoundingBox.TopRight.X; //.Right; /// /// Set the 's right coordinate. @@ -178,7 +178,7 @@ public void SetRight(double right) /// /// Get the 's left coordinate. /// - public double Left => BoundingBox.Left; + public double Left => BoundingBox.BottomLeft.X; //.Left; /// /// Set the 's left coordinate. @@ -194,7 +194,7 @@ public void SetLeft(double left) /// /// Get the 's bottom coordinate. /// - public double Bottom => BoundingBox.Bottom; + public double Bottom => BoundingBox.BottomLeft.Y; //.Bottom; /// /// Set the 's bottom coordinate. @@ -270,10 +270,20 @@ public bool IntersectsLine(Ruling ruling) return IntersectsLine(ruling.Line); } + + /// + /// hack to include border + /// + /// + private PdfRectangle Expand(PdfRectangle rectangle) + { + return new PdfRectangle(rectangle.Left - 1, rectangle.Bottom - 1, rectangle.Right + 1, rectangle.Top + 1); + } + public bool IntersectsLine(PdfLine line) { var clipper = new Clipper(); - clipper.AddPath(Clipper.ToClipperIntPoints(this.BoundingBox), PolyType.ptClip, true); + clipper.AddPath(Clipper.ToClipperIntPoints(Expand(this.BoundingBox)), PolyType.ptClip, true); clipper.AddPath(Clipper.ToClipperIntPoints(line), PolyType.ptSubject, false); diff --git a/Tabula/Tabula.csproj b/Tabula/Tabula.csproj index fb4602c..7e4fc80 100644 --- a/Tabula/Tabula.csproj +++ b/Tabula/Tabula.csproj @@ -6,7 +6,9 @@ https://github.com/BobLd/tabula-sharp 0.1.0.0 0.1.0.0 - 0.1.0-alpha001 + 0.1.0-alpha002 + BobLd + BobLd diff --git a/Tabula/Tabula.xml b/Tabula/Tabula.xml index aa6043b..aaffd3c 100644 --- a/Tabula/Tabula.xml +++ b/Tabula/Tabula.xml @@ -63,21 +63,39 @@ Nurminen detection algorithm. - + + + Simplified Nurminen detection algorithm. + Does not do any image processing. + + + Helper class that encapsulates a text edge - + Helper container for all text edges on a page - + Helper container for relevant text edge info + + + Simplified Nurminen detection algorithm. + Does not do any image processing. + + + + + Detects the tables in the page. + + + This is the basic spreadsheet table detection algorithm currently implemented in tabula (web). @@ -437,7 +455,7 @@ Is the horizontal? - + Is the blique? Neither vertical nor horizontal. @@ -660,6 +678,12 @@ Counter-clockwise, starting from bottom left point. + + + hack to include border + + + The 's top-left X coordinate. @@ -848,6 +872,18 @@ + + + Create a TextStripper for the given page. + + + + + + + Process the page. + + Unicode extensions. diff --git a/Tabula/TextStripper.cs b/Tabula/TextStripper.cs index 9d88e7b..37555e6 100644 --- a/Tabula/TextStripper.cs +++ b/Tabula/TextStripper.cs @@ -22,12 +22,20 @@ public class TextStripper public double totalHeight; public int countHeight; + /// + /// Create a TextStripper for the given page. + /// + /// + /// public TextStripper(PdfDocument document, int pageNumber) { this.document = document; this.pageNumber = pageNumber; } + /// + /// Process the page. + /// public void Process() { var page = document.GetPage(pageNumber); @@ -48,14 +56,16 @@ public void Process() double wos = GetExpectedWhitespaceSize(letter); //textPosition.getWidthOfSpace(); - TextElement te = new TextElement(GetBbox(letter), letter.Font, letter.PointSize, c, wos, letter.GlyphRectangle.Rotation); // Rotation->The direction of the text(0, 90, 180, or 270) - te.letter = letter; + TextElement te = new TextElement(GetBbox(letter), letter.Font, letter.PointSize, c, wos, letter.GlyphRectangle.Rotation) + { + letter = letter + }; - this.minCharWidth = Math.Min(this.minCharWidth, te.Width); - this.minCharHeight = Math.Min(this.minCharHeight, te.Height); + if (!string.IsNullOrWhiteSpace(c)) this.minCharWidth = Math.Min(this.minCharWidth, te.Width); + if (!string.IsNullOrWhiteSpace(c)) this.minCharHeight = Math.Min(this.minCharHeight, Math.Max(te.Height, 1)); // added by bobld: min height value to 1 countHeight++; - totalHeight += te.Height; + totalHeight += Math.Max(te.Height, 1); // added by bobld: min height value to 1 double avgHeight = totalHeight / countHeight; if (avgHeight > 0 && te.Height >= (avgHeight * AVG_HEIGHT_MULT_THRESHOLD) && (te.GetText()?.Trim().Equals("") != false))