diff --git a/Tabula.Csv/Tabula.Csv.csproj b/Tabula.Csv/Tabula.Csv.csproj
index d39a5c8..061fe0e 100644
--- a/Tabula.Csv/Tabula.Csv.csproj
+++ b/Tabula.Csv/Tabula.Csv.csproj
@@ -4,7 +4,8 @@
netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47Extract tables from PDF files (port of tabula-java using PdfPig). Csv and Tsv writers.https://github.com/BobLd/tabula-sharp
- 0.1.0-alpha001
+ 0.1.0-alpha002
+ BobLd
diff --git a/Tabula.Json/Tabula.Json.csproj b/Tabula.Json/Tabula.Json.csproj
index 2cd4422..d7344e3 100644
--- a/Tabula.Json/Tabula.Json.csproj
+++ b/Tabula.Json/Tabula.Json.csproj
@@ -4,7 +4,9 @@
netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47Extract tables from PDF files (port of tabula-java using PdfPig). Json writer.https://github.com/BobLd/tabula-sharp
- 0.1.0-alpha001
+ 0.1.0-alpha002
+ BobLd
+ BobLd
diff --git a/Tabula.Tests/TestsNurminenDetector.cs b/Tabula.Tests/TestsNurminenDetector.cs
new file mode 100644
index 0000000..26dca81
--- /dev/null
+++ b/Tabula.Tests/TestsNurminenDetector.cs
@@ -0,0 +1,33 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+using Tabula.Detectors;
+using Tabula.Extractors;
+using UglyToad.PdfPig;
+using Xunit;
+
+namespace Tabula.Tests
+{
+ public class TestsNurminenDetector
+ {
+ [Fact(Skip = "TO DO")]
+ public void TestLinesToCells()
+ {
+ using (PdfDocument document = PdfDocument.Open(@"test3.pdf", new ParsingOptions() { ClipPaths = true }))
+ {
+ ObjectExtractor oe = new ObjectExtractor(document);
+ PageArea page = oe.Extract(1);
+
+ SimpleNurminenDetectionAlgorithm detector = new SimpleNurminenDetectionAlgorithm();
+ var regions = detector.Detect(page);
+
+ foreach (var a in regions)
+ {
+ IExtractionAlgorithm ea = new BasicExtractionAlgorithm();
+ var newArea = page.GetArea(a.BoundingBox);
+ List
tables = ea.Extract(newArea);
+ }
+ }
+ }
+ }
+}
diff --git a/Tabula/Detectors/NurminenDetectionAlgorithm.cs b/Tabula/Detectors/NurminenDetectionAlgorithm.cs
index 0b28550..6356321 100644
--- a/Tabula/Detectors/NurminenDetectionAlgorithm.cs
+++ b/Tabula/Detectors/NurminenDetectionAlgorithm.cs
@@ -1,10 +1,5 @@
using System;
using System.Collections.Generic;
-using System.Linq;
-using UglyToad.PdfPig;
-using UglyToad.PdfPig.Content;
-using UglyToad.PdfPig.Core;
-using UglyToad.PdfPig.DocumentLayoutAnalysis;
namespace Tabula.Detectors
{
@@ -22,1007 +17,9 @@ namespace Tabula.Detectors
///
public class NurminenDetectionAlgorithm : IDetectionAlgorithm
{
- private static int GRAYSCALE_INTENSITY_THRESHOLD = 25;
- private static int HORIZONTAL_EDGE_WIDTH_MINIMUM = 50;
- private static int VERTICAL_EDGE_HEIGHT_MINIMUM = 10;
- private static int CELL_CORNER_DISTANCE_MAXIMUM = 10;
- private static float POINT_SNAP_DISTANCE_THRESHOLD = 8f;
- private static float TABLE_PADDING_AMOUNT = 1.0f;
- private static int REQUIRED_TEXT_LINES_FOR_EDGE = 4;
- private static int REQUIRED_CELLS_FOR_TABLE = 4;
- private static float IDENTICAL_TABLE_OVERLAP_RATIO = 0.9f;
-
- ///
- /// Helper class that encapsulates a text edge
- ///
- private class TextEdge // static
- {
- public readonly PdfLine line;
- // types of text edges
- public const int LEFT = 0;
- public const int MID = 1;
- public const int RIGHT = 2;
- public const int NUM_TYPES = 3;
-
- public int intersectingTextRowCount;
-
- public TextEdge(double x1, double y1, double x2, double y2)
- {
- this.line = new PdfLine(x1, y1, x2, y2); // bobld: careful with order here
- //super(x1, y1, x2, y2);
- this.intersectingTextRowCount = 0;
- }
- }
-
- ///
- /// Helper container for all text edges on a page
- ///
- private class TextEdges : List> // ArrayList> // static
- {
- public TextEdges(List leftEdges, List midEdges, List rightEdges)
- : base(3)
- {
- //super(3);
- this.Add(leftEdges);
- this.Add(midEdges);
- this.Add(rightEdges);
- }
- }
-
- ///
- /// Helper container for relevant text edge info
- ///
- private class RelevantEdges // static
- {
- public int edgeType;
- public int edgeCount;
-
- public RelevantEdges(int edgeType, int edgeCount)
- {
- this.edgeType = edgeType;
- this.edgeCount = edgeCount;
- }
- }
-
public List Detect(PageArea page)
{
throw new NotImplementedException();
-
- /*
- // get horizontal & vertical lines
- // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF
- // instructions that are interpreted incorrectly as visible elements - we really want to capture what a
- // person sees when they look at the PDF
- BufferedImage image;
- Page pdfPage = page.getPDPage();
- try
- {
- image = Utils.pageConvertToImage(page.getPDDoc(), pdfPage, 144, ImageType.GRAY);
- }
- catch (IOException e)
- {
- return new List(); //ArrayList<>();
- }
-
- List horizontalRulings = this.getHorizontalRulings(image);
-
- // now check the page for vertical lines, but remove the text first to make things less confusing
- PdfDocument removeTextDocument = null;
- try
- {
- removeTextDocument = this.removeText(pdfPage);
- pdfPage = removeTextDocument.GetPage(1); //.getPage(0);
- image = Utils.pageConvertToImage(removeTextDocument, pdfPage, 144); //, ImageType.GRAY);
- }
- catch (Exception e)
- {
- return new List(); //ArrayList<>();
- }
- finally
- {
- if (removeTextDocument != null)
- {
- try
- {
- removeTextDocument.Dispose(); //.close();
- }
- catch (IOException e)
- {
- // TODO Auto-generated catch block
- // e.printStackTrace();
- }
- }
- }
-
- List verticalRulings = this.getVerticalRulings(image);
-
- List allEdges = new List(horizontalRulings);
- allEdges.AddRange(verticalRulings);
-
- List tableAreas = new List();
-
- // if we found some edges, try to find some tables based on them
- if (allEdges.Count > 0)
- {
- // now we need to snap edge endpoints to a grid
- Utils.snapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD);
-
- // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings
- foreach (List rulings in new[] { horizontalRulings, verticalRulings }) // Arrays.asList(horizontalRulings, verticalRulings))
- {
- //foreach (Iterator iterator = rulings.iterator(); iterator.hasNext();)
- foreach (var ruling in rulings.ToList()) // ToList() to do a copy to allow remove in original
- {
- //Ruling ruling = iterator.next();
-
- ruling.normalize();
- if (ruling.oblique())
- {
- rulings.Remove(ruling); //iterator.remove();
- }
- }
- }
-
- // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier
- // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the
- // edge detection/pixel snapping steps
- horizontalRulings = Ruling.collapseOrientedRulings(horizontalRulings, 5);
- verticalRulings = Ruling.collapseOrientedRulings(verticalRulings, 5);
-
- // use the rulings and points to find cells
- var cells = SpreadsheetExtractionAlgorithm.findCells(horizontalRulings, verticalRulings); // List
-
- // then use those cells to make table areas
- tableAreas = this.getTableAreasFromCells(cells.Cast().ToList());
- }
-
- // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as
- // cells if there are missing horizontal lines (which there often are)
- // let's assume though that these lines should be part of the table
- foreach (Ruling verticalRuling in verticalRulings) // Line2D.Float
- {
- foreach (TableRectangle tableArea in tableAreas)
- {
- if (verticalRuling.intersects(tableArea) && !(tableArea.contains(verticalRuling.getP1()) && tableArea.contains(verticalRuling.getP2())))
- {
- tableArea.setTop((float)Math.Floor(Math.Max(tableArea.getTop(), verticalRuling.getY1()))); // min
- tableArea.setBottom((float)Math.Ceiling(Math.Min(tableArea.getBottom(), verticalRuling.getY2()))); // max
- break;
- }
- }
- }
-
- // the tabula Page coordinate space is half the size of the PDFBox image coordinate space
- // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything
- foreach (TableRectangle area in tableAreas)
- {
- area.x = (float)Math.Floor(area.x / 2) - TABLE_PADDING_AMOUNT;
- area.y = (float)Math.Floor(area.y / 2) - TABLE_PADDING_AMOUNT;
- area.width = (float)Math.Ceiling(area.width / 2) + TABLE_PADDING_AMOUNT;
- area.height = (float)Math.Ceiling(area.height / 2) + TABLE_PADDING_AMOUNT;
- }
-
- // we're going to want halved horizontal lines later too
- foreach (Ruling ruling in horizontalRulings) // Line2D.Float
- {
- ruling.x1 = ruling.x1 / 2;
- ruling.y1 = ruling.y1 / 2;
- ruling.x2 = ruling.x2 / 2;
- ruling.y2 = ruling.y2 / 2;
- }
-
- // now look at text rows to help us find more tables and flesh out existing ones
- List textChunks = TextElement.mergeWords(page.getText());
- List lines = TextChunk.groupByLines(textChunks);
-
- // first look for text rows that intersect an existing table - those lines should probably be part of the table
- foreach (TableLine textRow in lines)
- {
- foreach (TableRectangle tableArea in tableAreas)
- {
- if (!tableArea.contains(textRow) && textRow.intersects(tableArea))
- {
- tableArea.setLeft((float)Math.Floor(Math.Min(textRow.getLeft(), tableArea.getLeft())));
- tableArea.setRight((float)Math.Ceiling(Math.Max(textRow.getRight(), tableArea.getRight())));
- }
- }
- }
-
- // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic
- //for (Iterator iterator = tableAreas.iterator(); iterator.hasNext();)
- foreach (var table in tableAreas.ToList()) // ToList() to do a copy to allow remove in original
- {
- //TableRectangle table = iterator.next();
-
- bool intersectsText = false;
- foreach (TableLine textRow in lines)
- {
- if (table.intersects(textRow))
- {
- intersectsText = true;
- break;
- }
- }
-
- if (!intersectsText)
- {
- //iterator.remove();
- tableAreas.Remove(table);
- }
- }
-
- // lastly, there may be some tables that don't have any vertical rulings at all
- // we'll use text edges we've found to try and guess which text rows are part of a table
-
- // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table
- // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects
- // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be
- // part of a table.
-
- bool foundTable;
-
- do
- {
- foundTable = false;
-
- // get rid of any text lines contained within existing tables, this allows us to find more tables
- //for (Iterator iterator = lines.iterator(); iterator.hasNext();)
- foreach (var textRow in lines)
- {
- //TableLine textRow = iterator.next();
- foreach (TableRectangle table in tableAreas.ToList()) // ToList() to do a copy to allow remove in original
- {
- if (table.contains(textRow))
- {
- //iterator.remove();
- lines.Remove(textRow);
- break;
- }
- }
- }
-
- // get text edges from remaining lines in the document
- TextEdges textEdges = this.getTextEdges(lines);
- List leftTextEdges = textEdges.get(TextEdge.LEFT);
- List midTextEdges = textEdges.get(TextEdge.MID);
- List rightTextEdges = textEdges.get(TextEdge.RIGHT);
-
- // find the relevant text edges (the ones we think define where a table is)
- RelevantEdges relevantEdgeInfo = this.getRelevantEdges(textEdges, lines);
-
- // we found something relevant so let's look for rows that fit our criteria
- if (relevantEdgeInfo.edgeType != -1)
- {
- List relevantEdges = null;
- switch (relevantEdgeInfo.edgeType)
- {
- case TextEdge.LEFT:
- relevantEdges = leftTextEdges;
- break;
- case TextEdge.MID:
- relevantEdges = midTextEdges;
- break;
- case TextEdge.RIGHT:
- relevantEdges = rightTextEdges;
- break;
- }
-
- TableRectangle table = this.getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings);
-
- if (table != null)
- {
- foundTable = true;
- tableAreas.Add(table);
- }
- }
- } while (foundTable);
-
- // create a set of our current tables that will eliminate duplicate tables
- // Set tableSet = new TreeSet<>(new Comparator() {
- // not sure if works with sorted set??
- SortedSet tableSet = new SortedSet(new TreeSetRectangleComparer());
-
- //tableSet.addAll(tableAreas);
- foreach (var ta in tableAreas)
- {
- tableSet.Add(ta);
- }
-
- return new List(tableSet); //ArrayList<>(tableSet);
- */
- }
-
- public class TreeSetRectangleComparer : IComparer
- {
- public int Compare(TableRectangle o1, TableRectangle o2)
- {
- if (o1.Equals(o2))
- {
- return 0;
- }
-
- // o1 is "equal" to o2 if o2 contains all of o1
- if (o2.Contains(o1))
- {
- return 0;
- }
-
- if (o1.Contains(o2))
- {
- return 0;
- }
-
- // otherwise see if these tables are "mostly" the same
- double overlap = o1.OverlapRatio(o2);
- if (overlap >= IDENTICAL_TABLE_OVERLAP_RATIO)
- {
- return 0;
- }
- else
- {
- return 1;
- }
- }
- }
-
- private TableRectangle getTableFromText(List lines,
- List relevantEdges,
- int relevantEdgeCount,
- List horizontalRulings)
- {
-
- TableRectangle table = new TableRectangle();
-
- TableLine prevRow = null;
- TableLine firstTableRow = null;
- TableLine lastTableRow = null;
-
- int tableSpaceCount = 0;
- double totalRowSpacing = 0;
-
- // go through the lines and find the ones that have the correct count of the relevant edges
- foreach (TableLine textRow in lines)
- {
- int numRelevantEdges = 0;
-
- if (firstTableRow != null && tableSpaceCount > 0)
- {
- // check to make sure this text row is within a line or so of the other lines already added
- // if it's not, we should stop the table here
- double tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5;
- double lineDistance = textRow.Top - prevRow.Top;
-
- if (lineDistance > tableLineThreshold)
- {
- lastTableRow = prevRow;
- break;
- }
- }
-
- // for larger tables, be a little lenient on the number of relevant rows the text intersects
- // for smaller tables, not so much - otherwise we'll end up treating paragraphs as tables too
- int relativeEdgeDifferenceThreshold = 1;
- if (relevantEdgeCount <= 3)
- {
- relativeEdgeDifferenceThreshold = 0;
- }
-
- foreach (TextEdge edge in relevantEdges)
- {
- if (textRow.IntersectsLine(edge.line))
- {
- numRelevantEdges++;
- }
- }
-
- // see if we have a candidate text row
- if (numRelevantEdges >= (relevantEdgeCount - relativeEdgeDifferenceThreshold))
- {
- // keep track of table row spacing
- if (prevRow != null && firstTableRow != null)
- {
- tableSpaceCount++;
- totalRowSpacing += textRow.Top - prevRow.Top;
- }
-
- // row is part of a table
- if (table.Area == 0)
- {
- firstTableRow = textRow;
- table.SetRect(textRow);
- }
- else
- {
- table.SetLeft(Math.Min(table.Left, textRow.Left));
- table.SetBottom(Math.Min(table.Bottom, textRow.Bottom)); // max
- table.SetRight(Math.Max(table.Right, textRow.Right));
- }
- }
- else
- {
- // no dice
- // if we're at the end of the table, save the last row
- if (firstTableRow != null && lastTableRow == null)
- {
- lastTableRow = prevRow;
- }
- }
-
- prevRow = textRow;
- }
-
- // if we don't have a table now, we won't after the next step either
- if (table.Area == 0)
- {
- return null;
- }
-
- if (lastTableRow == null)
- {
- // takes care of one-row tables or tables that end at the bottom of a page
- lastTableRow = prevRow;
- }
-
- // use the average row height and nearby horizontal lines to extend the table area
- double avgRowHeight;
- if (tableSpaceCount > 0)
- {
- avgRowHeight = totalRowSpacing / tableSpaceCount;
- }
- else
- {
- avgRowHeight = lastTableRow.Height;
- }
-
- double rowHeightThreshold = avgRowHeight * 1.5;
-
- // check lines after the bottom of the table
- foreach (Ruling ruling in horizontalRulings) // Line2D.Float
- {
-
- if (ruling.Y1 < table.Bottom)
- {
- continue;
- }
-
- double distanceFromTable = ruling.Y1 - table.Bottom;
- if (distanceFromTable <= rowHeightThreshold)
- {
- // use this ruling to help define the table
- table.SetBottom(Math.Min(table.Bottom, ruling.Y1)); // max
- table.SetLeft(Math.Min(table.Left, ruling.X1));
- table.SetRight(Math.Max(table.Right, ruling.X2));
- }
- else
- {
- // no use checking any further
- break;
- }
- }
-
- // do the same for lines at the top, but make the threshold greater since table headings tend to be
- // larger to fit up to three-ish rows of text (at least but we don't want to grab too much)
- rowHeightThreshold = avgRowHeight * 3.8f;
-
- for (int i = horizontalRulings.Count - 1; i >= 0; i--)
- {
- Ruling ruling = horizontalRulings[i];//.get(i); Line2D.Float
-
- if (ruling.Y1 > table.Top) // bobld or ?
- {
- continue;
- }
-
- double distanceFromTable = table.Top - ruling.Y1;
- if (distanceFromTable <= rowHeightThreshold)
- {
- table.SetTop((float)Math.Max(table.Top, ruling.Y1)); //min
- table.SetLeft((float)Math.Min(table.Left, ruling.X1));
- table.SetRight((float)Math.Max(table.Right, ruling.X2));
- }
- else
- {
- break;
- }
- }
-
- // add a bit of padding since the halved horizontal lines are a little fuzzy anyways
- table.SetTop((float)Math.Floor(table.Top) - TABLE_PADDING_AMOUNT);
- table.SetBottom((float)Math.Ceiling(table.Bottom) + TABLE_PADDING_AMOUNT);
- table.SetLeft((float)Math.Floor(table.Left) - TABLE_PADDING_AMOUNT);
- table.SetRight((float)Math.Ceiling(table.Right) + TABLE_PADDING_AMOUNT);
-
- return table;
- }
-
- private RelevantEdges getRelevantEdges(TextEdges textEdges, List lines)
- {
- List leftTextEdges = textEdges[TextEdge.LEFT]; //.get(TextEdge.LEFT);
- List midTextEdges = textEdges[TextEdge.MID]; //.get(TextEdge.MID);
- List rightTextEdges = textEdges[TextEdge.RIGHT]; //.get(TextEdge.RIGHT);
-
- // first we'll find the number of lines each type of edge crosses
- int[][] edgeCountsPerLine = new int[lines.Count][]; //[TextEdge.NUM_TYPES];
- for (int i = 0; i < edgeCountsPerLine.Length; i++)
- {
- edgeCountsPerLine[i] = new int[TextEdge.NUM_TYPES];
- }
-
- foreach (TextEdge edge in leftTextEdges)
- {
- edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.LEFT]++;
- }
-
- foreach (TextEdge edge in midTextEdges)
- {
- edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.MID]++;
- }
-
- foreach (TextEdge edge in rightTextEdges)
- {
- edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.RIGHT]++;
- }
-
- // now let's find the relevant edge type and the number of those edges we should look for
- // we'll only take a minimum of two edges to look for tables
- int relevantEdgeType = -1;
- int relevantEdgeCount = 0;
- for (int i = edgeCountsPerLine.Length - 1; i > 2; i--)
- {
- if (edgeCountsPerLine[i][TextEdge.LEFT] > 2 &&
- edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.RIGHT] &&
- edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.MID])
- {
- relevantEdgeCount = edgeCountsPerLine[i][TextEdge.LEFT];
- relevantEdgeType = TextEdge.LEFT;
- break;
- }
-
- if (edgeCountsPerLine[i][TextEdge.RIGHT] > 1 &&
- edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.LEFT] &&
- edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.MID])
- {
- relevantEdgeCount = edgeCountsPerLine[i][TextEdge.RIGHT];
- relevantEdgeType = TextEdge.RIGHT;
- break;
- }
-
- if (edgeCountsPerLine[i][TextEdge.MID] > 1 &&
- edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.RIGHT] &&
- edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.LEFT])
- {
- relevantEdgeCount = edgeCountsPerLine[i][TextEdge.MID];
- relevantEdgeType = TextEdge.MID;
- break;
- }
- }
-
- return new RelevantEdges(relevantEdgeType, relevantEdgeCount);
- }
-
- private TextEdges getTextEdges(List lines)
- {
- // get all text edges (lines that align with the left, middle and right of chunks of text) that extend
- // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text
- List leftTextEdges = new List(); // ArrayList<>();
- List midTextEdges = new List(); //ArrayList<>();
- List rightTextEdges = new List(); // ArrayList<>();
-
- var currLeftEdges = new Dictionary>(); // Map> currLeftEdges = new HashMap<>();
- var currMidEdges = new Dictionary>(); // Map> currMidEdges = new HashMap<>();
- var currRightEdges = new Dictionary>(); // Map> currRightEdges = new HashMap<>();
-
- foreach (TableLine textRow in lines)
- {
- foreach (TextChunk text in textRow.TextElements)
- {
- int left = (int)Math.Floor(text.Left); // new Integer(
- int right = (int)Math.Floor(text.Right); //new Integer(
- int mid = left + ((right - left) / 2);//new Integer(
-
- // first put this chunk into any edge buckets it belongs to
- List leftEdge = currLeftEdges[left]; //.get(left);
- if (leftEdge == null)
- {
- leftEdge = new List(); //ArrayList<>();
- currLeftEdges[left] = leftEdge; //.put(left, leftEdge);
- }
- leftEdge.Add(text);
-
- List midEdge = currMidEdges[mid];//.get(mid);
- if (midEdge == null)
- {
- midEdge = new List(); //ArrayList<>();
- currMidEdges[mid] = midEdge; //.put(mid, midEdge);
- }
- midEdge.Add(text);
-
- List rightEdge = currRightEdges[right]; //.get(right);
- if (rightEdge == null)
- {
- rightEdge = new List(); //ArrayList<>();
- currRightEdges[right] = rightEdge; //.put(right, rightEdge);
- }
- rightEdge.Add(text);
-
- // now see if this text chunk blows up any other edges
- //for (Iterator>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext();)
- foreach (var entry in currLeftEdges.ToList())
- {
- //Map.Entry> entry = iterator.next();
- int key = entry.Key; //.getKey();
- if (key > left && key < right)
- {
- //iterator.remove();
- currLeftEdges.Remove(key);
- List edgeChunks = entry.Value; //.getValue();
- if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
- {
- TextChunk first = edgeChunks[0];//.get(0);
- TextChunk last = edgeChunks[edgeChunks.Count - 1];//.get(edgeChunks.size() - 1);
-
- TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom);
- edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);
-
- leftTextEdges.Add(edge);
- }
- }
- }
-
- //for (Iterator>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext();)
- foreach (var entry in currMidEdges.ToList())
- {
- //Map.Entry> entry = iterator.next();
- int key = entry.Key; //.getKey();
- if (key > left && key < right && Math.Abs(key - mid) > 2)
- {
- //iterator.remove();
- currMidEdges.Remove(key);
- List edgeChunks = entry.Value; //.getValue();
- if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
- {
- TextChunk first = edgeChunks[0];//.get(0);
- TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1);
-
- TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom);
- edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);
-
- midTextEdges.Add(edge);
- }
- }
- }
-
- //for (Iterator>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext();)
- foreach (var entry in currRightEdges.ToList())
- {
- //Map.Entry> entry = iterator.next();
- int key = entry.Key; //.getKey();
- if (key > left && key < right)
- {
- //iterator.remove();
- currRightEdges.Remove(key);
- List edgeChunks = entry.Value; //.getValue();
- if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
- {
- TextChunk first = edgeChunks[0];//.get(0);
- TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1);
-
- TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom);
- edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);
-
- rightTextEdges.Add(edge);
- }
- }
- }
- }
- }
-
- // add the leftovers
- //foreach (Integer key in currLeftEdges.keySet())
- foreach (var key in currLeftEdges.Keys)
- {
- List edgeChunks = currLeftEdges[key]; //.get(key);
- if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
- {
- TextChunk first = edgeChunks[0]; //.get(0);
- TextChunk last = edgeChunks[edgeChunks.Count - 1]; //.get(edgeChunks.size() - 1);
-
- TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom);
- edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);
-
- leftTextEdges.Add(edge);
- }
- }
-
- foreach (int key in currMidEdges.Keys) //.keySet())
- {
- List edgeChunks = currMidEdges[key]; // .get(key);
- if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
- {
- TextChunk first = edgeChunks[0]; //.get(0);
- TextChunk last = edgeChunks[edgeChunks.Count - 1];//.get(edgeChunks.size() - 1);
-
- TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom);
- edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);
-
- midTextEdges.Add(edge);
- }
- }
-
- foreach (int key in currRightEdges.Keys) //.keySet())
- {
- List edgeChunks = currRightEdges[key]; //.get(key);
- if (edgeChunks.Count >= REQUIRED_TEXT_LINES_FOR_EDGE)
- {
- TextChunk first = edgeChunks[0];//.get(0);
- TextChunk last = edgeChunks[edgeChunks.Count - 1]; // .get(edgeChunks.size() - 1);
-
- TextEdge edge = new TextEdge(key, first.Top, key, last.Bottom);
- edge.intersectingTextRowCount = Math.Min(edgeChunks.Count, lines.Count);
-
- rightTextEdges.Add(edge);
- }
- }
-
- return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges);
- }
-
- private List getTableAreasFromCells(List cells)
- {
- List> cellGroups = new List>(); // ArrayList<>();
- foreach (TableRectangle cell in cells)
- {
- bool addedToGroup = false;
-
- bool breakCellCheck = false;
- //cellCheck:
- foreach (List cellGroup in cellGroups)
- {
- if (breakCellCheck) break; // simulates 'break cellCheck;'
- foreach (TableRectangle groupCell in cellGroup)
- {
- PdfPoint[] groupCellCorners = groupCell.Points;
- PdfPoint[] candidateCorners = cell.Points;
-
- for (int i = 0; i < candidateCorners.Length; i++)
- {
- for (int j = 0; j < groupCellCorners.Length; j++)
- {
- //if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM)
- if (Distances.Euclidean(candidateCorners[i], groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM)
- {
- cellGroup.Add(cell);
- addedToGroup = true;
- //break cellCheck;
- breakCellCheck = true;
- break;
- }
- }
- }
- }
- }
-
- if (!addedToGroup)
- {
- List cellGroup = new List(); //ArrayList cellGroup = new ArrayList<>();
- cellGroup.Add(cell);
- cellGroups.Add(cellGroup);
- }
- }
-
- // create table areas based on cell group
- List tableAreas = new List(); //new ArrayList<>();
- foreach (List cellGroup in cellGroups)
- {
- // less than four cells should not make a table
- if (cellGroup.Count < REQUIRED_CELLS_FOR_TABLE)
- {
- continue;
- }
-
- double top = double.MinValue; // Float.MAX_VALUE;
- double left = double.MaxValue; // Float.MAX_VALUE;
- double bottom = double.MaxValue; // Float.MIN_VALUE;
- double right = double.MinValue; // Float.MIN_VALUE;
-
- foreach (TableRectangle cell in cellGroup)
- {
- if (cell.Top > top) top = cell.Top; // (cell.getTop() < top)
- if (cell.Left < left) left = cell.Left;
- if (cell.Bottom < bottom) bottom = cell.Bottom; // (cell.getBottom() > bottom)
- if (cell.Right > right) right = cell.Right;
- }
-
- tableAreas.Add(new TableRectangle(top, left, right - left, bottom - top));
- }
-
- return tableAreas;
- }
-
- private List getHorizontalRulings(object image) // BufferedImage
- {
- throw new NotImplementedException();
-
- /*
- // get all horizontal edges, which we'll define as a change in grayscale colour
- // along a straight line of a certain length
- List horizontalRulings = new List(); // ArrayList<>();
-
- Raster r = image.getRaster();
- int width = r.getWidth();
- int height = r.getHeight();
-
- for (int x = 0; x < width; x++)
- {
- int[] lastPixel = r.getPixel(x, 0, (int[])null);
-
- for (int y = 1; y < height - 1; y++)
- {
- int[] currPixel = r.getPixel(x, y, (int[])null);
-
- int diff = Math.Abs(currPixel[0] - lastPixel[0]);
- if (diff > GRAYSCALE_INTENSITY_THRESHOLD)
- {
- // we hit what could be a line
- // don't bother scanning it if we've hit a pixel in the line before
- bool alreadyChecked = false;
- foreach (var line in horizontalRulings) //
- {
- if (y == line.getY1() && x >= line.getX1() && x <= line.getX2())
- {
- alreadyChecked = true;
- break;
- }
- }
-
- if (alreadyChecked)
- {
- lastPixel = currPixel;
- continue;
- }
-
- int lineX = x + 1;
-
- while (lineX < width)
- {
- int[] linePixel = r.getPixel(lineX, y, (int[])null);
- int[] abovePixel = r.getPixel(lineX, y - 1, (int[])null);
-
- if (Math.Abs(linePixel[0] - abovePixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD
- || Math.Abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD)
- {
- break;
- }
-
- lineX++;
- }
-
- int endX = lineX - 1;
- int lineWidth = endX - x;
- if (lineWidth > HORIZONTAL_EDGE_WIDTH_MINIMUM)
- {
- horizontalRulings.Add(new Ruling(new PdfPoint(x, y), new PdfPoint(endX, y)));
- }
- }
-
- lastPixel = currPixel;
- }
- }
-
- return horizontalRulings;
- */
- }
-
- private List getVerticalRulings(object image) // BufferedImage
- {
- throw new NotImplementedException();
-
- /*
- // get all vertical edges, which we'll define as a change in grayscale colour
- // along a straight line of a certain length
- List verticalRulings = new List();//new ArrayList<>();
-
- Raster r = image.getRaster();
- int width = r.getWidth();
- int height = r.getHeight();
-
- for (int y = 0; y < height; y++)
- {
- int[] lastPixel = r.getPixel(0, y, (int[])null);
-
- for (int x = 1; x < width - 1; x++)
- {
- int[] currPixel = r.getPixel(x, y, (int[])null);
-
- int diff = Math.Abs(currPixel[0] - lastPixel[0]);
- if (diff > GRAYSCALE_INTENSITY_THRESHOLD)
- {
- // we hit what could be a line
- // don't bother scanning it if we've hit a pixel in the line before
- bool alreadyChecked = false;
- foreach (var line in verticalRulings)
- {
- if (x == line.getX1() && y >= line.getY1() && y <= line.getY2())
- {
- alreadyChecked = true;
- break;
- }
- }
-
- if (alreadyChecked)
- {
- lastPixel = currPixel;
- continue;
- }
-
- int lineY = y + 1;
-
- while (lineY < height)
- {
- int[] linePixel = r.getPixel(x, lineY, (int[])null);
- int[] leftPixel = r.getPixel(x - 1, lineY, (int[])null);
-
- if (Math.Abs(linePixel[0] - leftPixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD
- || Math.Abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD)
- {
- break;
- }
-
- lineY++;
- }
-
- int endY = lineY - 1;
- int lineLength = endY - y;
- if (lineLength > VERTICAL_EDGE_HEIGHT_MINIMUM)
- {
- verticalRulings.Add(new Ruling(new PdfPoint(x, y), new PdfPoint(x, endY)));
- }
- }
-
- lastPixel = currPixel;
- }
- }
-
- return verticalRulings;
- */
- }
-
- // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html
- private PdfDocument removeText(Page page)
- {
- throw new NotImplementedException();
- /*
- PDFStreamParser parser = new PDFStreamParser(page);
- parser.parse();
- List