diff --git a/Tabula.Csv/Tabula.Csv.csproj b/Tabula.Csv/Tabula.Csv.csproj index d40e2e8..d39a5c8 100644 --- a/Tabula.Csv/Tabula.Csv.csproj +++ b/Tabula.Csv/Tabula.Csv.csproj @@ -2,6 +2,14 @@ netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47 + Extract tables from PDF files (port of tabula-java using PdfPig). Csv and Tsv writers. + https://github.com/BobLd/tabula-sharp + 0.1.0-alpha001 + + + + D:\VS2017\source\repos\tabula-sharp\Tabula.Csv\Tabula.Csv.xml + 3 diff --git a/Tabula.Csv/Tabula.Csv.xml b/Tabula.Csv/Tabula.Csv.xml new file mode 100644 index 0000000..b172aa2 --- /dev/null +++ b/Tabula.Csv/Tabula.Csv.xml @@ -0,0 +1,8 @@ + + + + Tabula.Csv + + + + diff --git a/Tabula.Json/Tabula.Json.csproj b/Tabula.Json/Tabula.Json.csproj index fe6363d..2cd4422 100644 --- a/Tabula.Json/Tabula.Json.csproj +++ b/Tabula.Json/Tabula.Json.csproj @@ -2,6 +2,13 @@ netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47 + Extract tables from PDF files (port of tabula-java using PdfPig). Json writer. + https://github.com/BobLd/tabula-sharp + 0.1.0-alpha001 + + + + D:\VS2017\source\repos\tabula-sharp\Tabula.Json\Tabula.Json.xml diff --git a/Tabula.Json/Tabula.Json.xml b/Tabula.Json/Tabula.Json.xml new file mode 100644 index 0000000..a2251a8 --- /dev/null +++ b/Tabula.Json/Tabula.Json.xml @@ -0,0 +1,8 @@ + + + + Tabula.Json + + + + diff --git a/Tabula/Cell.cs b/Tabula/Cell.cs index 05fc13f..a199bfb 100644 --- a/Tabula/Cell.cs +++ b/Tabula/Cell.cs @@ -9,7 +9,7 @@ namespace Tabula * ** tabula/Cell.java ** */ /// - /// + /// A cell in a table. /// public class Cell : RectangularTextContainer { @@ -19,7 +19,7 @@ public class Cell : RectangularTextContainer public static Cell EMPTY => new Cell(new PdfRectangle()); /// - /// + /// Create a cell in a table. /// /// public Cell(PdfRectangle pdfRectangle) @@ -31,7 +31,7 @@ public Cell(PdfRectangle pdfRectangle) } /// - /// + /// Create a cell in a table. /// /// public Cell(TextChunk chunk) @@ -41,7 +41,7 @@ public Cell(TextChunk chunk) } /// - /// + /// Create a cell in a table. /// /// /// @@ -59,6 +59,10 @@ public Cell(PdfPoint topLeft, PdfPoint bottomRight) } } + /// + /// Gets the cell's text. + /// + /// public override string GetText(bool useLineReturns) { if (base.textElements.Count == 0) @@ -81,6 +85,9 @@ public override string GetText(bool useLineReturns) return sb.ToString().Trim(); } + /// + /// Gets the cell's text. + /// public override string GetText() { return GetText(true); @@ -100,6 +107,7 @@ public void SetPlaceholder(bool placeholder) this.IsPlaceholder = placeholder; } + /// public override string ToString() { return GetText(); diff --git a/Tabula/Detectors/IDetectionAlgorithm.cs b/Tabula/Detectors/IDetectionAlgorithm.cs index b5c57bc..fe4ce57 100644 --- a/Tabula/Detectors/IDetectionAlgorithm.cs +++ b/Tabula/Detectors/IDetectionAlgorithm.cs @@ -3,12 +3,20 @@ namespace Tabula.Detectors { // https://github.com/tabulapdf/tabula-java/blob/master/src/main/java/technology/ - /** + /* * ** tabula/detectors/DetectionAlgorithm.java ** * Created by matt on 2015-12-14. */ + + /// + /// Table detection algorithm. + /// public interface IDetectionAlgorithm { + /// + /// Detects the tables in the page. + /// + /// The page where to detect the tables. List Detect(PageArea page); } } diff --git a/Tabula/Detectors/NurminenDetectionAlgorithm.cs b/Tabula/Detectors/NurminenDetectionAlgorithm.cs index 26fd6bc..0b28550 100644 --- a/Tabula/Detectors/NurminenDetectionAlgorithm.cs +++ b/Tabula/Detectors/NurminenDetectionAlgorithm.cs @@ -8,7 +8,7 @@ namespace Tabula.Detectors { - /** + /* * ** tabula/detectors/NurminenDetectionAlgorithm.java ** * Created by matt on 2015-12-17. *

@@ -16,6 +16,10 @@ namespace Tabula.Detectors * Anssi Nurminen's master's thesis: * http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 */ + + ///

+ /// Nurminen detection algorithm. + /// public class NurminenDetectionAlgorithm : IDetectionAlgorithm { private static int GRAYSCALE_INTENSITY_THRESHOLD = 25; diff --git a/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs b/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs index ff371af..77e08ae 100644 --- a/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs +++ b/Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs @@ -16,6 +16,10 @@ namespace Tabula.Detectors /// public class SpreadsheetDetectionAlgorithm : IDetectionAlgorithm { + /// + /// Detects the tables in the page. + /// + /// The page where to detect the tables. public List Detect(PageArea page) { List cells = SpreadsheetExtractionAlgorithm.FindCells(page.HorizontalRulings, page.VerticalRulings); diff --git a/Tabula/Extractors/BasicExtractionAlgorithm.cs b/Tabula/Extractors/BasicExtractionAlgorithm.cs index aec6f7b..f1ecf86 100644 --- a/Tabula/Extractors/BasicExtractionAlgorithm.cs +++ b/Tabula/Extractors/BasicExtractionAlgorithm.cs @@ -4,27 +4,33 @@ namespace Tabula.Extractors { /// - /// stream + /// Stream extraction algorithm. /// public class BasicExtractionAlgorithm : IExtractionAlgorithm { private IReadOnlyList verticalRulings; /// - /// stream + /// Stream extraction algorithm. /// public BasicExtractionAlgorithm() { } /// - /// stream + /// Stream extraction algorithm. /// + /// List of vertical rulings. public BasicExtractionAlgorithm(IReadOnlyList verticalRulings) { this.verticalRulings = verticalRulings; } + /// + /// Extracts the tables in the page. + /// + /// The page where to extract the tables. + /// List of vertical rulings, indicated by there x position. public List Extract(PageArea page, IReadOnlyList verticalRulingPositions) { List verticalRulings = new List(verticalRulingPositions.Count); @@ -36,6 +42,10 @@ public List
Extract(PageArea page, IReadOnlyList verticalRulingPos return this.Extract(page); } + /// + /// Extracts the tables in the page. + /// + /// The page where to extract the tables. public List
Extract(PageArea page) { List textElements = page.GetText(); @@ -113,13 +123,14 @@ public List
Extract(PageArea page) return new Table[] { table }.ToList(); } + /// public override string ToString() { return "stream"; } /// - /// + /// Gets columns positions. /// /// Must be an array of lines sorted by their +top+ attribute. /// a list of column boundaries (x axis). diff --git a/Tabula/Extractors/IExtractionAlgorithm.cs b/Tabula/Extractors/IExtractionAlgorithm.cs index 05fa906..eed7435 100644 --- a/Tabula/Extractors/IExtractionAlgorithm.cs +++ b/Tabula/Extractors/IExtractionAlgorithm.cs @@ -2,8 +2,15 @@ namespace Tabula.Extractors { + /// + /// Table extraction algorithm. + /// public interface IExtractionAlgorithm { + /// + /// Extracts the tables in the page. + /// + /// The page where to extract the tables. List
Extract(PageArea page); } } diff --git a/Tabula/Extractors/SpreadsheetExtractionAlgorithm.cs b/Tabula/Extractors/SpreadsheetExtractionAlgorithm.cs index 13c585b..b1fdc60 100644 --- a/Tabula/Extractors/SpreadsheetExtractionAlgorithm.cs +++ b/Tabula/Extractors/SpreadsheetExtractionAlgorithm.cs @@ -7,12 +7,12 @@ namespace Tabula.Extractors { //https://github.com/tabulapdf/tabula-java/blob/master/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java /// - /// lattice + /// Lattice extraction algorithm. /// public class SpreadsheetExtractionAlgorithm : IExtractionAlgorithm { /// - /// lattice + /// Lattice extraction algorithm. /// public SpreadsheetExtractionAlgorithm() { @@ -81,13 +81,17 @@ public int Compare(PdfPoint arg0, PdfPoint arg1) } } + /// + /// Extracts the tables in the page. + /// + /// The page where to extract the tables. public List
Extract(PageArea page) { return Extract(page, page.GetRulings()); } /// - /// Extract a list of Table from page using rulings as separators + /// Extracts the tables in the page using rulings as separators. /// /// /// @@ -159,6 +163,10 @@ public List
Extract(PageArea page, IReadOnlyList rulings) return spreadsheets; } + /// + /// + /// + /// public bool IsTabular(PageArea page) { // if there's no text at all on the page, it's not a table @@ -177,6 +185,7 @@ public bool IsTabular(PageArea page) { return false; } + Table table = tables[0]; int rowsDefinedByLines = table.RowCount; int colsDefinedByLines = table.ColumnCount; @@ -187,6 +196,7 @@ public bool IsTabular(PageArea page) // TODO WHAT DO WE DO HERE? System.Diagnostics.Debug.Write("SpreadsheetExtractionAlgorithm.isTabular(): no table found."); } + table = tables[0]; int rowsDefinedWithoutLines = table.RowCount; int colsDefinedWithoutLines = table.ColumnCount; @@ -196,6 +206,11 @@ public bool IsTabular(PageArea page) return ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1 / MAGIC_HEURISTIC_NUMBER); } + /// + /// Find cells from horizontal and vertical ruling lines. + /// + /// + /// public static List FindCells(IReadOnlyList horizontalRulingLines, IReadOnlyList verticalRulingLines) { List cellsFound = new List(); @@ -266,6 +281,11 @@ public static List FindCells(IReadOnlyList horizontalRulingLines, return cellsFound; } + /// + /// Find spreadsheets areas from cells. + /// Based on O'Rourke's `Uniqueness of orthogonal connect-the-dots`. + /// + /// public static List FindSpreadsheetsFromCells(List cells) { // via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon @@ -390,6 +410,7 @@ public static List FindSpreadsheetsFromCells(List public override string ToString() { return "lattice"; @@ -419,12 +440,6 @@ public override bool Equals(object other) return this.point.Equals(o.point); } return false; - /* - if (this == other) - return true; - if (!(other is PolygonVertex)) return false; - return this.point.Equals(((PolygonVertex)other).point); - */ } public override int GetHashCode() diff --git a/Tabula/ObjectExtractor.cs b/Tabula/ObjectExtractor.cs index e3eed6e..51aee93 100644 --- a/Tabula/ObjectExtractor.cs +++ b/Tabula/ObjectExtractor.cs @@ -8,16 +8,24 @@ namespace Tabula { - /** + /* * ** tabula/ObjectExtractor.java ** * ** tabula/ObjectExtractorStreamEngine.java ** */ + + /// + /// Tabula object extractor. + /// public class ObjectExtractor { private const int rounding = 6; private PdfDocument pdfDocument; + /// + /// Create a Tabula object extractor. + /// + /// public ObjectExtractor(PdfDocument pdfDocument) { this.pdfDocument = pdfDocument; @@ -49,6 +57,10 @@ private PdfPoint RoundPdfPoint(PdfPoint pdfPoint, int decimalPlace) return new PdfPoint(Utils.Round(pdfPoint.X, decimalPlace), Utils.Round(pdfPoint.Y, decimalPlace)); } + /// + /// Extract the , with its text elements (letters) and rulings (processed PdfPath and PdfSubpath). + /// + /// The page number to extract. public PageArea ExtractPage(int pageNumber) { if (pageNumber > this.pdfDocument.NumberOfPages || pageNumber < 1) @@ -156,21 +168,35 @@ public PageArea ExtractPage(int pageNumber) pdfTextStripper.spatialIndex); } + /// + /// Enumerate and extract over the given pages. + /// + /// public PageIterator Extract(IEnumerable pages) { return new PageIterator(this, pages); } + /// + /// Enumerate and extract over all the pages. + /// public PageIterator Extract() { return Extract(Utils.Range(1, this.pdfDocument.NumberOfPages + 1)); } + /// + /// Extract the , with its text elements (letters) and rulings (processed PdfPath and PdfSubpath). + /// + /// The page number to extract. public PageArea Extract(int pageNumber) { return Extract(Utils.Range(pageNumber, pageNumber + 1)).Next(); } + /// + /// Close the ObjectExtractor. + /// public void Close() { this.pdfDocument.Dispose(); diff --git a/Tabula/PageArea.cs b/Tabula/PageArea.cs index 918af94..4c6e604 100644 --- a/Tabula/PageArea.cs +++ b/Tabula/PageArea.cs @@ -11,7 +11,7 @@ namespace Tabula // TODO: this class should probably be called "PageArea" or something like that /// - /// + /// A tabula page. /// public class PageArea : TableRectangle { @@ -43,22 +43,22 @@ public class PageArea : TableRectangle public PdfDocument PdfDocument { get; } /// - /// + /// The minimum character width. /// public double MinCharWidth { get; } /// - /// + /// The minimum character height. /// public double MinCharHeight { get; } /// - /// + /// True if the page contains text. /// public bool HasText => this.texts.Count > 0; /// - /// Get the vertical rulings. + /// Gets the vertical rulings. /// This is a read-only list. Use to add a . /// public IReadOnlyList VerticalRulings @@ -75,7 +75,7 @@ public IReadOnlyList VerticalRulings } /// - /// Get the horizontal rulings. + /// Gets the horizontal rulings. /// This is a read-only list. Use to add a . /// public IReadOnlyList HorizontalRulings @@ -92,7 +92,7 @@ public IReadOnlyList HorizontalRulings } /// - /// Get the unprocessed rulings. + /// Gets the unprocessed rulings. /// This is a read-only list. Use to add a . /// public IReadOnlyList UnprocessedRulings => this.rulings; @@ -135,12 +135,25 @@ public PageArea(double top, double left, double width, double height, int rotati } */ - public PageArea(PdfRectangle area, int rotation, int page_number, Page pdPage, PdfDocument doc, + /// + /// Create a new page area. + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + public PageArea(PdfRectangle area, int rotation, int pageNumber, Page pdPage, PdfDocument doc, List characters, List rulings, double minCharWidth, double minCharHeight, RectangleSpatialIndex index) : base(area) { this.Rotation = rotation; - this.PageNumber = page_number; + this.PageNumber = pageNumber; this.PdfPage = pdPage; this.PdfDocument = doc; this.texts = characters; @@ -150,6 +163,10 @@ public PageArea(PdfRectangle area, int rotation, int page_number, Page pdPage, P this.spatial_index = index; } + /// + /// Gets the page area from the given area. + /// + /// public PageArea GetArea(PdfRectangle area) { List t = GetText(area); @@ -192,21 +209,39 @@ public PageArea GetArea(PdfRectangle area) return rv; } + /// + /// Gets the page area from the given area. + /// + /// + /// + /// + /// + /// public PageArea GetArea(double top, double left, double bottom, double right) { return this.GetArea(new PdfRectangle(left, bottom, right, top)); } + /// + /// Gets the page's text. + /// public List GetText() { return texts; } + /// + /// Gets the page's text contained in the area. + /// + /// public List GetText(PdfRectangle area) { return this.spatial_index.Contains(area); } + /// + /// Gets the bounding box containing the text. + /// public TableRectangle GetTextBounds() { List texts = this.GetText(); @@ -265,6 +300,10 @@ public IReadOnlyList GetRulings() return this.cleanRulings; } + /// + /// Add a vertical or a horizontal ruling lines. + /// + /// public void AddRuling(Ruling r) { if (r.Oblique) diff --git a/Tabula/PageIterator.cs b/Tabula/PageIterator.cs index c0e4456..680ca20 100644 --- a/Tabula/PageIterator.cs +++ b/Tabula/PageIterator.cs @@ -4,17 +4,26 @@ namespace Tabula { + /// + /// A tabula page iterator. + /// public class PageIterator : IEnumerator { private ObjectExtractor oe; private IEnumerator pageIndexIterator; + /// + /// Create a tabula page iterator. + /// + /// + /// public PageIterator(ObjectExtractor oe, IEnumerable pages) : base() { this.oe = oe; this.pageIndexIterator = pages.GetEnumerator(); } + /// public PageArea Current { get @@ -30,8 +39,13 @@ public PageArea Current } } + /// object IEnumerator.Current => Current; + /// + /// Helper function that does MoveNext() + Current; + /// + /// public PageArea Next() { if (MoveNext()) @@ -44,17 +58,20 @@ public PageArea Next() } } + /// public void Dispose() { this.oe.Close(); this.pageIndexIterator.Dispose(); } + /// public bool MoveNext() { return this.pageIndexIterator.MoveNext(); } + /// public void Reset() { this.pageIndexIterator.Reset(); diff --git a/Tabula/Table.cs b/Tabula/Table.cs index 3675265..4d3e3ec 100644 --- a/Tabula/Table.cs +++ b/Tabula/Table.cs @@ -5,6 +5,9 @@ namespace Tabula { + /// + /// A tabula table. + /// public class Table : TableRectangle { /// @@ -12,11 +15,19 @@ public class Table : TableRectangle /// public static Table EMPTY => new Table(""); - private Table(string extractionMethod) + /// + /// Create a table. + /// + /// + private Table(string extractionMethod) : base() { this.ExtractionMethod = extractionMethod; } + /// + /// Create a table. + /// + /// public Table(IExtractionAlgorithm extractionAlgorithm) : this(extractionAlgorithm.ToString()) { } @@ -31,20 +42,26 @@ public Table(IExtractionAlgorithm extractionAlgorithm) public IReadOnlyList Cells => cells.Values.ToList(); /// - /// + /// Gets the number of rows in the table. /// public int RowCount { get; private set; } /// - /// + /// Gets the number of columns in the table. /// public int ColumnCount { get; private set; } /// - /// + /// Gets the extraction method used to build to table. /// public string ExtractionMethod { get; } + /// + /// Add a cell at the given [row, column] position. + /// + /// + /// + /// public void Add(RectangularTextContainer chunk, int row, int col) { if (chunk is Cell cell) @@ -74,7 +91,7 @@ public void Add(RectangularTextContainer chunk, int row, int col) private List> memoizedRows; /// - /// + /// Gets the table's rows. /// public IReadOnlyList> Rows { @@ -108,7 +125,6 @@ private List> ComputeRows() /// /// Row. /// Column. - /// public Cell this[int i, int j] { get diff --git a/Tabula/TableLine.cs b/Tabula/TableLine.cs index 80b8198..8c7b4d9 100644 --- a/Tabula/TableLine.cs +++ b/Tabula/TableLine.cs @@ -8,22 +8,38 @@ namespace Tabula // TODO this class seems superfluous - get rid of it + /// + /// A Tabula Line. + /// public class TableLine : TableRectangle { - private List textChunks = new List(); + /// + /// List of white space characters. + /// public static readonly char[] WHITE_SPACE_CHARS = { ' ', '\t', '\r', '\n', '\f' }; + private List textChunks = new List(); + /// - /// Get the list of text elements. + /// Gets the list of text elements. /// This is a read-only list. Use to add a . /// public IReadOnlyList TextElements => textChunks; + /// + /// Sets the TextElements. + /// + /// public void SetTextElements(List textChunks) { this.textChunks = textChunks; } + /// + /// Add a text chunk to the text elements at a given index. + /// + /// The index at which to add the chunk. + /// The chunk to be added. public void AddTextChunk(int i, TextChunk textChunk) { if (i < 0) @@ -47,6 +63,10 @@ public void AddTextChunk(int i, TextChunk textChunk) this.Merge(textChunk); } + /// + /// Add a text chunk to the text elements. + /// + /// The chunk to be added. public void AddTextChunk(TextChunk textChunk) { if (this.textChunks.Count == 0) @@ -60,6 +80,7 @@ public void AddTextChunk(TextChunk textChunk) this.textChunks.Add(textChunk); } + /// public override string ToString() { StringBuilder sb = new StringBuilder(); diff --git a/Tabula/TableWithRulingLines.cs b/Tabula/TableWithRulingLines.cs index d59c2f1..566c73c 100644 --- a/Tabula/TableWithRulingLines.cs +++ b/Tabula/TableWithRulingLines.cs @@ -5,6 +5,9 @@ namespace Tabula { + /// + /// A tabula table with ruling lines. + /// public class TableWithRulingLines : Table { private class CellComparer : IComparer @@ -19,10 +22,26 @@ public int Compare(Cell arg0, Cell arg1) private readonly List horizontalRulings; private readonly RectangleSpatialIndex si = new RectangleSpatialIndex(); + /// + /// Gets the vertical rulings. + /// This is a read-only list. + /// public IReadOnlyList VerticalRulings => verticalRulings; + /// + /// Gets the horizontal rulings. + /// This is a read-only list. + /// public IReadOnlyList HorizontalRulings => horizontalRulings; + /// + /// Create a table. + /// + /// + /// + /// + /// + /// public TableWithRulingLines(TableRectangle area, List cells, List horizontalRulings, List verticalRulings, IExtractionAlgorithm extractionAlgorithm) : base(extractionAlgorithm) { diff --git a/Tabula/Tabula.csproj b/Tabula/Tabula.csproj index 6cadf1b..fb4602c 100644 --- a/Tabula/Tabula.csproj +++ b/Tabula/Tabula.csproj @@ -2,6 +2,11 @@ netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47 + Extract tables from PDF files (port of tabula-java using PdfPig). + https://github.com/BobLd/tabula-sharp + 0.1.0.0 + 0.1.0.0 + 0.1.0-alpha001 @@ -9,7 +14,6 @@ - diff --git a/Tabula/Tabula.xml b/Tabula/Tabula.xml index e837f8c..aa6043b 100644 --- a/Tabula/Tabula.xml +++ b/Tabula/Tabula.xml @@ -5,13 +5,64 @@ - ** tabula/Cell.java ** + + A cell in a table. + + + + + An empty Cell, with coordinates [0, 0, 0, 0]. + + + + + Create a cell in a table. + + + + + + Create a cell in a table. + + + + + + Create a cell in a table. + + + + + + + Gets the cell's text. + + + + + + Gets the cell's text. + + + + - - ** tabula/detectors/DetectionAlgorithm.java ** - Created by matt on 2015-12-14. + + + Table detection algorithm. + + + + + Detects the tables in the page. + + The page where to detect the tables. + + + + Nurminen detection algorithm. + - Helper class that encapsulates a text edge @@ -28,52 +79,325 @@ - Created by matt on 2015-12-14. + This is the basic spreadsheet table detection algorithm currently implemented in tabula (web). It uses intersecting ruling lines to find tables. + + + + + Detects the tables in the page. + + The page where to detect the tables. - stream + Stream extraction algorithm. - stream + Stream extraction algorithm. + + + + + Stream extraction algorithm. + + List of vertical rulings. + + + + Extracts the tables in the page. + + The page where to extract the tables. + List of vertical rulings, indicated by there x position. + + + + Extracts the tables in the page. + + The page where to extract the tables. + + + + + + + Gets columns positions. + Must be an array of lines sorted by their +top+ attribute. + a list of column boundaries (x axis). - + - stream + Table extraction algorithm. + + + Extracts the tables in the page. + + The page where to extract the tables. + - lattice + Lattice extraction algorithm. - lattice + Lattice extraction algorithm. + + + + + Extracts the tables in the page. + The page where to extract the tables. - + - Extract a list of Table from page using rulings as separators + Extracts the tables in the page using rulings as separators. - + + + + + + + + + + Find cells from horizontal and vertical ruling lines. + + + + + + + Find spreadsheets areas from cells. + Based on O'Rourke's `Uniqueness of orthogonal connect-the-dots`. + + + + + - ** tabula/ObjectExtractor.java ** - ** tabula/ObjectExtractorStreamEngine.java ** + + Tabula object extractor. + + + + + Create a Tabula object extractor. + + + + + + Extract the , with its text elements (letters) and rulings (processed PdfPath and PdfSubpath). + + The page number to extract. + + + + Enumerate and extract over the given pages. + + + + + + Enumerate and extract over all the pages. + + + + + Extract the , with its text elements (letters) and rulings (processed PdfPath and PdfSubpath). + + The page number to extract. + + + + Close the ObjectExtractor. + + + + + A tabula page. + + + + + The page rotation. + + + + + The page number. + + + + + The original page. + + + + + The original document. + + + + + The minimum character width. + + + + + The minimum character height. + + + + + True if the page contains text. + + + + + Gets the vertical rulings. + This is a read-only list. Use to add a . + + + + + Gets the horizontal rulings. + This is a read-only list. Use to add a . + + + + + Gets the unprocessed rulings. + This is a read-only list. Use to add a . + + + + + Create a new page area. + + + + + + + + + + + + + + + Gets the page area from the given area. + + + + + + Gets the page area from the given area. + + + + + + + + + + Gets the page's text. + + + + + Gets the page's text contained in the area. + + + + + + Gets the bounding box containing the text. + + + + + Get the cleaned rulings. + + + + + Add a vertical or a horizontal ruling lines. + + + + + + A tabula page iterator. + + + + + Create a tabula page iterator. + + + + + + + + + + + + + Helper function that does MoveNext() + Current; + + + + + + + + + + + + + + + + + hack + + + + + + + + + + + + + @@ -81,11 +405,15 @@ TO REMOVE: need to check PdfPig's 'IntersectsWith' for bug with empty rectangles. they should instersect - + Minimum bounding box of all the Rectangles contained on this RectangleSpatialIndex. - + + + + + @@ -94,19 +422,33 @@ bottom point. top point - + + + Normalize almost horizontal or almost vertical lines. + + + + + Is the vertical? + + + + + Is the horizontal? + + + - Normalize almost horizontal or almost vertical lines + Is the blique? Neither vertical nor horizontal. - + attributes that make sense only for non-oblique lines these are used to have a single collapse method (in page, currently) - - + if the lines we're comparing are colinear or parallel, we expand them by a only 1 pixel, because the expansions are additive @@ -119,29 +461,138 @@ - + + + + + + + + Compute the angle. [0, +180]. + + + log(n) implementation of find_intersections based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf - - + True if both horizontal, aligned and overlap (i.e. infinite intersection points). True if both vertical, aligned and overlap (i.e. infinite intersection points). True if not parallel and intersect (i.e. in intersection point). - - + Deep copy. + + + A tabula table. + + + + + An empty table. + + + + + Create a table. + + + + + + Create a table. + + + + + + Get the list of cells. + This is a read-only list. Use to add a . + + + + + Gets the number of rows in the table. + + + + + Gets the number of columns in the table. + + + + + Gets the extraction method used to build to table. + + + + + Add a cell at the given [row, column] position. + + + + + + + + Gets the table's rows. + + + + + Get the cell at position [i, j]. + + Row. + Column. + + + + A Tabula Line. + + + + + List of white space characters. + + + + + Gets the list of text elements. + This is a read-only list. Use to add a . + + + + + Sets the TextElements. + + + + + + Add a text chunk to the text elements at a given index. + + The index at which to add the chunk. + The chunk to be added. + + + + Add a text chunk to the text elements. + + The chunk to be added. + + + + Sort top to bottom (as in reading order). @@ -157,58 +608,186 @@ - + 1 is LTR, 0 is neutral, -1 is RTL. Need this for fancy sorting in Tabula.TextChunk - + + + Get the 's top coordinate. + + + + + Set the 's top coordinate. + + + + + Get the 's right coordinate. + + + + + Set the 's right coordinate. + + + + + Get the 's left coordinate. + + + + + Set the 's left coordinate. + + + + Get the 's bottom coordinate. + + + + + Set the 's bottom coordinate. + + + + + Get the 's points. Counter-clockwise, starting from bottom left point. - + + + The 's top-left X coordinate. + + + + + The 's top-left Y coordinate. + + + - Sets the location and size of this Rectangle2D to the specified double values. + The 's width. - the X coordinate of the upper-left corner of this Rectangle2D - the Y coordinate of the upper-left corner of this Rectangle2D - the width of this Rectangle2D - the height of this Rectangle2D - + + + The 's height. + + + + + The 's left coordinate. + + + + + The 's right coordinate. + + + + + The 's bottom coordinate. + + + + + The 's top coordinate. + + + + + A tabula table with ruling lines. + + + + + Gets the vertical rulings. + This is a read-only list. + + + + + Gets the horizontal rulings. + This is a read-only list. + + + + + Create a table. + + + + + + + + + + An empty text chunk. + + + + + + + + + + + + + + + + + + + + Splits a TextChunk into N TextChunks, where each chunk is of a single directionality, and then reverse the RTL ones. what we're doing here is *reversing* the Unicode bidi algorithm - in the language of that algorithm, each chunk is a(maximal) directional run. - We attach whitespace to the beginning of non-RTL + in the language of that algorithm, each chunk is a (maximal) directional run. + We attach whitespace to the beginning of non-RTL. - - + - 1 is LtR, 0 is neutral, -1 is RtL. + 1 is Left-to-Right, 0 is neutral, -1 is Right-to-Left. - + + + Returns null. + TODO Auto-generated method stub + + + + + Returns true if text contained in this TextChunk is the same repeated character - + Splits a TextChunk in two, at the position of the i-th TextElement - + - Removes runs of identical TextElements in this TextChunk + Removes runs of identical TextElements in this TextChunk. For example, if the TextChunk contains this string of characters: "1234xxxxx56xx" and c == 'x' and minRunLength == 4, this method will return a list of TextChunk such that: ["1234", "56xx"] @@ -217,7 +796,49 @@ - + + + + + + + + + The purpose is basically just to return true iff there are 2+ TextChunks and they're identical. + + + + + + A tabula, text element. Equivalent to a letter. + + + + + Create a text element. + + + + + + + The direction of the text (0, 90, 180, or 270). Can be any number with PdfPig. + + + + The direction of the text (0, 90, 180, or 270). Can be any number with PdfPig. + + + + + + + + + + + + heuristically merge a list of TextElement into a list of TextChunk ported from PDFBox's PDFTextStripper.writePage, with modifications. @@ -227,14 +848,33 @@ + + + Unicode extensions. + As of Unicode v13.0. + + + + + Gets the character abbreviated type (i.e. 'BN', 'S', 'NSM', 'LRO'), used in the Unicode Bidirectional Algorithm. + + The character value. + + + + Gets the character abbreviated type (i.e. 'BN', 'S', 'NSM', 'LRO'), used in the Unicode Bidirectional Algorithm. + + The integer value of a char. + + @author manuel - + Wrap Collections.sort so we can fallback to a non-stable quicksort if we're running on JDK7+ - + re-implemented. @@ -242,15 +882,47 @@ - + - low endpoint (inclusive) of the subList - high endpoint (exclusive) of the subList - + Low endpoint (inclusive) of the subList + High endpoint (exclusive) of the subList + + + + Base interface for tabula writer. + + + + + Write the table to the stream. + + + + + + + Write the tables to the stream. + + + + + + + Write the table to the stream. + + + + + + + Write the tables to the stream. + + + diff --git a/Tabula/TextChunk.cs b/Tabula/TextChunk.cs index 7f9d0d4..7201ffb 100644 --- a/Tabula/TextChunk.cs +++ b/Tabula/TextChunk.cs @@ -10,7 +10,7 @@ namespace Tabula public class TextChunk : RectangularTextContainer, IHasText { /// - /// + /// An empty text chunk. /// public static TextChunk EMPTY => new TextChunk(); @@ -170,7 +170,7 @@ public TextChunk GroupByDirectionality(bool isLtrDominant) } /// - /// 1 is LtR, 0 is neutral, -1 is RtL. + /// 1 is Left-to-Right, 0 is neutral, -1 is Right-to-Left. /// public override int IsLtrDominant() { @@ -231,6 +231,12 @@ public override string GetText() return sb.ToString().Normalize(NormalizationForm.FormKC).Trim(); } + /// + /// Returns null. + /// TODO Auto-generated method stub + /// + /// + /// public override string GetText(bool useLineReturns) { // TODO Auto-generated method stub @@ -355,6 +361,7 @@ public List Squeeze(char c, int minRunLength) return rv; } + /// public override int GetHashCode() { const int prime = 31; @@ -362,6 +369,7 @@ public override int GetHashCode() return prime * result + ((textElements?.GetHashCode()) ?? 0); } + /// public override bool Equals(object obj) { if (obj is TextChunk other) @@ -380,6 +388,10 @@ public override bool Equals(object obj) return false; } + /// + /// The purpose is basically just to return true iff there are 2+ TextChunks and they're identical. + /// + /// public static bool AllSameChar(IReadOnlyList textChunks) { /* diff --git a/Tabula/TextElement.cs b/Tabula/TextElement.cs index 959341c..0b3c7dd 100644 --- a/Tabula/TextElement.cs +++ b/Tabula/TextElement.cs @@ -8,6 +8,9 @@ namespace Tabula { + /// + /// A tabula, text element. Equivalent to a letter. + /// public class TextElement : TableRectangle, IHasText { internal Letter letter; // do we really use it? @@ -15,6 +18,15 @@ public class TextElement : TableRectangle, IHasText private string text; private static double AVERAGE_CHAR_TOLERANCE = 0.3; + /// + /// Create a text element. + /// + /// + /// + /// + /// + /// + /// The direction of the text (0, 90, 180, or 270). Can be any number with PdfPig. public TextElement(PdfRectangle pdfRectangle, FontDetails font, double fontSize, string c, double widthOfSpace, double dir) : base(pdfRectangle) { @@ -26,6 +38,9 @@ public TextElement(PdfRectangle pdfRectangle, FontDetails font, double fontSize, } public string GetText() => text; + /// + /// The direction of the text (0, 90, 180, or 270). Can be any number with PdfPig. + /// public double Direction { get; } public double WidthOfSpace { get; } @@ -34,6 +49,7 @@ public TextElement(PdfRectangle pdfRectangle, FontDetails font, double fontSize, public double FontSize { get; } + /// public override string ToString() { StringBuilder sb = new StringBuilder(); @@ -43,6 +59,7 @@ public override string ToString() return sb.ToString(); } + /// public override int GetHashCode() { const int prime = 31; @@ -56,6 +73,7 @@ public override int GetHashCode() return result; } + /// public override bool Equals(object obj) { if (obj is TextElement other) diff --git a/Tabula/UnicodeExtensions.cs b/Tabula/UnicodeExtensions.cs index 02dd267..607e29a 100644 --- a/Tabula/UnicodeExtensions.cs +++ b/Tabula/UnicodeExtensions.cs @@ -2,19 +2,32 @@ namespace Tabula { + /// + /// Unicode extensions. + /// As of Unicode v13.0. + /// public static class UnicodeExtensions { + /// + /// Gets the character abbreviated type (i.e. 'BN', 'S', 'NSM', 'LRO'), used in the Unicode Bidirectional Algorithm. + /// + /// The character value. public static string GetDirectionality(this char c) { var val = char.ConvertToUtf32(c.ToString(), 0); return GetDirectionality(val); } + /// + /// Gets the character abbreviated type (i.e. 'BN', 'S', 'NSM', 'LRO'), used in the Unicode Bidirectional Algorithm. + /// + /// The integer value of a char. + /// public static string GetDirectionality(int val) { if (val < 0) { - throw new ArgumentOutOfRangeException(); + throw new ArgumentOutOfRangeException("Char integer value needs to be more or equal to 0.", nameof(val)); } // need to add check for max value diff --git a/Tabula/Writers/IWriter.cs b/Tabula/Writers/IWriter.cs index 737fa01..ac9e827 100644 --- a/Tabula/Writers/IWriter.cs +++ b/Tabula/Writers/IWriter.cs @@ -4,14 +4,37 @@ namespace Tabula.Writers { + /// + /// Base interface for tabula writer. + /// public interface IWriter { + /// + /// Write the table to the stream. + /// + /// + /// void Write(StreamWriter sb, Table table); + /// + /// Write the tables to the stream. + /// + /// + /// void Write(StreamWriter sb, IReadOnlyList
tables); + /// + /// Write the table to the stream. + /// + /// + /// void Write(StringBuilder sb, Table table); + /// + /// Write the tables to the stream. + /// + /// + /// void Write(StringBuilder sb, IReadOnlyList
tables); } }