Skip to content

Commit

Permalink
Merge pull request #12 from BobLd/0.1.0-alpha001
Browse files Browse the repository at this point in the history
version 0.1.0-alpha001
  • Loading branch information
BobLd committed Sep 21, 2020
2 parents 540d5a7 + dcbbb5d commit ccdb2d4
Show file tree
Hide file tree
Showing 23 changed files with 1,063 additions and 95 deletions.
8 changes: 8 additions & 0 deletions Tabula.Csv/Tabula.Csv.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

<PropertyGroup>
<TargetFrameworks>netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47</TargetFrameworks>
<Description>Extract tables from PDF files (port of tabula-java using PdfPig). Csv and Tsv writers.</Description>
<PackageProjectUrl>https://github.com/BobLd/tabula-sharp</PackageProjectUrl>
<Version>0.1.0-alpha001</Version>
</PropertyGroup>

<PropertyGroup Condition="'$(Configuration)|$(TargetFramework)|$(Platform)'=='Release|netcoreapp3.1|AnyCPU'">
<DocumentationFile>D:\VS2017\source\repos\tabula-sharp\Tabula.Csv\Tabula.Csv.xml</DocumentationFile>
<WarningLevel>3</WarningLevel>
</PropertyGroup>

<ItemGroup>
Expand Down
8 changes: 8 additions & 0 deletions Tabula.Csv/Tabula.Csv.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions Tabula.Json/Tabula.Json.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

<PropertyGroup>
<TargetFrameworks>netcoreapp3.1;netstandard2.0;net45;net451;net452;net46;net461;net462;net47</TargetFrameworks>
<Description>Extract tables from PDF files (port of tabula-java using PdfPig). Json writer.</Description>
<PackageProjectUrl>https://github.com/BobLd/tabula-sharp</PackageProjectUrl>
<Version>0.1.0-alpha001</Version>
</PropertyGroup>

<PropertyGroup Condition="'$(Configuration)|$(TargetFramework)|$(Platform)'=='Release|netcoreapp3.1|AnyCPU'">
<DocumentationFile>D:\VS2017\source\repos\tabula-sharp\Tabula.Json\Tabula.Json.xml</DocumentationFile>
</PropertyGroup>

<ItemGroup>
Expand Down
8 changes: 8 additions & 0 deletions Tabula.Json/Tabula.Json.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 12 additions & 4 deletions Tabula/Cell.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace Tabula
* ** tabula/Cell.java **
*/
/// <summary>
///
/// A cell in a table.
/// </summary>
public class Cell : RectangularTextContainer<TextChunk>
{
Expand All @@ -19,7 +19,7 @@ public class Cell : RectangularTextContainer<TextChunk>
public static Cell EMPTY => new Cell(new PdfRectangle());

/// <summary>
///
/// Create a cell in a table.
/// </summary>
/// <param name="pdfRectangle"></param>
public Cell(PdfRectangle pdfRectangle)
Expand All @@ -31,7 +31,7 @@ public Cell(PdfRectangle pdfRectangle)
}

/// <summary>
///
/// Create a cell in a table.
/// </summary>
/// <param name="chunk"></param>
public Cell(TextChunk chunk)
Expand All @@ -41,7 +41,7 @@ public Cell(TextChunk chunk)
}

/// <summary>
///
/// Create a cell in a table.
/// </summary>
/// <param name="topLeft"></param>
/// <param name="bottomRight"></param>
Expand All @@ -59,6 +59,10 @@ public Cell(PdfPoint topLeft, PdfPoint bottomRight)
}
}

/// <summary>
/// Gets the cell's text.
/// </summary>
/// <param name="useLineReturns"></param>
public override string GetText(bool useLineReturns)
{
if (base.textElements.Count == 0)
Expand All @@ -81,6 +85,9 @@ public override string GetText(bool useLineReturns)
return sb.ToString().Trim();
}

/// <summary>
/// Gets the cell's text.
/// </summary>
public override string GetText()
{
return GetText(true);
Expand All @@ -100,6 +107,7 @@ public void SetPlaceholder(bool placeholder)
this.IsPlaceholder = placeholder;
}

/// <inheritdoc/>
public override string ToString()
{
return GetText();
Expand Down
10 changes: 9 additions & 1 deletion Tabula/Detectors/IDetectionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,20 @@
namespace Tabula.Detectors
{
// https://github.com/tabulapdf/tabula-java/blob/master/src/main/java/technology/
/**
/*
* ** tabula/detectors/DetectionAlgorithm.java **
* Created by matt on 2015-12-14.
*/

/// <summary>
/// Table detection algorithm.
/// </summary>
public interface IDetectionAlgorithm
{
/// <summary>
/// Detects the tables in the page.
/// </summary>
/// <param name="page">The page where to detect the tables.</param>
List<TableRectangle> Detect(PageArea page);
}
}
6 changes: 5 additions & 1 deletion Tabula/Detectors/NurminenDetectionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,18 @@

namespace Tabula.Detectors
{
/**
/*
* ** tabula/detectors/NurminenDetectionAlgorithm.java **
* Created by matt on 2015-12-17.
* <p>
* Attempt at an implementation of the table finding algorithm described by
* Anssi Nurminen's master's thesis:
* http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
*/

/// <summary>
/// Nurminen detection algorithm.
/// </summary>
public class NurminenDetectionAlgorithm : IDetectionAlgorithm
{
private static int GRAYSCALE_INTENSITY_THRESHOLD = 25;
Expand Down
4 changes: 4 additions & 0 deletions Tabula/Detectors/SpreadsheetDetectionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ namespace Tabula.Detectors
/// </summary>
public class SpreadsheetDetectionAlgorithm : IDetectionAlgorithm
{
/// <summary>
/// Detects the tables in the page.
/// </summary>
/// <param name="page">The page where to detect the tables.</param>
public List<TableRectangle> Detect(PageArea page)
{
List<Cell> cells = SpreadsheetExtractionAlgorithm.FindCells(page.HorizontalRulings, page.VerticalRulings);
Expand Down
19 changes: 15 additions & 4 deletions Tabula/Extractors/BasicExtractionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,33 @@
namespace Tabula.Extractors
{
/// <summary>
/// stream
/// Stream extraction algorithm.
/// </summary>
public class BasicExtractionAlgorithm : IExtractionAlgorithm
{
private IReadOnlyList<Ruling> verticalRulings;

/// <summary>
/// stream
/// Stream extraction algorithm.
/// </summary>
public BasicExtractionAlgorithm()
{
}

/// <summary>
/// stream
/// Stream extraction algorithm.
/// </summary>
/// <param name="verticalRulings">List of vertical rulings.</param>
public BasicExtractionAlgorithm(IReadOnlyList<Ruling> verticalRulings)
{
this.verticalRulings = verticalRulings;
}

/// <summary>
/// Extracts the tables in the page.
/// </summary>
/// <param name="page">The page where to extract the tables.</param>
/// <param name="verticalRulingPositions">List of vertical rulings, indicated by there x position.</param>
public List<Table> Extract(PageArea page, IReadOnlyList<float> verticalRulingPositions)
{
List<Ruling> verticalRulings = new List<Ruling>(verticalRulingPositions.Count);
Expand All @@ -36,6 +42,10 @@ public List<Table> Extract(PageArea page, IReadOnlyList<float> verticalRulingPos
return this.Extract(page);
}

/// <summary>
/// Extracts the tables in the page.
/// </summary>
/// <param name="page">The page where to extract the tables.</param>
public List<Table> Extract(PageArea page)
{
List<TextElement> textElements = page.GetText();
Expand Down Expand Up @@ -113,13 +123,14 @@ public List<Table> Extract(PageArea page)
return new Table[] { table }.ToList();
}

/// <inheritdoc/>
public override string ToString()
{
return "stream";
}

/// <summary>
///
/// Gets columns positions.
/// </summary>
/// <param name="lines">Must be an array of lines sorted by their +top+ attribute.</param>
/// <returns>a list of column boundaries (x axis).</returns>
Expand Down
7 changes: 7 additions & 0 deletions Tabula/Extractors/IExtractionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@

namespace Tabula.Extractors
{
/// <summary>
/// Table extraction algorithm.
/// </summary>
public interface IExtractionAlgorithm
{
/// <summary>
/// Extracts the tables in the page.
/// </summary>
/// <param name="page">The page where to extract the tables.</param>
List<Table> Extract(PageArea page);
}
}
33 changes: 24 additions & 9 deletions Tabula/Extractors/SpreadsheetExtractionAlgorithm.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ namespace Tabula.Extractors
{
//https://github.com/tabulapdf/tabula-java/blob/master/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java
/// <summary>
/// lattice
/// Lattice extraction algorithm.
/// </summary>
public class SpreadsheetExtractionAlgorithm : IExtractionAlgorithm
{
/// <summary>
/// lattice
/// Lattice extraction algorithm.
/// </summary>
public SpreadsheetExtractionAlgorithm()
{
Expand Down Expand Up @@ -81,13 +81,17 @@ public int Compare(PdfPoint arg0, PdfPoint arg1)
}
}

/// <summary>
/// Extracts the tables in the page.
/// </summary>
/// <param name="page">The page where to extract the tables.</param>
public List<Table> Extract(PageArea page)
{
return Extract(page, page.GetRulings());
}

/// <summary>
/// Extract a list of Table from page using rulings as separators
/// Extracts the tables in the page using rulings as separators.
/// </summary>
/// <param name="page"></param>
/// <param name="rulings"></param>
Expand Down Expand Up @@ -159,6 +163,10 @@ public List<Table> Extract(PageArea page, IReadOnlyList<Ruling> rulings)
return spreadsheets;
}

/// <summary>
///
/// </summary>
/// <param name="page"></param>
public bool IsTabular(PageArea page)
{
// if there's no text at all on the page, it's not a table
Expand All @@ -177,6 +185,7 @@ public bool IsTabular(PageArea page)
{
return false;
}

Table table = tables[0];
int rowsDefinedByLines = table.RowCount;
int colsDefinedByLines = table.ColumnCount;
Expand All @@ -187,6 +196,7 @@ public bool IsTabular(PageArea page)
// TODO WHAT DO WE DO HERE?
System.Diagnostics.Debug.Write("SpreadsheetExtractionAlgorithm.isTabular(): no table found.");
}

table = tables[0];
int rowsDefinedWithoutLines = table.RowCount;
int colsDefinedWithoutLines = table.ColumnCount;
Expand All @@ -196,6 +206,11 @@ public bool IsTabular(PageArea page)
return ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1 / MAGIC_HEURISTIC_NUMBER);
}

/// <summary>
/// Find cells from horizontal and vertical ruling lines.
/// </summary>
/// <param name="horizontalRulingLines"></param>
/// <param name="verticalRulingLines"></param>
public static List<Cell> FindCells(IReadOnlyList<Ruling> horizontalRulingLines, IReadOnlyList<Ruling> verticalRulingLines)
{
List<Cell> cellsFound = new List<Cell>();
Expand Down Expand Up @@ -266,6 +281,11 @@ public static List<Cell> FindCells(IReadOnlyList<Ruling> horizontalRulingLines,
return cellsFound;
}

/// <summary>
/// Find spreadsheets areas from cells.
/// <para>Based on O'Rourke's `Uniqueness of orthogonal connect-the-dots`.</para>
/// </summary>
/// <param name="cells"></param>
public static List<TableRectangle> FindSpreadsheetsFromCells(List<TableRectangle> cells)
{
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
Expand Down Expand Up @@ -390,6 +410,7 @@ public static List<TableRectangle> FindSpreadsheetsFromCells(List<TableRectangle
return rectangles;
}

/// <inheritdoc/>
public override string ToString()
{
return "lattice";
Expand Down Expand Up @@ -419,12 +440,6 @@ public override bool Equals(object other)
return this.point.Equals(o.point);
}
return false;
/*
if (this == other)
return true;
if (!(other is PolygonVertex)) return false;
return this.point.Equals(((PolygonVertex)other).point);
*/
}

public override int GetHashCode()
Expand Down
Loading

0 comments on commit ccdb2d4

Please sign in to comment.