Skip to content

Commit

Permalink
Update to 0.1.4-alpha001
Browse files Browse the repository at this point in the history
Update to 0.1.4-alpha001
- Remove reference to net5
- Update supported dotnet versions in README
- Make ObjectExtractor static and add PdfPigExtensionsTests
- Update PdfPig NuGet package to 0.1.9-alpha-20231019-c6e2d
- Seal classes, make clipper internal
  • Loading branch information
BobLd committed Oct 19, 2023
2 parents fe6e6e5 + 4eef723 commit b29d4f8
Show file tree
Hide file tree
Showing 40 changed files with 5,160 additions and 4,654 deletions.
6 changes: 1 addition & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
![Linux](https://github.com/BobLd/tabula-sharp/workflows/Linux/badge.svg)
![Mac OS](https://github.com/BobLd/tabula-sharp/workflows/Mac%20OS/badge.svg)

- Supports .NET 5, .NET Core 3.1, .NET Standard 2.0, .NET Framework 4.5, 4.51, 4.52, 4.6, 4.61, 4.62, 4.7
- Supports .NET 6, .NET Core 3.1, .NET Standard 2.0, .NET Framework 4.52, 4.6, 4.61, 4.62, 4.7
- No java bindings

NuGet packages available on the [releases](https://github.com/BobLd/tabula-sharp/releases) page and on www.nuget.org:
Expand Down Expand Up @@ -56,7 +56,3 @@ using (PdfDocument document = PdfDocument.Open("doc.pdf", new ParsingOptions() {
![example](images/stream-us-018.png)
## Lattice mode - SpreadsheetExtractionAlgorithm
![example](images/lattice-eu-004.png)

# HELP WANTED
- The original java implementation uses STR trees in [`RectangleSpatialIndex`](https://github.com/tabulapdf/tabula-java/blob/master/src/main/java/technology/tabula/RectangleSpatialIndex.java). This is not the case here so it might be a bit slower. Any help implementing a similar approach is welcome.

6 changes: 3 additions & 3 deletions Tabula.Csv/Tabula.Csv.csproj
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>netcoreapp3.1;netstandard2.0;net452;net46;net461;net462;net47;net5.0;net6.0</TargetFrameworks>
<TargetFrameworks>netcoreapp3.1;netstandard2.0;net452;net46;net461;net462;net47;net6.0</TargetFrameworks>
<Description>Extract tables from PDF files (port of tabula-java using PdfPig). Csv and Tsv writers.</Description>
<PackageProjectUrl>https://github.com/BobLd/tabula-sharp</PackageProjectUrl>
<Version>0.1.3</Version>
<Version>0.1.4-alpha001</Version>
<Authors>BobLd</Authors>
<PackageTags>pdf, extract, table, tabula, pdfpig, parse, extraction, csv, tsv, excel, export</PackageTags>
<PackageLicenseExpression>MIT</PackageLicenseExpression>
Expand All @@ -22,7 +22,7 @@
</ItemGroup>

<ItemGroup>
<PackageReference Include="CsvHelper" Version="27.2.1" />
<PackageReference Include="CsvHelper" Version="30.0.1" />
</ItemGroup>

<ItemGroup>
Expand Down
6 changes: 3 additions & 3 deletions Tabula.Json/Tabula.Json.csproj
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>netcoreapp3.1;netstandard2.0;net452;net46;net461;net462;net47;net5.0;net6.0</TargetFrameworks>
<TargetFrameworks>netcoreapp3.1;netstandard2.0;net452;net46;net461;net462;net47;net6.0</TargetFrameworks>
<Description>Extract tables from PDF files (port of tabula-java using PdfPig). Json writer.</Description>
<PackageProjectUrl>https://github.com/BobLd/tabula-sharp</PackageProjectUrl>
<Version>0.1.3</Version>
<Version>0.1.4-alpha001</Version>
<Company>BobLd</Company>
<Authors>BobLd</Authors>
<PackageTags>pdf, extract, table, tabula, pdfpig, parse, extraction, json, export</PackageTags>
Expand All @@ -22,7 +22,7 @@
</ItemGroup>

<ItemGroup>
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
</ItemGroup>

<ItemGroup>
Expand Down
462 changes: 462 additions & 0 deletions Tabula.Tests/PdfPigExtensionsTests.cs

Large diffs are not rendered by default.

18 changes: 12 additions & 6 deletions Tabula.Tests/Tabula.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="CsvHelper" Version="27.2.1" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.5.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
<PackageReference Include="xunit" Version="2.4.0" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.0" />
<PackageReference Include="coverlet.collector" Version="1.2.0" />
<PackageReference Include="CsvHelper" Version="30.0.1" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.6.2" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
<PackageReference Include="xunit" Version="2.4.2" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.5">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="coverlet.collector" Version="6.0.0">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

<ItemGroup>
Expand Down
29 changes: 9 additions & 20 deletions Tabula.Tests/TestObjectExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ public void TestCanReadPDFWithOwnerEncryption()
{
using (PdfDocument pdf_document = PdfDocument.Open("Resources/S2MNCEbirdisland.pdf"))
{
ObjectExtractor oe = new ObjectExtractor(pdf_document);
PageIterator pi = oe.Extract();
PageIterator pi = ObjectExtractor.Extract(pdf_document);
int i = 0;
while (pi.MoveNext())
{
Expand All @@ -39,9 +38,8 @@ public void TestGoodPassword()
{
using (PdfDocument pdf_document = PdfDocument.Open("Resources/encrypted.pdf", new ParsingOptions() { Password = "userpassword" }))
{
ObjectExtractor oe = new ObjectExtractor(pdf_document);
List<PageArea> pages = new List<PageArea>();
PageIterator pi = oe.Extract();
PageIterator pi = ObjectExtractor.Extract(pdf_document);
while (pi.MoveNext())
{
pages.Add(pi.Current);
Expand All @@ -55,8 +53,7 @@ public void TestTextExtractionDoesNotRaise()
{
using (PdfDocument pdf_document = PdfDocument.Open("Resources/rotated_page.pdf", new ParsingOptions() { ClipPaths = true }))
{
ObjectExtractor oe = new ObjectExtractor(pdf_document);
PageIterator pi = oe.Extract();
PageIterator pi = ObjectExtractor.Extract(pdf_document);

Assert.True(pi.MoveNext());
Assert.NotNull(pi.Current);
Expand All @@ -69,8 +66,7 @@ public void TestShouldDetectRulings()
{
using (PdfDocument pdf_document = PdfDocument.Open("Resources/should_detect_rulings.pdf", new ParsingOptions() { ClipPaths = true }))
{
ObjectExtractor oe = new ObjectExtractor(pdf_document);
PageIterator pi = oe.Extract();
PageIterator pi = ObjectExtractor.Extract(pdf_document);

PageArea page = pi.Next();
IReadOnlyList<Ruling> rulings = page.GetRulings();
Expand All @@ -87,8 +83,7 @@ public void TestDontThrowNPEInShfill()
{
using (PdfDocument pdf_document = PdfDocument.Open("Resources/labor.pdf", new ParsingOptions() { ClipPaths = true }))
{
ObjectExtractor oe = new ObjectExtractor(pdf_document);
PageIterator pi = oe.Extract();
PageIterator pi = ObjectExtractor.Extract(pdf_document);
Assert.True(pi.MoveNext());

PageArea p = pi.Current;
Expand All @@ -103,8 +98,7 @@ public void TestExtractOnePage()
{
Assert.Equal(2, pdf_document.NumberOfPages);

ObjectExtractor oe = new ObjectExtractor(pdf_document);
PageArea page = oe.Extract(2);
PageArea page = ObjectExtractor.Extract(pdf_document, 2);

Assert.NotNull(page);
}
Expand All @@ -117,8 +111,7 @@ public void TestExtractWrongPageNumber()// throws IOException
{
Assert.Equal(2, pdf_document.NumberOfPages);

ObjectExtractor oe = new ObjectExtractor(pdf_document);
Assert.Throws<IndexOutOfRangeException>(() => oe.Extract(3));
Assert.Throws<IndexOutOfRangeException>(() => ObjectExtractor.Extract(pdf_document, 3));
}
}

Expand All @@ -127,9 +120,7 @@ public void TestTextElementsContainedInPage()
{
using (PdfDocument pdf_document = PdfDocument.Open("Resources/cs-en-us-pbms.pdf", new ParsingOptions() { ClipPaths = true }))
{
ObjectExtractor oe = new ObjectExtractor(pdf_document);

PageArea page = oe.ExtractPage(1);
PageArea page = ObjectExtractor.ExtractPage(pdf_document, 1);

foreach (TextElement te in page.GetText())
{
Expand All @@ -143,9 +134,7 @@ public void TestDoNotNPEInPointComparator()
{
using (PdfDocument pdf_document = PdfDocument.Open("Resources/npe_issue_206.pdf", new ParsingOptions() { ClipPaths = true }))
{
ObjectExtractor oe = new ObjectExtractor(pdf_document);

PageArea p = oe.ExtractPage(1);
PageArea p = ObjectExtractor.ExtractPage(pdf_document, 1);
Assert.NotNull(p);
}
}
Expand Down
28 changes: 14 additions & 14 deletions Tabula.Tests/TestSpreadsheetExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ public void TestSpanningCells()
PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
string expectedJson = UtilsForTesting.LoadJson("Resources/json/spanning_cells.json");
SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
List<Table> tables = se.Extract(page);
IReadOnlyList<Table> tables = se.Extract(page);
Assert.Equal(2, tables.Count);

var expectedJObject = (JArray)JsonConvert.DeserializeObject(expectedJson);
Expand Down Expand Up @@ -268,7 +268,7 @@ public void TestSpanningCellsToCsv()
PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spanning_cells.csv");
SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
List<Table> tables = se.Extract(page);
IReadOnlyList<Table> tables = se.Extract(page);
Assert.Equal(2, tables.Count);

StringBuilder sb = new StringBuilder();
Expand All @@ -281,7 +281,7 @@ public void TestIncompleteGrid()
{
PageArea page = UtilsForTesting.GetPage("Resources/china.pdf", 1);
SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
List<Table> tables = se.Extract(page);
IReadOnlyList<Table> tables = se.Extract(page);
Assert.Equal(2, tables.Count);
}

Expand All @@ -290,7 +290,7 @@ public void TestNaturalOrderOfRectanglesDoesNotBreakContract()
{
PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2);
SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
List<Table> tables = se.Extract(page);
IReadOnlyList<Table> tables = se.Extract(page);

string expected = "Project,Agency,Institution\r\nNanotechnology and its publics,NSF,Pennsylvania State University\r\n\"Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)\",Interagency,\"North Carolina State\rUniversity\"\r\n\"Social and ethical research and education in agrifood\rnanotechnology (NIRT)\",NSF,Michigan State University\r\n\"From laboratory to society: developing an informed\rapproach to nanoscale science and engineering (NIRT)\",NSF,University of South Carolina\r\nDatabase and innovation timeline for nanotechnology,NSF,UCLA\r\nSocial and ethical dimensions of nanotechnology,NSF,University of Virginia\r\n\"Undergraduate exploration of nanoscience,\rapplications and societal implications (NUE)\",NSF,\"Michigan Technological\rUniversity\"\r\n\"Ethics and belief inside the development of\rnanotechnology (CAREER)\",NSF,University of Virginia\r\n\"All centers, NNIN and NCN have a societal\rimplications components\",\"NSF, DOE,\rDOD, and NIH\",\"All nanotechnology centers\rand networks\""; // \r\n

Expand Down Expand Up @@ -325,7 +325,7 @@ public void TestSpreadsheetWithNoBoundingFrameShouldBeSpreadsheet()
SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
bool isTabular = se.IsTabular(page);
Assert.True(isTabular);
List<Table> tables = se.Extract(page);
IReadOnlyList<Table> tables = se.Extract(page);

StringBuilder sb = new StringBuilder();
(new CSVWriter()).Write(sb, tables[0]);
Expand All @@ -337,7 +337,7 @@ public void TestExtractSpreadsheetWithinAnArea()
{
PageArea page = UtilsForTesting.GetAreaFromPage("Resources/puertos1.pdf", 1, new PdfRectangle(30.32142857142857, 793 - 554.8821428571429, 546.7964285714286, 793 - 273.9035714285714)); // 273.9035714285714f, 30.32142857142857f, 554.8821428571429f, 546.7964285714286f);
SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
List<Table> tables = se.Extract(page);
IReadOnlyList<Table> tables = se.Extract(page);
Table table = tables[0];
Assert.Equal(15, table.Rows.Count);

Expand Down Expand Up @@ -417,7 +417,7 @@ public void TestShouldDetectASingleSpreadsheet()
{
PageArea page = UtilsForTesting.GetAreaFromPage("Resources/offense.pdf", 1, new PdfRectangle(16.44, 792 - 680.85, 597.84, 792 - 16.44)); // 68.08f, 16.44f, 680.85f, 597.84f);
SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
List<Table> tables = bea.Extract(page);
IReadOnlyList<Table> tables = bea.Extract(page);
Assert.Single(tables);
}

Expand All @@ -426,7 +426,7 @@ public void TestExtractTableWithExternallyDefinedRulings()
{
PageArea page = UtilsForTesting.GetPage("Resources/us-007.pdf", 1);
SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
List<Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS.ToList());
IReadOnlyList<Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS.ToList());
Assert.Single(tables);
Table table = tables[0];
Assert.Equal(18, table.Cells.Count);
Expand Down Expand Up @@ -458,7 +458,7 @@ public void TestAnotherExtractTableWithExternallyDefinedRulings()
{
PageArea page = UtilsForTesting.GetPage("Resources/us-024.pdf", 1);
SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
List<Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS2.ToList());
IReadOnlyList<Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS2.ToList());
Assert.Single(tables);
Table table = tables[0];

Expand All @@ -472,7 +472,7 @@ public void TestSpreadsheetsSortedByTopAndRight()
PageArea page = UtilsForTesting.GetPage("Resources/sydney_disclosure_contract.pdf", 1);

SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
List<Table> tables = sea.Extract(page);
IReadOnlyList<Table> tables = sea.Extract(page);
for (int i = 1; i < tables.Count; i++)
{
Assert.True(tables[i - 1].Top >= tables[i].Top); // Assert.True(tables[i - 1].getTop() <= tables[i].getTop());
Expand All @@ -485,7 +485,7 @@ public void TestDontStackOverflowQuicksort()
PageArea page = UtilsForTesting.GetPage("Resources/failing_sort.pdf", 1);

SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
List<Table> tables = sea.Extract(page);
IReadOnlyList<Table> tables = sea.Extract(page);
for (int i = 1; i < tables.Count; i++)
{
Assert.True(tables[i - 1].Top >= tables[i].Top); //Assert.True(tables[i - 1].getTop() <= tables[i].getTop());
Expand All @@ -497,7 +497,7 @@ public void TestRTL()
{
PageArea page = UtilsForTesting.GetPage("Resources/arabic.pdf", 1);
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
List<Table> tables = sea.Extract(page);
IReadOnlyList<Table> tables = sea.Extract(page);
// Assert.Equal(1, tables.size());
Table table = tables[0];

Expand Down Expand Up @@ -528,7 +528,7 @@ public void TestRealLifeRTL()
{
PageArea page = UtilsForTesting.GetPage("Resources/mednine.pdf", 1);
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
List<Table> tables = sea.Extract(page);
IReadOnlyList<Table> tables = sea.Extract(page);
Assert.Single(tables);
Table table = tables[0];
var rows = table.Rows;
Expand Down Expand Up @@ -580,7 +580,7 @@ public void TestSpreadsheetExtractionIssue656()
string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.csv");

SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
List<Table> tables = sea.Extract(page);
IReadOnlyList<Table> tables = sea.Extract(page);
Assert.Single(tables);
Table table = tables[0];

Expand Down
2 changes: 1 addition & 1 deletion Tabula.Tests/TestTableDetection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public class TestTableDetection

//private static Level defaultLogLevel;

private class TestStatus
private sealed class TestStatus
{
public int numExpectedTables;
public int numCorrectlyDetectedTables;
Expand Down
6 changes: 3 additions & 3 deletions Tabula.Tests/TestWriters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ private Table GetTable()
return bea.Extract(page)[0];
}

private List<Table> GetTables()
private IReadOnlyList<Table> GetTables()
{
PageArea page = UtilsForTesting.GetPage("Resources/twotables.pdf", 1);
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
Expand Down Expand Up @@ -144,7 +144,7 @@ public void TestCSVSerializeInfinity()
public void TestJSONSerializeTwoTables()
{
string expectedJson = UtilsForTesting.LoadJson("Resources/json/twotables.json");
List<Table> tables = this.GetTables();
IReadOnlyList<Table> tables = this.GetTables();

StringBuilder sb = new StringBuilder();
(new JSONWriter()).Write(sb, tables);
Expand Down Expand Up @@ -178,7 +178,7 @@ public void TestJSONSerializeTwoTables()
public void TestCSVSerializeTwoTables()
{
string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/twotables.csv");
List<Table> tables = this.GetTables();
IReadOnlyList<Table> tables = this.GetTables();

/*
StringBuilder sb = new StringBuilder();
Expand Down
3 changes: 1 addition & 2 deletions Tabula.Tests/TestsIcdar2013.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ public void Eu004()
{
using (PdfDocument document = PdfDocument.Open("Resources/icdar2013-dataset/competition-dataset-eu/eu-004.pdf", new ParsingOptions() { ClipPaths = true }))
{
ObjectExtractor oe = new ObjectExtractor(document);
PageArea page = oe.Extract(3);
PageArea page = ObjectExtractor.Extract(document, 3);

var detector = new SimpleNurminenDetectionAlgorithm();
var regions = detector.Detect(page);
Expand Down
5 changes: 2 additions & 3 deletions Tabula.Tests/TestsNurminenDetector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ public void TestLinesToCells()
{
using (PdfDocument document = PdfDocument.Open("test3.pdf", new ParsingOptions() { ClipPaths = true }))
{
ObjectExtractor oe = new ObjectExtractor(document);
PageArea page = oe.Extract(1);
PageArea page = ObjectExtractor.Extract(document, 1);

SimpleNurminenDetectionAlgorithm detector = new SimpleNurminenDetectionAlgorithm();
var regions = detector.Detect(page);
Expand All @@ -25,7 +24,7 @@ public void TestLinesToCells()
{
IExtractionAlgorithm ea = new BasicExtractionAlgorithm();
var newArea = page.GetArea(a.BoundingBox);
List<Table> tables = ea.Extract(newArea);
IReadOnlyList<Table> tables = ea.Extract(newArea);
}
}
}
Expand Down
Loading

0 comments on commit b29d4f8

Please sign in to comment.