Update to 0.1.4-alpha001

Update to 0.1.4-alpha001 - Remove reference to net5 - Update supported dotnet versions in README - Make ObjectExtractor static and add PdfPigExtensionsTests - Update PdfPig NuGet package to 0.1.9-alpha-20231019-c6e2d - Seal classes, make clipper internal
BobLd · Oct 19, 2023 · b29d4f8 · b29d4f8
2 parents fe6e6e5 + 4eef723
commit b29d4f8
Show file tree

Hide file tree

Showing 40 changed files with 5,160 additions and 4,654 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 ![Linux](https://github.com/BobLd/tabula-sharp/workflows/Linux/badge.svg)
 ![Mac OS](https://github.com/BobLd/tabula-sharp/workflows/Mac%20OS/badge.svg)
 
-- Supports .NET 5, .NET Core 3.1, .NET Standard 2.0, .NET Framework 4.5, 4.51, 4.52, 4.6, 4.61, 4.62, 4.7
+- Supports .NET 6, .NET Core 3.1, .NET Standard 2.0, .NET Framework 4.52, 4.6, 4.61, 4.62, 4.7
 - No java bindings
 
 NuGet packages available on the [releases](https://github.com/BobLd/tabula-sharp/releases) page and on www.nuget.org:
@@ -56,7 +56,3 @@ using (PdfDocument document = PdfDocument.Open("doc.pdf", new ParsingOptions() {
 ![example](images/stream-us-018.png)
 ## Lattice mode - SpreadsheetExtractionAlgorithm
 ![example](images/lattice-eu-004.png)
-
-# HELP WANTED
-- The original java implementation uses STR trees in [`RectangleSpatialIndex`](https://github.com/tabulapdf/tabula-java/blob/master/src/main/java/technology/tabula/RectangleSpatialIndex.java). This is not the case here so it might be a bit slower. Any help implementing a similar approach is welcome.
-
diff --git a/Tabula.Csv/Tabula.Csv.csproj b/Tabula.Csv/Tabula.Csv.csproj
@@ -1,10 +1,10 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-    <TargetFrameworks>netcoreapp3.1;netstandard2.0;net452;net46;net461;net462;net47;net5.0;net6.0</TargetFrameworks>
+    <TargetFrameworks>netcoreapp3.1;netstandard2.0;net452;net46;net461;net462;net47;net6.0</TargetFrameworks>
     <Description>Extract tables from PDF files (port of tabula-java using PdfPig). Csv and Tsv writers.</Description>
     <PackageProjectUrl>https://github.com/BobLd/tabula-sharp</PackageProjectUrl>
-    <Version>0.1.3</Version>
+    <Version>0.1.4-alpha001</Version>
     <Authors>BobLd</Authors>
     <PackageTags>pdf, extract, table, tabula, pdfpig, parse, extraction, csv, tsv, excel, export</PackageTags>
     <PackageLicenseExpression>MIT</PackageLicenseExpression>
@@ -22,7 +22,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <PackageReference Include="CsvHelper" Version="27.2.1" />
+    <PackageReference Include="CsvHelper" Version="30.0.1" />
   </ItemGroup>
 
   <ItemGroup>

diff --git a/Tabula.Json/Tabula.Json.csproj b/Tabula.Json/Tabula.Json.csproj
@@ -1,10 +1,10 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-    <TargetFrameworks>netcoreapp3.1;netstandard2.0;net452;net46;net461;net462;net47;net5.0;net6.0</TargetFrameworks>
+    <TargetFrameworks>netcoreapp3.1;netstandard2.0;net452;net46;net461;net462;net47;net6.0</TargetFrameworks>
     <Description>Extract tables from PDF files (port of tabula-java using PdfPig). Json writer.</Description>
     <PackageProjectUrl>https://github.com/BobLd/tabula-sharp</PackageProjectUrl>
-    <Version>0.1.3</Version>
+    <Version>0.1.4-alpha001</Version>
     <Company>BobLd</Company>
     <Authors>BobLd</Authors>
     <PackageTags>pdf, extract, table, tabula, pdfpig, parse, extraction, json, export</PackageTags>
@@ -22,7 +22,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
+    <PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
   </ItemGroup>
 
   <ItemGroup>

diff --git a/Tabula.Tests/PdfPigExtensionsTests.cs b/Tabula.Tests/PdfPigExtensionsTests.cs
diff --git a/Tabula.Tests/Tabula.Tests.csproj b/Tabula.Tests/Tabula.Tests.csproj
@@ -9,12 +9,18 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="CsvHelper" Version="27.2.1" />
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.5.0" />
-    <PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
-    <PackageReference Include="xunit" Version="2.4.0" />
-    <PackageReference Include="xunit.runner.visualstudio" Version="2.4.0" />
-    <PackageReference Include="coverlet.collector" Version="1.2.0" />
+    <PackageReference Include="CsvHelper" Version="30.0.1" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.6.2" />
+    <PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
+    <PackageReference Include="xunit" Version="2.4.2" />
+    <PackageReference Include="xunit.runner.visualstudio" Version="2.4.5">
+      <PrivateAssets>all</PrivateAssets>
+      <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
+    </PackageReference>
+    <PackageReference Include="coverlet.collector" Version="6.0.0">
+      <PrivateAssets>all</PrivateAssets>
+      <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
+    </PackageReference>
   </ItemGroup>
 
   <ItemGroup>

diff --git a/Tabula.Tests/TestObjectExtractor.cs b/Tabula.Tests/TestObjectExtractor.cs
@@ -23,8 +23,7 @@ public void TestCanReadPDFWithOwnerEncryption()
         {
             using (PdfDocument pdf_document = PdfDocument.Open("Resources/S2MNCEbirdisland.pdf"))
             {
-                ObjectExtractor oe = new ObjectExtractor(pdf_document);
-                PageIterator pi = oe.Extract();
+                PageIterator pi = ObjectExtractor.Extract(pdf_document);
                 int i = 0;
                 while (pi.MoveNext())
                 {
@@ -39,9 +38,8 @@ public void TestGoodPassword()
         {
             using (PdfDocument pdf_document = PdfDocument.Open("Resources/encrypted.pdf", new ParsingOptions() { Password = "userpassword" }))
             {
-                ObjectExtractor oe = new ObjectExtractor(pdf_document);
                 List<PageArea> pages = new List<PageArea>();
-                PageIterator pi = oe.Extract();
+                PageIterator pi = ObjectExtractor.Extract(pdf_document);
                 while (pi.MoveNext())
                 {
                     pages.Add(pi.Current);
@@ -55,8 +53,7 @@ public void TestTextExtractionDoesNotRaise()
         {
             using (PdfDocument pdf_document = PdfDocument.Open("Resources/rotated_page.pdf", new ParsingOptions() { ClipPaths = true }))
             {
-                ObjectExtractor oe = new ObjectExtractor(pdf_document);
-                PageIterator pi = oe.Extract();
+                PageIterator pi = ObjectExtractor.Extract(pdf_document);
 
                 Assert.True(pi.MoveNext());
                 Assert.NotNull(pi.Current);
@@ -69,8 +66,7 @@ public void TestShouldDetectRulings()
         {
             using (PdfDocument pdf_document = PdfDocument.Open("Resources/should_detect_rulings.pdf", new ParsingOptions() { ClipPaths = true }))
             {
-                ObjectExtractor oe = new ObjectExtractor(pdf_document);
-                PageIterator pi = oe.Extract();
+                PageIterator pi = ObjectExtractor.Extract(pdf_document);
 
                 PageArea page = pi.Next();
                 IReadOnlyList<Ruling> rulings = page.GetRulings();
@@ -87,8 +83,7 @@ public void TestDontThrowNPEInShfill()
         {
             using (PdfDocument pdf_document = PdfDocument.Open("Resources/labor.pdf", new ParsingOptions() { ClipPaths = true }))
             {
-                ObjectExtractor oe = new ObjectExtractor(pdf_document);
-                PageIterator pi = oe.Extract();
+                PageIterator pi = ObjectExtractor.Extract(pdf_document);
                 Assert.True(pi.MoveNext());
 
                 PageArea p = pi.Current;
@@ -103,8 +98,7 @@ public void TestExtractOnePage()
             {
                 Assert.Equal(2, pdf_document.NumberOfPages);
 
-                ObjectExtractor oe = new ObjectExtractor(pdf_document);
-                PageArea page = oe.Extract(2);
+                PageArea page = ObjectExtractor.Extract(pdf_document, 2);
 
                 Assert.NotNull(page);
             }
@@ -117,8 +111,7 @@ public void TestExtractWrongPageNumber()// throws IOException
             {
                 Assert.Equal(2, pdf_document.NumberOfPages);
 
-                ObjectExtractor oe = new ObjectExtractor(pdf_document);
-                Assert.Throws<IndexOutOfRangeException>(() => oe.Extract(3));
+                Assert.Throws<IndexOutOfRangeException>(() => ObjectExtractor.Extract(pdf_document, 3));
             }
         }
 
@@ -127,9 +120,7 @@ public void TestTextElementsContainedInPage()
         {
             using (PdfDocument pdf_document = PdfDocument.Open("Resources/cs-en-us-pbms.pdf", new ParsingOptions() { ClipPaths = true }))
             {
-                ObjectExtractor oe = new ObjectExtractor(pdf_document);
-
-                PageArea page = oe.ExtractPage(1);
+                PageArea page = ObjectExtractor.ExtractPage(pdf_document, 1);
 
                 foreach (TextElement te in page.GetText())
                 {
@@ -143,9 +134,7 @@ public void TestDoNotNPEInPointComparator()
         {
             using (PdfDocument pdf_document = PdfDocument.Open("Resources/npe_issue_206.pdf", new ParsingOptions() { ClipPaths = true }))
             {
-                ObjectExtractor oe = new ObjectExtractor(pdf_document);
-
-                PageArea p = oe.ExtractPage(1);
+                PageArea p = ObjectExtractor.ExtractPage(pdf_document, 1);
                 Assert.NotNull(p);
             }
         }

diff --git a/Tabula.Tests/TestSpreadsheetExtractor.cs b/Tabula.Tests/TestSpreadsheetExtractor.cs
@@ -212,7 +212,7 @@ public void TestSpanningCells()
             PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
             string expectedJson = UtilsForTesting.LoadJson("Resources/json/spanning_cells.json");
             SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = se.Extract(page);
+            IReadOnlyList<Table> tables = se.Extract(page);
             Assert.Equal(2, tables.Count);
 
             var expectedJObject = (JArray)JsonConvert.DeserializeObject(expectedJson);
@@ -268,7 +268,7 @@ public void TestSpanningCellsToCsv()
             PageArea page = UtilsForTesting.GetPage("Resources/spanning_cells.pdf", 1);
             string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/spanning_cells.csv");
             SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = se.Extract(page);
+            IReadOnlyList<Table> tables = se.Extract(page);
             Assert.Equal(2, tables.Count);
 
             StringBuilder sb = new StringBuilder();
@@ -281,7 +281,7 @@ public void TestIncompleteGrid()
         {
             PageArea page = UtilsForTesting.GetPage("Resources/china.pdf", 1);
             SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = se.Extract(page);
+            IReadOnlyList<Table> tables = se.Extract(page);
             Assert.Equal(2, tables.Count);
         }
 
@@ -290,7 +290,7 @@ public void TestNaturalOrderOfRectanglesDoesNotBreakContract()
         {
             PageArea page = UtilsForTesting.GetPage("Resources/us-017.pdf", 2);
             SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = se.Extract(page);
+            IReadOnlyList<Table> tables = se.Extract(page);
 
             string expected = "Project,Agency,Institution\r\nNanotechnology and its publics,NSF,Pennsylvania State University\r\n\"Public information and deliberation in nanoscience and\rnanotechnology policy (SGER)\",Interagency,\"North Carolina State\rUniversity\"\r\n\"Social and ethical research and education in agrifood\rnanotechnology (NIRT)\",NSF,Michigan State University\r\n\"From laboratory to society: developing an informed\rapproach to nanoscale science and engineering (NIRT)\",NSF,University of South Carolina\r\nDatabase and innovation timeline for nanotechnology,NSF,UCLA\r\nSocial and ethical dimensions of nanotechnology,NSF,University of Virginia\r\n\"Undergraduate exploration of nanoscience,\rapplications and societal implications (NUE)\",NSF,\"Michigan Technological\rUniversity\"\r\n\"Ethics and belief inside the development of\rnanotechnology (CAREER)\",NSF,University of Virginia\r\n\"All centers, NNIN and NCN have a societal\rimplications components\",\"NSF, DOE,\rDOD, and NIH\",\"All nanotechnology centers\rand networks\""; // \r\n
 
@@ -325,7 +325,7 @@ public void TestSpreadsheetWithNoBoundingFrameShouldBeSpreadsheet()
             SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
             bool isTabular = se.IsTabular(page);
             Assert.True(isTabular);
-            List<Table> tables = se.Extract(page);
+            IReadOnlyList<Table> tables = se.Extract(page);
 
             StringBuilder sb = new StringBuilder();
             (new CSVWriter()).Write(sb, tables[0]);
@@ -337,7 +337,7 @@ public void TestExtractSpreadsheetWithinAnArea()
         {
             PageArea page = UtilsForTesting.GetAreaFromPage("Resources/puertos1.pdf", 1, new PdfRectangle(30.32142857142857, 793 - 554.8821428571429, 546.7964285714286, 793 - 273.9035714285714)); // 273.9035714285714f, 30.32142857142857f, 554.8821428571429f, 546.7964285714286f);
             SpreadsheetExtractionAlgorithm se = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = se.Extract(page);
+            IReadOnlyList<Table> tables = se.Extract(page);
             Table table = tables[0];
             Assert.Equal(15, table.Rows.Count);
 
@@ -417,7 +417,7 @@ public void TestShouldDetectASingleSpreadsheet()
         {
             PageArea page = UtilsForTesting.GetAreaFromPage("Resources/offense.pdf", 1, new PdfRectangle(16.44, 792 - 680.85, 597.84, 792 - 16.44)); // 68.08f, 16.44f, 680.85f, 597.84f);
             SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = bea.Extract(page);
+            IReadOnlyList<Table> tables = bea.Extract(page);
             Assert.Single(tables);
         }
 
@@ -426,7 +426,7 @@ public void TestExtractTableWithExternallyDefinedRulings()
         {
             PageArea page = UtilsForTesting.GetPage("Resources/us-007.pdf", 1);
             SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS.ToList());
+            IReadOnlyList<Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS.ToList());
             Assert.Single(tables);
             Table table = tables[0];
             Assert.Equal(18, table.Cells.Count);
@@ -458,7 +458,7 @@ public void TestAnotherExtractTableWithExternallyDefinedRulings()
         {
             PageArea page = UtilsForTesting.GetPage("Resources/us-024.pdf", 1);
             SpreadsheetExtractionAlgorithm bea = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS2.ToList());
+            IReadOnlyList<Table> tables = bea.Extract(page, EXTERNALLY_DEFINED_RULINGS2.ToList());
             Assert.Single(tables);
             Table table = tables[0];
 
@@ -472,7 +472,7 @@ public void TestSpreadsheetsSortedByTopAndRight()
             PageArea page = UtilsForTesting.GetPage("Resources/sydney_disclosure_contract.pdf", 1);
 
             SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = sea.Extract(page);
+            IReadOnlyList<Table> tables = sea.Extract(page);
             for (int i = 1; i < tables.Count; i++)
             {
                 Assert.True(tables[i - 1].Top >= tables[i].Top); // Assert.True(tables[i - 1].getTop() <= tables[i].getTop());
@@ -485,7 +485,7 @@ public void TestDontStackOverflowQuicksort()
             PageArea page = UtilsForTesting.GetPage("Resources/failing_sort.pdf", 1);
 
             SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = sea.Extract(page);
+            IReadOnlyList<Table> tables = sea.Extract(page);
             for (int i = 1; i < tables.Count; i++)
             {
                 Assert.True(tables[i - 1].Top >= tables[i].Top); //Assert.True(tables[i - 1].getTop() <= tables[i].getTop());
@@ -497,7 +497,7 @@ public void TestRTL()
         {
             PageArea page = UtilsForTesting.GetPage("Resources/arabic.pdf", 1);
             SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = sea.Extract(page);
+            IReadOnlyList<Table> tables = sea.Extract(page);
             // Assert.Equal(1, tables.size());
             Table table = tables[0];
 
@@ -528,7 +528,7 @@ public void TestRealLifeRTL()
         {
             PageArea page = UtilsForTesting.GetPage("Resources/mednine.pdf", 1);
             SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = sea.Extract(page);
+            IReadOnlyList<Table> tables = sea.Extract(page);
             Assert.Single(tables);
             Table table = tables[0];
             var rows = table.Rows;
@@ -580,7 +580,7 @@ public void TestSpreadsheetExtractionIssue656()
             string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/Publication_of_award_of_Bids_for_Transport_Sector__August_2016.csv");
 
             SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
-            List<Table> tables = sea.Extract(page);
+            IReadOnlyList<Table> tables = sea.Extract(page);
             Assert.Single(tables);
             Table table = tables[0];
 

diff --git a/Tabula.Tests/TestTableDetection.cs b/Tabula.Tests/TestTableDetection.cs
@@ -16,7 +16,7 @@ public class TestTableDetection
 
         //private static Level defaultLogLevel;
 
-        private class TestStatus
+        private sealed class TestStatus
         {
             public int numExpectedTables;
             public int numCorrectlyDetectedTables;

diff --git a/Tabula.Tests/TestWriters.cs b/Tabula.Tests/TestWriters.cs
@@ -20,7 +20,7 @@ private Table GetTable()
             return bea.Extract(page)[0];
         }
 
-        private List<Table> GetTables()
+        private IReadOnlyList<Table> GetTables()
         {
             PageArea page = UtilsForTesting.GetPage("Resources/twotables.pdf", 1);
             SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
@@ -144,7 +144,7 @@ public void TestCSVSerializeInfinity()
         public void TestJSONSerializeTwoTables()
         {
             string expectedJson = UtilsForTesting.LoadJson("Resources/json/twotables.json");
-            List<Table> tables = this.GetTables();
+            IReadOnlyList<Table> tables = this.GetTables();
 
             StringBuilder sb = new StringBuilder();
             (new JSONWriter()).Write(sb, tables);
@@ -178,7 +178,7 @@ public void TestJSONSerializeTwoTables()
         public void TestCSVSerializeTwoTables()
         {
             string expectedCsv = UtilsForTesting.LoadCsv("Resources/csv/twotables.csv");
-            List<Table> tables = this.GetTables();
+            IReadOnlyList<Table> tables = this.GetTables();
 
             /*
             StringBuilder sb = new StringBuilder();

diff --git a/Tabula.Tests/TestsIcdar2013.cs b/Tabula.Tests/TestsIcdar2013.cs
@@ -15,8 +15,7 @@ public void Eu004()
         {
             using (PdfDocument document = PdfDocument.Open("Resources/icdar2013-dataset/competition-dataset-eu/eu-004.pdf", new ParsingOptions() { ClipPaths = true }))
             {
-                ObjectExtractor oe = new ObjectExtractor(document);
-                PageArea page = oe.Extract(3);
+                PageArea page = ObjectExtractor.Extract(document, 3);
 
                 var detector = new SimpleNurminenDetectionAlgorithm();
                 var regions = detector.Detect(page);

diff --git a/Tabula.Tests/TestsNurminenDetector.cs b/Tabula.Tests/TestsNurminenDetector.cs
@@ -15,8 +15,7 @@ public void TestLinesToCells()
         {
             using (PdfDocument document = PdfDocument.Open("test3.pdf", new ParsingOptions() { ClipPaths = true }))
             {
-                ObjectExtractor oe = new ObjectExtractor(document);
-                PageArea page = oe.Extract(1);
+                PageArea page = ObjectExtractor.Extract(document, 1);
 
                 SimpleNurminenDetectionAlgorithm detector = new SimpleNurminenDetectionAlgorithm();
                 var regions = detector.Detect(page);
@@ -25,7 +24,7 @@ public void TestLinesToCells()
                 {
                     IExtractionAlgorithm ea = new BasicExtractionAlgorithm();
                     var newArea = page.GetArea(a.BoundingBox);
-                    List<Table> tables = ea.Extract(newArea);
+                    IReadOnlyList<Table> tables = ea.Extract(newArea);
                 }
             }
         }