Skip to content

Commit b4c6e68

Browse files
authored
✨ add getting page count from a local input source (#273)
1 parent 45845d4 commit b4c6e68

File tree

9 files changed

+409
-331
lines changed

9 files changed

+409
-331
lines changed

.github/workflows/_static-analysis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
submodules: recursive
1414

1515
- name: Set up JDK
16-
uses: actions/setup-java@v4
16+
uses: actions/setup-java@v5
1717
with:
1818
java-version: "11"
1919
distribution: "temurin"

src/main/java/com/mindee/extraction/PDFExtractor.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,26 +111,24 @@ public List<ExtractedPDF> extractSubDocuments(List<List<Integer>> pageIndexes)
111111
return extractedPDFs;
112112
}
113113

114-
115114
/**
116115
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
117116
*
118117
* @param pageIndexes List of page indexes.
119118
* @return a list of extracted files.
120119
* @throws IOException Throws if the file can't be accessed.
121120
*/
122-
public List<ExtractedPDF> extractInvoices(List<InvoiceSplitterV1InvoicePageGroup> pageIndexes)
123-
throws IOException {
121+
public List<ExtractedPDF> extractInvoices(
122+
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes
123+
) throws IOException {
124124

125125
List<List<Integer>> indexes =
126126
pageIndexes.stream().map(InvoiceSplitterV1InvoicePageGroup::getPageIndexes)
127127
.collect(Collectors.toList());
128128

129-
130129
return extractSubDocuments(indexes);
131130
}
132131

133-
134132
/**
135133
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
136134
*
@@ -139,8 +137,10 @@ public List<ExtractedPDF> extractInvoices(List<InvoiceSplitterV1InvoicePageGroup
139137
* @return a list of extracted files.
140138
* @throws IOException Throws if the file can't be accessed.
141139
*/
142-
public List<ExtractedPDF> extractInvoices(List<InvoiceSplitterV1InvoicePageGroup> pageIndexes,
143-
boolean strict) throws IOException {
140+
public List<ExtractedPDF> extractInvoices(
141+
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes,
142+
boolean strict
143+
) throws IOException {
144144
List<List<Integer>> correctPageIndexes = new ArrayList<>();
145145
if (!strict) {
146146
return extractInvoices(pageIndexes);

src/main/java/com/mindee/input/LocalInputSource.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package com.mindee.input;
22

33
import com.mindee.image.ImageCompressor;
4+
import com.mindee.pdf.PDFUtils;
45
import com.mindee.pdf.PdfBoxApi;
56
import com.mindee.pdf.PdfCompressor;
67
import com.mindee.pdf.PdfOperation;
@@ -48,6 +49,17 @@ public LocalInputSource(String fileAsBase64, String filename) {
4849
this.filename = filename;
4950
}
5051

52+
/**
53+
* Get the number of pages in the document.
54+
* @return the number of pages in the current file.
55+
* @throws IOException If an I/O error occurs during the PDF operation.
56+
*/
57+
public int getPageCount() throws IOException {
58+
if (!this.isPdf()) {
59+
return 1;
60+
}
61+
return PDFUtils.getNumberOfPages(this.file);
62+
}
5163

5264
/**
5365
* Applies PDF-specific operations on the current file based on the specified {@code PageOptions}.

src/main/java/com/mindee/pdf/PDFUtils.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,16 @@ private PDFUtils() {
3939
* @param inputSource The PDF file.
4040
*/
4141
public static int getNumberOfPages(LocalInputSource inputSource) throws IOException {
42-
PDDocument document = Loader.loadPDF(inputSource.getFile());
42+
return getNumberOfPages(inputSource.getFile());
43+
}
44+
45+
/**
46+
* Get the number of pages in the PDF.
47+
*
48+
* @param pdfBytes The PDF file as a byte array.
49+
*/
50+
public static int getNumberOfPages(byte[] pdfBytes) throws IOException {
51+
PDDocument document = Loader.loadPDF(pdfBytes);
4352
int pageCount = document.getNumberOfPages();
4453
document.close();
4554
return pageCount;

src/test/java/com/mindee/extraction/InvoiceSplitterAutoExtractionIT.java

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,8 @@ protected Document<InvoiceSplitterV1> getInvoiceSplitterPrediction() throws
4242
protected PredictResponse<InvoiceV4> getInvoicePrediction(LocalInputSource invoicePDF) throws
4343
IOException, MindeeException {
4444
return client.parse(InvoiceV4.class, invoicePDF);
45-
4645
}
4746

48-
4947
protected String prepareInvoiceReturn(String rstFilePath, Document<InvoiceV4> invoicePrediction)
5048
throws IOException {
5149
List<String> rstRefLines = Files.readAllLines(Paths.get(rstFilePath));
@@ -60,7 +58,7 @@ protected String prepareInvoiceReturn(String rstFilePath, Document<InvoiceV4> in
6058
}
6159

6260
@Test
63-
public void givenAPDF_shouldExtractInvoicesStrict() throws IOException, InterruptedException {
61+
public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedException {
6462
Document<InvoiceSplitterV1> document = getInvoiceSplitterPrediction();
6563
InvoiceSplitterV1 inference = document.getInference();
6664

@@ -71,29 +69,37 @@ public void givenAPDF_shouldExtractInvoicesStrict() throws IOException, Interrup
7169
Assertions.assertEquals(2, extractedPDFsStrict.size());
7270
Assertions.assertEquals("default_sample_001-001.pdf", extractedPDFsStrict.get(0).getFilename());
7371
Assertions.assertEquals("default_sample_002-002.pdf", extractedPDFsStrict.get(1).getFilename());
74-
PredictResponse<InvoiceV4> invoice0 =
75-
getInvoicePrediction(extractedPDFsStrict.get(0).asInputSource());
7672

73+
PredictResponse<InvoiceV4> invoice0 = getInvoicePrediction(
74+
extractedPDFsStrict.get(0).asInputSource()
75+
);
7776
String testStringRSTInvoice0 = prepareInvoiceReturn(
7877
"src/test/resources/products/invoices/response_v4/summary_full_invoice_p1.rst",
79-
invoice0.getDocument());
80-
Assertions.assertEquals(testStringRSTInvoice0, String.join(String.format("%n"),
81-
invoice0.getDocument().toString().split(System.lineSeparator())));
82-
83-
PredictResponse<InvoiceV4> invoice1 =
84-
getInvoicePrediction(extractedPDFsStrict.get(1).asInputSource());
78+
invoice0.getDocument()
79+
);
80+
double invoice0Ratio = levenshteinRatio(
81+
testStringRSTInvoice0,
82+
String.join(
83+
String.format("%n"),
84+
invoice0.getDocument().toString().split(System.lineSeparator())
85+
)
86+
);
87+
Assertions.assertTrue(invoice0Ratio > 0.90);
8588

89+
PredictResponse<InvoiceV4> invoice1 = getInvoicePrediction(
90+
extractedPDFsStrict.get(1).asInputSource()
91+
);
8692
String testStringRSTInvoice1 = prepareInvoiceReturn(
8793
"src/test/resources/products/invoices/response_v4/summary_full_invoice_p2.rst",
88-
invoice1.getDocument());
89-
Assertions.assertTrue(
90-
levenshteinRatio(
94+
invoice1.getDocument()
95+
);
96+
double invoice1Ratio = levenshteinRatio(
9197
testStringRSTInvoice1,
92-
String.join(String.format("%n"),
93-
invoice1.getDocument().toString().split(System.lineSeparator())
98+
String.join(
99+
String.format("%n"),
100+
invoice1.getDocument().toString().split(System.lineSeparator())
94101
)
95-
) > 0.97);
96-
97-
102+
);
103+
Assertions.assertTrue(invoice1Ratio > 0.90);
98104
}
99105
}

0 commit comments

Comments
 (0)