55import java .io .ByteArrayOutputStream ;
66import java .io .File ;
77import java .io .IOException ;
8- import java .io .InputStream ;
98import java .util .ArrayList ;
109import java .util .List ;
1110import org .apache .pdfbox .cos .COSName ;
2120 */
2221public final class PDFUtils {
2322
24- private PDFUtils () {
25- }
23+ private PDFUtils () {}
2624
27- private static int countPDDocumentPages (PDDocument document ) throws IOException {
25+ /**
26+ * Get the number of pages in the PDF.
27+ * @param inputSource The PDF file.
28+ */
29+ public static int getNumberOfPages (LocalInputSource inputSource ) throws IOException {
30+ PDDocument document = PDDocument .load (inputSource .getFile ());
2831 int pageCount = document .getNumberOfPages ();
2932 document .close ();
3033 return pageCount ;
3134 }
3235
33- public static int countPdfPages (InputStream inputStream ) throws IOException {
34- try {
35- PDDocument document = PDDocument .load (inputStream );
36- int pageCount = countPDDocumentPages (document );
37- document .close ();
38- return pageCount ;
39- } finally {
40- inputStream .close ();
41- }
42- }
43-
4436 private static byte [] createPdfFromExistingPdf (
4537 PDDocument document ,
4638 List <Integer > pageNumbers
@@ -61,6 +53,11 @@ private static byte[] createPdfFromExistingPdf(
6153 return output ;
6254 }
6355
56+ /**
57+ * Merge specified PDF pages together.
58+ * @param file The PDF file.
59+ * @param pageNumbers Lit of page numbers to merge together.
60+ */
6461 public static byte [] mergePdfPages (
6562 File file ,
6663 List <Integer > pageNumbers
@@ -74,7 +71,6 @@ public static boolean isPdfEmpty(File file) throws IOException {
7471 }
7572
7673 private static boolean checkIfPdfIsEmpty (PDDocument document ) throws IOException {
77-
7874 boolean isEmpty = true ;
7975 for (PDPage page : document .getPages ()) {
8076 PDResources resources = page .getResources ();
@@ -97,29 +93,80 @@ private static boolean checkIfPdfIsEmpty(PDDocument document) throws IOException
9793 return isEmpty ;
9894 }
9995
96+ /**
97+ * Render all pages of a PDF as images.
98+ * Converting PDFs with hundreds of pages may result in a heap space error.
99+ * @param filePath The path to the PDF file.
100+ * @return List of all pages as images.
101+ */
100102 public static List <PdfPageImage > pdfToImages (String filePath ) throws IOException {
101103 return pdfToImages (new LocalInputSource (filePath ));
102104 }
103105
106+ /**
107+ * Render all pages of a PDF as images.
108+ * Converting PDFs with hundreds of pages may result in a heap space error.
109+ * @param source The PDF file.
110+ * @return List of all pages as images.
111+ */
104112 public static List <PdfPageImage > pdfToImages (LocalInputSource source ) throws IOException {
105113 PDDocument document = PDDocument .load (source .getFile ());
106114 PDFRenderer pdfRenderer = new PDFRenderer (document );
107115 List <PdfPageImage > pdfPageImages = new ArrayList <>();
108116 for (int i = 0 ; i < document .getNumberOfPages (); i ++) {
109- PDRectangle bbox = document .getPage (i ).getBBox ();
110- float dimension = bbox .getWidth () * bbox .getHeight ();
111- int dpi ;
112- if (dimension < 200000 ) {
113- dpi = 300 ;
114- } else if (dimension < 300000 ) {
115- dpi = 250 ;
116- } else {
117- dpi = 200 ;
118- }
119- BufferedImage imageBuffer = pdfRenderer .renderImageWithDPI (i , dpi , ImageType .RGB );
117+ BufferedImage imageBuffer = pdfPageToImageBuffer (i , document , pdfRenderer );
120118 pdfPageImages .add (new PdfPageImage (imageBuffer , i , source .getFilename (), "jpg" ));
121119 }
122120 document .close ();
123121 return pdfPageImages ;
124122 }
123+
124+ /**
125+ * Render a single page of a PDF as an image.
126+ * Main use case is for processing PDFs with hundreds of pages.
127+ * If you need to only render some pages from the PDF, use <code>mergePdfPages</code> and then <code>pdfToImages</code>.
128+ * @param filePath The path to the PDF file.
129+ * @param pageNumber The page number to render, first page is 1.
130+ * @return The page as an image.
131+ */
132+ public static PdfPageImage pdfPageToImage (String filePath , int pageNumber ) throws IOException {
133+ return pdfPageToImage (new LocalInputSource (filePath ), pageNumber );
134+ }
135+
136+ /**
137+ * Render a single page of a PDF as an image.
138+ * Main use case is for processing PDFs with hundreds of pages.
139+ * If you need to only render some pages from the PDF, use <code>mergePdfPages</code> and then <code>pdfToImages</code>.
140+ * @param source The PDF file.
141+ * @param pageNumber The page number to render, first page is 1.
142+ * @return The page as an image.
143+ */
144+ public static PdfPageImage pdfPageToImage (
145+ LocalInputSource source ,
146+ int pageNumber
147+ ) throws IOException {
148+ int index = pageNumber - 1 ;
149+ PDDocument document = PDDocument .load (source .getFile ());
150+ PDFRenderer pdfRenderer = new PDFRenderer (document );
151+ BufferedImage imageBuffer = pdfPageToImageBuffer (index , document , pdfRenderer );
152+ return new PdfPageImage (imageBuffer , index , source .getFilename (), "jpg" );
153+ }
154+
155+ private static BufferedImage pdfPageToImageBuffer (
156+ int index ,
157+ PDDocument document ,
158+ PDFRenderer pdfRenderer
159+ ) throws IOException {
160+ PDRectangle bbox = document .getPage (index ).getBBox ();
161+ float dimension = bbox .getWidth () * bbox .getHeight ();
162+ int dpi ;
163+ if (dimension < 200000 ) {
164+ dpi = 300 ;
165+ } else if (dimension < 300000 ) {
166+ dpi = 250 ;
167+ } else {
168+ dpi = 200 ;
169+ }
170+ return pdfRenderer .renderImageWithDPI (index , dpi , ImageType .RGB );
171+ }
125172}
0 commit comments