diff --git a/index.html b/index.html index 981c34d..2a70873 100755 --- a/index.html +++ b/index.html @@ -172,7 +172,10 @@

Usage

The Docsplit gem includes both the docsplit command-line utility as well as a Ruby API. The available commands and options are identical in both.
--output or -o can be passed to any command in order to - store the generated files in a directory of your choosing. + store the generated files in a directory of your choosing.
+ --leading_zeros can be passed to any command extracting individual + pages in order to pad the files' page numbers with zeros, resulting in + numerical ordering for particular environments.

diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb index 60ee7ef..ae52dfb 100755 --- a/lib/docsplit/command_line.rb +++ b/lib/docsplit/command_line.rb @@ -101,6 +101,9 @@ def parse_options opts.on('-r', '--rolling', 'generate images from each previous image') do |r| @options[:rolling] = true end + opts.on('--leading_zeros', 'include leading zeros when naming a page') do |l| + @options[:leading_zeros] = true + end opts.on_tail('-v', '--version', 'display docsplit version') do puts "Docsplit version #{Docsplit::VERSION}" exit diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..510e14c 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -32,6 +32,7 @@ def convert(pdf, size, format, previous=nil) basename = File.basename(pdf, File.extname(pdf)) directory = directory_for(size) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s + page_format = page_number_format(pdf) escaped_pdf = ESCAPE[pdf] FileUtils.mkdir_p(directory) unless File.exists?(directory) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" @@ -41,7 +42,8 @@ def convert(pdf, size, format, previous=nil) raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| - out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] + page_number = sprintf(page_format, page) + out_file = ESCAPE[File.join(directory, "#{basename}_#{page_number}.#{format}")] cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 @@ -63,6 +65,7 @@ def extract_options(options) @sizes = [options[:size]].flatten.compact @sizes = [nil] if @sizes.empty? @rolling = !!options[:rolling] + @zeros = !!options[:leading_zeros] end # If there's only one size requested, generate the images directly into @@ -98,6 +101,12 @@ def page_list(pages) }.flatten.uniq.sort end + # Generate the appropriate page number format. + def page_number_format(pdf) + digits = Docsplit.extract_length(pdf).to_s.length + @zeros ? "%0#{digits}d" : "%d" + end + end end diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb index 1b9bf7f..9815c63 100644 --- a/lib/docsplit/page_extractor.rb +++ b/lib/docsplit/page_extractor.rb @@ -9,7 +9,8 @@ def extract(pdfs, opts) extract_options opts [pdfs].flatten.each do |pdf| pdf_name = File.basename(pdf, File.extname(pdf)) - page_path = File.join(@output, "#{pdf_name}_%d.pdf") + page_format = page_number_format(pdf) + page_path = File.join(@output, "#{pdf_name}_#{page_format}.pdf") FileUtils.mkdir_p @output unless File.exists?(@output) cmd = if DEPENDENCIES[:pdftailor] # prefer pdftailor, but keep pdftk for backwards compatability @@ -29,6 +30,14 @@ def extract(pdfs, opts) def extract_options(options) @output = options[:output] || '.' + @zeros = !!options[:leading_zeros] + end + + # Generate the appropriate page number format. + def page_number_format(pdf) + digits = Docsplit.extract_length(pdf).to_s.length + # PDFTailor doesn't support printf-style format in the output, yet + (!DEPENDENCIES[:pdftailor] && @zeros) ? "%0#{digits}d" : "%d" end end diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..6a1ab39 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -59,12 +59,14 @@ def extract_from_pdf(pdf, pages) def extract_from_ocr(pdf, pages) tempdir = Dir.mktmpdir base_path = File.join(@output, @pdf_name) + page_format = page_number_format(pdf) escaped_pdf = ESCAPE[pdf] if pages pages.each do |page| - tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" + page_number = sprintf(page_format, page) + tiff = "#{tempdir}/#{@pdf_name}_#{page_number}.tif" escaped_tiff = ESCAPE[tiff] - file = "#{base_path}_#{page}" + file = "#{base_path}_#{page_number}" run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1" clean_text(file + '.txt') if @clean_ocr @@ -109,7 +111,8 @@ def extract_full(pdf) # Extract the contents of a single page of text, directly, adding it to # the `@pages_to_ocr` list if the text length is inadequate. def extract_page(pdf, page) - text_path = File.join(@output, "#{@pdf_name}_#{page}.txt") + page_number = sprintf(page_number_format(pdf), page) + text_path = File.join(@output, "#{@pdf_name}_#{page_number}.txt") run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1" unless @forbid_ocr @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE @@ -123,6 +126,13 @@ def extract_options(options) @forbid_ocr = options[:ocr] == false @clean_ocr = !(options[:clean] == false) @language = options[:language] || 'eng' + @zeros = !!options[:leading_zeros] + end + + # Generate the appropriate page number format. + def page_number_format(pdf) + digits = Docsplit.extract_length(pdf).to_s.length + @zeros ? "%0#{digits}d" : "%d" end end diff --git a/test/fixtures/leading_zeros.pdf b/test/fixtures/leading_zeros.pdf new file mode 100644 index 0000000..18f84c1 Binary files /dev/null and b/test/fixtures/leading_zeros.pdf differ diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb index 8ccfc58..2e833b6 100755 --- a/test/unit/test_extract_images.rb +++ b/test/unit/test_extract_images.rb @@ -48,4 +48,13 @@ def test_name_escaping_while_extracting_images 'PDF file with spaces \'single\' and "double quotes"_1.gif']) end + def test_leading_zeros_while_extracting_images + Docsplit.extract_images('test/fixtures/leading_zeros.pdf', :leading_zeros => true, :output => OUTPUT) + assert_directory_contains(OUTPUT, ['leading_zeros_01.png', 'leading_zeros_02.png', + 'leading_zeros_03.png', 'leading_zeros_04.png', + 'leading_zeros_05.png', 'leading_zeros_06.png', + 'leading_zeros_07.png', 'leading_zeros_08.png', + 'leading_zeros_09.png', 'leading_zeros_10.png']) + end + end diff --git a/test/unit/test_extract_pages.rb b/test/unit/test_extract_pages.rb index e0b1015..42e5a81 100755 --- a/test/unit/test_extract_pages.rb +++ b/test/unit/test_extract_pages.rb @@ -24,4 +24,17 @@ def test_name_escaping_while_extracting_pages assert Dir["#{OUTPUT}/*.pdf"].length == 2 end + def test_leading_zeros_while_extracting_pages + Docsplit.extract_pages('test/fixtures/leading_zeros.pdf', :leading_zeros => true, :output => OUTPUT) + + doc_data_path = File.join(OUTPUT, 'doc_data.txt') + File.delete(doc_data_path) if File.exists?(doc_data_path) + + assert_directory_contains(OUTPUT, ['leading_zeros_01.pdf', 'leading_zeros_02.pdf', + 'leading_zeros_03.pdf', 'leading_zeros_04.pdf', + 'leading_zeros_05.pdf', 'leading_zeros_06.pdf', + 'leading_zeros_07.pdf', 'leading_zeros_08.pdf', + 'leading_zeros_09.pdf', 'leading_zeros_10.pdf']) + end + end diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index 00d24e3..79a1f32 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -54,4 +54,13 @@ def test_name_escaping_while_extracting_text assert Dir["#{OUTPUT}/*.txt"].length == 2 end + def test_leading_zeros_while_extracting_text + Docsplit.extract_text('test/fixtures/leading_zeros.pdf', :pages => 'all', :leading_zeros => true, :output => OUTPUT) + assert_directory_contains(OUTPUT, ['leading_zeros_01.txt', 'leading_zeros_02.txt', + 'leading_zeros_03.txt', 'leading_zeros_04.txt', + 'leading_zeros_05.txt', 'leading_zeros_06.txt', + 'leading_zeros_07.txt', 'leading_zeros_08.txt', + 'leading_zeros_09.txt', 'leading_zeros_10.txt']) + end + end