diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 1c49e91..15cd14a 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -100,6 +100,7 @@ def self.normalize_value(value) end +require "#{Docsplit::ROOT}/lib/docsplit/external_process" require "#{Docsplit::ROOT}/lib/docsplit/image_extractor" require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs" require "#{Docsplit::ROOT}/lib/docsplit/text_extractor" diff --git a/lib/docsplit/external_process.rb b/lib/docsplit/external_process.rb new file mode 100644 index 0000000..914ccce --- /dev/null +++ b/lib/docsplit/external_process.rb @@ -0,0 +1,34 @@ +module Docsplit + module ExternalProcess + COMMAND_TIMEOUT = 300 # seconds + + # Run an external process and raise an exception if it fails. + def run(command, env = "") + # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). + # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes + # sure we exit at some point. + # + # - See https://github.com/GetSilverfin/silverfin/issues/1998 + # - Add timeout so a stuck process doesn't block our Ruby process forever + # - Remove blank lines + # - Remove duplicate lines + run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq" + + # - Run through bash so we can use PIPESTATUS + # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq` + result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp + + raise ExtractionFailed, result if $? != 0 + result + end + + def timeout + "#{timeout_bin} #{COMMAND_TIMEOUT}" + end + + def timeout_bin + # gtimeout on Mac + `which timeout` != "" ? "timeout" : "gtimeout" + end + end +end diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..fa6ca1a 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -3,11 +3,13 @@ module Docsplit # Delegates to GraphicsMagick in order to convert PDF documents into # nicely sized images. class ImageExtractor + include ExternalProcess MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" DEFAULT_FORMAT = :png DEFAULT_DENSITY = '150' + # Extract a list of PDFs as rasterized page images, according to the # configuration in options. def extract(pdfs, options) @@ -34,24 +36,26 @@ def convert(pdf, size, format, previous=nil) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s escaped_pdf = ESCAPE[pdf] FileUtils.mkdir_p(directory) unless File.exists?(directory) - common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" + env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2" + common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" + if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) - result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp - raise ExtractionFailed, result if $? != 0 + # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). + # By filtering these we avoid memory bloat when the executing process tries to capture stdout. + # See https://github.com/GetSilverfin/silverfin/issues/1998 + + run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1", env) else page_list(pages).each do |page| - out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp - result = `#{cmd}`.chomp - raise ExtractionFailed, result if $? != 0 + out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] + run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1", env) end end ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) end - private # Extract the relevant GraphicsMagick options from the options hash. diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0e51476..0a54267 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -13,6 +13,7 @@ module Docsplit # * Re-OCR each page in the `@pages_to_ocr` list at the end. # class TextExtractor + include ExternalProcess NO_TEXT_DETECTED = /---------\n\Z/ @@ -61,22 +62,25 @@ def extract_from_ocr(pdf, pages) base_path = File.join(@output, @pdf_name) escaped_pdf = ESCAPE[pdf] psm = @detect_orientation ? "-psm 1" : "" + timeout = 5.minutes.to_i + env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2" + if pages pages.each do |page| tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" + run("gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1", env) + run("tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1") clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff end else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" + run("gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1", env) #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 - run "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1" + run("tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1") clean_text(base_path + '.txt') if @clean_ocr end ensure @@ -95,13 +99,6 @@ def clean_text(file) end end - # Run an external process and raise an exception if it fails. - def run(command) - result = `#{command}` - raise ExtractionFailed, result if $? != 0 - result - end - # Extract the full contents of a pdf as a single file, directly. def extract_full(pdf) text_path = File.join(@output, "#{@pdf_name}.txt")