From dbbcdab2867d73a16b66cc77cd1f4618a90aee36 Mon Sep 17 00:00:00 2001 From: David Verhasselt Date: Thu, 15 Jun 2017 12:15:40 +0300 Subject: [PATCH 01/10] Add a timeout and pipe warnings to /dev/null --- lib/docsplit/image_extractor.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..b8fa8f1 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -35,14 +35,15 @@ def convert(pdf, size, format, previous=nil) escaped_pdf = ESCAPE[pdf] FileUtils.mkdir_p(directory) unless File.exists?(directory) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" + timeout = 5.minutes.to_i if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) - result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp + result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>/dev/null`.chomp raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp + cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>/dev/null".chomp result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 end From 227da4b8ad0069a78a436fbca6b026257e8e361c Mon Sep 17 00:00:00 2001 From: David Verhasselt Date: Thu, 15 Jun 2017 13:54:42 +0300 Subject: [PATCH 02/10] =?UTF-8?q?Filter=20on=20dupe=20lines=20in=20output,?= =?UTF-8?q?=20don=E2=80=99t=20blackhole=20everything?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don’t throw away the baby with the bathwater --- lib/docsplit/image_extractor.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index b8fa8f1..b5a005b 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -38,12 +38,12 @@ def convert(pdf, size, format, previous=nil) timeout = 5.minutes.to_i if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) - result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>/dev/null`.chomp + result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq`.chomp raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>/dev/null".chomp + cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v '^$' | uniq".chomp result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 end From f403e27b0b295f85ddd9a1a5cb776a58409b37a5 Mon Sep 17 00:00:00 2001 From: David Verhasselt Date: Thu, 15 Jun 2017 13:57:27 +0300 Subject: [PATCH 03/10] Add timeouts to TextExtractor too --- lib/docsplit/text_extractor.rb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0e51476..c6ef8a3 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -61,22 +61,23 @@ def extract_from_ocr(pdf, pages) base_path = File.join(@output, @pdf_name) escaped_pdf = ESCAPE[pdf] psm = @detect_orientation ? "-psm 1" : "" + timeout = 5.minutes.to_i if pages pages.each do |page| tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" + run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" + run "timeout #{timeout} tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff end else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" + run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 - run "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1" + run "timeout #{timeout} tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1" clean_text(base_path + '.txt') if @clean_ocr end ensure From 73c2cef089cafd2f20c1b3af4540b5e5ecc80616 Mon Sep 17 00:00:00 2001 From: David Verhasselt Date: Thu, 15 Jun 2017 14:15:13 +0300 Subject: [PATCH 04/10] Add comment --- lib/docsplit/image_extractor.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index b5a005b..a55a60c 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -38,6 +38,9 @@ def convert(pdf, size, format, previous=nil) timeout = 5.minutes.to_i if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) + # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). + # By filtering these we avoid memory bloat when the executing process tries to capture stdout. + # See https://github.com/GetSilverfin/silverfin/issues/1998 result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq`.chomp raise ExtractionFailed, result if $? != 0 else From 5a9ae540bda4c61ebee9368ed2cbf9ef45f1653f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tarmo=20T=C3=A4nav?= Date: Thu, 15 Jun 2017 22:10:13 +0300 Subject: [PATCH 05/10] Return the exit status of the first piped command instead of the (default) last --- lib/docsplit/image_extractor.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index a55a60c..2721787 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -41,12 +41,12 @@ def convert(pdf, size, format, previous=nil) # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). # By filtering these we avoid memory bloat when the executing process tries to capture stdout. # See https://github.com/GetSilverfin/silverfin/issues/1998 - result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq`.chomp + result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq; exit ${PIPESTATUS[0]}`.chomp raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v '^$' | uniq".chomp + cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v '^$' | uniq; exit ${PIPESTATUS[0]}".chomp result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 end From ba77c8fc86bbf6e1d4e33d15f212e2f63270d4fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tarmo=20T=C3=A4nav?= Date: Fri, 16 Jun 2017 00:37:06 +0300 Subject: [PATCH 06/10] Try to send convert commands explicitly through bash so PIPESTATUS would be reliably available --- lib/docsplit/image_extractor.rb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 2721787..57119e9 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -41,13 +41,14 @@ def convert(pdf, size, format, previous=nil) # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). # By filtering these we avoid memory bloat when the executing process tries to capture stdout. # See https://github.com/GetSilverfin/silverfin/issues/1998 - result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq; exit ${PIPESTATUS[0]}`.chomp + cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v \"^$\" | uniq; exit ${PIPESTATUS[0]}" + result = `bash -c '#{cmd}'`.chomp raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v '^$' | uniq; exit ${PIPESTATUS[0]}".chomp - result = `#{cmd}`.chomp + cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v \"^$\" | uniq; exit ${PIPESTATUS[0]}" + result = `bash -c '#{cmd}'`.chomp raise ExtractionFailed, result if $? != 0 end end From e505222b9fe6861f7314d50e86de6a610f9a1f44 Mon Sep 17 00:00:00 2001 From: David Verhasselt Date: Fri, 16 Jun 2017 14:07:54 +0300 Subject: [PATCH 07/10] Fix timeout on Mac --- lib/docsplit/image_extractor.rb | 47 ++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 57119e9..86852a0 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -7,6 +7,7 @@ class ImageExtractor MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" DEFAULT_FORMAT = :png DEFAULT_DENSITY = '150' + COMMAND_TIMEOUT = 300 # seconds # Extract a list of PDFs as rasterized page images, according to the # configuration in options. @@ -34,31 +35,57 @@ def convert(pdf, size, format, previous=nil) pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s escaped_pdf = ESCAPE[pdf] FileUtils.mkdir_p(directory) unless File.exists?(directory) - common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" - timeout = 5.minutes.to_i + env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2" + common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" + if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). # By filtering these we avoid memory bloat when the executing process tries to capture stdout. # See https://github.com/GetSilverfin/silverfin/issues/1998 - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v \"^$\" | uniq; exit ${PIPESTATUS[0]}" - result = `bash -c '#{cmd}'`.chomp - raise ExtractionFailed, result if $? != 0 + + run(env, "gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1") else page_list(pages).each do |page| - out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v \"^$\" | uniq; exit ${PIPESTATUS[0]}" - result = `bash -c '#{cmd}'`.chomp - raise ExtractionFailed, result if $? != 0 + out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] + run(env, "gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1") end end ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) end - private + # Run an external process and raise an exception if it fails. + def run(env, command) + # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). + # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes + # sure we exit at some point. + # + # - See https://github.com/GetSilverfin/silverfin/issues/1998 + # - Add timeout so a stuck process doesn't block our Ruby process forever + # - Remove blank lines + # - Remove duplicate lines + run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq" + + # - Run through bash so we can use PIPESTATUS + # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq` + result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp + + raise ExtractionFailed, result if $? != 0 + result + end + + def timeout + "#{timeout_bin} #{COMMAND_TIMEOUT}" + end + + def timeout_bin + # gtimeout on Mac + `which timeout` != "" ? "timeout" : "gtimeout" + end + # Extract the relevant GraphicsMagick options from the options hash. def extract_options(options) @output = options[:output] || '.' From fa29009f34aa6d7b8bad9934f49fe2ba85c21296 Mon Sep 17 00:00:00 2001 From: David Verhasselt Date: Fri, 16 Jun 2017 15:55:01 +0300 Subject: [PATCH 08/10] Extract ExternalProcess module --- lib/docsplit.rb | 1 + lib/docsplit/external_process.rb | 34 ++++++++++++++++++++++++++++++++ lib/docsplit/image_extractor.rb | 32 ++---------------------------- 3 files changed, 37 insertions(+), 30 deletions(-) create mode 100644 lib/docsplit/external_process.rb diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 1c49e91..15cd14a 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -100,6 +100,7 @@ def self.normalize_value(value) end +require "#{Docsplit::ROOT}/lib/docsplit/external_process" require "#{Docsplit::ROOT}/lib/docsplit/image_extractor" require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs" require "#{Docsplit::ROOT}/lib/docsplit/text_extractor" diff --git a/lib/docsplit/external_process.rb b/lib/docsplit/external_process.rb new file mode 100644 index 0000000..55dd943 --- /dev/null +++ b/lib/docsplit/external_process.rb @@ -0,0 +1,34 @@ +module Docsplit + module ExternalProcess + COMMAND_TIMEOUT = 300 # seconds + + # Run an external process and raise an exception if it fails. + def run(env, command) + # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). + # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes + # sure we exit at some point. + # + # - See https://github.com/GetSilverfin/silverfin/issues/1998 + # - Add timeout so a stuck process doesn't block our Ruby process forever + # - Remove blank lines + # - Remove duplicate lines + run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq" + + # - Run through bash so we can use PIPESTATUS + # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq` + result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp + + raise ExtractionFailed, result if $? != 0 + result + end + + def timeout + "#{timeout_bin} #{COMMAND_TIMEOUT}" + end + + def timeout_bin + # gtimeout on Mac + `which timeout` != "" ? "timeout" : "gtimeout" + end + end +end diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 86852a0..acf437d 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -3,11 +3,12 @@ module Docsplit # Delegates to GraphicsMagick in order to convert PDF documents into # nicely sized images. class ImageExtractor + include ExternalProcess MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB" DEFAULT_FORMAT = :png DEFAULT_DENSITY = '150' - COMMAND_TIMEOUT = 300 # seconds + # Extract a list of PDFs as rasterized page images, according to the # configuration in options. @@ -57,35 +58,6 @@ def convert(pdf, size, format, previous=nil) private - # Run an external process and raise an exception if it fails. - def run(env, command) - # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). - # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes - # sure we exit at some point. - # - # - See https://github.com/GetSilverfin/silverfin/issues/1998 - # - Add timeout so a stuck process doesn't block our Ruby process forever - # - Remove blank lines - # - Remove duplicate lines - run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq" - - # - Run through bash so we can use PIPESTATUS - # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq` - result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp - - raise ExtractionFailed, result if $? != 0 - result - end - - def timeout - "#{timeout_bin} #{COMMAND_TIMEOUT}" - end - - def timeout_bin - # gtimeout on Mac - `which timeout` != "" ? "timeout" : "gtimeout" - end - # Extract the relevant GraphicsMagick options from the options hash. def extract_options(options) @output = options[:output] || '.' From e7adf3ebbeff0dfea6e93d279d71888f19e5cfd2 Mon Sep 17 00:00:00 2001 From: David Verhasselt Date: Fri, 16 Jun 2017 17:07:05 +0300 Subject: [PATCH 09/10] Cleanup TextExtractor --- lib/docsplit/text_extractor.rb | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index c6ef8a3..a236a3c 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -13,6 +13,7 @@ module Docsplit # * Re-OCR each page in the `@pages_to_ocr` list at the end. # class TextExtractor + include ExternalProcess NO_TEXT_DETECTED = /---------\n\Z/ @@ -62,22 +63,24 @@ def extract_from_ocr(pdf, pages) escaped_pdf = ESCAPE[pdf] psm = @detect_orientation ? "-psm 1" : "" timeout = 5.minutes.to_i + env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2" + if pages pages.each do |page| tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" - run "timeout #{timeout} tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1" + run(env, "gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1") + run("", "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1") clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff end else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] - run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" + run(env, "gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1") #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 - run "timeout #{timeout} tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1" + run("", "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1") clean_text(base_path + '.txt') if @clean_ocr end ensure @@ -96,13 +99,6 @@ def clean_text(file) end end - # Run an external process and raise an exception if it fails. - def run(command) - result = `#{command}` - raise ExtractionFailed, result if $? != 0 - result - end - # Extract the full contents of a pdf as a single file, directly. def extract_full(pdf) text_path = File.join(@output, "#{@pdf_name}.txt") From 93ecf0b4f767e16841397f5d202d6833e9974c7c Mon Sep 17 00:00:00 2001 From: David Verhasselt Date: Thu, 22 Jun 2017 12:05:33 +0300 Subject: [PATCH 10/10] Fix bug in remaining `run` calls --- lib/docsplit/external_process.rb | 2 +- lib/docsplit/image_extractor.rb | 4 ++-- lib/docsplit/text_extractor.rb | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/docsplit/external_process.rb b/lib/docsplit/external_process.rb index 55dd943..914ccce 100644 --- a/lib/docsplit/external_process.rb +++ b/lib/docsplit/external_process.rb @@ -3,7 +3,7 @@ module ExternalProcess COMMAND_TIMEOUT = 300 # seconds # Run an external process and raise an exception if it fails. - def run(env, command) + def run(command, env = "") # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between). # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes # sure we exit at some point. diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index acf437d..fa6ca1a 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -45,11 +45,11 @@ def convert(pdf, size, format, previous=nil) # By filtering these we avoid memory bloat when the executing process tries to capture stdout. # See https://github.com/GetSilverfin/silverfin/issues/1998 - run(env, "gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1") + run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1", env) else page_list(pages).each do |page| out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - run(env, "gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1") + run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1", env) end end ensure diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index a236a3c..0a54267 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -70,17 +70,17 @@ def extract_from_ocr(pdf, pages) tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" - run(env, "gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1") - run("", "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1") + run("gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1", env) + run("tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1") clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff end else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] - run(env, "gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1") + run("gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1", env) #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 - run("", "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1") + run("tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1") clean_text(base_path + '.txt') if @clean_ocr end ensure