From dbbcdab2867d73a16b66cc77cd1f4618a90aee36 Mon Sep 17 00:00:00 2001
From: David Verhasselt <david@crowdway.com>
Date: Thu, 15 Jun 2017 12:15:40 +0300
Subject: [PATCH 01/10] Add a timeout and pipe warnings to /dev/null

---
 lib/docsplit/image_extractor.rb | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 8c29bbc..b8fa8f1 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -35,14 +35,15 @@ def convert(pdf, size, format, previous=nil)
       escaped_pdf = ESCAPE[pdf]
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
       common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
+      timeout   = 5.minutes.to_i
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
-        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
+        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>/dev/null`.chomp
         raise ExtractionFailed, result if $? != 0
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>/dev/null".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
         end

From 227da4b8ad0069a78a436fbca6b026257e8e361c Mon Sep 17 00:00:00 2001
From: David Verhasselt <david@crowdway.com>
Date: Thu, 15 Jun 2017 13:54:42 +0300
Subject: [PATCH 02/10] =?UTF-8?q?Filter=20on=20dupe=20lines=20in=20output,?=
 =?UTF-8?q?=20don=E2=80=99t=20blackhole=20everything?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don’t throw away the baby with the bathwater
---
 lib/docsplit/image_extractor.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index b8fa8f1..b5a005b 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -38,12 +38,12 @@ def convert(pdf, size, format, previous=nil)
       timeout   = 5.minutes.to_i
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
-        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>/dev/null`.chomp
+        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq`.chomp
         raise ExtractionFailed, result if $? != 0
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>/dev/null".chomp
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v '^$' | uniq".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
         end

From f403e27b0b295f85ddd9a1a5cb776a58409b37a5 Mon Sep 17 00:00:00 2001
From: David Verhasselt <david@crowdway.com>
Date: Thu, 15 Jun 2017 13:57:27 +0300
Subject: [PATCH 03/10] Add timeouts to TextExtractor too

---
 lib/docsplit/text_extractor.rb | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 0e51476..c6ef8a3 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -61,22 +61,23 @@ def extract_from_ocr(pdf, pages)
       base_path = File.join(@output, @pdf_name)
       escaped_pdf = ESCAPE[pdf]
       psm = @detect_orientation ? "-psm 1" : ""
+      timeout = 5.minutes.to_i
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
-          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
+          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
+          run "timeout #{timeout} tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
-        run "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1"
+        run "timeout #{timeout} tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure

From 73c2cef089cafd2f20c1b3af4540b5e5ecc80616 Mon Sep 17 00:00:00 2001
From: David Verhasselt <david@crowdway.com>
Date: Thu, 15 Jun 2017 14:15:13 +0300
Subject: [PATCH 04/10] Add comment

---
 lib/docsplit/image_extractor.rb | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index b5a005b..a55a60c 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -38,6 +38,9 @@ def convert(pdf, size, format, previous=nil)
       timeout   = 5.minutes.to_i
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
+        # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
+        # By filtering these we avoid memory bloat when the executing process tries to capture stdout.
+        # See https://github.com/GetSilverfin/silverfin/issues/1998
         result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq`.chomp
         raise ExtractionFailed, result if $? != 0
       else

From 5a9ae540bda4c61ebee9368ed2cbf9ef45f1653f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tarmo=20T=C3=A4nav?= <tarmo@itech.ee>
Date: Thu, 15 Jun 2017 22:10:13 +0300
Subject: [PATCH 05/10] Return the exit status of the first piped command
 instead of the (default) last

---
 lib/docsplit/image_extractor.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index a55a60c..2721787 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -41,12 +41,12 @@ def convert(pdf, size, format, previous=nil)
         # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
         # By filtering these we avoid memory bloat when the executing process tries to capture stdout.
         # See https://github.com/GetSilverfin/silverfin/issues/1998
-        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq`.chomp
+        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq; exit ${PIPESTATUS[0]}`.chomp
         raise ExtractionFailed, result if $? != 0
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v '^$' | uniq".chomp
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v '^$' | uniq; exit ${PIPESTATUS[0]}".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
         end

From ba77c8fc86bbf6e1d4e33d15f212e2f63270d4fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tarmo=20T=C3=A4nav?= <tarmo@itech.ee>
Date: Fri, 16 Jun 2017 00:37:06 +0300
Subject: [PATCH 06/10] Try to send convert commands explicitly through bash so
 PIPESTATUS would be reliably available

---
 lib/docsplit/image_extractor.rb | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 2721787..57119e9 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -41,13 +41,14 @@ def convert(pdf, size, format, previous=nil)
         # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
         # By filtering these we avoid memory bloat when the executing process tries to capture stdout.
         # See https://github.com/GetSilverfin/silverfin/issues/1998
-        result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v '^$' | uniq; exit ${PIPESTATUS[0]}`.chomp
+        cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v \"^$\" | uniq; exit ${PIPESTATUS[0]}"
+        result = `bash -c '#{cmd}'`.chomp
         raise ExtractionFailed, result if $? != 0
       else
         page_list(pages).each do |page|
           out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v '^$' | uniq; exit ${PIPESTATUS[0]}".chomp
-          result = `#{cmd}`.chomp
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v \"^$\" | uniq; exit ${PIPESTATUS[0]}"
+          result = `bash -c '#{cmd}'`.chomp
           raise ExtractionFailed, result if $? != 0
         end
       end

From e505222b9fe6861f7314d50e86de6a610f9a1f44 Mon Sep 17 00:00:00 2001
From: David Verhasselt <david@crowdway.com>
Date: Fri, 16 Jun 2017 14:07:54 +0300
Subject: [PATCH 07/10] Fix timeout on Mac

---
 lib/docsplit/image_extractor.rb | 47 ++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 57119e9..86852a0 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -7,6 +7,7 @@ class ImageExtractor
     MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
     DEFAULT_FORMAT  = :png
     DEFAULT_DENSITY = '150'
+    COMMAND_TIMEOUT = 300 # seconds
 
     # Extract a list of PDFs as rasterized page images, according to the
     # configuration in options.
@@ -34,31 +35,57 @@ def convert(pdf, size, format, previous=nil)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
       escaped_pdf = ESCAPE[pdf]
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
-      common    = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
-      timeout   = 5.minutes.to_i
+      env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2"
+      common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
+
       if previous
         FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
         # We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
         # By filtering these we avoid memory bloat when the executing process tries to capture stdout.
         # See https://github.com/GetSilverfin/silverfin/issues/1998
-        cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1 | grep -v \"^$\" | uniq; exit ${PIPESTATUS[0]}"
-        result = `bash -c '#{cmd}'`.chomp
-        raise ExtractionFailed, result if $? != 0
+
+        run(env, "gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1")
       else
         page_list(pages).each do |page|
-          out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1 | grep -v \"^$\" | uniq; exit ${PIPESTATUS[0]}"
-          result = `bash -c '#{cmd}'`.chomp
-          raise ExtractionFailed, result if $? != 0
+          out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
+          run(env, "gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1")
         end
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
     end
 
-
     private
 
+    # Run an external process and raise an exception if it fails.
+    def run(env, command)
+      # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
+      # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes
+      # sure we exit at some point.
+      #
+      # - See https://github.com/GetSilverfin/silverfin/issues/1998
+      # - Add timeout so a stuck process doesn't block our Ruby process forever
+      # - Remove blank lines
+      # - Remove duplicate lines
+      run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq"
+
+      # - Run through bash so we can use PIPESTATUS
+      # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq`
+      result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp
+
+      raise ExtractionFailed, result if $? != 0
+      result
+    end
+
+    def timeout
+      "#{timeout_bin} #{COMMAND_TIMEOUT}"
+    end
+
+    def timeout_bin
+      # gtimeout on Mac
+      `which timeout` != "" ? "timeout" : "gtimeout"
+    end
+
     # Extract the relevant GraphicsMagick options from the options hash.
     def extract_options(options)
       @output  = options[:output]  || '.'

From fa29009f34aa6d7b8bad9934f49fe2ba85c21296 Mon Sep 17 00:00:00 2001
From: David Verhasselt <david@crowdway.com>
Date: Fri, 16 Jun 2017 15:55:01 +0300
Subject: [PATCH 08/10] Extract ExternalProcess module

---
 lib/docsplit.rb                  |  1 +
 lib/docsplit/external_process.rb | 34 ++++++++++++++++++++++++++++++++
 lib/docsplit/image_extractor.rb  | 32 ++----------------------------
 3 files changed, 37 insertions(+), 30 deletions(-)
 create mode 100644 lib/docsplit/external_process.rb

diff --git a/lib/docsplit.rb b/lib/docsplit.rb
index 1c49e91..15cd14a 100755
--- a/lib/docsplit.rb
+++ b/lib/docsplit.rb
@@ -100,6 +100,7 @@ def self.normalize_value(value)
 
 end
 
+require "#{Docsplit::ROOT}/lib/docsplit/external_process"
 require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
diff --git a/lib/docsplit/external_process.rb b/lib/docsplit/external_process.rb
new file mode 100644
index 0000000..55dd943
--- /dev/null
+++ b/lib/docsplit/external_process.rb
@@ -0,0 +1,34 @@
+module Docsplit
+  module ExternalProcess
+    COMMAND_TIMEOUT = 300 # seconds
+
+    # Run an external process and raise an exception if it fails.
+    def run(env, command)
+      # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
+      # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes
+      # sure we exit at some point.
+      #
+      # - See https://github.com/GetSilverfin/silverfin/issues/1998
+      # - Add timeout so a stuck process doesn't block our Ruby process forever
+      # - Remove blank lines
+      # - Remove duplicate lines
+      run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq"
+
+      # - Run through bash so we can use PIPESTATUS
+      # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq`
+      result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp
+
+      raise ExtractionFailed, result if $? != 0
+      result
+    end
+
+    def timeout
+      "#{timeout_bin} #{COMMAND_TIMEOUT}"
+    end
+
+    def timeout_bin
+      # gtimeout on Mac
+      `which timeout` != "" ? "timeout" : "gtimeout"
+    end
+  end
+end
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index 86852a0..acf437d 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -3,11 +3,12 @@ module Docsplit
   # Delegates to GraphicsMagick in order to convert PDF documents into
   # nicely sized images.
   class ImageExtractor
+    include ExternalProcess
 
     MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
     DEFAULT_FORMAT  = :png
     DEFAULT_DENSITY = '150'
-    COMMAND_TIMEOUT = 300 # seconds
+
 
     # Extract a list of PDFs as rasterized page images, according to the
     # configuration in options.
@@ -57,35 +58,6 @@ def convert(pdf, size, format, previous=nil)
 
     private
 
-    # Run an external process and raise an exception if it fails.
-    def run(env, command)
-      # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
-      # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes
-      # sure we exit at some point.
-      #
-      # - See https://github.com/GetSilverfin/silverfin/issues/1998
-      # - Add timeout so a stuck process doesn't block our Ruby process forever
-      # - Remove blank lines
-      # - Remove duplicate lines
-      run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq"
-
-      # - Run through bash so we can use PIPESTATUS
-      # - Use PIPESTATUS to return the exit status of #{command} instead of `uniq`
-      result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp
-
-      raise ExtractionFailed, result if $? != 0
-      result
-    end
-
-    def timeout
-      "#{timeout_bin} #{COMMAND_TIMEOUT}"
-    end
-
-    def timeout_bin
-      # gtimeout on Mac
-      `which timeout` != "" ? "timeout" : "gtimeout"
-    end
-
     # Extract the relevant GraphicsMagick options from the options hash.
     def extract_options(options)
       @output  = options[:output]  || '.'

From e7adf3ebbeff0dfea6e93d279d71888f19e5cfd2 Mon Sep 17 00:00:00 2001
From: David Verhasselt <david@crowdway.com>
Date: Fri, 16 Jun 2017 17:07:05 +0300
Subject: [PATCH 09/10] Cleanup TextExtractor

---
 lib/docsplit/text_extractor.rb | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index c6ef8a3..a236a3c 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -13,6 +13,7 @@ module Docsplit
   #  * Re-OCR each page in the `@pages_to_ocr` list at the end.
   #
   class TextExtractor
+    include ExternalProcess
 
     NO_TEXT_DETECTED = /---------\n\Z/
 
@@ -62,22 +63,24 @@ def extract_from_ocr(pdf, pages)
       escaped_pdf = ESCAPE[pdf]
       psm = @detect_orientation ? "-psm 1" : ""
       timeout = 5.minutes.to_i
+      env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2"
+
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
-          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "timeout #{timeout} tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
+          run(env, "gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1")
+          run("", "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1")
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 timeout #{timeout} gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+        run(env, "gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1")
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
-        run "timeout #{timeout} tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1"
+        run("", "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1")
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
@@ -96,13 +99,6 @@ def clean_text(file)
       end
     end
 
-    # Run an external process and raise an exception if it fails.
-    def run(command)
-      result = `#{command}`
-      raise ExtractionFailed, result if $? != 0
-      result
-    end
-
     # Extract the full contents of a pdf as a single file, directly.
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")

From 93ecf0b4f767e16841397f5d202d6833e9974c7c Mon Sep 17 00:00:00 2001
From: David Verhasselt <david@crowdway.com>
Date: Thu, 22 Jun 2017 12:05:33 +0300
Subject: [PATCH 10/10] Fix bug in remaining `run` calls

---
 lib/docsplit/external_process.rb | 2 +-
 lib/docsplit/image_extractor.rb  | 4 ++--
 lib/docsplit/text_extractor.rb   | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/docsplit/external_process.rb b/lib/docsplit/external_process.rb
index 55dd943..914ccce 100644
--- a/lib/docsplit/external_process.rb
+++ b/lib/docsplit/external_process.rb
@@ -3,7 +3,7 @@ module ExternalProcess
     COMMAND_TIMEOUT = 300 # seconds
 
     # Run an external process and raise an exception if it fails.
-    def run(env, command)
+    def run(command, env = "")
       # If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
       # By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes
       # sure we exit at some point.
diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
index acf437d..fa6ca1a 100755
--- a/lib/docsplit/image_extractor.rb
+++ b/lib/docsplit/image_extractor.rb
@@ -45,11 +45,11 @@ def convert(pdf, size, format, previous=nil)
         # By filtering these we avoid memory bloat when the executing process tries to capture stdout.
         # See https://github.com/GetSilverfin/silverfin/issues/1998
 
-        run(env, "gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1")
+        run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1", env)
       else
         page_list(pages).each do |page|
           out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
-          run(env, "gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1")
+          run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1", env)
         end
       end
     ensure
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index a236a3c..0a54267 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -70,17 +70,17 @@ def extract_from_ocr(pdf, pages)
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
-          run(env, "gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1")
-          run("", "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1")
+          run("gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1", env)
+          run("tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1")
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
-        run(env, "gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1")
+        run("gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1", env)
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
-        run("", "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1")
+        run("tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1")
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure