Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix timeout on mac #3

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions lib/docsplit.rb
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def self.normalize_value(value)

end

require "#{Docsplit::ROOT}/lib/docsplit/external_process"
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
Expand Down
34 changes: 34 additions & 0 deletions lib/docsplit/external_process.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
module Docsplit
module ExternalProcess
COMMAND_TIMEOUT = 300 # seconds

# Run an external process and raise an exception if it fails.
def run(command, env = "")
# If a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
# By filtering these we avoid memory bloat when the executing process tries to capture stdout. The timeout makes
# sure we exit at some point.
#
# - See https://github.com/GetSilverfin/silverfin/issues/1998
# - Add timeout so a stuck process doesn't block our Ruby process forever
# - Remove blank lines
# - Remove duplicate lines
run_command = "#{env} #{timeout} #{command} | grep -v \"^$\" | uniq"

# - Run through bash so we can use PIPESTATUS
# - Use PIPESTATUS to return the exit status of #{command} instead of `uniq`
result = `bash -c '#{run_command}; exit ${PIPESTATUS[0]}'`.chomp

raise ExtractionFailed, result if $? != 0
result
end

def timeout
"#{timeout_bin} #{COMMAND_TIMEOUT}"
end

def timeout_bin
# gtimeout on Mac
`which timeout` != "" ? "timeout" : "gtimeout"
end
end
end
20 changes: 12 additions & 8 deletions lib/docsplit/image_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@ module Docsplit
# Delegates to GraphicsMagick in order to convert PDF documents into
# nicely sized images.
class ImageExtractor
include ExternalProcess

MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
DEFAULT_FORMAT = :png
DEFAULT_DENSITY = '150'


# Extract a list of PDFs as rasterized page images, according to the
# configuration in options.
def extract(pdfs, options)
Expand All @@ -34,24 +36,26 @@ def convert(pdf, size, format, previous=nil)
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
escaped_pdf = ESCAPE[pdf]
FileUtils.mkdir_p(directory) unless File.exists?(directory)
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"
env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2"
common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}"

if previous
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp
raise ExtractionFailed, result if $? != 0
# We're adding `| grep -v '^$' | uniq` here and below because if a corrupt PDF is parsed, it generates an infinite amount of identical warnings (with blank lines in between).
# By filtering these we avoid memory bloat when the executing process tries to capture stdout.
# See https://github.com/GetSilverfin/silverfin/issues/1998

run("gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1", env)
else
page_list(pages).each do |page|
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
run("gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1", env)
end
end
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end


private

# Extract the relevant GraphicsMagick options from the options hash.
Expand Down
19 changes: 8 additions & 11 deletions lib/docsplit/text_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ module Docsplit
# * Re-OCR each page in the `@pages_to_ocr` list at the end.
#
class TextExtractor
include ExternalProcess

NO_TEXT_DETECTED = /---------\n\Z/

Expand Down Expand Up @@ -61,22 +62,25 @@ def extract_from_ocr(pdf, pages)
base_path = File.join(@output, @pdf_name)
escaped_pdf = ESCAPE[pdf]
psm = @detect_orientation ? "-psm 1" : ""
timeout = 5.minutes.to_i
env = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2"

if pages
pages.each do |page|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
escaped_tiff = ESCAPE[tiff]
file = "#{base_path}_#{page}"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1"
run("gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1", env)
run("tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{psm} 2>&1")
clean_text(file + '.txt') if @clean_ocr
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
escaped_tiff = ESCAPE[tiff]
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
run("gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1", env)
#if the user says don't do orientation detection or the plugin is not installed, set psm to 0
run "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1"
run("tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1")
clean_text(base_path + '.txt') if @clean_ocr
end
ensure
Expand All @@ -95,13 +99,6 @@ def clean_text(file)
end
end

# Run an external process and raise an exception if it fails.
def run(command)
result = `#{command}`
raise ExtractionFailed, result if $? != 0
result
end

# Extract the full contents of a pdf as a single file, directly.
def extract_full(pdf)
text_path = File.join(@output, "#{@pdf_name}.txt")
Expand Down