forked from tleyden/open-ocr
-
Notifications
You must be signed in to change notification settings - Fork 2
/
sandwich_engine.go
488 lines (423 loc) · 16.2 KB
/
sandwich_engine.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
package ocrworker
import (
"context"
"encoding/base64"
"fmt"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)
// SandwichEngine calls pdfsandwich via exec
// This implementation returns either the pdf with ocr layer only
// or merged variant of pdf plus ocr layer with the ability to
// optimize the output pdf file by calling "gs" tool
type SandwichEngine struct{}
type SandwichEngineArgs struct {
configVars map[string]string `json:"config_vars"`
lang string `json:"lang"`
ocrType string `json:"ocr_type"`
ocrOptimize bool `json:"result_optimize"`
saveFiles bool
t2pConverter string
requestID string
component string
}
// NewSandwichEngineArgs generates arguments for SandwichEngine which will be used to start involved tools
func NewSandwichEngineArgs(ocrRequest *OcrRequest, workerConfig *WorkerConfig) (*SandwichEngineArgs, error) {
engineArgs := &SandwichEngineArgs{}
engineArgs.component = "OCR_WORKER"
engineArgs.requestID = ocrRequest.RequestID
logger := zerolog.New(os.Stdout).With().
Str("RequestID", engineArgs.requestID).Str("component", engineArgs.component).Timestamp().Logger()
if ocrRequest.EngineArgs == nil {
return engineArgs, nil
}
// config vars
configVarsMapInterfaceOrig := ocrRequest.EngineArgs["config_vars"]
if configVarsMapInterfaceOrig != nil {
logger.Info().Interface("configVarsMap", configVarsMapInterfaceOrig).
Msg("got configVarsMap")
configVarsMapInterface := configVarsMapInterfaceOrig.(map[string]interface{})
configVarsMap := make(map[string]string)
for k, v := range configVarsMapInterface {
v, ok := v.(string)
if !ok {
return nil, fmt.Errorf("could not convert configVar into string: %v", v)
}
configVarsMap[k] = v
}
engineArgs.configVars = configVarsMap
}
// language
lang := ocrRequest.EngineArgs["lang"]
if lang != nil {
langStr, ok := lang.(string)
if !ok {
return nil, fmt.Errorf("could not convert lang into string: %v", lang)
}
engineArgs.lang = langStr
}
// select from pdf, layer 1:pdf + layer 2:ocr_pdf
ocrType := ocrRequest.EngineArgs["ocr_type"]
if ocrType != nil {
ocrTypeSrt, ok := ocrType.(string)
if !(ok) {
return nil, fmt.Errorf("could not convert into string: %v", ocrType)
}
engineArgs.ocrType = ocrTypeSrt
}
// set optimize flag
ocrOptimize := ocrRequest.EngineArgs["result_optimize"]
if ocrOptimize != nil {
ocrOptimizeFlag, ok := ocrOptimize.(bool)
if !(ok) {
return nil, fmt.Errorf("could not convert into boolean: %v", ocrOptimize)
}
engineArgs.ocrOptimize = ocrOptimizeFlag
}
// if true temp files won't be deleted
engineArgs.saveFiles = workerConfig.SaveFiles
engineArgs.t2pConverter = workerConfig.Tiff2pdfConverter
return engineArgs, nil
}
// Export return a slice that can be passed to tesseract binary as command line
// args, eg, ["-c", "tessedit_char_whitelist=0123456789", "-c", "foo=bar"]
func (t *SandwichEngineArgs) Export() []string {
var result []string
if t.lang != "" {
result = append(result, "-lang", t.lang)
}
// pdfsandwich wants the quotes before -c an after the last key e.g. -tesso '"-c arg1=key1"'
result = append(result, "-tesso", "-c textonly_pdf=1")
if t.configVars != nil {
for k, v := range t.configVars {
keyValArg := fmt.Sprintf("%s=%s", k, v)
result = append(result, keyValArg)
}
}
return result
}
// ProcessRequest will process incoming OCR request by routing it through the whole process chain
func (t SandwichEngine) ProcessRequest(ocrRequest *OcrRequest, workerConfig *WorkerConfig) (OcrResult, error) {
logger := zerolog.New(os.Stdout).With().
Str("component", "OCR_SANDWICH").
Str("RequestID", ocrRequest.RequestID).Timestamp().Logger()
// copy configuration for logging purposes to prevent leaking passwords to logs
workerConfigToLog := workerConfig
urlToLog, err := url.Parse(workerConfigToLog.AmqpAPIURI)
if err == nil {
workerConfigToLog.AmqpAPIURI = StripPasswordFromUrl(urlToLog)
}
urlToLog, err = url.Parse(workerConfigToLog.AmqpURI)
if err == nil {
workerConfigToLog.AmqpURI = StripPasswordFromUrl(urlToLog)
}
logger.Debug().Interface("workerConfig", workerConfigToLog).Msg("worker configuration for this request")
logger.Info().Str("DocType", ocrRequest.DocType).
Str("ImgUrl", ocrRequest.ImgUrl).
Str("ReplyTo", ocrRequest.ReplyTo).
Bool("Deferred", ocrRequest.Deferred).
Uint16("PageNumber", ocrRequest.PageNumber).
Uint("TimeOut", ocrRequest.TimeOut).
Int("ImgBase64Size", len(ocrRequest.ImgBase64)).
Int("ImgBytesSize", len(ocrRequest.ImgBytes)).
Str("UserAgent", ocrRequest.UserAgent).
Str("ReferenceID", ocrRequest.ReferenceID).
Msg("ocr request data")
tmpFileName, err := func() (string, error) {
switch {
case ocrRequest.ImgBase64 != "":
return t.tmpFileFromImageBase64(ocrRequest.ImgBase64, ocrRequest.RequestID)
case ocrRequest.ImgUrl != "":
return t.tmpFileFromImageURL(ocrRequest.ImgUrl, ocrRequest.RequestID)
default:
return t.tmpFileFromImageBytes(ocrRequest.ImgBytes, ocrRequest.RequestID)
}
}()
if err != nil {
logger.Error().Caller().Err(err).Msg("error getting tmpFileName")
return OcrResult{Text: "Internal server error", Status: "error"}, err
}
// detect if file type is supported
buffer, err := readFirstBytes(tmpFileName, 64)
if err != nil {
logger.Warn().Err(err).
Str("file_name", tmpFileName).
Msg("safety check can not be completed, processing of current file will be aborted")
return OcrResult{Text: "WARNING: provided file format is not supported", Status: "error"}, err
}
uplFileType := detectFileType(buffer)
if uplFileType == "UNKNOWN" {
err := fmt.Errorf("file format not understood")
logger.Warn().Caller().Err(err).
Str("file_type", uplFileType).
Msg("only support TIFF and PDF input files")
return OcrResult{Text: "only support TIFF and PDF input files", Status: "error"}, err
}
logger.Info().Str("file_type", uplFileType)
engineArgs, err := NewSandwichEngineArgs(ocrRequest, workerConfig)
if err != nil {
logger.Error().Err(err).Caller().Msg("error getting engineArgs")
return OcrResult{Text: "can not build arguments", Status: "error"}, err
}
// getting timeout for request
configTimeOut := ocrRequest.TimeOut
ocrResult, err := t.processImageFile(tmpFileName, uplFileType, engineArgs, configTimeOut)
return ocrResult, err
}
func (SandwichEngine) tmpFileFromImageBytes(imgBytes []byte, tmpFileName string) (string, error) {
log.Info().Str("component", "OCR_SANDWICH").Msg("Use pdfsandwich with bytes image")
var err error
tmpFileName, err = createTempFileName(tmpFileName)
if err != nil {
return "", err
}
// we have to write the contents of the image url to a temp
// file, because the leptonica lib can't seem to handle byte arrays
err = saveBytesToFileName(imgBytes, tmpFileName)
if err != nil {
return "", err
}
return tmpFileName, nil
}
func (SandwichEngine) tmpFileFromImageBase64(base64Image, tmpFileName string) (string, error) {
log.Info().Str("component", "OCR_SANDWICH").Msg("Use pdfsandwich with base 64")
var err error
if tmpFileName == "" {
tmpFileName, err = createTempFileName("")
if err != nil {
return "", err
}
}
// decoding into bytes the base64 string
decoded, decodeError := base64.StdEncoding.DecodeString(base64Image)
if decodeError != nil {
return "", err
}
err = saveBytesToFileName(decoded, tmpFileName)
if err != nil {
return "", err
}
return tmpFileName, nil
}
func (SandwichEngine) tmpFileFromImageURL(imgURL, tmpFileName string) (string, error) {
log.Info().Str("component", "OCR_SANDWICH").Msg("Use pdfsandwich with url")
var err error
if tmpFileName == "" {
tmpFileName, err = createTempFileName("")
if err != nil {
return "", err
}
}
// we have to write the contents of the image url to a temp
// file, because the leptonica lib can't seem to handle byte arrays
err = saveUrlContentToFileName(imgURL, tmpFileName)
if err != nil {
return "", err
}
return tmpFileName, nil
}
func (SandwichEngine) buildCmdLineArgs(inputFilename string, engineArgs *SandwichEngineArgs) (cmdArgs []string, ocrLayerFile string) {
// sets output file name for pdfsandwich output file
// and builds the argument list for external program
// since pdfsandwich can only return pdf files the will deliver work with pdf intermediates
// for later use we may expand the implementation
// pdfsandwich by default default expands the name of output file wich _ocr
cflags := engineArgs.Export()
tmpFileExtension := "_ocr.pdf"
ocrLayerFile = inputFilename
cmdArgs = make([]string, 0)
ocrLayerFile = fmt.Sprintf("%s%s", ocrLayerFile, tmpFileExtension)
cmdArgs = append(cmdArgs, cflags...)
cmdArgs = append(cmdArgs, inputFilename, "-o", ocrLayerFile)
log.Info().Str("component", "OCR_SANDWICH").Interface("cmdArgs", cmdArgs)
return cmdArgs, ocrLayerFile
}
func (SandwichEngine) runExternalCmd(commandToRun string, cmdArgs []string, defaultTimeOutSeconds time.Duration) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), defaultTimeOutSeconds)
defer cancel()
log.Debug().Str("component", "OCR_SANDWICH").
Str("command", commandToRun).
Interface("cmdArgs", cmdArgs).
Msg("running external command")
cmd := exec.CommandContext(ctx, commandToRun, cmdArgs...)
output, err := cmd.CombinedOutput()
if ctx.Err() == context.DeadlineExceeded {
err = fmt.Errorf("command timed out, terminated: %v", err)
// on deadline cancellation the output doesnt matter
return "", err
}
// err = "command timed out, terminated: signal: killed"
return string(output), err
}
func (t SandwichEngine) processImageFile(inputFilename, uplFileType string, engineArgs *SandwichEngineArgs, configTimeOut uint) (OcrResult, error) {
// if error flag is true, input files won't be deleted
errorFlag := false
filesToDelete := make([]string, 0)
logger := zerolog.New(os.Stdout).With().
Str("component", "OCR_SANDWICH").
Str("RequestID", engineArgs.requestID).Timestamp().Logger()
// if command line argument save_files is set or any internal processing is failed the input file won't be deleted
defer func() {
if !engineArgs.saveFiles && !errorFlag {
for _, element := range filesToDelete {
fileToDelete, _ := filepath.Abs(element)
logger.Info().Str("file_name", element).
Bool("save_files_flag", engineArgs.saveFiles).
Bool("errorFlag", errorFlag).
Msg("deleting file " + element)
if err := os.Remove(fileToDelete); err != nil {
logger.Warn().Err(err)
}
}
} else {
logger.Info().Interface("fileList", filesToDelete).
Msg("All input files were not removed for debugging purposes due to flags or errors while processing")
}
}()
// timeTrack(start time.Time, operation string, message string, requestID string)
defer timeTrack(time.Now(), "processing_time", "processing time", engineArgs.requestID)
logger.Info().Interface("engineArgs", engineArgs).Msg("Engine arguments")
fileToDeliver := "temp.file"
var cmdArgs []string
ocrLayerFile := ""
alternativeConverter := ""
originalInputfileName := inputFilename
logger.Info().Str("file_name", inputFilename).Msg("input file name")
filesToDelete = append(filesToDelete, inputFilename)
if uplFileType == "TIFF" {
switch engineArgs.t2pConverter {
case "convert":
alternativeConverter = "tiff2pdf"
inputFilename = convertImageToPdf(inputFilename)
case "tiff2pdf":
alternativeConverter = "convert"
inputFilename = tiff2Pdf(inputFilename)
}
/* if the first converter fails, we will automatically try the second one.
If the second one fails, we will break up processing and return an error to a caller */
if inputFilename == "" {
err := fmt.Errorf("can not convert input image to intermediate pdf, usually this is caused by a damaged input file")
logger.Warn().Err(err).Caller().Msg("Error exec " + engineArgs.t2pConverter + " Try to switch the image converter to " + alternativeConverter)
switch alternativeConverter {
case "convert":
inputFilename = convertImageToPdf(originalInputfileName)
case "tiff2pdf":
inputFilename = tiff2Pdf(originalInputfileName)
}
if inputFilename == "" {
err := fmt.Errorf("entirely failed to convert the input image to intermediate pdf, usually this is caused by a damaged input file")
logger.Error().Err(err).Caller().Msg("Error exec " + alternativeConverter)
errorFlag = true
return OcrResult{Status: "error"}, err
}
}
}
ocrType := strings.ToUpper(engineArgs.ocrType)
extCommandTimeout := time.Duration(configTimeOut) * time.Second
cmdArgs, ocrLayerFile = t.buildCmdLineArgs(inputFilename, engineArgs)
logger.Info().Str("command", "pdfsandwich").Interface("cmdArgs", cmdArgs).
Uint("command_timeout", configTimeOut).
Msg("running external pdfsandwich command")
output, err := t.runExternalCmd("pdfsandwich", cmdArgs, extCommandTimeout)
if err != nil {
errMsg := output
if errMsg != "" {
errMsg = fmt.Sprintf(output, err)
err := fmt.Errorf(errMsg)
logger.Error().Err(err).Caller().Msg("Error exec external command")
errorFlag = true
return OcrResult{Status: "error"}, err
}
logger.Error().Err(err).Caller().Msg("Error exec external command")
errorFlag = true
return OcrResult{Status: "error"}, err
}
switch ocrType {
case "COMBINEDPDF":
tmpOutCombinedPdf := fmt.Sprintf("%s%s", inputFilename, "_comb.pdf")
filesToDelete = append(filesToDelete, tmpOutCombinedPdf)
var combinedArgs []string
// pdftk FILE_only_TEXT-LAYER.pdf multistamp FILE_ORIGINAL_IMAGE.pdf output FILE_OUTPUT_IMAGE_AND_TEXT_LAYER.pdf
combinedArgs = append(combinedArgs, ocrLayerFile, "multistamp", inputFilename, "output", tmpOutCombinedPdf)
logger.Info().Interface("combinedArgs", combinedArgs).
Msg("Arguments for pdftk to combine pdf files")
_, errPdftk := exec.Command("pdftk", combinedArgs...).CombinedOutput()
if errPdftk != nil {
logger.Error().Err(errPdftk).Caller().
Str("file_name", tmpOutCombinedPdf).
Msg("Error running command")
errorFlag = true
return OcrResult{Status: "error"}, err
}
if engineArgs.ocrOptimize {
logger.Info().Msg("optimizing was requested, performing selected operation")
var compressedArgs []string
tmpOutCompressedPdf := inputFilename
tmpOutCompressedPdf = fmt.Sprintf("%s%s", tmpOutCompressedPdf, "_compr.pdf")
filesToDelete = append(filesToDelete, tmpOutCompressedPdf)
compressedArgs = append(
compressedArgs,
"-sDEVICE=pdfwrite",
"-dCompatibilityLevel=1.7",
"-dPDFSETTINGS=/prepress",
"-dNOPAUSE",
"-dBATCH",
"-dQUIET",
"-sOutputFile="+tmpOutCompressedPdf,
tmpOutCombinedPdf,
)
logger.Info().Str("file_name", tmpOutCompressedPdf).
Str("file_name", tmpOutCombinedPdf).
Interface("compressedArgs", compressedArgs).
Msg("tmpOutCompressedPdf, tmpOutCombinedPdf, combinedArgs ")
outQpdf, errQpdf := exec.Command("gs", compressedArgs...).CombinedOutput()
if errQpdf != nil {
logger.Error().Err(errQpdf).
Str("outQpdf", string(outQpdf)).
Msg("Error running command")
errorFlag = true
return OcrResult{Status: "error"}, err
}
fileToDeliver = tmpOutCompressedPdf
} else {
fileToDeliver = tmpOutCombinedPdf
}
case "OCRLAYERONLY":
fileToDeliver = ocrLayerFile
case "TXT":
// pdftotext will create %filename%.txt
logger.Info().Msg("extracting text from ocr")
textFile := fmt.Sprintf("%s%s", strings.TrimSuffix(ocrLayerFile, filepath.Ext(ocrLayerFile)), ".txt")
filesToDelete = append(filesToDelete, textFile)
cmdArgsPdfToText := exec.Command("pdftotext", ocrLayerFile)
outputPdfToText, err := cmdArgsPdfToText.CombinedOutput()
if err != nil {
errMsg := fmt.Sprintf(string(outputPdfToText), err)
err := fmt.Errorf(errMsg)
logger.Error().Caller().Err(err).Msg("error exec pdftotext")
errorFlag = true
}
fileToDeliver = textFile
default:
err := fmt.Errorf("requested output format is not supported")
logger.Error().Err(err).Caller()
errorFlag = true
return OcrResult{Status: "error"}, err
}
logger.Info().Str("file_name", fileToDeliver).Msg("resulting file")
outBytes, err := os.ReadFile(fileToDeliver)
if err != nil {
logger.Error().Caller().Err(err).Msg("Error getting data from result file")
return OcrResult{Status: "error"}, err
}
return OcrResult{
Text: base64.StdEncoding.EncodeToString(outBytes),
Status: "done",
}, nil
}