Skip to content

Commit

Permalink
fix: crash on tesseract during some ocr process
Browse files Browse the repository at this point in the history
  • Loading branch information
Fabien committed Oct 16, 2024
1 parent 51a96da commit ba20cd9
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,6 @@ public void startConsumer() {

private void receiveDocument() {
try {
if (!MemoryUtils.hasEnoughAvailableMemory()) {
log.warn("There is not currently enough memory to perform consumption ");
return;
}
queueMessageService.consume(QueueName.QUEUE_DOCUMENT_ANALYSIS,
documentAnalysisDelay,
documentAnalysisTimeout,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,6 @@ public void startConsumer() {

private void receiveFile() {
try {
if (!MemoryUtils.hasEnoughAvailableMemory()) {
log.warn("There is not currently enough memory to perform consumption ");
return;
}
queueMessageService.consume(QueueName.QUEUE_FILE_ANALYSIS,
0,
fileAnalysisTimeout,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,6 @@ public void startConsumer() {

private void receiveFile() {
try {
if (!MemoryUtils.hasEnoughAvailableMemory()) {
log.warn("There is not currently enough memory to perform consumption ");
return;
}
queueMessageService.consume(QueueName.QUEUE_FILE_MINIFY,
0,
fileMinifyTimeout,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import fr.dossierfacile.common.entity.ocr.ParsedFile;
import fr.dossierfacile.common.utils.FileUtility;
import fr.dossierfacile.process.file.service.parsers.tools.PageExtractorModel;
import fr.dossierfacile.process.file.util.MemoryUtils;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.Tesseract;
Expand Down Expand Up @@ -35,6 +36,7 @@ void init() {
this.tesseract.setVariable("user_defined_dpi", "300");
}
}

private BufferedImage[] getImages(File file) throws IOException {
if ("pdf".equalsIgnoreCase(FilenameUtils.getExtension(file.getName()))) {
BufferedImage[] images = FileUtility.convertPdfToImage(file);
Expand Down Expand Up @@ -76,7 +78,12 @@ protected T parse(BufferedImage... images) {
// rectangle exceeds image size
return null;
}
extractedTexts.put(entry.getKey(), this.tesseract.doOCR(image, rect));
MemoryUtils.logAvailableMemory(250);
String text;
synchronized (this) {
text = this.tesseract.doOCR(image, rect);
}
extractedTexts.put(entry.getKey(), text);
}
T result = getResultFromExtraction(extractedTexts);
if (result != null) {
Expand Down Expand Up @@ -112,8 +119,11 @@ protected boolean modelMatches(PageExtractorModel model, BufferedImage image) th
if (image.getWidth() < (zone.rect().x + zone.rect().width)
|| image.getHeight() < (zone.rect().y + zone.rect().height))
return false;

String text = this.tesseract.doOCR(image, zone.rect());
MemoryUtils.logAvailableMemory(250);
String text;
synchronized (this) {
text = this.tesseract.doOCR(image, zone.rect());
}
log.debug("expected: " + zone.regexp() + " actual: " + text + "b=" + (text != null && text.trim().matches(zone.regexp())));
return text != null && text.trim().matches(zone.regexp());
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@

@Slf4j
public class MemoryUtils {
// TODO currently we only log low availability
public static boolean hasEnoughAvailableMemory() {
public static int INT_1024X1024 = 1048576;

public static void logAvailableMemory(int thresholdInMB) {
Runtime runtime = Runtime.getRuntime();
// Arbitrary choose 250MB as minimal requirement to perform
if (runtime.maxMemory() - runtime.totalMemory() + runtime.freeMemory() < 262144000) {
log.warn("Memory usage: (Total=" + runtime.totalMemory() / 1024 + " MB , max=" + runtime.maxMemory() / 1024 + " MB , free=" + runtime.freeMemory() / 1024 + " MB , avail=" + (runtime.maxMemory() - runtime.totalMemory() + runtime.freeMemory()));
if (runtime.maxMemory() - runtime.totalMemory() + runtime.freeMemory() < (long) thresholdInMB * INT_1024X1024) {
log.warn("Memory usage: (Total=" + runtime.totalMemory() / INT_1024X1024 + " MB , max=" + runtime.maxMemory() / INT_1024X1024 + " MB , free=" + runtime.freeMemory() / INT_1024X1024 + " MB , avail=" + ((runtime.maxMemory() - runtime.totalMemory() + runtime.freeMemory()) / INT_1024X1024));
}
return true;
}
}

0 comments on commit ba20cd9

Please sign in to comment.