-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwineocr.js
116 lines (91 loc) · 3.28 KB
/
wineocr.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
/*
psm: Set Tesseract to only run a subset of layout analysis
and assume a certain form of image. The options are:
0 = Orientation and script detection (OSD) only.
1 = Automatic page segmentation with OSD.
2 = Automatic page segmentation, but no OSD, or OCR.
3 = Fully automatic page segmentation, but no OSD. (Default)
4 = Assume a single column of text of variable sizes.
5 = Assume a single uniform block of vertically aligned text.
6 = Assume a single uniform block of text.
7 = Treat the image as a single text line.
8 = Treat the image as a single word.
9 = Treat the image as a single word in a circle.
10 = Treat the image as a single character.
Default options:
Recognize French text
in a single uniform block of text inside given image file
*/
var fs = require('fs'),
_ = require("underscore"),
_s = require("underscore.string"),
tesseract = require('node-tesseract'),
arguments = process.argv.slice(2),
image_path = arguments[0],
tesseract_path = '/usr/local/bin/tesseract';
var processOCR = function(image_path, options, callback) {
var stats = fs.statSync(image_path);
if(stats.isFile() && hasValidExtension(image_path)) {
tesseract.process(image_path, options, function(err, text) {
output_file = image_path + '.ocr.txt';
console.log(_s.sprintf("%s, size: %s", image_path, getFilesizeInBytes(image_path, false)));
if(err) {
output = err;
} else {
output = text;
}
fs.writeFile(output_file, output);
console.log('----------------------------------------------------');
console.log(output);
if(callback) {
callback();
}
});
}
}
var walk = function(currentDirPath, callback) {
var fs = require('fs'), path = require('path');
fs.readdirSync(currentDirPath).forEach(function(name) {
var filePath = path.join(currentDirPath, name);
var stat = fs.statSync(filePath);
if (stat.isFile()) {
callback(filePath, stat);
} else if (stat.isDirectory()) {
walk(filePath, callback);
}
});
}
var hasValidExtension = function(path) {
var valids = ['jpg', 'jpeg', 'JPG', 'JPEG'];
var ext = getExtension(path);
return (-1 < _.indexOf(valids, ext));
}
var getExtension = function(filename) {
return filename.split('.').pop();
}
var getFilesizeInBytes = function(filename, convertToMegaBytes) {
var mb = convertToMegaBytes || false;
var stats = fs.statSync(filename);
var fileSizeInBytes = stats["size"];
if(convertToMegaBytes) {
fileSizeInBytes = fileSizeInBytes / 1000000.0;
}
return fileSizeInBytes
}
arguments[0] = arguments[0] || __dirname + '/examples/etiquette_3.jpg';
arguments[1] = arguments[1] || '3'; // fully automatic page segmentation, but no OSD.
arguments[2] = arguments[2] || 'fra'; // chauvinisim.
var stats = fs.statSync(arguments[0]),
options = {
psm: arguments[1],
l: arguments[2],
binary: tesseract_path
};
if(stats.isFile()) {
processOCR(arguments[0], options);
}
if(stats.isDirectory()) {
walk(arguments[0], function(filePath, stat) {
processOCR(filePath, options);
});
}