From 6669ccd002d4e1147adfc2d3d34b3d336a79fc9e Mon Sep 17 00:00:00 2001 From: Balearica Date: Thu, 31 Aug 2023 22:40:47 -0700 Subject: [PATCH 01/16] Initial commit of v5 changes --- src/Tesseract.js | 8 +--- src/constants/config.js | 5 -- src/constants/defaultOptions.js | 12 +++++ src/createWorker.js | 32 ++++++++++--- src/index.d.ts | 7 +-- src/worker-script/browser/getCore.js | 15 ++++-- src/worker-script/index.js | 71 +++++++++++++++++++++------- src/worker-script/node/getCore.js | 15 ++++-- tests/FS.test.js | 2 +- tests/detect.test.js | 4 +- tests/recognize.test.js | 31 ++++++------ tests/scheduler.test.js | 5 +- 12 files changed, 140 insertions(+), 67 deletions(-) delete mode 100644 src/constants/config.js diff --git a/src/Tesseract.js b/src/Tesseract.js index 953967434..3506e90ae 100644 --- a/src/Tesseract.js +++ b/src/Tesseract.js @@ -1,9 +1,7 @@ const createWorker = require('./createWorker'); const recognize = async (image, langs, options) => { - const worker = await createWorker(options); - await worker.loadLanguage(langs); - await worker.initialize(langs); + const worker = await createWorker(langs, 1, options); return worker.recognize(image) .finally(async () => { await worker.terminate(); @@ -11,9 +9,7 @@ const recognize = async (image, langs, options) => { }; const detect = async (image, options) => { - const worker = await createWorker(options); - await worker.loadLanguage('osd'); - await worker.initialize('osd'); + const worker = await createWorker("osd", 0, options); return worker.detect(image) .finally(async () => { await worker.terminate(); diff --git a/src/constants/config.js b/src/constants/config.js deleted file mode 100644 index f61b06298..000000000 --- a/src/constants/config.js +++ /dev/null @@ -1,5 +0,0 @@ -const OEM = require('./OEM'); - -module.exports = { - defaultOEM: OEM.DEFAULT, -}; diff --git a/src/constants/defaultOptions.js b/src/constants/defaultOptions.js index e4254203b..d4b7def29 100644 --- a/src/constants/defaultOptions.js +++ b/src/constants/defaultOptions.js @@ -1,4 +1,16 @@ +const OEM = require('./OEM'); + module.exports = { + // If `oem = OEM.LSTM_ONLY` (the default), only the code and language data for the LSTM model is loaded. + // This significantly decreases network and memory use for LSTM-only users (the vast majority), + // and has no impact on Legacy-only users. + // For the small number of users that use both models, this increases network and memory use + // (as two sets of code and language data end up being downloaded). + // For these users, `oemLang` and `oemCore` should be set to `OEM.TESSERACT_LSTM_COMBINED`, + // which forces download of language data and core code (respectively) that support both models. + oemLang: OEM.DEFAULT, + oemCore: OEM.DEFAULT, + /* * default path for downloading *.traineddata */ diff --git a/src/createWorker.js b/src/createWorker.js index 8fbd426bf..6f0bb7846 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -3,7 +3,7 @@ const circularize = require('./utils/circularize'); const createJob = require('./createJob'); const { log } = require('./utils/log'); const getId = require('./utils/getId'); -const { defaultOEM } = require('./constants/config'); +const OEM = require('./constants/OEM'); const { defaultOptions, spawnWorker, @@ -15,7 +15,7 @@ const { let workerCounter = 0; -module.exports = async (_options = {}) => { +module.exports = async (langs = "eng", oem = OEM.LSTM_ONLY, _options = {}, config = {}) => { const id = getId('Worker', workerCounter); const { logger, @@ -69,7 +69,7 @@ module.exports = async (_options = {}) => { const loadInternal = (jobId) => ( startJob(createJob({ - id: jobId, action: 'load', payload: { options }, + id: jobId, action: 'load', payload: {options: { oem: options.oemCore, corePath: options.corePath, logging: options.logging }}, })) ); @@ -105,7 +105,11 @@ module.exports = async (_options = {}) => { })) ); - const loadLanguage = (langs = 'eng', jobId) => ( + const loadLanguage = () => ( + console.warn('`loadLanguage` is depreciated and should be removed from code (workers now come with language pre-loaded)') + ); + + const loadLanguageInternal = (langs = 'eng', jobId) => ( startJob(createJob({ id: jobId, action: 'loadLanguage', @@ -113,7 +117,11 @@ module.exports = async (_options = {}) => { })) ); - const initialize = (langs = 'eng', oem = defaultOEM, config, jobId) => ( + const initialize = () => ( + console.warn('`initialize` is depreciated and should be removed from code (workers now come pre-initialized)') + ); + + const initializeInternal = (langs = 'eng', oem = OEM.LSTM_ONLY, config, jobId) => ( startJob(createJob({ id: jobId, action: 'initialize', @@ -121,6 +129,13 @@ module.exports = async (_options = {}) => { })) ); + // TODO: If OEM is not specified, this should default to whatever it was before. + // In other words, if it was initialized with Legacy `reinitialize` should not change to LSTM. + const reinitialize = (langs = 'eng', oem = OEM.DEFAULT, config, jobId) => ( + loadLanguageInternal(langs, jobId) + .then(() => initializeInternal(langs, oem, config, jobId)) + ); + const setParameters = (params = {}, jobId) => ( startJob(createJob({ id: jobId, @@ -207,6 +222,7 @@ module.exports = async (_options = {}) => { FS, loadLanguage, initialize, + reinitialize, setParameters, recognize, getPDF, @@ -214,7 +230,11 @@ module.exports = async (_options = {}) => { terminate, }; - loadInternal().then(() => workerResResolve(resolveObj)).catch(() => {}); + loadInternal() + .then(() => loadLanguageInternal(langs)) + .then(() => initializeInternal(langs, oem, config)) + .then(() => workerResResolve(resolveObj)) + .catch(() => {}); return workerRes; }; diff --git a/src/index.d.ts b/src/index.d.ts index 58598e8d6..5cdb72f8c 100644 --- a/src/index.d.ts +++ b/src/index.d.ts @@ -1,6 +1,6 @@ declare namespace Tesseract { function createScheduler(): Scheduler - function createWorker(options?: Partial): Promise + function createWorker(langs?: string | Lang[], oem?: OEM, options?: Partial, config?: string | Partial): Promise function setLogging(logging: boolean): void function recognize(image: ImageLike, langs?: string, options?: Partial): Promise function detect(image: ImageLike, options?: Partial): any @@ -20,8 +20,7 @@ declare namespace Tesseract { readText(path: string, jobId?: string): Promise removeText(path: string, jobId?: string): Promise FS(method: string, args: any[], jobId?: string): Promise - loadLanguage(langs?: string | Lang[], jobId?: string): Promise - initialize(langs?: string | Lang[], oem?: OEM, config?: string | Partial, jobId?: string): Promise + reinitialize(langs?: string | Lang[], oem?: OEM, config?: string | Partial, jobId?: string): Promise setParameters(params: Partial, jobId?: string): Promise getImage(type: imageType): string recognize(image: ImageLike, options?: Partial, output?: Partial, jobId?: string): Promise @@ -61,6 +60,8 @@ declare namespace Tesseract { cacheMethod: string workerBlobURL: boolean gzip: boolean + oemLang: OEM + oemCore: OEM logger: (arg: LoggerMessage) => void, errorHandler: (arg: any) => void } diff --git a/src/worker-script/browser/getCore.js b/src/worker-script/browser/getCore.js index dad1e0966..eb7626397 100644 --- a/src/worker-script/browser/getCore.js +++ b/src/worker-script/browser/getCore.js @@ -1,7 +1,8 @@ const { simd } = require('wasm-feature-detect'); const { dependencies } = require('../../../package.json'); +const OEM = require('../../constants/OEM'); -module.exports = async (corePath, res) => { +module.exports = async (oem, corePath, res) => { if (typeof global.TesseractCore === 'undefined') { res.progress({ status: 'loading tesseract core', progress: 0 }); @@ -19,9 +20,17 @@ module.exports = async (corePath, res) => { } else { const simdSupport = await simd(); if (simdSupport) { - corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-simd.wasm.js`; + if (oem === OEM.LSTM_ONLY) { + corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-simd-lstm.wasm.js`; + } else { + corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-simd.wasm.js`; + } } else { - corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core.wasm.js`; + if (oem === OEM.LSTM_ONLY) { + corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-lstm.wasm.js`; + } else { + corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core.wasm.js`; + } } } diff --git a/src/worker-script/index.js b/src/worker-script/index.js index d5c3c672a..cf1aa1194 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -28,13 +28,14 @@ let api = null; let latestJob; let adapter = {}; let params = defaultParams; -let cachePathWorker; -let cacheMethodWorker; +let loadLanguageLangsWorker; +let loadLanguageOptionsWorker; +let dataFromCache = false; -const load = async ({ workerId, jobId, payload: { options: { corePath, logging } } }, res) => { +const load = async ({ workerId, jobId, payload: { options: { oem, corePath, logging } } }, res) => { setLogging(logging); if (!TessModule) { - const Core = await adapter.getCore(corePath, res); + const Core = await adapter.getCore(oem, corePath, res); res.progress({ workerId, status: 'initializing tesseract', progress: 0 }); @@ -76,9 +77,13 @@ const loadLanguage = async ({ }, }, res) => { - // Remember cache options for later, as cache may be deleted if `initialize` fails - cachePathWorker = cachePath; - cacheMethodWorker = cacheMethod; + // Remember options for later, as cache may be deleted if `initialize` fails + loadLanguageLangsWorker = langs; + loadLanguageOptionsWorker = {langPath: langPath, + dataPath: dataPath, + cachePath: cachePath, + cacheMethod: cacheMethod, + gzip: gzip}; const loadAndGunzipFile = async (_lang) => { const lang = typeof _lang === 'string' ? _lang : _lang.code; @@ -94,8 +99,9 @@ res) => { const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`); if (typeof _data !== 'undefined') { log(`[${workerId}]: Load ${lang}.traineddata from cache`); - res.progress({ workerId, status: 'loading language traineddata (from cache)', progress: 0.5 }); + if (res) res.progress({ workerId, status: 'loading language traineddata (from cache)', progress: 0.5 }); data = _data; + dataFromCache = true; } else { throw Error('Not found in cache'); } @@ -144,7 +150,7 @@ res) => { try { TessModule.FS.mkdir(dataPath); } catch (err) { - res.reject(err.toString()); + if (res) res.reject(err.toString()); } } TessModule.FS.writeFile(`${dataPath || '.'}/${lang}.traineddata`, data); @@ -158,16 +164,16 @@ res) => { log(err.toString()); } } - return Promise.resolve(); + return; }; - res.progress({ workerId, status: 'loading language traineddata', progress: 0 }); + if (res) res.progress({ workerId, status: 'loading language traineddata', progress: 0 }); try { await Promise.all((typeof langs === 'string' ? langs.split('+') : langs).map(loadAndGunzipFile)); - res.progress({ workerId, status: 'loaded language traineddata', progress: 1 }); - res.resolve(langs); + if (res) res.progress({ workerId, status: 'loaded language traineddata', progress: 1 }); + if (res) res.resolve(langs); } catch (err) { - res.reject(err.toString()); + if (res) res.reject(err.toString()); } }; @@ -230,18 +236,49 @@ const initialize = async ({ } api = new TessModule.TessBaseAPI(); - const status = api.Init(null, langs, oem); + let status = api.Init(null, langs, oem); if (status === -1) { + // Cache is deleted if initialization fails to avoid keeping bad data in cache // This assumes that initialization failing only occurs due to bad .traineddata, // this should be refined if other reasons for init failing are encountered. - if (['write', 'refresh', undefined].includes(cacheMethodWorker)) { + // The "if" condition skips this section if either (1) cache is disabled [so the issue + // is definitely unrelated to cached data] or (2) cache is set to read-only + // [so we do not have permission to make any changes]. + if (['write', 'refresh', undefined].includes(loadLanguageOptionsWorker.cacheMethod)) { const langsArr = langs.split('+'); - const delCachePromise = langsArr.map((lang) => adapter.deleteCache(`${cachePathWorker || '.'}/${lang}.traineddata`)); + const delCachePromise = langsArr.map((lang) => adapter.deleteCache(`${loadLanguageOptionsWorker.cachePath || '.'}/${lang}.traineddata`)); await Promise.all(delCachePromise); + + // Check for the case when (1) data was loaded from the cache and (2) the data does not support the requested OEM. + // In this case, loadLanguage is re-run and initialization is attempted a second time. + // This is because `loadLanguage` has no mechanism for checking whether the cached data supports the requested model, + // so this only becomes apparent when initialization fails. + + // Check for this error message: + // "Tesseract (legacy) engine requested, but components are not present in ./eng.traineddata!!"" + // The .wasm build of Tesseract saves this message in a separate file (in addition to the normal debug file location). + const debugStr = TessModule.FS.readFile('/debugDev.txt', { encoding: 'utf8', flags: 'a+' }); + if (dataFromCache && /components are not present/.test(debugStr)) { + log('Data from cache missing requested OEM model. Attempting to refresh cache with new language data.'); + // In this case, language data is re-loaded + await loadLanguage({workerId: workerId, payload: {langs: loadLanguageLangsWorker, options: loadLanguageOptionsWorker}}); + status = api.Init(null, langs, oem); + if (status === -1) { + log('Language data refresh failed.'); + const delCachePromise = langsArr.map((lang) => adapter.deleteCache(`${loadLanguageOptionsWorker.cachePath || '.'}/${lang}.traineddata`)); + await Promise.all(delCachePromise); + } else { + log('Language data refresh successful.'); + } + } } + } + + if (status === -1) { res.reject('initialization failed'); } + params = defaultParams; await setParameters({ payload: { params } }); res.progress({ diff --git a/src/worker-script/node/getCore.js b/src/worker-script/node/getCore.js index 03469dd7e..151606eea 100644 --- a/src/worker-script/node/getCore.js +++ b/src/worker-script/node/getCore.js @@ -1,18 +1,27 @@ const { simd } = require('wasm-feature-detect'); +const OEM = require('../../constants/OEM'); let TesseractCore = null; /* * getCore is a sync function to load and return * TesseractCore. */ -module.exports = async (_, res) => { +module.exports = async (oem, _, res) => { if (TesseractCore === null) { const simdSupport = await simd(); res.progress({ status: 'loading tesseract core', progress: 0 }); if (simdSupport) { - TesseractCore = require('tesseract.js-core/tesseract-core-simd'); + if (oem === OEM.LSTM_ONLY) { + TesseractCore = require('tesseract.js-core/tesseract-core-simd-lstm'); + } else { + TesseractCore = require('tesseract.js-core/tesseract-core-simd'); + } } else { - TesseractCore = require('tesseract.js-core/tesseract-core'); + if (oem === OEM.LSTM_ONLY) { + TesseractCore = require('tesseract.js-core/tesseract-core-lstm'); + } else { + TesseractCore = require('tesseract.js-core/tesseract-core'); + } } res.progress({ status: 'loaded tesseract core', progress: 1 }); } diff --git a/tests/FS.test.js b/tests/FS.test.js index 91c5bf02f..df82a84c2 100644 --- a/tests/FS.test.js +++ b/tests/FS.test.js @@ -3,7 +3,7 @@ const FS_WAIT = 500; let worker; before(async function cb() { this.timeout(0); - worker = await createWorker(OPTIONS); + worker = await createWorker("eng", 1, OPTIONS); }); describe('FS', async () => { diff --git a/tests/detect.test.js b/tests/detect.test.js index 535d432fd..b6f8026c5 100644 --- a/tests/detect.test.js +++ b/tests/detect.test.js @@ -2,7 +2,7 @@ const { createWorker } = Tesseract; let worker; before(async function cb() { this.timeout(0); - worker = await createWorker(OPTIONS); + worker = await createWorker("osd", 1, OPTIONS); }); describe('detect()', async () => { @@ -10,8 +10,6 @@ describe('detect()', async () => { [ { name: 'cosmic.png', ans: { script: 'Latin' } }, ].forEach(async ({ name, ans: { script } }) => { - await worker.loadLanguage('osd'); - await worker.initialize('osd'); const { data: { script: s } } = await worker.detect(`${IMAGE_PATH}/${name}`); expect(s).to.be(script); }); diff --git a/tests/recognize.test.js b/tests/recognize.test.js index 41a6f23b5..79831b318 100644 --- a/tests/recognize.test.js +++ b/tests/recognize.test.js @@ -2,15 +2,14 @@ const { createWorker, PSM } = Tesseract; let worker; before(async function cb() { this.timeout(0); - worker = await createWorker(OPTIONS); - await worker.loadLanguage('eng+chi_tra+osd'); + worker = await createWorker("eng+chi_tra+osd", 1, OPTIONS); }); describe('recognize()', () => { describe('should read bmp, jpg, png and pbm format images', () => { FORMATS.forEach(format => ( it(`support ${format} format`, async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize(`${IMAGE_PATH}/simple.${format}`); expect(text).to.be(SIMPLE_TEXT); }).timeout(TIMEOUT) @@ -23,7 +22,7 @@ describe('recognize()', () => { { format: 'jpg', image: SIMPLE_JPG_BASE64, ans: SIMPLE_TEXT }, ].forEach(({ format, image, ans }) => ( it(`recongize ${format} in base64`, async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize(image); expect(text).to.be(ans); }).timeout(TIMEOUT) @@ -37,7 +36,7 @@ describe('recognize()', () => { { name: 'simple-270.jpg', desc: 'simple', ans: SIMPLE_TEXT }, ].forEach(({ name, desc, ans }) => ( it(`recongize ${desc} image`, async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize(`${IMAGE_PATH}/${name}`); expect(text).to.be(ans); }).timeout(TIMEOUT) @@ -62,7 +61,7 @@ describe('recognize()', () => { { name: 'chinese.png', lang: 'chi_tra', ans: CHINESE_TEXT }, ].forEach(({ name, lang, ans }) => ( it(`recongize ${lang}`, async () => { - await worker.initialize(lang); + await worker.reinitialize(lang); const { data: { text } } = await worker.recognize(`${IMAGE_PATH}/${name}`); expect(text).to.be(ans); }).timeout(TIMEOUT) @@ -76,7 +75,7 @@ describe('recognize()', () => { { name: 'testocr.png', desc: 'large', ans: TESTOCR_TEXT }, ].forEach(({ name, desc, ans }) => ( it(`recongize ${desc} image`, async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize(`${IMAGE_PATH}/${name}`); expect(text).to.be(ans); }).timeout(TIMEOUT) @@ -92,7 +91,7 @@ describe('recognize()', () => { name, left, top, width, height, ans, }) => ( it(`recongize half ${name}`, async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize( `${IMAGE_PATH}/${name}`, { @@ -108,7 +107,7 @@ describe('recognize()', () => { describe('should work with selected parameters', () => { it('support preserve_interword_spaces', async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); await worker.setParameters({ preserve_interword_spaces: '1', }); @@ -117,7 +116,7 @@ describe('recognize()', () => { }).timeout(TIMEOUT); it('support tessedit_char_whitelist', async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); await worker.setParameters({ tessedit_char_whitelist: 'Tess', }); @@ -132,7 +131,7 @@ describe('recognize()', () => { .map(name => ({ name, mode: PSM[name] })) .forEach(({ name, mode }) => ( it(`support PSM.${name} mode`, async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); await worker.setParameters({ tessedit_pageseg_mode: mode, }); @@ -146,7 +145,7 @@ describe('recognize()', () => { FORMATS.forEach(format => ( it(`support ${format} format`, async () => { const buf = fs.readFileSync(path.join(__dirname, 'assets', 'images', `simple.${format}`)); - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize(buf); expect(text).to.be(SIMPLE_TEXT); }).timeout(TIMEOUT) @@ -158,7 +157,7 @@ describe('recognize()', () => { it(`support ${format} format`, async () => { const imageDOM = document.createElement('img'); imageDOM.setAttribute('src', `${IMAGE_PATH}/simple.${format}`); - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize(imageDOM); expect(text).to.be(SIMPLE_TEXT); }).timeout(TIMEOUT) @@ -170,7 +169,7 @@ describe('recognize()', () => { it(`support ${format} format`, async () => { const videoDOM = document.createElement('video'); videoDOM.setAttribute('poster', `${IMAGE_PATH}/simple.${format}`); - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize(videoDOM); expect(text).to.be(SIMPLE_TEXT); }).timeout(TIMEOUT) @@ -202,7 +201,7 @@ describe('recognize()', () => { formats.forEach(format => ( it(`support ${format} format`, async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize(canvasDOM); expect(text).to.be(SIMPLE_TEXT); }).timeout(TIMEOUT) @@ -234,7 +233,7 @@ describe('recognize()', () => { formats.forEach(format => ( it(`support ${format} format`, async () => { - await worker.initialize('eng'); + await worker.reinitialize('eng'); const { data: { text } } = await worker.recognize(offscreenCanvas); expect(text).to.be(SIMPLE_TEXT); }).timeout(TIMEOUT) diff --git a/tests/scheduler.test.js b/tests/scheduler.test.js index e4270c50b..b8f6567ae 100644 --- a/tests/scheduler.test.js +++ b/tests/scheduler.test.js @@ -7,10 +7,7 @@ before(async function cb() { const NUM_WORKERS = 5; console.log(`Initializing ${NUM_WORKERS} workers`); workers = await Promise.all(Array(NUM_WORKERS).fill(0).map(async () => { - const w = await createWorker(OPTIONS); - await w.loadLanguage('eng'); - await w.initialize('eng'); - return w; + return await createWorker("eng", 1, OPTIONS); })); console.log(`Initialized ${NUM_WORKERS} workers`); }); From f5b81b115d2d7fc49f21d7f1792777b3f57ec554 Mon Sep 17 00:00:00 2001 From: Balearica Date: Sun, 3 Sep 2023 21:31:22 -0700 Subject: [PATCH 02/16] Continued working on v5 changes --- src/Tesseract.js | 2 +- src/constants/defaultOptions.js | 4 +- src/createWorker.js | 56 ++++++++++++++++++------- src/worker-script/browser/getCore.js | 14 +++---- src/worker-script/index.js | 63 ++++++++++++++++++---------- src/worker-script/node/getCore.js | 14 +++---- 6 files changed, 98 insertions(+), 55 deletions(-) diff --git a/src/Tesseract.js b/src/Tesseract.js index 3506e90ae..50200254f 100644 --- a/src/Tesseract.js +++ b/src/Tesseract.js @@ -9,7 +9,7 @@ const recognize = async (image, langs, options) => { }; const detect = async (image, options) => { - const worker = await createWorker("osd", 0, options); + const worker = await createWorker('osd', 0, options); return worker.detect(image) .finally(async () => { await worker.terminate(); diff --git a/src/constants/defaultOptions.js b/src/constants/defaultOptions.js index d4b7def29..5acffd343 100644 --- a/src/constants/defaultOptions.js +++ b/src/constants/defaultOptions.js @@ -1,11 +1,11 @@ const OEM = require('./OEM'); module.exports = { - // If `oem = OEM.LSTM_ONLY` (the default), only the code and language data for the LSTM model is loaded. + // If `oem = OEM.LSTM_ONLY` (the default) only the code and langdata for LSTM is loaded. // This significantly decreases network and memory use for LSTM-only users (the vast majority), // and has no impact on Legacy-only users. // For the small number of users that use both models, this increases network and memory use - // (as two sets of code and language data end up being downloaded). + // (as two sets of code and language data end up being downloaded). // For these users, `oemLang` and `oemCore` should be set to `OEM.TESSERACT_LSTM_COMBINED`, // which forces download of language data and core code (respectively) that support both models. oemLang: OEM.DEFAULT, diff --git a/src/createWorker.js b/src/createWorker.js index 6f0bb7846..0b4b324e8 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -15,7 +15,7 @@ const { let workerCounter = 0; -module.exports = async (langs = "eng", oem = OEM.LSTM_ONLY, _options = {}, config = {}) => { +module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, config = {}) => { const id = getId('Worker', workerCounter); const { logger, @@ -28,6 +28,12 @@ module.exports = async (langs = "eng", oem = OEM.LSTM_ONLY, _options = {}, confi const resolves = {}; const rejects = {}; + // Current langs, oem, and config file. + // Used if the user ever re-initializes the worker using `worker.reinitialize`. + const currentLangs = typeof langs === 'string' ? langs.split('+') : langs; + let currentOem = oem; + let currentConfig = config; + let workerResReject; let workerResResolve; const workerRes = new Promise((resolve, reject) => { @@ -69,7 +75,7 @@ module.exports = async (langs = "eng", oem = OEM.LSTM_ONLY, _options = {}, confi const loadInternal = (jobId) => ( startJob(createJob({ - id: jobId, action: 'load', payload: {options: { oem: options.oemCore, corePath: options.corePath, logging: options.logging }}, + id: jobId, action: 'load', payload: { options: { oem: options.oemCore, corePath: options.corePath, logging: options.logging } }, })) ); @@ -109,11 +115,11 @@ module.exports = async (langs = "eng", oem = OEM.LSTM_ONLY, _options = {}, confi console.warn('`loadLanguage` is depreciated and should be removed from code (workers now come with language pre-loaded)') ); - const loadLanguageInternal = (langs = 'eng', jobId) => ( + const loadLanguageInternal = (_langs, jobId) => ( startJob(createJob({ id: jobId, action: 'loadLanguage', - payload: { langs, options }, + payload: { langs: _langs, options }, })) ); @@ -121,20 +127,38 @@ module.exports = async (langs = "eng", oem = OEM.LSTM_ONLY, _options = {}, confi console.warn('`initialize` is depreciated and should be removed from code (workers now come pre-initialized)') ); - const initializeInternal = (langs = 'eng', oem = OEM.LSTM_ONLY, config, jobId) => ( + const initializeInternal = (_langs, _oem, _config, jobId) => ( startJob(createJob({ id: jobId, action: 'initialize', - payload: { langs, oem, config }, + payload: { langs: _langs, oem: _oem, config: _config }, })) ); - // TODO: If OEM is not specified, this should default to whatever it was before. - // In other words, if it was initialized with Legacy `reinitialize` should not change to LSTM. - const reinitialize = (langs = 'eng', oem = OEM.DEFAULT, config, jobId) => ( - loadLanguageInternal(langs, jobId) - .then(() => initializeInternal(langs, oem, config, jobId)) - ); + // TODO: + // (1) Add case where OEM is requested that current core does not support + // (this may just be error message). + // (2) Figure out how to download the appropriate traineddata for the OEM + const reinitialize = (langs = 'eng', oem, config, jobId) => { // eslint-disable-line + + const _oem = oem || currentOem; + currentOem = _oem; + + const _config = config || currentConfig; + currentConfig = _config; + + // Only load langs that are not already loaded. + // This logic fails if the user downloaded the LSTM-only English data for a language + // and then uses `worker.reinitialize` to switch to the Legacy engine. + // However, the correct data will still be downloaded after initialization fails + // and this can be avoided entirely + const langsArr = typeof langs === 'string' ? langs.split('+') : langs; + const _langs = langsArr.filter((x) => currentLangs.includes(x)); + currentLangs.push(_langs); + + return loadLanguageInternal(_langs, jobId) + .then(() => initializeInternal(_langs, _oem, _config, jobId)); + }; const setParameters = (params = {}, jobId) => ( startJob(createJob({ @@ -231,10 +255,10 @@ module.exports = async (langs = "eng", oem = OEM.LSTM_ONLY, _options = {}, confi }; loadInternal() - .then(() => loadLanguageInternal(langs)) - .then(() => initializeInternal(langs, oem, config)) - .then(() => workerResResolve(resolveObj)) - .catch(() => {}); + .then(() => loadLanguageInternal(langs)) + .then(() => initializeInternal(langs, oem, config)) + .then(() => workerResResolve(resolveObj)) + .catch(() => {}); return workerRes; }; diff --git a/src/worker-script/browser/getCore.js b/src/worker-script/browser/getCore.js index eb7626397..ce3d61b30 100644 --- a/src/worker-script/browser/getCore.js +++ b/src/worker-script/browser/getCore.js @@ -4,7 +4,9 @@ const OEM = require('../../constants/OEM'); module.exports = async (oem, corePath, res) => { if (typeof global.TesseractCore === 'undefined') { - res.progress({ status: 'loading tesseract core', progress: 0 }); + const statusText = 'loading tesseract core'; + + res.progress({ status: statusText, progress: 0 }); // If the user specifies a core path, we use that // Otherwise, default to CDN @@ -25,12 +27,10 @@ module.exports = async (oem, corePath, res) => { } else { corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-simd.wasm.js`; } + } else if (oem === OEM.LSTM_ONLY) { + corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-lstm.wasm.js`; } else { - if (oem === OEM.LSTM_ONLY) { - corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-lstm.wasm.js`; - } else { - corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core.wasm.js`; - } + corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core.wasm.js`; } } @@ -45,7 +45,7 @@ module.exports = async (oem, corePath, res) => { } else if (typeof global.TesseractCore === 'undefined') { throw Error('Failed to load TesseractCore'); } - res.progress({ status: 'loading tesseract core', progress: 1 }); + res.progress({ status: statusText, progress: 1 }); } return global.TesseractCore; }; diff --git a/src/worker-script/index.js b/src/worker-script/index.js index cf1aa1194..0618cea6c 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -34,10 +34,13 @@ let dataFromCache = false; const load = async ({ workerId, jobId, payload: { options: { oem, corePath, logging } } }, res) => { setLogging(logging); + + const statusText = 'initializing tesseract'; + if (!TessModule) { const Core = await adapter.getCore(oem, corePath, res); - res.progress({ workerId, status: 'initializing tesseract', progress: 0 }); + res.progress({ workerId, status: statusText, progress: 0 }); Core({ TesseractProgress(percent) { @@ -50,7 +53,7 @@ const load = async ({ workerId, jobId, payload: { options: { oem, corePath, logg }, }).then((tessModule) => { TessModule = tessModule; - res.progress({ workerId, status: 'initialized tesseract', progress: 1 }); + res.progress({ workerId, status: statusText, progress: 1 }); res.resolve({ loaded: true }); }); } else { @@ -79,11 +82,18 @@ const loadLanguage = async ({ res) => { // Remember options for later, as cache may be deleted if `initialize` fails loadLanguageLangsWorker = langs; - loadLanguageOptionsWorker = {langPath: langPath, - dataPath: dataPath, - cachePath: cachePath, - cacheMethod: cacheMethod, - gzip: gzip}; + loadLanguageOptionsWorker = { + langPath, + dataPath, + cachePath, + cacheMethod, + gzip, + }; + + const statusText = 'loading language traineddata'; + + const langsArr = typeof langs === 'string' ? langs.split('+') : langs; + let progress = 0; const loadAndGunzipFile = async (_lang) => { const lang = typeof _lang === 'string' ? _lang : _lang.code; @@ -99,7 +109,6 @@ res) => { const _data = await readCache(`${cachePath || '.'}/${lang}.traineddata`); if (typeof _data !== 'undefined') { log(`[${workerId}]: Load ${lang}.traineddata from cache`); - if (res) res.progress({ workerId, status: 'loading language traineddata (from cache)', progress: 0.5 }); data = _data; dataFromCache = true; } else { @@ -138,6 +147,9 @@ res) => { } } + progress += 0.5 / langsArr.length; + if (res) res.progress({ workerId, status: statusText, progress }); + // Check for gzip magic numbers (1F and 8B in hex) const isGzip = (data[0] === 31 && data[1] === 139) || (data[1] === 31 && data[0] === 139); @@ -164,13 +176,16 @@ res) => { log(err.toString()); } } - return; + + progress += 0.5 / langsArr.length; + // Make sure last progress message is 1 (not 0.9999) + if (Math.round(progress * 100) === 100) progress = 1; + if (res) res.progress({ workerId, status: statusText, progress }); }; - if (res) res.progress({ workerId, status: 'loading language traineddata', progress: 0 }); + if (res) res.progress({ workerId, status: statusText, progress: 0 }); try { - await Promise.all((typeof langs === 'string' ? langs.split('+') : langs).map(loadAndGunzipFile)); - if (res) res.progress({ workerId, status: 'loaded language traineddata', progress: 1 }); + await Promise.all(langsArr.map(loadAndGunzipFile)); if (res) res.resolve(langs); } catch (err) { if (res) res.reject(err.toString()); @@ -214,9 +229,11 @@ const initialize = async ({ ? _langs : _langs.map((l) => ((typeof l === 'string') ? l : l.data)).join('+'); + const statusText = 'initializing api'; + try { res.progress({ - workerId, status: 'initializing api', progress: 0, + workerId, status: statusText, progress: 0, }); if (api !== null) { api.End(); @@ -238,7 +255,6 @@ const initialize = async ({ api = new TessModule.TessBaseAPI(); let status = api.Init(null, langs, oem); if (status === -1) { - // Cache is deleted if initialization fails to avoid keeping bad data in cache // This assumes that initialization failing only occurs due to bad .traineddata, // this should be refined if other reasons for init failing are encountered. @@ -250,24 +266,27 @@ const initialize = async ({ const delCachePromise = langsArr.map((lang) => adapter.deleteCache(`${loadLanguageOptionsWorker.cachePath || '.'}/${lang}.traineddata`)); await Promise.all(delCachePromise); - // Check for the case when (1) data was loaded from the cache and (2) the data does not support the requested OEM. + // Check for the case when (1) data was loaded from the cache and + // (2) the data does not support the requested OEM. // In this case, loadLanguage is re-run and initialization is attempted a second time. - // This is because `loadLanguage` has no mechanism for checking whether the cached data supports the requested model, - // so this only becomes apparent when initialization fails. + // This is because `loadLanguage` has no mechanism for checking whether the cached data + // supports the requested model, so this only becomes apparent when initialization fails. // Check for this error message: + // eslint-disable-next-line // "Tesseract (legacy) engine requested, but components are not present in ./eng.traineddata!!"" - // The .wasm build of Tesseract saves this message in a separate file (in addition to the normal debug file location). + // The .wasm build of Tesseract saves this message in a separate file + // (in addition to the normal debug file location). const debugStr = TessModule.FS.readFile('/debugDev.txt', { encoding: 'utf8', flags: 'a+' }); if (dataFromCache && /components are not present/.test(debugStr)) { log('Data from cache missing requested OEM model. Attempting to refresh cache with new language data.'); // In this case, language data is re-loaded - await loadLanguage({workerId: workerId, payload: {langs: loadLanguageLangsWorker, options: loadLanguageOptionsWorker}}); + await loadLanguage({ workerId, payload: { langs: loadLanguageLangsWorker, options: loadLanguageOptionsWorker } }); // eslint-disable-line max-len status = api.Init(null, langs, oem); if (status === -1) { log('Language data refresh failed.'); - const delCachePromise = langsArr.map((lang) => adapter.deleteCache(`${loadLanguageOptionsWorker.cachePath || '.'}/${lang}.traineddata`)); - await Promise.all(delCachePromise); + const delCachePromise2 = langsArr.map((lang) => adapter.deleteCache(`${loadLanguageOptionsWorker.cachePath || '.'}/${lang}.traineddata`)); + await Promise.all(delCachePromise2); } else { log('Language data refresh successful.'); } @@ -282,7 +301,7 @@ const initialize = async ({ params = defaultParams; await setParameters({ payload: { params } }); res.progress({ - workerId, status: 'initialized api', progress: 1, + workerId, status: statusText, progress: 1, }); res.resolve(); } catch (err) { diff --git a/src/worker-script/node/getCore.js b/src/worker-script/node/getCore.js index 151606eea..4ebe03563 100644 --- a/src/worker-script/node/getCore.js +++ b/src/worker-script/node/getCore.js @@ -8,22 +8,22 @@ let TesseractCore = null; */ module.exports = async (oem, _, res) => { if (TesseractCore === null) { + const statusText = 'loading tesseract core'; + const simdSupport = await simd(); - res.progress({ status: 'loading tesseract core', progress: 0 }); + res.progress({ status: statusText, progress: 0 }); if (simdSupport) { if (oem === OEM.LSTM_ONLY) { TesseractCore = require('tesseract.js-core/tesseract-core-simd-lstm'); } else { TesseractCore = require('tesseract.js-core/tesseract-core-simd'); } + } else if (oem === OEM.LSTM_ONLY) { + TesseractCore = require('tesseract.js-core/tesseract-core-lstm'); } else { - if (oem === OEM.LSTM_ONLY) { - TesseractCore = require('tesseract.js-core/tesseract-core-lstm'); - } else { - TesseractCore = require('tesseract.js-core/tesseract-core'); - } + TesseractCore = require('tesseract.js-core/tesseract-core'); } - res.progress({ status: 'loaded tesseract core', progress: 1 }); + res.progress({ status: statusText, progress: 1 }); } return TesseractCore; }; From f7a219bed304fbc2d211cad1dfbaebd92b61e64e Mon Sep 17 00:00:00 2001 From: Balearica Date: Mon, 4 Sep 2023 18:22:22 -0700 Subject: [PATCH 03/16] Updated to use LSTM-only data when possible --- src/constants/defaultOptions.js | 4 ---- src/createWorker.js | 15 +++++++++++---- src/worker-script/index.js | 7 +++++++ 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/constants/defaultOptions.js b/src/constants/defaultOptions.js index 5acffd343..4dce28c18 100644 --- a/src/constants/defaultOptions.js +++ b/src/constants/defaultOptions.js @@ -11,10 +11,6 @@ module.exports = { oemLang: OEM.DEFAULT, oemCore: OEM.DEFAULT, - /* - * default path for downloading *.traineddata - */ - langPath: 'https://tessdata.projectnaptha.com/4.0.0', /* * Use BlobURL for worker script by default * TODO: remove this option diff --git a/src/createWorker.js b/src/createWorker.js index 0b4b324e8..9ff3fbc73 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -115,13 +115,20 @@ module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, confi console.warn('`loadLanguage` is depreciated and should be removed from code (workers now come with language pre-loaded)') ); - const loadLanguageInternal = (_langs, jobId) => ( - startJob(createJob({ + const loadLanguageInternal = (_langs, jobId) => { + return startJob(createJob({ id: jobId, action: 'loadLanguage', - payload: { langs: _langs, options }, + payload: { langs: _langs, options: { + langPath: options.langPath, + dataPath: options.dataPath, + cachePath: options.cachePath, + cacheMethod: options.cacheMethod, + gzip: options.gzip, + lstmOnly: !([OEM.TESSERACT_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(options.oemLang) || [OEM.TESSERACT_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(options.oemLang)), + } }, })) - ); + }; const initialize = () => ( console.warn('`initialize` is depreciated and should be removed from code (workers now come pre-initialized)') diff --git a/src/worker-script/index.js b/src/worker-script/index.js index 0618cea6c..30d2338a2 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -76,6 +76,7 @@ const loadLanguage = async ({ cachePath, cacheMethod, gzip = true, + lstmOnly, }, }, }, @@ -88,6 +89,7 @@ res) => { cachePath, cacheMethod, gzip, + lstmOnly }; const statusText = 'loading language traineddata'; @@ -103,6 +105,11 @@ res) => { let data = null; let newData = false; + // If `langPath` if not explicitly set by the user, the jsdelivr CDN is used. + // Data supporting the Legacy model is only included if `lstmOnly` is not true. + // This saves a significant amount of data for the majority of users that use LSTM only. + const langPath = langPath || (lstmOnly ? 'https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int' : 'https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0'); + // Check for existing .traineddata file in cache // This automatically fails if cacheMethod is set to 'refresh' or 'none' try { From 97712c8db26f69e7dfe6dbab603df94e490bf697 Mon Sep 17 00:00:00 2001 From: Balearica Date: Sun, 10 Sep 2023 22:27:36 -0700 Subject: [PATCH 04/16] Continued developing v5 --- src/constants/defaultOptions.js | 12 --------- src/createWorker.js | 38 +++++++++++++++------------- src/worker-script/browser/getCore.js | 4 +-- src/worker-script/index.js | 22 ++++++++-------- src/worker-script/node/getCore.js | 4 +-- tests/constants.js | 2 +- tests/detect.test.js | 2 +- 7 files changed, 37 insertions(+), 47 deletions(-) diff --git a/src/constants/defaultOptions.js b/src/constants/defaultOptions.js index 4dce28c18..af875f6c1 100644 --- a/src/constants/defaultOptions.js +++ b/src/constants/defaultOptions.js @@ -1,16 +1,4 @@ -const OEM = require('./OEM'); - module.exports = { - // If `oem = OEM.LSTM_ONLY` (the default) only the code and langdata for LSTM is loaded. - // This significantly decreases network and memory use for LSTM-only users (the vast majority), - // and has no impact on Legacy-only users. - // For the small number of users that use both models, this increases network and memory use - // (as two sets of code and language data end up being downloaded). - // For these users, `oemLang` and `oemCore` should be set to `OEM.TESSERACT_LSTM_COMBINED`, - // which forces download of language data and core code (respectively) that support both models. - oemLang: OEM.DEFAULT, - oemCore: OEM.DEFAULT, - /* * Use BlobURL for worker script by default * TODO: remove this option diff --git a/src/createWorker.js b/src/createWorker.js index 9ff3fbc73..a4497889d 100644 --- a/src/createWorker.js +++ b/src/createWorker.js @@ -15,7 +15,7 @@ const { let workerCounter = 0; -module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, config = {}) => { +module.exports = async (langs = 'eng', oem = OEM.DEFAULT, _options = {}, config = {}) => { const id = getId('Worker', workerCounter); const { logger, @@ -33,6 +33,7 @@ module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, confi const currentLangs = typeof langs === 'string' ? langs.split('+') : langs; let currentOem = oem; let currentConfig = config; + const lstmOnlyCore = [OEM.DEFAULT, OEM.LSTM_ONLY].includes(oem); let workerResReject; let workerResResolve; @@ -115,20 +116,21 @@ module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, confi console.warn('`loadLanguage` is depreciated and should be removed from code (workers now come with language pre-loaded)') ); - const loadLanguageInternal = (_langs, jobId) => { - return startJob(createJob({ - id: jobId, - action: 'loadLanguage', - payload: { langs: _langs, options: { + const loadLanguageInternal = (_langs, jobId) => startJob(createJob({ + id: jobId, + action: 'loadLanguage', + payload: { + langs: _langs, + options: { langPath: options.langPath, dataPath: options.dataPath, cachePath: options.cachePath, cacheMethod: options.cacheMethod, gzip: options.gzip, - lstmOnly: !([OEM.TESSERACT_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(options.oemLang) || [OEM.TESSERACT_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(options.oemLang)), - } }, - })) - }; + lstmOnly: !([OEM.TESSERACT_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(currentOem) || [OEM.TESSERACT_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(options.oemLang)), // eslint-disable-line + }, + }, + })); const initialize = () => ( console.warn('`initialize` is depreciated and should be removed from code (workers now come pre-initialized)') @@ -142,12 +144,10 @@ module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, confi })) ); - // TODO: - // (1) Add case where OEM is requested that current core does not support - // (this may just be error message). - // (2) Figure out how to download the appropriate traineddata for the OEM const reinitialize = (langs = 'eng', oem, config, jobId) => { // eslint-disable-line + if (lstmOnlyCore && [OEM.TESSERACT_ONLY, OEM.TESSERACT_LSTM_COMBINED].includes(oem)) throw Error('Legacy model requested but code missing.'); + const _oem = oem || currentOem; currentOem = _oem; @@ -194,13 +194,15 @@ module.exports = async (langs = 'eng', oem = OEM.LSTM_ONLY, _options = {}, confi })); }; - const detect = async (image, jobId) => ( - startJob(createJob({ + const detect = async (image, jobId) => { + if (lstmOnlyCore) throw Error('`worker.detect` requires Legacy model, which was not loaded.'); + + return startJob(createJob({ id: jobId, action: 'detect', payload: { image: await loadImage(image) }, - })) - ); + })); + }; const terminate = async () => { if (worker !== null) { diff --git a/src/worker-script/browser/getCore.js b/src/worker-script/browser/getCore.js index ce3d61b30..cb40d3cd2 100644 --- a/src/worker-script/browser/getCore.js +++ b/src/worker-script/browser/getCore.js @@ -22,12 +22,12 @@ module.exports = async (oem, corePath, res) => { } else { const simdSupport = await simd(); if (simdSupport) { - if (oem === OEM.LSTM_ONLY) { + if ([OEM.DEFAULT, OEM.LSTM_ONLY].includes(oem)) { corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-simd-lstm.wasm.js`; } else { corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-simd.wasm.js`; } - } else if (oem === OEM.LSTM_ONLY) { + } else if ([OEM.DEFAULT, OEM.LSTM_ONLY].includes(oem)) { corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core-lstm.wasm.js`; } else { corePathImportFile = `${corePathImport.replace(/\/$/, '')}/tesseract-core.wasm.js`; diff --git a/src/worker-script/index.js b/src/worker-script/index.js index 30d2338a2..ab388f8d9 100644 --- a/src/worker-script/index.js +++ b/src/worker-script/index.js @@ -89,7 +89,7 @@ res) => { cachePath, cacheMethod, gzip, - lstmOnly + lstmOnly, }; const statusText = 'loading language traineddata'; @@ -105,11 +105,6 @@ res) => { let data = null; let newData = false; - // If `langPath` if not explicitly set by the user, the jsdelivr CDN is used. - // Data supporting the Legacy model is only included if `lstmOnly` is not true. - // This saves a significant amount of data for the majority of users that use LSTM only. - const langPath = langPath || (lstmOnly ? 'https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int' : 'https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0'); - // Check for existing .traineddata file in cache // This automatically fails if cacheMethod is set to 'refresh' or 'none' try { @@ -128,14 +123,19 @@ res) => { if (typeof _lang === 'string') { let path = null; + // If `langPath` if not explicitly set by the user, the jsdelivr CDN is used. + // Data supporting the Legacy model is only included if `lstmOnly` is not true. + // This saves a significant amount of data for the majority of users that use LSTM only. + const langPathDownload = langPath || (lstmOnly ? `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0_best_int` : `https://cdn.jsdelivr.net/npm/@tesseract.js-data/${lang}/4.0.0`); + // For Node.js, langPath may be a URL or local file path // The is-url package is used to tell the difference // For the browser version, langPath is assumed to be a URL - if (env !== 'node' || isURL(langPath) || langPath.startsWith('moz-extension://') || langPath.startsWith('chrome-extension://') || langPath.startsWith('file://')) { /** When langPath is an URL */ - path = langPath.replace(/\/$/, ''); + if (env !== 'node' || isURL(langPathDownload) || langPathDownload.startsWith('moz-extension://') || langPathDownload.startsWith('chrome-extension://') || langPathDownload.startsWith('file://')) { /** When langPathDownload is an URL */ + path = langPathDownload.replace(/\/$/, ''); } - // langPath is a URL, fetch from server + // langPathDownload is a URL, fetch from server if (path !== null) { const fetchUrl = `${path}/${lang}.traineddata${gzip ? '.gz' : ''}`; const resp = await (env === 'webworker' ? fetch : adapter.fetch)(fetchUrl); @@ -144,10 +144,10 @@ res) => { } data = new Uint8Array(await resp.arrayBuffer()); - // langPath is a local file, read .traineddata from local filesystem + // langPathDownload is a local file, read .traineddata from local filesystem // (adapter.readCache is a generic file read function in Node.js version) } else { - data = await adapter.readCache(`${langPath}/${lang}.traineddata${gzip ? '.gz' : ''}`); + data = await adapter.readCache(`${langPathDownload}/${lang}.traineddata${gzip ? '.gz' : ''}`); } } else { data = _lang.data; // eslint-disable-line diff --git a/src/worker-script/node/getCore.js b/src/worker-script/node/getCore.js index 4ebe03563..b86662ce0 100644 --- a/src/worker-script/node/getCore.js +++ b/src/worker-script/node/getCore.js @@ -13,12 +13,12 @@ module.exports = async (oem, _, res) => { const simdSupport = await simd(); res.progress({ status: statusText, progress: 0 }); if (simdSupport) { - if (oem === OEM.LSTM_ONLY) { + if ([OEM.DEFAULT, OEM.LSTM_ONLY].includes(oem)) { TesseractCore = require('tesseract.js-core/tesseract-core-simd-lstm'); } else { TesseractCore = require('tesseract.js-core/tesseract-core-simd'); } - } else if (oem === OEM.LSTM_ONLY) { + } else if ([OEM.DEFAULT, OEM.LSTM_ONLY].includes(oem)) { TesseractCore = require('tesseract.js-core/tesseract-core-lstm'); } else { TesseractCore = require('tesseract.js-core/tesseract-core'); diff --git a/tests/constants.js b/tests/constants.js index cbe2dd2f8..a0726597c 100644 --- a/tests/constants.js +++ b/tests/constants.js @@ -13,7 +13,7 @@ const SIMPLE_TEXT_HALF = 'Tesse\n'; const COMSIC_TEXT = 'HellO World\nfrom beyond\nthe Cosmic Void\n'; const TESTOCR_TEXT = 'This is a lot of 12 point text to test the\nocr code and see if it works on all types\nof file format.\n\nThe quick brown dog jumped over the\nlazy fox. The quick brown dog jumped\nover the lazy fox. The quick brown dog\njumped over the lazy fox. The quick\nbrown dog jumped over the lazy fox.\n'; const CHINESE_TEXT = '繁 體 中 文 測 試\n'; -const BILL_SPACED_TEXT = 'FIRST CHEQUING\n\nLine of Credit 100,000.00 Rate 4.2000\n\nDate Description Number Debits Credits Balance\n31Jul2018 Balance Forward 99,878.08 -\n01Aug2018 Clearing Cheque 4987 36.07 99,914.15 -\n01Aug2018 Clearing Cheque 4986 60.93 99,975.08 -\n01Aug2018 Clearing Cheque 4982 800.04 100,775.12 EX\n01Aug2018 Clearing Cheque 4981 823.34 101,598.46 EX\n01Aug2018 Incoming Interac e-Transfer 1454 101,583.92 EX\n01Aug2018 Incoming Interac e-Transfer 400.00 101,183.92 EX\n01Aug2018 Assisted Deposit 3241450 68,769.42 -\n01Aug2018 Transfer out to loan 7 1,500.00 70,269.42 -\n02Aug2018 Clearing Cheque 4984 48.08 70,317.50 -\n02Aug2018 Clearing Cheque 4985 7051 70,388.01 -\n02Aug2018 Clearing Cheque 4992 500.00 70.888.01 -\n'; +const BILL_SPACED_TEXT = 'FIRST CHEQUING\n\nLine of Credit 100,000.00 Rate 4.2000\n\nDate Description Number Debits Credits Balance\n31Jul2018 Balance Forward 99,878.08 -\n01Aug2018 Clearing Cheque 4987 36.07 99,914.15 -\n01Aug2018 Clearing Cheque 4986 60.93 99,975.08 -\n01Aug2018 Clearing Cheque 4982 800.04 100,775.12 EX\n01Aug2018 Clearing Cheque 4981 823.34 101,598.46 EX\n01Aug2018 Incoming Interac e-Transfer 1454 101,583.92 EX\n01Aug2018 Incoming Interac e-Transfer 400.00 101,183.92 EX\n01Aug2018 Assisted Deposit 3241450 68,769.42 -\n01Aug2018 Transfer out to loan 7 1,500.00 70,269.42 -\n02Aug2018 Clearing Cheque 4984 48.08 70,317.50 -\n02Aug2018 Clearing Cheque 4985 7051 70,388.01 -\n02Aug2018 Clearing Cheque 4992 500.00 70,888.01 -\n'; const SIMPLE_WHITELIST_TEXT = 'Tesses\n'; const FORMATS = ['png', 'jpg', 'bmp', 'pbm', 'webp', 'gif']; const SIMPLE_PNG_BASE64 = ''; diff --git a/tests/detect.test.js b/tests/detect.test.js index b6f8026c5..97b1bbb43 100644 --- a/tests/detect.test.js +++ b/tests/detect.test.js @@ -2,7 +2,7 @@ const { createWorker } = Tesseract; let worker; before(async function cb() { this.timeout(0); - worker = await createWorker("osd", 1, OPTIONS); + worker = await createWorker("osd", 0, OPTIONS); }); describe('detect()', async () => { From 4467bc517e4e0ee2fa59a606f2cd84d09174f937 Mon Sep 17 00:00:00 2001 From: Balearica Date: Sat, 23 Sep 2023 23:52:08 -0700 Subject: [PATCH 05/16] Updated examples for v5 --- benchmarks/browser/auto-rotate-benchmark.html | 7 +- benchmarks/browser/speed-benchmark.html | 4 +- benchmarks/node/speed-benchmark.js | 2 - examples/browser/basic-efficient.html | 6 +- examples/browser/basic-scheduler.html | 4 +- examples/browser/demo.html | 160 ------------------ examples/browser/download-pdf.html | 2 +- examples/browser/image-processing.html | 6 +- examples/node/download-pdf.js | 2 - examples/node/image-processing.js | 2 - examples/node/recognize.js | 4 +- examples/node/scheduler.js | 28 ++- 12 files changed, 27 insertions(+), 200 deletions(-) delete mode 100644 examples/browser/demo.html diff --git a/benchmarks/browser/auto-rotate-benchmark.html b/benchmarks/browser/auto-rotate-benchmark.html index ac97ed125..d1e90ef39 100644 --- a/benchmarks/browser/auto-rotate-benchmark.html +++ b/benchmarks/browser/auto-rotate-benchmark.html @@ -37,15 +37,10 @@ const element = document.getElementById("imgRow"); - const worker = await Tesseract.createWorker({ + const worker = await Tesseract.createWorker('eng', 0, { // corePath: '/tesseract-core-simd.wasm.js', workerPath: "/dist/worker.dev.js" }); - await worker.loadLanguage('eng'); - await worker.initialize('eng'); - - await worker.initialize(); - const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"]; let timeTotal = 0; diff --git a/benchmarks/browser/speed-benchmark.html b/benchmarks/browser/speed-benchmark.html index e2e51fe3f..87c23eced 100644 --- a/benchmarks/browser/speed-benchmark.html +++ b/benchmarks/browser/speed-benchmark.html @@ -13,12 +13,10 @@ const { createWorker } = Tesseract; (async () => { - const worker = await createWorker({ + const worker = await createWorker("eng", 1, { // corePath: '/tesseract-core-simd.wasm.js', workerPath: "/dist/worker.dev.js" }); - await worker.loadLanguage('eng'); - await worker.initialize('eng'); // The performance.measureUserAgentSpecificMemory function only runs under specific circumstances for security reasons. // See: https://developer.mozilla.org/en-US/docs/Web/API/Performance/measureUserAgentSpecificMemory#security_requirements diff --git a/benchmarks/node/speed-benchmark.js b/benchmarks/node/speed-benchmark.js index 28d3fe725..8da5dab1c 100644 --- a/benchmarks/node/speed-benchmark.js +++ b/benchmarks/node/speed-benchmark.js @@ -4,8 +4,6 @@ const { createWorker } = require('../../'); (async () => { const worker = await createWorker(); - await worker.loadLanguage('eng'); - await worker.initialize('eng'); const fileArr = ["../data/meditations.jpg", "../data/tyger.jpg", "../data/testocr.png"]; let timeTotal = 0; for (let file of fileArr) { diff --git a/examples/browser/basic-efficient.html b/examples/browser/basic-efficient.html index 4d278f312..d86473448 100644 --- a/examples/browser/basic-efficient.html +++ b/examples/browser/basic-efficient.html @@ -10,16 +10,12 @@ // This is a basic example more efficient than "basic.html". // In this example we create a worker once, and this worker is re-used // every time the user uploads a new file. - - const worker = await Tesseract.createWorker({ + const worker = await Tesseract.createWorker("eng", 1, { corePath: '../../node_modules/tesseract.js-core', workerPath: "/dist/worker.dev.js", logger: function(m){console.log(m);} }); - await worker.loadLanguage('eng'); - await worker.initialize('eng'); - const recognize = async function(evt){ const files = evt.target.files; diff --git a/examples/browser/basic-scheduler.html b/examples/browser/basic-scheduler.html index 35b7e78f7..60be9a704 100644 --- a/examples/browser/basic-scheduler.html +++ b/examples/browser/basic-scheduler.html @@ -16,13 +16,11 @@ // Creates worker and adds to scheduler const workerGen = async () => { - const worker = await Tesseract.createWorker({ + const worker = await Tesseract.createWorker("eng", 1, { corePath: '../../node_modules/tesseract.js-core', workerPath: "/dist/worker.dev.js", logger: function(m){console.log(m);} }); - await worker.loadLanguage('eng'); - await worker.initialize('eng'); scheduler.addWorker(worker); } diff --git a/examples/browser/demo.html b/examples/browser/demo.html deleted file mode 100644 index 0712b0f47..000000000 --- a/examples/browser/demo.html +++ /dev/null @@ -1,160 +0,0 @@ - - - - - - - -
- - - diff --git a/examples/browser/download-pdf.html b/examples/browser/download-pdf.html index 7d821a302..bc99bded0 100644 --- a/examples/browser/download-pdf.html +++ b/examples/browser/download-pdf.html @@ -10,7 +10,7 @@ +