From 6ebe92fb5b1fcf2d65234e21f9e819f3532a4c75 Mon Sep 17 00:00:00 2001 From: Balearica Date: Wed, 27 Sep 2023 23:34:11 -0700 Subject: [PATCH] Update to v5 (#830) --- .gitignore | 4 +- README.md | 111 ++++--- benchmarks/browser/auto-rotate-benchmark.html | 11 +- benchmarks/browser/speed-benchmark.html | 24 +- benchmarks/node/speed-benchmark.js | 2 - docs/api.md | 270 ++++++++---------- docs/examples.md | 44 +-- docs/faq.md | 2 - docs/intro.md | 68 ----- docs/local-installation.md | 29 +- docs/performance.md | 29 +- docs/workers_vs_schedulers.md | 51 ++++ examples/browser/basic-efficient.html | 10 +- examples/browser/basic-scheduler.html | 8 +- examples/browser/basic.html | 26 -- examples/browser/demo.html | 160 ----------- examples/browser/download-pdf.html | 8 +- examples/browser/image-processing.html | 10 +- examples/node/detect.js | 13 - examples/node/download-pdf.js | 2 - examples/node/image-processing.js | 2 - examples/node/recognize.js | 4 +- examples/node/scheduler.js | 28 +- package-lock.json | 14 +- package.json | 4 +- scripts/server.js | 2 +- scripts/webpack.config.dev.js | 49 ---- src/Tesseract.js | 8 +- src/constants/config.js | 5 - src/constants/defaultOptions.js | 4 - src/createWorker.js | 86 ++++-- src/index.d.ts | 7 +- src/worker-script/browser/getCore.js | 16 +- src/worker-script/index.js | 117 ++++++-- src/worker-script/node/getCore.js | 17 +- src/worker/browser/defaultOptions.js | 10 +- tests/FS.test.html | 2 +- tests/FS.test.js | 2 +- tests/constants.js | 4 +- tests/detect.test.html | 2 +- tests/detect.test.js | 4 +- tests/recognize.test.html | 2 +- tests/recognize.test.js | 31 +- tests/scheduler.test.html | 2 +- tests/scheduler.test.js | 5 +- 45 files changed, 532 insertions(+), 777 deletions(-) delete mode 100644 docs/intro.md create mode 100644 docs/workers_vs_schedulers.md delete mode 100644 examples/browser/basic.html delete mode 100644 examples/browser/demo.html delete mode 100755 examples/node/detect.js delete mode 100644 scripts/webpack.config.dev.js delete mode 100644 src/constants/config.js diff --git a/.gitignore b/.gitignore index 43e21fd85..5942f401a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,8 @@ .DS_Store node_modules/* yarn.lock -tesseract.dev.js -worker.dev.js +tesseract.min.js +worker.min.js *.traineddata *.traineddata.gz .nyc_output diff --git a/README.md b/README.md index dee50a29f..8493e83ea 100644 --- a/README.md +++ b/README.md @@ -31,82 +31,32 @@ Video Real-time Recognition Tesseract.js wraps a [webassembly port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR Engine. -It works in the browser using [webpack](https://webpack.js.org/) or plain script tags with a [CDN](#CDN) and on the server with [Node.js](https://nodejs.org/en/). +It works in the browser using [webpack](https://webpack.js.org/), esm, or plain script tags with a [CDN](#CDN) and on the server with [Node.js](https://nodejs.org/en/). After you [install it](#installation), using it is as simple as: -```javascript -import Tesseract from 'tesseract.js'; - -Tesseract.recognize( - 'https://tesseract.projectnaptha.com/img/eng_bw.png', - 'eng', - { logger: m => console.log(m) } -).then(({ data: { text } }) => { - console.log(text); -}) -``` - -Or using workers (recommended for production use): - ```javascript import { createWorker } from 'tesseract.js'; -const worker = await createWorker({ - logger: m => console.log(m) -}); - (async () => { - await worker.loadLanguage('eng'); - await worker.initialize('eng'); - const { data: { text } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); - console.log(text); + const worker = await createWorker('eng'); + const data = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); + console.log(data.text); await worker.terminate(); })(); ``` +When recognizing multiple images, users should create a worker once, run `worker.recognize` for each image, and then run `worker.terminate()` once at the end (rather than running the above snippet for every image). -For a basic overview of the functions, including the pros/cons of different approaches, see the [intro](./docs/intro.md). [Check out the docs](#documentation) for a full explanation of the API. - -## Major changes in v4 -Version 4 includes many new features and bug fixes--see [this issue](https://github.com/naptha/tesseract.js/issues/662) for a full list. Several highlights are below. - -- Added rotation preprocessing options (including auto-rotate) for significantly better accuracy -- Processed images (rotated, grayscale, binary) can now be retrieved -- Improved support for parallel processing (schedulers) -- Breaking changes: - - `createWorker` is now async - - `getPDF` function replaced by `pdf` recognize option - -## Major changes in v3 -- Significantly faster performance - - Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data) -- Upgrade to Tesseract v5.1.0 (using emscripten 3.1.18) -- Added SIMD-enabled build for supported devices -- Added support: - - Node.js version 18 -- Removed support: - - ASM.js version, any other old versions of Tesseract.js-core (<3.0.0) - - Node.js versions 10 and 12 - -## Major changes in v2 -- Upgrade to tesseract v4.1.1 (using emscripten 1.39.10 upstream) -- Support multiple languages at the same time, eg: eng+chi\_tra for English and Traditional Chinese -- Supported image formats: png, jpg, bmp, pbm -- Support WebAssembly (fallback to ASM.js when browser doesn't support) -- Support Typescript - -Read a story about v2: Why I refactor tesseract.js v2?
- Check the support/1.x branch for version 1 ## Installation Tesseract.js works with a ` + + ``` -After including the script the `Tesseract` variable will be globally available. +After including the script the `Tesseract` variable will be globally available and a worker can be created using `Tesseract.createWorker`. +Alternatively, an ESM build (used with `import` syntax) can be found at `https://cdn.jsdelivr.net/npm/tesseract.js@5/dist/tesseract.esm.min.js`. ### Node.js @@ -122,16 +72,51 @@ npm install tesseract.js@3.0.3 yarn add tesseract.js@3.0.3 ``` - ## Documentation -* [Intro](./docs/intro.md) +* [Workers vs. Schedulers](./docs/workers_vs_schedulers.md) * [Examples](./docs/examples.md) -* [Image Format](./docs/image-format.md) +* [Supported Image Formats](./docs/image-format.md) * [API](./docs/api.md) * [Local Installation](./docs/local-installation.md) * [FAQ](./docs/faq.md) +## Major changes in v5 +Version 5 changes are documented in [this issue](https://github.com/naptha/tesseract.js/issues/820). Highlights are below. + + - Significantly smaller files by default (54% smaller for English, 73% smaller for Chinese) + - This results in a ~50% reduction in runtime for first-time users (who do not have the files cached yet) + - Significantly lower memory usage + - Compatible with iOS 17 (using default settings) + - Breaking changes: + - `createWorker` arguments changed + - Setting non-default language and OEM now happens in `createWorker` + - E.g. `createWorker("chi_sim", 1)` + - `worker.initialize` and `worker.loadLanguage` functions now do nothing and can be deleted from code + - See [this issue](https://github.com/naptha/tesseract.js/issues/820) for full list + +## Major changes in v4 +Version 4 includes many new features and bug fixes--see [this issue](https://github.com/naptha/tesseract.js/issues/662) for a full list. Several highlights are below. + +- Added rotation preprocessing options (including auto-rotate) for significantly better accuracy +- Processed images (rotated, grayscale, binary) can now be retrieved +- Improved support for parallel processing (schedulers) +- Breaking changes: + - `createWorker` is now async + - `getPDF` function replaced by `pdf` recognize option + +## Major changes in v3 +- Significantly faster performance + - Runtime reduction of 84% for Browser and 96% for Node.js when recognizing the [example images](./examples/data) +- Upgrade to Tesseract v5.1.0 (using emscripten 3.1.18) +- Added SIMD-enabled build for supported devices +- Added support: + - Node.js version 18 +- Removed support: + - ASM.js version, any other old versions of Tesseract.js-core (<3.0.0) + - Node.js versions 10 and 12 + + ## Use tesseract.js the way you like! - Electron Version: https://github.com/Balearica/tesseract.js-electron @@ -167,7 +152,7 @@ npm start ``` The development server will be available at http://localhost:3000/examples/browser/demo.html in your favorite browser. -It will automatically rebuild `tesseract.dev.js` and `worker.dev.js` when you change files in the **src** folder. +It will automatically rebuild `tesseract.min.js` and `worker.min.js` when you change files in the **src** folder. ### Online Setup with a single Click diff --git a/benchmarks/browser/auto-rotate-benchmark.html b/benchmarks/browser/auto-rotate-benchmark.html index ac97ed125..dcc1f003f 100644 --- a/benchmarks/browser/auto-rotate-benchmark.html +++ b/benchmarks/browser/auto-rotate-benchmark.html @@ -1,7 +1,7 @@ - + diff --git a/examples/browser/download-pdf.html b/examples/browser/download-pdf.html index 7d821a302..0cff1555c 100644 --- a/examples/browser/download-pdf.html +++ b/examples/browser/download-pdf.html @@ -1,6 +1,6 @@ - +
@@ -10,17 +10,15 @@ +