diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 74544677..700c32cf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,26 +1,53 @@ -name: Node.js CI/CD +name: CI/CD on: pull_request: - branches: [ "*" ] jobs: - build: + node: + name: Node.js (${{ matrix.node-version }}) + + runs-on: ubuntu-latest + strategy: + matrix: + node-version: [18, 20, 22] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ matrix.node-version }} + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Install Dependencies + run: bun install + + - name: Run Tests + run: bun run test:node + + bun: + name: Bun (${{ matrix.bun-version }}) runs-on: ubuntu-latest - outputs: - test-result: ${{ steps.test.outcome }} + strategy: + matrix: + bun-version: [latest] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - name: Use Node.js 20.x # Updated for Node 20 - uses: actions/setup-node@v3 + - name: Setup Bun + uses: oven-sh/setup-bun@v2 with: - node-version: 20.x + bun-version: ${{ matrix.bun-version }} - - name: Run ci - run: npm ci + - name: Install Dependencies + run: bun install - - name: Run tests - id: test - run: npm test + - name: Run Tests + run: bun run test:bun diff --git a/base/display/api.js b/base/display/api.js index 8af545fe..8ddb278c 100755 --- a/base/display/api.js +++ b/base/display/api.js @@ -49,11 +49,11 @@ PDFJS.imageResourcesPath = PDFJS.imageResourcesPath === undefined ? /** * Disable the web worker and run all code on the main thread. This will happen * automatically if the browser doesn't support workers or sending typed arrays - * to workers. + * to workers. Default is true when running in a server environment like node or bun. * @var {Boolean} */ PDFJS.disableWorker = PDFJS.disableWorker === undefined ? - false : PDFJS.disableWorker; + typeof window === 'undefined' : PDFJS.disableWorker; /** * Path and filename of the worker file. Required when the worker is enabled in diff --git a/jest.config.json b/jest.config.json deleted file mode 100644 index 2457cddc..00000000 --- a/jest.config.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "testMatch": ["**/test/_test_.*"], - "testEnvironment": "node", - "bail": false, - "testFailureExitCode": 1 -} diff --git a/package.json b/package.json index 74e01dd6..cd868c0c 100644 --- a/package.json +++ b/package.json @@ -33,9 +33,11 @@ "module": "./dist/pdfparser.js", "typings": "./pdfparser.d.ts", "scripts": { - "pretest": "npm run build", - "test:jest": "jest --config ./jest.config.json", - "test": "jest --config ./jest.config.json && npm run parse-r && npm run parse-fd", + "pretest:node": "npm run build", + "test:node": "jest && npm run parse-r && npm run parse-fd", + "pretest:bun": "npm run build", + "test:bun": "bun test && npm run parse-r && npm run parse-fd", + "test": "bun test", "test:forms": "cd ./test && sh p2j.forms.sh", "test:misc": "cd ./test && sh p2j.one.sh misc . \"Expected: 14 success, 5 fail exception with stack trace\" ", "parse": "./bin/pdf2json.js -f ./test/pdf/fd/form/F1040.pdf -o ./test/target/fd/form", @@ -52,7 +54,7 @@ "build:rollup": "rollup -c ./rollup.config.js", "build:bundle-pdfjs-base": "node rollup/bundle-pdfjs-base.js", "build": "npm run build:bundle-pdfjs-base && npm run build:rollup", - "build:clean": "rm -rf node_modules && rm -f package-lock.json && npm i && npm run build" + "build:clean": "rm -rf node_modules && rm -f package-lock.json && npm install && npm run build" }, "engines": { "node": ">=20.18.0", diff --git a/readme.md b/readme.md index 3af884ff..2c140fa5 100644 --- a/readme.md +++ b/readme.md @@ -8,7 +8,7 @@ ![GitHub top language](https://img.shields.io/github/languages/top/modesty/pdf2json) ![GitHub last commit](https://img.shields.io/github/last-commit/modesty/pdf2json?color=red) -pdf2json is a [node.js](http://nodejs.org/) module converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use. +pdf2json is a [node.js](https://nodejs.org/) module that converts binary PDF to JSON and text. Built with [pdf.js](https://github.com/mozilla/pdf.js/), it extracts text content and interactive form elements for server-side processing and command-line use. ## Features @@ -20,45 +20,62 @@ pdf2json is a [node.js](http://nodejs.org/) module converts binary PDF to JSON a ## Install -> npm i pdf2json +You can install it using npm or bun: -Or, install it globally: +```bash +npm install pdf2json +bun add pdf2json +``` -> npm i pdf2json -g +If you want to use the `pdf2json` CLI, you can install it globally: -To update with latest version: +```bash +npm install pdf2json -g +bun install pdf2json -g +``` -> npm update pdf2json -g +## Usage -To Run in RESTful Web Service or as command line Utility +```javascript +import PDFParser from "pdf2json"; -- More details can be found at the bottom of this document. +const pdfParser = new PDFParser(); +``` -## Test +The module is tested with [Node.js](https://nodejs.org/) 18+ and [Bun](https://bun.sh/) 1+. -After install, run command line: +## Test -> npm test +You can run tests in Bun, or in Node.js using Jest. -`pretest` step builds bundles and source maps for both ES Module and CommonJS, output to `./dist` directory. The Jest test suit is defined in `./test/_test_.cjs` with commonJS, test run will also cover `parse-r` and `parse-fd` with ES Modules via command line. +```bash +bun run test:bun # runs in Bun +bun run test:node # runs in Node.js using Jest +``` -The default Jest test suits are essential tests for all PRs. But it only covers a portion of all testing PDFs, for more broader coverage, run: +The `pretest` script builds bundles and source maps for both ES Module and CommonJS, then outputs to `./dist` directory. The test suit is defined in `./test/p2j.test.js` with CommonJS, and will also cover `parse-r` and `parse-fd` with ES Modules via command line. -> npm run test:forms +The default test suits are essential tests for all PRs. But it only covers a portion of all testing PDFs, for more broader coverage, run: -It'll scan and parse _260_ PDF AcroForm files under _*./test/pdf*_, runs with _*-s -t -c -m*_ command line options, generates primary output JSON, additional text content JSON, form fields JSON and merged text file for each PDF. It usually takes ~20s in my MacBook Pro to complete, check _*./test/target/*_ for outputs. +```bash +bun run test:forms +``` -_update on 4/27/2024_: parsing 260 PDFs by `npm run test:forms` on M2 Mac takes 7~8s +It'll scan and parse _260_ PDF AcroForm files under _*./test/pdf*_, runs with _*-s -t -c -m*_ command line options, generates primary output JSON, additional text content JSON, form fields JSON and merged text file for each PDF. It usually takes ~8s in my MacBook Pro to complete, check _*./test/target/*_ for outputs. -To run Jest test suits with commonJS bundle only +To run the test suite with CommonJS bundle only, run: -> npm run test:jest +```bash +bun run test +``` ### Test Exception Handlings After install, run command line: -> npm run test:misc +```bash +bun run test:misc +``` It'll scan and parse all PDF files under _*./test/pdf/misc*_, also runs with _*-s -t -c -m*_ command line options, generates primary output JSON, additional text content JSON, form fields JSON and merged text JSON file for 15 PDF fields, 12 are expected to success while the other three's exceptions are expected to catch with stack trace for: @@ -70,7 +87,9 @@ It'll scan and parse all PDF files under _*./test/pdf/misc*_, also runs with _*- After install, run command line: -> npm run parse-r +```bash +bun run parse-r +``` It scans 165 PDF files under _*./test/pdf/fd/form/*_, parses with [Stream API](https://nodejs.org/dist/latest-v14.x/docs/api/stream.html), then generates output to _*./test/target/fd/form/*_. diff --git a/rollup/bundle-pdfjs-base.js b/rollup/bundle-pdfjs-base.js index f6e983dd..7134eb08 100644 --- a/rollup/bundle-pdfjs-base.js +++ b/rollup/bundle-pdfjs-base.js @@ -66,7 +66,7 @@ fs.writeFileSync(path.join(__dirname, "../lib/pdfjs-code.js"), ` ${"import nodeUtil from 'util';import { Blob } from 'buffer';import { DOMParser } from '@xmldom/xmldom';import PDFAnno from './pdfanno.js';import Image from './pdfimage.js';import { createScratchCanvas } from './pdfcanvas.js';"} ${"export const PDFJS = {};"} - ${"const globalScope = { console };"} + ${"const globalScope = { console, PDFJS };"} ${_baseCode} `, { diff --git a/test/_test_.cjs b/test/p2j.test.js similarity index 100% rename from test/_test_.cjs rename to test/p2j.test.js