diff --git a/.gitignore b/.gitignore index 6ef748fb..0b361641 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ *.out node_modules/ target/ +.idea +.npmrc diff --git a/.travis.yml b/.travis.yml index 96c921b2..ca5f442d 100755 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,3 @@ language: node_js node_js: - - "4.5.0" + - "14.18.0" diff --git a/base/core/core.js b/base/core/core.js index 8ea87394..c25797da 100755 --- a/base/core/core.js +++ b/base/core/core.js @@ -170,13 +170,17 @@ var Page = (function PageClosure() { var opList = new OperatorList(handler, self.pageIndex); - - handler.send('StartRenderPage', { - transparency: partialEvaluator.hasBlendModes(self.resources), - pageIndex: self.pageIndex - }); - partialEvaluator.getOperatorList(contentStream, self.resources, opList); - pageListPromise.resolve(opList); + try { + handler.send('StartRenderPage', { + transparency: partialEvaluator.hasBlendModes(self.resources), + pageIndex: self.pageIndex + }); + partialEvaluator.getOperatorList(contentStream, self.resources, opList); + pageListPromise.resolve(opList); + } + catch(e) { + pageListPromise.reject(e); + } }); var annotationsPromise = pdfManager.ensure(this, 'annotations'); diff --git a/base/core/crypto.js b/base/core/crypto.js index 11f4902f..c46e9281 100755 --- a/base/core/crypto.js +++ b/base/core/crypto.js @@ -553,17 +553,17 @@ var CipherTransformFactory = (function CipherTransformFactoryClosure() { function CipherTransformFactory(dict, fileId, password) { var filter = dict.get('Filter'); if (!isName(filter) || filter.name != 'Standard') - error('unknown encryption method'); + error('Error: unknown encryption method'); this.dict = dict; var algorithm = dict.get('V'); if (!isInt(algorithm) || (algorithm != 1 && algorithm != 2 && algorithm != 4)) - error('unsupported encryption algorithm'); + error('Error: unsupported encryption algorithm'); this.algorithm = algorithm; var keyLength = dict.get('Length') || 40; if (!isInt(keyLength) || keyLength < 40 || (keyLength % 8) !== 0) - error('invalid key length'); + error('Error: invalid key length'); // prepare keys var ownerPassword = stringToBytes(dict.get('O')).subarray(0, 32); var userPassword = stringToBytes(dict.get('U')).subarray(0, 32); diff --git a/base/core/obj.js b/base/core/obj.js index 171ea100..15951e92 100755 --- a/base/core/obj.js +++ b/base/core/obj.js @@ -1001,6 +1001,7 @@ var XRef = (function XRefClosure() { throw e; } log('(while reading XRef): ' + e); + error(e); } if (recoveryMode) diff --git a/base/core/parser.js b/base/core/parser.js index bb2f1783..06f3e3e0 100755 --- a/base/core/parser.js +++ b/base/core/parser.js @@ -534,9 +534,8 @@ var Lexer = (function LexerClosure() { str += String.fromCharCode(ch); } } - if (str.length > 128) { - error('Warning: name token is longer than allowed by the spec: ' + - str.length); + if (str.length > 127) { + warn('Name token is longer than allowed by the spec: ' + str.length); } return new Name(str); }, diff --git a/base/display/api.js b/base/display/api.js index f1fd048f..fbe2285a 100755 --- a/base/display/api.js +++ b/base/display/api.js @@ -412,8 +412,13 @@ var PDFPageProxy = (function PDFPageProxyClosure() { return; } stats.time('Rendering'); - internalRenderTask.initalizeGraphics(transparency); - internalRenderTask.operatorListChanged(); + try {//MQZ. catch canvas drawing exceptions + internalRenderTask.initalizeGraphics(transparency); + internalRenderTask.operatorListChanged(); + } + catch(err) { + complete(err); + } }, function pageDisplayReadPromiseError(reason) { complete(reason); diff --git a/base/display/canvas.js b/base/display/canvas.js index 30fb0010..55149342 100755 --- a/base/display/canvas.js +++ b/base/display/canvas.js @@ -1036,7 +1036,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { showText: function CanvasGraphics_showText(glyphs, skipTextSelection) { var ctx = this.ctx; var current = this.current; - var font = current.font; + var font = current.font || {}; var fontSize = current.fontSize; var fontSizeScale = current.fontSizeScale; var charSpacing = current.charSpacing; @@ -1095,19 +1095,20 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { this.processingType3 = null; } else { ctx.save(); - var tx = 0; //MQZ Dec.04.2013 handles leading word spacing + var tx = 0; if (wordSpacing !== 0) { - var firstGlyph = _.find(glyphs, function(g) { return _.isObject(g);}); + var firstGlyph = glyphs.filter(g => g && ('fontChar' in g || 'unicode' in g))[0]; if (firstGlyph && (firstGlyph.fontChar === ' ' || firstGlyph.unicode === ' ')) { - if (_.find(glyphs, function(g) { return _.isObject(g) && g.unicode !== ' ';})) { - current.x += wordSpacing * fontSize * textHScale; - } + tx = wordSpacing * fontSize * textHScale; } } + current.x += tx this.applyTextTransforms(); + current.x -= tx + // MQZ-GYJ Apr.20.2017 handles leading word spacing over var lineWidth = current.lineWidth; var a1 = current.textMatrix[0], b1 = current.textMatrix[1]; @@ -1286,7 +1287,8 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { } } else { - if (-e >= spaceWidth) { + //MQZ-GYJ. Apr.20.2017 split word when spacing is a positive number but very big + if (Math.abs(e) >= spaceWidth) { if (vertical) { current.y += spacingLength; } else { @@ -1533,6 +1535,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { var depth = this.current.paintFormXObjectDepth; do { this.restore(); + this.current.paintFormXObjectDepth--; // some pdf don't close all restores inside object // closing those for them } while (this.current.paintFormXObjectDepth >= depth); @@ -1616,6 +1619,10 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { }, endGroup: function CanvasGraphics_endGroup(group) { + //MQZ. make sure endGroup is always invoked after beginGroup + if (this.groupLevel == 0) + this.beginGroup(group); + this.groupLevel--; var groupCtx = this.ctx; this.ctx = this.groupStack.pop(); diff --git a/base/display/metadata.js b/base/display/metadata.js index 029ad77f..6f62537d 100755 --- a/base/display/metadata.js +++ b/base/display/metadata.js @@ -57,7 +57,7 @@ var Metadata = PDFJS.Metadata = (function MetadataClosure() { var doc = this.metaDocument; var rdf = doc.documentElement; - if (rdf.nodeName.toLowerCase() !== 'rdf:rdf') { // Wrapped in + if (rdf && rdf.nodeName.toLowerCase() !== 'rdf:rdf') { // Wrapped in rdf = rdf.firstChild; while (rdf && rdf.nodeName.toLowerCase() !== 'rdf:rdf') rdf = rdf.nextSibling; diff --git a/base/shared/annotation.js b/base/shared/annotation.js index f7a23a6d..01830797 100755 --- a/base/shared/annotation.js +++ b/base/shared/annotation.js @@ -667,7 +667,7 @@ var LinkAnnotation = (function LinkAnnotationClosure() { // Lets URLs beginning with 'www.' default to using the 'http://' protocol. function addDefaultProtocolToUrl(url) { - if (url.indexOf('www.') === 0) { + if (url && url.indexOf('www.') === 0) { return ('http://' + url); } return url; diff --git a/base/shared/util.js b/base/shared/util.js index a2a2de66..3b4ebdd5 100755 --- a/base/shared/util.js +++ b/base/shared/util.js @@ -1244,18 +1244,18 @@ nodeUtil.p2jinfo = info; nodeUtil.p2jwarn = warn; nodeUtil.p2jerror = error; nodeUtil.verbosity = function(verbo) { - if (!isNaN(verbo)) { + if (isNaN(verbo)) { + verbosity = WARNINGS; + } + else { if (verbo <= ERRORS) { - verbo = ERRORS; + verbosity = ERRORS; } else if (verbo >= INFOS) { - verbo = INFOS; + verbosity = INFOS; } - - verbosity = verbo; - } - else { - verbosity = ERRORS; + else + verbosity = verbo; } }; nodeUtil.verbosity(); diff --git a/bin/pdf2json b/bin/pdf2json index 9ccf33bc..8e9b6868 100755 --- a/bin/pdf2json +++ b/bin/pdf2json @@ -1,6 +1,4 @@ #!/usr/bin/env node -'use strict'; - -var P2JCMD = require('../lib/p2jcmd'); +const P2JCMD = require('../lib/p2jcmd'); new P2JCMD().start(); diff --git a/lib/p2jcmd.js b/lib/p2jcmd.js index 065c6453..1e32a33f 100644 --- a/lib/p2jcmd.js +++ b/lib/p2jcmd.js @@ -1,184 +1,160 @@ -'use strict'; - -let nodeUtil = require("util"), - stream = require('stream'), - fs = require('fs'), - path = require('path'), - _ = require('lodash'), - PDFParser = require("../pdfparser"), - pkInfo = require('../package.json'), - async = require("async"); +const nodeUtil = require("util"), + fs = require("fs"), + path = require("path"), + {ParserStream, StringifyStream} = require("./parserstream"), + pkInfo = require("../package.json"), + PDFParser = require("../pdfparser"); const _PRO_TIMER = `${pkInfo.name}@${pkInfo.version} [${pkInfo.homepage}]`; -let optimist = require('optimist') - .usage("\n" + _PRO_TIMER + "\n\nUsage: $0 -f|--file [-o|output_dir]") - .alias('v', 'version') - .describe('v', 'Display version.\n') - .alias('h', 'help') - .describe('h', 'Display brief help information.\n') - .alias('f', 'file') - .describe('f', '(required) Full path of input PDF file or a directory to scan for all PDF files. When specifying a PDF file name, it must end with .PDF, otherwise it would be treated as a input directory.\n') - .alias('o', 'output_dir') - .describe('o', '(optional) Full path of output directory, must already exist. Current JSON file in the output folder will be replaced when file name is same.\n') - .alias('s', 'silent') - .describe('s', '(optional) when specified, will only log errors, otherwise verbose.\n') - .alias('t', 'fieldTypes') - .describe('t', '(optional) when specified, will generate .fields.json that includes fields ids and types.\n') - .alias('c', 'content') - .describe('c', '(optional) when specified, will generate .content.txt that includes text content from PDF.\n') - .alias('m', 'merge') - .describe('m', '(optional) when specified, will generate .merged.json that includes auto-merged broken text blocks from PDF (Experimental).\n') - .alias('r', 'stream') - .describe('r', '(optional) when specified, will process and parse with buffer/object transform stream rather than file system (Experimental).\n'); - -const argv = optimist.argv; -const VERBOSITY_LEVEL = (_.has(argv, 's') ? 0 : 5); - -const PROCESS_RAW_TEXT_CONTENT = _.has(argv, 'c'); -const PROCESS_FIELDS_CONTENT = _.has(argv, 't'); -const PROCESS_MERGE_BROKEN_TEXT_BLOCKS = _.has(argv, 'm'); -const PROCESS_WITH_STREAM = _.has(argv, 'r'); - -let PDF2JSONUtil = (function () { - - function StringifyStream(){ - stream.Transform.call(this); - - this._readableState.objectMode = false; - this._writableState.objectMode = true; - } - nodeUtil.inherits(StringifyStream, stream.Transform); - - StringifyStream.prototype._transform = function(obj, encoding, callback){ - this.push(JSON.stringify(obj)); - callback(); - }; - - let _continue = function(callback, err) { - if (err) - console.error(err); - if (nodeUtil.isFunction(callback)) - callback(err); - }; +const yargs = require('./p2jcmdarg') + .usage(`\n${_PRO_TIMER}\n\nUsage: ${pkInfo.name} -f|--file [-o|output_dir]`) + .alias('v', 'version', 'Display version.') + .alias('h', 'help', 'Display brief help information.') + .alias('f', 'file', '(required) Full path of input PDF file or a directory to scan for all PDF files.\n\t\t When specifying a PDF file name, it must end with .PDF, otherwise it would be treated as a input directory.') + .alias('o', 'output', '(optional) Full path of output directory, must already exist.\n\t\t Current JSON file in the output folder will be replaced when file name is same.') + .alias('s', 'silent', '(optional) when specified, will only log errors, otherwise verbose.') + .alias('t', 'fieldTypes', '(optional) when specified, will generate .fields.json that includes fields ids and types.') + .alias('c', 'content', '(optional) when specified, will generate .content.txt that includes text content from PDF.') + .alias('m', 'merge', '(optional) when specified, will generate .merged.json that includes auto-merged broken text blocks from PDF.') + .alias('r', 'stream', '(optional) when specified, will process and parse with buffer/object transform stream rather than file system.'); + +const argv = yargs.argv; +const ONLY_SHOW_VERSION = ('v' in argv); +const ONLY_SHOW_HELP = ('h' in argv); +const VERBOSITY_LEVEL = (('s' in argv) ? 0 : 5); +const HAS_INPUT_DIR_OR_FILE = ('f' in argv); + +const PROCESS_RAW_TEXT_CONTENT = ('c' in argv); +const PROCESS_FIELDS_CONTENT = ('t' in argv); +const PROCESS_MERGE_BROKEN_TEXT_BLOCKS = ('m' in argv); +const PROCESS_WITH_STREAM = ('r' in argv); + +const INPUT_DIR_OR_FILE = argv.f; + +class PDFProcessor { + inputDir = null; + inputFile = null; + inputPath = null; + + outputDir = null; + outputFile = null; + outputPath = null; + + pdfParser = null; + curCLI = null; - let _onPdfParserError = function(evtData, callback) { - this.curProcessor.failedCount++; - _continue.call(this, callback, "Parse Exception: " + evtData.parserError); - }; + // constructor + constructor(inputDir, inputFile, curCLI) { + // public, this instance copies + this.inputDir = path.normalize(inputDir); + this.inputFile = inputFile; + this.inputPath = path.join(this.inputDir, this.inputFile); - let _createOutputStream = function(outputPath, callback) { - let outputStream = fs.createWriteStream(outputPath); - outputStream.on('finish', () => { - callback(null, outputPath); - }); - outputStream.on('error', err => { - callback({"streamError": err}, outputPath); - }); + this.outputDir = path.normalize(argv.o || inputDir); + this.outputFile = null; + this.outputPath = null; - return outputStream; - }; + this.pdfParser = null; + this.curCLI = curCLI; + } - let _generateMergedTextBlocksStream = function(callback) { - let outputStream = _createOutputStream.call(this, this.outputPath.replace(".json", ".merged.json"), callback); + //private methods + #continue(callback, err) { + if (typeof callback === "function") + callback(err); + } + + #onPdfParserError(evtData, callback) { + this.curCLI.addResultCount(evtData.parserError); + this.#continue(callback, evtData.parserError); + } + + #generateMergedTextBlocksStream(callback) { + const outputStream = ParserStream.createOutputStream(this.outputPath.replace(".json", ".merged.json"), callback); this.pdfParser.getMergedTextBlocksStream().pipe(new StringifyStream()).pipe(outputStream); - }; + } - let _generateRawTextContentStream = function(callback) { - let outputStream = _createOutputStream.call(this, this.outputPath.replace(".json", ".content.txt"), callback); + #generateRawTextContentStream(callback) { + const outputStream = ParserStream.createOutputStream(this.outputPath.replace(".json", ".content.txt"), callback); this.pdfParser.getRawTextContentStream().pipe(outputStream); - }; + } - let _generateFieldsTypesStream = function(callback) { - let outputStream = _createOutputStream.call(this, this.outputPath.replace(".json", ".fields.json"), callback); + #generateFieldsTypesStream(callback) { + const outputStream = ParserStream.createOutputStream(this.outputPath.replace(".json", ".fields.json"), callback); this.pdfParser.getAllFieldsTypesStream().pipe(new StringifyStream()).pipe(outputStream); - }; + } - let _processAdditionalStreams = function(outputTasks, callback) { - if (PROCESS_FIELDS_CONTENT) {//needs to generate fields.json file - outputTasks.push(cbFunc => _generateFieldsTypesStream.call(this, cbFunc)); - } - if (PROCESS_RAW_TEXT_CONTENT) {//needs to generate content.txt file - outputTasks.push(cbFunc => _generateRawTextContentStream.call(this, cbFunc)); - } - if (PROCESS_MERGE_BROKEN_TEXT_BLOCKS) {//needs to generate json file with merged broken text blocks - outputTasks.push(cbFunc => _generateMergedTextBlocksStream.call(this, cbFunc)); - } - - if (outputTasks.length > 0) { - async.series(outputTasks, function (err, results) { - if (err) { - console.error("Additional streams Error: " + err); - } else { - console.log("Additional streams OK: \n", results); - } - _continue.call(this, callback); - }); - } - else { - _continue.call(this, callback); - } - }; + #processAdditionalStreams(callback) { + const outputTasks = []; + if (PROCESS_FIELDS_CONTENT) {//needs to generate fields.json file + outputTasks.push(cbFunc => this.#generateFieldsTypesStream(cbFunc)); + } + if (PROCESS_RAW_TEXT_CONTENT) {//needs to generate content.txt file + outputTasks.push(cbFunc => this.#generateRawTextContentStream(cbFunc)); + } + if (PROCESS_MERGE_BROKEN_TEXT_BLOCKS) {//needs to generate json file with merged broken text blocks + outputTasks.push(cbFunc => this.#generateMergedTextBlocksStream(cbFunc)); + } - let _onPrimarySuccess = function(callback) { - console.log("SUCCESS: [" + this.inputPath + "] => [" + this.outputPath + "]"); - this.curProcessor.successCount++; - _processAdditionalStreams.call(this, [], callback); - }; + let taskId = 0; + function sequenceTask() { + if (taskId < outputTasks.length) { + outputTasks[taskId]((err, ret) => { + this.curCLI.addStatusMsg(err, `[+]=> ${ret}`); + taskId++; + sequenceTask.call(this); + }); + } + else + this.#continue(callback); + } + sequenceTask.call(this); + } + + #onPrimarySuccess(callback) { + this.curCLI.addResultCount(); + this.#processAdditionalStreams(callback); + } - let _onPrimaryError = function(err, callback) { - console.error("Output Exception: [" + this.inputPath + "] => [" + this.outputPath + "]: " + err); - this.curProcessor.failedCount++; + #onPrimaryError(err, callback) { + this.curCLI.addResultCount(err); callback(err); - }; + } - let _parseOnePDFStream = function(callback) { + #parseOnePDFStream(callback) { this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT); - this.pdfParser.on("pdfParser_dataError", evtData => _onPdfParserError.call(this, evtData, callback)); + this.pdfParser.on("pdfParser_dataError", evtData => this.#onPdfParserError(evtData, callback)); - let outputStream = fs.createWriteStream(this.outputPath); - outputStream.on('finish', () => _onPrimarySuccess.call(this, callback)); - outputStream.on('error', err => _onPrimaryError.call(this, callback)); + const outputStream = fs.createWriteStream(this.outputPath); + outputStream.on('finish', () => this.#onPrimarySuccess(callback)); + outputStream.on('error', err => this.#onPrimaryError(err, callback)); - console.log("Transcoding " + this.inputFile + " to - " + this.outputPath); + nodeUtil.p2jinfo("Transcoding Stream " + this.inputFile + " to - " + this.outputPath); let inputStream = fs.createReadStream(this.inputPath, {bufferSize: 64 * 1024}); - inputStream.pipe(this.pdfParser).pipe(new StringifyStream()).pipe(outputStream); + inputStream.pipe(this.pdfParser.createParserStream()).pipe(new StringifyStream()).pipe(outputStream); }; - let _parseOnePDF = function(callback) { + #parseOnePDF(callback) { this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT); - this.pdfParser.on("pdfParser_dataError", evtData => _onPdfParserError.call(this, evtData, callback)); + this.pdfParser.on("pdfParser_dataError", evtData => this.#onPdfParserError(evtData, callback)); this.pdfParser.on("pdfParser_dataReady", evtData => { fs.writeFile(this.outputPath, JSON.stringify(evtData), err => { if(err) { - _onPrimaryError.call(this, callback); + this.#onPrimaryError(err, callback); } else { - _onPrimarySuccess.call(this, callback); + this.#onPrimarySuccess(callback); } }); }); - console.log("Transcoding " + this.inputFile + " to - " + this.outputPath); + nodeUtil.p2jinfo("Transcoding File " + this.inputFile + " to - " + this.outputPath); this.pdfParser.loadPDF(this.inputPath, VERBOSITY_LEVEL); - }; + } - // constructor - let cls = function (inputDir, inputFile, curProcessor) { - // public, this instance copies - this.inputDir = path.normalize(inputDir); - this.inputFile = inputFile; - this.inputPath = this.inputDir + path.sep + this.inputFile; - - this.outputDir = path.normalize(argv.o || inputDir); - this.outputFile = null; - this.outputPath = null; - - this.pdfParser = null; - this.curProcessor = curProcessor; - }; - - cls.prototype.validateParams = function() { + //public methods + validateParams() { let retVal = null; if (!fs.existsSync(this.inputDir)) @@ -189,18 +165,18 @@ let PDF2JSONUtil = (function () { retVal = "Input error: output directory doesn't exist - " + this.outputDir + "."; if (retVal != null) { - this.curProcessor.failedCount += 1; + this.curCLI.addResultCount(retVal); return retVal; } - let inExtName = path.extname(this.inputFile).toLowerCase(); + const inExtName = path.extname(this.inputFile).toLowerCase(); if (inExtName !== '.pdf') retVal = "Input error: input file name doesn't have pdf extention - " + this.inputFile + "."; else { this.outputFile = path.basename(this.inputPath, inExtName) + ".json"; this.outputPath = path.normalize(this.outputDir + "/" + this.outputFile); if (fs.existsSync(this.outputPath)) - console.log("\nOutput file will be replaced - " + this.outputPath); + nodeUtil.p2jwarn("Output file will be replaced - " + this.outputPath); else { let fod = fs.openSync(this.outputPath, "wx"); if (!fod) @@ -213,9 +189,9 @@ let PDF2JSONUtil = (function () { } return retVal; - }; + } - cls.prototype.destroy = function() { + destroy() { this.inputDir = null; this.inputFile = null; this.inputPath = null; @@ -226,62 +202,72 @@ let PDF2JSONUtil = (function () { this.pdfParser.destroy(); } this.pdfParser = null; - this.curProcessor = null; - }; + this.curCLI = null; + } - cls.prototype.processFile = function(callback) { + processFile(callback) { let validateMsg = this.validateParams(); if (!!validateMsg) { - _continue.call(this, callback, validateMsg); + this.#continue(callback, validateMsg); } else if (PROCESS_WITH_STREAM) { - _parseOnePDFStream.call(this, callback); + this.#parseOnePDFStream(callback); } else { - _parseOnePDF.call(this, callback); + this.#parseOnePDF(callback); } - }; + } + + getOutputFile = function() { + return path.join(this.outputDir, this.outputFile); + } +} - return cls; -})(); +class PDFCLI { + inputCount = 0; + successCount = 0; + failedCount = 0; + warningCount = 0; + statusMsgs = []; -let PDFProcessor = (function () { // constructor - let cls = function () { + constructor() { this.inputCount = 0; this.successCount = 0; this.failedCount = 0; this.warningCount = 0; + this.statusMsgs = []; this.p2j = null; - }; + } - cls.prototype.initialize = function(){ + initialize() { console.time(_PRO_TIMER); + nodeUtil.verbosity(VERBOSITY_LEVEL); let retVal = true; try { - if (_.has(argv, 'v')) { + if (ONLY_SHOW_VERSION) { console.log(pkInfo.version); retVal = false; } - else if (_.has(argv, 'h')) { - optimist.showHelp(); + else if (ONLY_SHOW_HELP) { + yargs.showHelp(); retVal = false; } - else if (!_.has(argv, 'f')) { - optimist.showHelp(); - console.log("-f is required to specify input directory or file."); + else if (!HAS_INPUT_DIR_OR_FILE) { + yargs.showHelp(); + console.error("-f is required to specify input directory or file."); retVal = false; } } catch(e) { - console.log("Exception: " + e.message); + console.error("Exception: " + e.message); retVal = false; } return retVal; - }; + } - cls.prototype.start = function(){ + start() { if (!this.initialize()) { console.timeEnd(_PRO_TIMER); return; @@ -290,8 +276,7 @@ let PDFProcessor = (function () { try { console.log("\n" + _PRO_TIMER); - let inputStatus = fs.statSync(argv.f); - + const inputStatus = fs.statSync(INPUT_DIR_OR_FILE); if (inputStatus.isFile()) { this.processOneFile(); } @@ -303,72 +288,83 @@ let PDFProcessor = (function () { console.error("Exception: " + e.message); console.timeEnd(_PRO_TIMER); } - }; - - cls.prototype.complete = function(err) { - let statusMsg = "\n%d input files\t%d success\t%d fail\t%d warning."; - console.log(statusMsg, this.inputCount, this.successCount, this.failedCount, this.warningCount); + } + complete() { + if (this.statusMsgs.length > 0) + console.log(this.statusMsgs); + console.log(`${this.inputCount} input files\t${this.successCount} success\t${this.failedCount} fail\t${this.warningCount} warning`); process.nextTick( () => { - console.timeEnd(_PRO_TIMER); - //let exitCode = (this.inputCount === this.successCount) ? 0 : 1; - process.exit(0); + console.timeEnd(_PRO_TIMER); + // process.exit((this.inputCount === this.successCount) ? 0 : 1); }); - }; + } - cls.prototype.processOneFile = function () { - let inputDir = path.dirname(argv.f); - let inputFile = path.basename(argv.f); + processOneFile() { + const inputDir = path.dirname(INPUT_DIR_OR_FILE); + const inputFile = path.basename(INPUT_DIR_OR_FILE); this.inputCount = 1; - this.p2j = new PDF2JSONUtil(inputDir, inputFile, this); - this.p2j.processFile( data => this.complete(data) ); - }; + this.p2j = new PDFProcessor(inputDir, inputFile, this); + this.p2j.processFile( err => { + this.addStatusMsg(err, `${path.join(inputDir, inputFile)} => ${err ?? this.p2j.getOutputFile()}`); + this.complete(); + }); + } - cls.prototype.processFiles = function(inputDir, files) { - let fId = 0; - this.p2j = new PDF2JSONUtil(inputDir, files[fId], this); + processFiles(inputDir, files) { + let fId = 0; + this.p2j = new PDFProcessor(inputDir, files[fId], this); this.p2j.processFile( function processPDFFile(err) { - if (err) { - this.complete(err); + this.addStatusMsg(err, `${path.join(inputDir, files[fId])} => ${err ?? this.p2j.getOutputFile()}`); + + fId++; + if (fId >= this.inputCount) { + this.complete(); } else { - fId++; - if (fId >= this.inputCount) { - this.complete(null); + if (this.p2j) { + this.p2j.destroy(); + this.p2j = null; } - else { - if (this.p2j) { - this.p2j.destroy(); - this.p2j = null; - } - this.p2j = new PDF2JSONUtil(inputDir, files[fId], this); - this.p2j.processFile(processPDFFile.bind(this)); - } - } - }.bind(this)); - }; + this.p2j = new PDFProcessor(inputDir, files[fId], this); + this.p2j.processFile(processPDFFile.bind(this)); + } + }.bind(this) ); + } - cls.prototype.processOneDirectory = function () { - let inputDir = path.normalize(argv.f); + processOneDirectory() { + let inputDir = path.normalize(INPUT_DIR_OR_FILE); fs.readdir(inputDir, (err, files) => { - let _iChars = "!@#$%^&*()+=[]\\\';,/{}|\":<>?~`.-_ "; - let pdfFiles = files.filter( file => file.substr(-4).toLowerCase() === '.pdf' && _iChars.indexOf(file.substr(0,1)) < 0 ); - - this.inputCount = pdfFiles.length; - if (this.inputCount > 0) { - this.processFiles(inputDir, pdfFiles); + if (err) { + this.addStatusMsg(true, `[${inputDir}] - ${err.toString()}`); + this.complete(); } else { - console.log("No PDF files found. [" + inputDir + "]."); - this.complete(null); + const _iChars = "!@#$%^&*()+=[]\\\';,/{}|\":<>?~`.-_ "; + const pdfFiles = files.filter( file => file.substr(-4).toLowerCase() === '.pdf' && _iChars.indexOf(file.substr(0,1)) < 0 ); + + this.inputCount = pdfFiles.length; + if (this.inputCount > 0) { + this.processFiles(inputDir, pdfFiles); + } + else { + this.addStatusMsg(true, `[${inputDir}] - No PDF files found`); + this.complete(); + } } }); - }; + } + + addStatusMsg(error, oneMsg) { + this.statusMsgs.push(error ? `✗ Error - ${oneMsg}` : `✓ Success - ${oneMsg}`); + } - return cls; -})(); + addResultCount(error) { + (error ? this.failedCount++ : this.successCount++); + } +} -module.exports = PDFProcessor; +module.exports = PDFCLI; diff --git a/lib/p2jcmdarg.js b/lib/p2jcmdarg.js new file mode 100644 index 00000000..c7cc0084 --- /dev/null +++ b/lib/p2jcmdarg.js @@ -0,0 +1,136 @@ +class CLIArgParser { + args = []; + #aliases = {}; + + #usage = ""; + #argv = null; + + // constructor + constructor(args) { + if (Array.isArray(args)) + this.args = args; + } + + usage(usageMsg) { + this.#usage = usageMsg + '\n\nOptions:\n'; + return this; + } + + alias(key, name, description) { + this.#aliases[key] = {name, description}; + return this; + } + + showHelp() { + let helpMsg = this.#usage; + for (const [key, value] of Object.entries(this.#aliases)) { + helpMsg += `-${key},--${value.name}\t ${value.description}\n`; + } + console.log(helpMsg); + } + + get argv() { + return this.#argv ? this.#argv : this.#parseArgv(); + } + + static isNumber (x) { + if (typeof x === 'number') + return true; + if (/^0x[0-9a-f]+$/i.test(x)) + return true; + return /^[-+]?(?:\d+(?:\.\d*)?|\.\d+)(e[-+]?\d+)?$/.test(x); + } + + #setArg(key, val, argv) { + const value = CLIArgParser.isNumber(val) ? Number(val) : val; + this.#setKey(argv, key.split('.'), value); + + const aliasKey = (key in this.#aliases) ? [this.#aliases[key].name] : []; + if (aliasKey.length < 1) { + for (const [akey, avalue] of Object.entries(this.#aliases)) { + if (key === avalue.name) { + aliasKey.push(akey); + break; + } + } + } + aliasKey.forEach(x => this.#setKey(argv, x.split('.'), value)); + } + + #setKey(obj, keys, value) { + let o = obj; + for (let i = 0; i < keys.length-1; i++) { + let key = keys[i]; + if (key === '__proto__') return; + if (o[key] === undefined) o[key] = {}; + if (o[key] === Object.prototype || o[key] === Number.prototype + || o[key] === String.prototype) o[key] = {}; + if (o[key] === Array.prototype) o[key] = []; + o = o[key]; + } + + let key = keys[keys.length - 1]; + if (key === '__proto__') return; + if (o === Object.prototype || o === Number.prototype + || o === String.prototype) o = {}; + if (o === Array.prototype) o = []; + if (o[key] === undefined) { + o[key] = value; + } + else if (Array.isArray(o[key])) { + o[key].push(value); + } + else { + o[key] = [ o[key], value ]; + } + } + + #parseArgv() { + let aliases=this.#aliases, args = this.args; + let argv = {}; + + for (let i = 0; i < args.length; i++) { + let arg = args[i]; + + if (/^--.+/.test(arg)) { + let key = arg.match(/^--(.+)/)[1]; + let next = args[i + 1]; + if (next !== undefined && !/^-/.test(next)) { + this.#setArg(key, next, argv); + i++; + } + else if (/^(true|false)$/.test(next)) { + this.#setArg(key, next === 'true', argv); + i++; + } + else { + this.#setArg(key, true, argv); + } + } + else if (/^-[^-]+/.test(arg)) { + let key = arg.slice(-1)[0]; + if (key !== '-') { + if (args[i+1] && !/^(-|--)[^-]/.test(args[i+1])) { + this.#setArg(key, args[i+1], argv); + i++; + } + else if (args[i+1] && /^(true|false)$/.test(args[i+1])) { + this.#setArg(key, args[i+1] === 'true', argv); + i++; + } + else { + this.#setArg(key, true, argv); + } + } + } + else { + console.warn("Unknow CLI options:", arg); + } + } + + this.#argv = argv; + return argv; + } +} + +module.exports = new CLIArgParser(process.argv.slice(2)); \ No newline at end of file diff --git a/lib/parserstream.js b/lib/parserstream.js new file mode 100644 index 00000000..f54ef621 --- /dev/null +++ b/lib/parserstream.js @@ -0,0 +1,84 @@ +const {Transform, Readable} = require("stream"), + fs = require('fs'); + +class ParserStream extends Transform { + static createContentStream(jsonObj) { + const rStream = new Readable({objectMode: true}); + rStream.push(jsonObj); + rStream.push(null); + return rStream; + } + + static createOutputStream(outputPath, callback) { + const outputStream = fs.createWriteStream(outputPath); + outputStream.on('finish', () => { + callback(null, outputPath); + }); + outputStream.on('error', err => { + callback({"streamError": err}, outputPath); + }); + + return outputStream; + } + + #pdfParser = null; + #chunks = []; + #parsedData = {Pages:[]}; + #_flush_callback = null; + + constructor(pdfParser, options) { + super(options); + this.#pdfParser = pdfParser; + + this.#chunks = []; + + // this.#pdfParser.on("pdfParser_dataReady", evtData => { + // this.push(evtData); + // this.#_flush_callback(); + // this.emit('end', null); + // }); + this.#pdfParser.on("readable", meta => this.#parsedData = {...meta, Pages:[]}); + this.#pdfParser.on("data", page => { + if (!page) { + this.push(this.#parsedData); + this.#_flush_callback(); + } + else + this.#parsedData.Pages.push(page); + }); + } + + //implements transform stream + _transform(chunk, enc, callback) { + this.#chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk, enc)); + callback(); + } + + _flush(callback) { + this.#_flush_callback = callback; + this.#pdfParser.parseBuffer(Buffer.concat(this.#chunks)); + } + + _destroy() { + super.removeAllListeners(); + this.#pdfParser = null; + this.#chunks = []; + } +} + + +class StringifyStream extends Transform { + constructor(options) { + super(options); + + this._readableState.objectMode = false; + this._writableState.objectMode = true; + } + + _transform(obj, encoding, callback){ + this.push(JSON.stringify(obj)); + callback(); + } +} + +module.exports = {ParserStream, StringifyStream}; \ No newline at end of file diff --git a/lib/pdf.js b/lib/pdf.js index fe1c35ca..4dc131d0 100644 --- a/lib/pdf.js +++ b/lib/pdf.js @@ -1,499 +1,469 @@ -'use strict'; - -let nodeUtil = require("util"), - nodeEvents = require("events"), - fs = require('fs'), - _ = require('lodash'), - DOMParser = require('xmldom').DOMParser, - ImageData = require('./imagedata.js'), - PDFCanvas = require('./pdfcanvas.js'), - PDFUnit = require('./pdfunit.js'), - PDFField = require('./pdffield.js'), - PDFAnno = require('./pdfanno.js'), - Image = require('./pdfimage.js'), - pkInfo = require('../package.json'), - PDFFont = require('./pdffont'); +const nodeUtil = require("util"), + { EventEmitter } = require("events"), + { Blob } = require("buffer"), + fs = require("fs"), + DOMParser = require("@xmldom/xmldom").DOMParser, + PDFCanvas = require("./pdfcanvas"), + PDFUnit = require("./pdfunit"), + PDFField = require("./pdffield"), + PDFAnno = require("./pdfanno"), + Image = require("./pdfimage"), + pkInfo = require("../package.json"), + PDFFont = require("./pdffont"); const _pdfjsFiles = [ - 'shared/util.js', - 'shared/colorspace.js', - 'shared/pattern.js', - 'shared/function.js', - 'shared/annotation.js', - - 'core/core.js', - 'core/obj.js', - 'core/charsets.js', - 'core/crypto.js', - 'core/evaluator.js', - 'core/fonts.js', - 'core/font_renderer.js', - 'core/glyphlist.js', - 'core/image.js', - 'core/metrics.js', - 'core/parser.js', - 'core/stream.js', - 'core/worker.js', - 'core/jpx.js', - 'core/jbig2.js', - 'core/bidi.js', - 'core/jpg.js', - 'core/chunked_stream.js', - 'core/pdf_manager.js', - 'core/cmap.js', - 'core/cidmaps.js', - - 'display/canvas.js', - 'display/font_loader.js', - 'display/metadata.js', - 'display/api.js' + "shared/util.js", + "shared/colorspace.js", + "shared/pattern.js", + "shared/function.js", + "shared/annotation.js", + + "core/core.js", + "core/obj.js", + "core/charsets.js", + "core/crypto.js", + "core/evaluator.js", + "core/fonts.js", + "core/font_renderer.js", + "core/glyphlist.js", + "core/image.js", + "core/metrics.js", + "core/parser.js", + "core/stream.js", + "core/worker.js", + "core/jpx.js", + "core/jbig2.js", + "core/bidi.js", + "core/jpg.js", + "core/chunked_stream.js", + "core/pdf_manager.js", + "core/cmap.js", + "core/cidmaps.js", + + "display/canvas.js", + "display/font_loader.js", + "display/metadata.js", + "display/api.js", ]; const _PARSER_SIG = `${pkInfo.name}@${pkInfo.version} [${pkInfo.homepage}]`; //////replacing HTML5 canvas with PDFCanvas (in-memory canvas) -function createScratchCanvas(width, height) { return new PDFCanvas({}, width, height); } +function createScratchCanvas(width, height) { + return new PDFCanvas({}, width, height); +} -let PDFJS = {}; -let globalScope = {console: console}; +const PDFJS = {}; +const globalScope = { console }; -let _basePath = __dirname + "/../base/"; -let _fileContent = ''; +const _basePath = __dirname + "/../base/"; +let _fileContent = ""; -_pdfjsFiles.forEach( (fieldName, idx, arr) => _fileContent += fs.readFileSync(_basePath + fieldName, 'utf8') ); +_pdfjsFiles.forEach( + (fieldName, idx, arr) => + (_fileContent += fs.readFileSync(_basePath + fieldName, "utf8")) +); eval(_fileContent); ////////////////////////////////start of helper classes -let PDFPageParser = (function () { - // private static - let _nextId = 1; - let _name = 'PDFPageParser'; - - let RenderingStates = { - INITIAL: 0, - RUNNING: 1, - PAUSED: 2, - FINISHED: 3 - }; - - let _addField = function(field) { - if (!PDFField.isFormElement(field)) - return; - - let oneField = new PDFField(field, this.viewport, this.Fields, this.Boxsets); - oneField.processField(); - }; - - // constructor - let cls = function (pdfPage, id, scale, ptiParser) { - nodeEvents.EventEmitter.call(this); - // private - let _id = _nextId++; - - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = () => _id; - this.get_name = () => _name + _id; - - // public, this instance copies - this.id = id; - this.pdfPage = pdfPage; - this.ptiParser = ptiParser; - - this.scale = scale || 1.0; - - //leave out the 2nd parameter in order to use page's default rotation (for both portrait and landscape form) - this.viewport = this.pdfPage.getViewport(this.scale); - - this.renderingState = RenderingStates.INITIAL; - - //form elements other than radio buttons and check boxes - this.Fields = []; - //form elements: radio buttons and check boxes - this.Boxsets = []; - - //public properties - Object.defineProperty(this, 'width', { - get:function () { - return PDFUnit.toFormX(this.viewport.width); - }, - enumerable:true - }); - - Object.defineProperty(this, 'height', { - get:function () { - return PDFUnit.toFormY(this.viewport.height); - }, - enumerable:true - }); - }; - // inherit from event emitter - nodeUtil.inherits(cls, nodeEvents.EventEmitter); +class PDFPageParser { + //static + static RenderingStates = { + INITIAL: 0, + RUNNING: 1, + PAUSED: 2, + FINISHED: 3, + }; + + //public + id = -1; + pdfPage = null; + ptiParser = null; + scale = 0; + viewport = null; + renderingState = -1; + + Fields = null; + Boxsets = null; + ctxCanvas = null; + + #_addField(field) { + if (!PDFField.isFormElement(field)) { + nodeUtil.p2jwarn("NOT valid form element", field); + return; + } + + const oneField = new PDFField( + field, + this.viewport, + this.Fields, + this.Boxsets + ); + oneField.processField(); + } + + // constructor + constructor(pdfPage, id, scale, ptiParser) { + // public, this instance copies + this.id = id; + this.pdfPage = pdfPage; + this.ptiParser = ptiParser; + + this.scale = scale || 1.0; + + //leave out the 2nd parameter in order to use page's default rotation (for both portrait and landscape form) + this.viewport = this.pdfPage.getViewport(this.scale); + + this.renderingState = PDFPageParser.RenderingStates.INITIAL; + + //form elements other than radio buttons and check boxes + this.Fields = []; + //form elements: radio buttons and check boxes + this.Boxsets = []; + this.ctxCanvas = {}; + } + + get width() { + return PDFUnit.toFormX(this.viewport.width); + } + get height() { + return PDFUnit.toFormY(this.viewport.height); + } + get HLines() { + return this.ctxCanvas.HLines; + } + get VLines() { + return this.ctxCanvas.VLines; + } + get Fills() { + return this.ctxCanvas.Fills; + } + get Texts() { + return this.ctxCanvas.Texts; + } + + destroy() { + this.pdfPage.destroy(); + this.pdfPage = null; + + this.ptiParser = null; + this.Fields = null; + this.Boxsets = null; + this.ctxCanvas = null; + } + + getPagePoint(x, y) { + return this.viewport.convertToPdfPoint(x, y); + } + + parsePage(callback, errorCallBack) { + if (this.renderingState !== PDFPageParser.RenderingStates.INITIAL) { + errorCallBack("Must be in new state before drawing"); + return; + } + + this.renderingState = PDFPageParser.RenderingStates.RUNNING; + + const canvas = createScratchCanvas(1, 1); + const ctx = canvas.getContext("2d"); + + function pageViewDrawCallback(error) { + this.renderingState = PDFPageParser.RenderingStates.FINISHED; + + if (error) { + console.error(error); + errorCallBack(`Error: Page ${this.id + 1}: ${error.message}`); + } else { + if (this.ptiParser) { + const extraFields = this.ptiParser.getFields(parseInt(this.id) + 1); + extraFields.forEach((field) => this.#_addField(field)); + } - cls.prototype.destroy = function() { - this.pdfPage.destroy(); - this.pdfPage = null; + this.ctxCanvas = ctx.canvas; + this.stats = this.pdfPage.stats; - this.ptiParser = null; - this.Fields = null; - this.Boxsets = null; - }; + nodeUtil.p2jinfo(`Success: Page ${this.id + 1}`); + callback(); + } + } - cls.prototype.getPagePoint = function(x, y) { - return this.viewport.convertToPdfPoint(x, y); + const renderContext = { + canvasContext: ctx, + viewport: this.viewport, }; - cls.prototype.parsePage = function(callback, errorCallBack) { - if (this.renderingState !== RenderingStates.INITIAL) - error('Must be in new state before drawing'); - - this.renderingState = RenderingStates.RUNNING; - - let canvas = createScratchCanvas(1, 1); - let ctx = canvas.getContext('2d'); - - function pageViewDrawCallback(error) { - this.renderingState = RenderingStates.FINISHED; - - if (error) { - let errMsg = 'An error occurred while rendering the page ' + (this.id + 1) + - ':\n' + error.message + - ':\n' + error.stack; - errorCallBack(errMsg); - } - else { - if (this.ptiParser) { - let extraFields = this.ptiParser.getFields(parseInt(this.id) + 1); - _.each(extraFields, _.bind(_addField, this)); - } - - _.extend(this, ctx.canvas); - this.stats = this.pdfPage.stats; - - nodeUtil.p2jinfo('page ' + (this.id + 1) + ' is rendered successfully.'); - callback(); - } - } - - let renderContext = { - canvasContext:ctx, - viewport:this.viewport - }; - - this.pdfPage.render(renderContext).then( - data => { - this.pdfPage.getAnnotations().then( - fields => { - _.each(fields, _.bind(_addField, this)); - pageViewDrawCallback.call(this, null); - }, - err => console.error("pdfPage.getAnnotations error:" + err)); - }, - err => pageViewDrawCallback.call(this, err) + this.pdfPage.render(renderContext).then( + (data) => { + this.pdfPage.getAnnotations().then( + (fields) => { + fields.forEach((field) => this.#_addField(field)); + pageViewDrawCallback.call(this, null); + }, + (err) => errorCallBack("pdfPage.getAnnotations error:" + err) ); - }; - - return cls; - -})(); + }, + (err) => pageViewDrawCallback.call(this, err) + ); + } +} ////////////////////////////////Start of Node.js Module -let PDFJSClass = (function () { - // private static - let _nextId = 1; - let _name = 'PDFJSClass'; - let _sufInfo = "_fieldInfo.xml"; - - let _getMetaDataString = function(metadata, key){ - let retVal = "unknown"; - if (metadata && metadata.has(key)) { - retVal = encodeURIComponent(metadata.get(key)); - } - return retVal; - }; - - let _getMetaDataInt = function(metadata, key){ - let retVal = _getMetaDataString(metadata, key); - retVal = parseInt(retVal); - if (retVal == null || isNaN(retVal)) - retVal = -1; - return retVal; +class PDFJSClass extends EventEmitter { + pdfDocument = null; + pages = null; + rawTextContents = null; + + needRawText = null; + + // constructor + constructor(needRawText) { + super(); + + // public, this instance copies + this.pdfDocument = null; + this.pages = []; + this.rawTextContents = []; + + this.needRawText = needRawText; + } + + raiseErrorEvent(errMsg) { + console.error(errMsg); + process.nextTick(() => this.emit("pdfjs_parseDataError", errMsg)); + // this.emit("error", errMsg); + return errMsg; + } + + raiseReadyEvent(data) { + process.nextTick(() => this.emit("pdfjs_parseDataReady", data)); + return data; + } + + parsePDFData(arrayBuffer, password) { + this.pdfDocument = null; + + const parameters = { password: password, data: arrayBuffer }; + PDFJS.getDocument(parameters).then( + (pdfDocument) => this.load(pdfDocument, 1), + (error) => this.raiseErrorEvent(error) + ); + } + + tryLoadFieldInfoXML(pdfFilePath) { + const _sufInfo = "_fieldInfo.xml"; + const fieldInfoXMLPath = pdfFilePath.replace(".pdf", _sufInfo); + if ( + fieldInfoXMLPath.indexOf(_sufInfo) < 1 || + !fs.existsSync(fieldInfoXMLPath) + ) { + return; + } + nodeUtil.p2jinfo("About to load fieldInfo XML : " + fieldInfoXMLPath); + + let PTIXmlParser = require("./ptixmlinject"); + this.ptiParser = new PTIXmlParser(); + this.ptiParser.parseXml(fieldInfoXMLPath, (err) => { + if (err) { + nodeUtil.p2jwarn("fieldInfo XML Error: " + JSON.stringify(err)); + this.ptiParser = null; + } else { + nodeUtil.p2jinfo("fieldInfo XML loaded."); + } + }); + } + + load(pdfDocument, scale) { + this.pdfDocument = pdfDocument; + + return this.loadMetaData().then( + () => this.loadPages(), + (error) => this.raiseErrorEvent("loadMetaData error: " + error) + ); + } + + loadMetaData() { + return this.pdfDocument.getMetadata().then( + (data) => { + this.documentInfo = data.info; + this.metadata = data.metadata?.metadata ?? {}; + this.parseMetaData(); + }, + (error) => this.raiseErrorEvent("pdfDocument.getMetadata error: " + error) + ); + } + + parseMetaData() { + const meta = { + Transcoder: _PARSER_SIG, + Meta: { ...this.documentInfo, Metadata: this.metadata }, }; + this.raiseReadyEvent(meta); + this.emit("readable", meta); + } + + loadPages() { + const pagesCount = this.pdfDocument.numPages; + const pagePromises = []; + for (let i = 1; i <= pagesCount; i++) + pagePromises.push(this.pdfDocument.getPage(i)); + + const pagesPromise = PDFJS.Promise.all(pagePromises); + + nodeUtil.p2jinfo("PDF loaded. pagesCount = " + pagesCount); + + return pagesPromise.then( + (promisedPages) => this.parsePage(promisedPages, 0, 1.5), + (error) => this.raiseErrorEvent("pagesPromise error: " + error) + ); + } + + parsePage(promisedPages, id, scale) { + nodeUtil.p2jinfo("start to parse page:" + (id + 1)); + + const pdfPage = promisedPages[id]; + const pageParser = new PDFPageParser(pdfPage, id, scale, this.ptiParser); + + function continueOnNextPage() { + nodeUtil.p2jinfo("complete parsing page:" + (id + 1)); + if (id === this.pdfDocument.numPages - 1) { + this.raiseReadyEvent({ Pages: this.pages }); + //v1.1.2: signal end of parsed data with null + process.nextTick(() => this.raiseReadyEvent(null)); + this.emit("data", null); + } else { + process.nextTick(() => this.parsePage(promisedPages, ++id, scale)); + } + } + + pageParser.parsePage( + (data) => { + const page = { + Width: pageParser.width, + Height: pageParser.height, + HLines: pageParser.HLines, + VLines: pageParser.VLines, + Fills: pageParser.Fills, + //needs to keep current default output format, text content will output to a separate file if '-c' command line argument is set + // Content:pdfPage.getTextContent(), + Texts: pageParser.Texts, + Fields: pageParser.Fields, + Boxsets: pageParser.Boxsets, + Images: pageParser.Images, + }; - // constructor - let cls = function (needRawText) { - nodeEvents.EventEmitter.call(this); - // private - let _id = _nextId++; - - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = () => _id; - this.get_name = () => _name + _id; - - // public, this instance copies - this.pdfDocument = null; - this.pages = []; - this.pageWidth = 0; - this.rawTextContents = []; - - this.needRawText = needRawText; - }; - // inherit from event emitter - nodeUtil.inherits(cls, nodeEvents.EventEmitter); - - cls.prototype.raiseErrorEvent = function(errMsg) { - console.error(errMsg); - process.nextTick( () => this.emit("pdfjs_parseDataError", errMsg)); - return errMsg; - }; - - cls.prototype.raiseReadyEvent = function(data) { - process.nextTick( () => this.emit("pdfjs_parseDataReady", data) ); - return data; - }; - - - cls.prototype.parsePDFData = function(arrayBuffer) { - this.pdfDocument = null; + this.pages.push(page); + this.emit("data", page); - let parameters = {password: '', data: arrayBuffer}; - PDFJS.getDocument(parameters).then( - pdfDocument => this.load(pdfDocument, 1), - error => this.raiseErrorEvent("An error occurred while parsing the PDF: " + error) - ); - }; - - cls.prototype.tryLoadFieldInfoXML = function(pdfFilePath) { - let fieldInfoXMLPath = pdfFilePath.replace(".pdf", _sufInfo); - if ((fieldInfoXMLPath.indexOf(_sufInfo) < 1) || (!fs.existsSync(fieldInfoXMLPath))) { - return; + if (this.needRawText) { + pdfPage.getTextContent().then( + (textContent) => { + this.rawTextContents.push(textContent); + nodeUtil.p2jinfo("complete parsing raw text content:" + (id + 1)); + continueOnNextPage.call(this); + }, + (error) => + this.raiseErrorEvent("pdfPage.getTextContent error: " + error) + ); + } else { + continueOnNextPage.call(this); } - nodeUtil.p2jinfo("About to load fieldInfo XML : " + fieldInfoXMLPath); - - let PTIXmlParser = require('./ptixmlinject'); - this.ptiParser = new PTIXmlParser(); - this.ptiParser.parseXml(fieldInfoXMLPath, err => { - if (err) { - nodeUtil.p2jwarn("fieldInfo XML Error: " + JSON.stringify(err)); - this.ptiParser = null; - } - else { - nodeUtil.p2jinfo("fieldInfo XML loaded."); - } - }); - }; - - cls.prototype.load = function(pdfDocument, scale) { - this.pdfDocument = pdfDocument; - - return this.loadMetaData().then( - () => this.loadPages(), - error => this.raiseErrorEvent("loadMetaData error: " + error) - ); - }; - - cls.prototype.loadMetaData = function() { - return this.pdfDocument.getMetadata().then( - data => { - this.documentInfo = data.info; - this.metadata = data.metadata; - this.parseMetaData(); - }, - error => this.raiseErrorEvent("pdfDocument.getMetadata error: " + error) - ); - }; - - cls.prototype.parseMetaData = function() { - let info = this.documentInfo; - let metadata = this.metadata; - - let pdfTile = ""; - if (metadata && metadata.has('dc:title')) { - pdfTile = metadata.get('dc:title'); + }, + (errMsg) => this.raiseErrorEvent(errMsg) + ); + } + + getRawTextContent() { + let retVal = ""; + if (!this.needRawText) return retVal; + + this.rawTextContents.forEach((textContent, index) => { + let prevText = null; + textContent.bidiTexts.forEach((textObj, idx) => { + if (prevText) { + if (Math.abs(textObj.y - prevText.y) <= 9) { + prevText.str += textObj.str; + } else { + retVal += prevText.str + "\r\n"; + prevText = textObj; + } + } else { + prevText = textObj; } - else if (info && info['Title']) - pdfTile = info['Title']; - - let formAttr = {AgencyId:"", Name: "", MC: false, Max: 1, Parent:""}; - if (metadata) { - formAttr.AgencyId = _getMetaDataString(metadata, 'pdfx:agencyid'); - if (formAttr.AgencyId != "unknown") - pdfTile = formAttr.AgencyId; - - formAttr.Name = _getMetaDataString(metadata, 'pdfx:name'); - formAttr.MC = _getMetaDataString(metadata, 'pdfx:mc') === 'true'; - formAttr.Max = _getMetaDataInt(metadata, 'pdfx:max'); - formAttr.Parent = _getMetaDataInt(metadata, 'pdfx:parent'); + }); + if (prevText) { + retVal += prevText.str; + } + retVal += + "\r\n----------------Page (" + index + ") Break----------------\r\n"; + }); + + return retVal; + } + + getAllFieldsTypes() { + return PDFField.getAllFieldsTypes({ Pages: this.pages || [] }); + } + + getMergedTextBlocksIfNeeded() { + for (let p = 0; p < this.pages.length; p++) { + let prevText = null; + let page = this.pages[p]; + + page.Texts.sort(PDFFont.compareBlockPos); + page.Texts = page.Texts.filter((t, j) => { + let isDup = j > 0 && PDFFont.areDuplicateBlocks(page.Texts[j - 1], t); + if (isDup) { + nodeUtil.p2jinfo( + "skipped: dup text block: " + decodeURIComponent(t.R[0].T) + ); } - - this.raiseReadyEvent({Transcoder: _PARSER_SIG, Agency:pdfTile, Id: formAttr}); - }; - - cls.prototype.loadPages = function() { - let pagesCount = this.pdfDocument.numPages; - let pagePromises = []; - for (let i = 1; i <= pagesCount; i++) - pagePromises.push(this.pdfDocument.getPage(i)); - - let pagesPromise = PDFJS.Promise.all(pagePromises); - - nodeUtil.p2jinfo("PDF loaded. pagesCount = " + pagesCount); - - return pagesPromise.then( - promisedPages => this.parsePage(promisedPages, 0, 1.5), - error => this.raiseErrorEvent("pagesPromise error: " + error) - ); - }; - - cls.prototype.parsePage = function(promisedPages, id, scale) { - nodeUtil.p2jinfo("start to parse page:" + (id+1)); - - let pdfPage = promisedPages[id]; - let pageParser = new PDFPageParser(pdfPage, id, scale, this.ptiParser); - - function continueOnNextPage() { - nodeUtil.p2jinfo("complete parsing page:" + (id+1)); - if (id === (this.pdfDocument.numPages - 1) ) { - this.raiseReadyEvent({Pages:this.pages, Width: this.pageWidth}); - - //v1.1.2: signal end of parsed data with null - process.nextTick(() => this.raiseReadyEvent(null)); - } - else { - process.nextTick(() => this.parsePage(promisedPages, ++id, scale)); - } + return !isDup; + }); + + for (let i = 0; i < page.Texts.length; i++) { + let text = page.Texts[i]; + + if (prevText) { + if ( + PDFFont.areAdjacentBlocks(prevText, text) && + PDFFont.haveSameStyle(prevText, text) + ) { + let preT = decodeURIComponent(prevText.R[0].T); + let curT = decodeURIComponent(text.R[0].T); + + prevText.R[0].T += text.R[0].T; + prevText.w += text.w; + text.merged = true; + + let mergedText = decodeURIComponent(prevText.R[0].T); + nodeUtil.p2jinfo( + `merged text block: ${preT} + ${curT} => ${mergedText}` + ); + prevText = null; //yeah, only merge two blocks for now + } else { + prevText = text; + } + } else { + prevText = text; } + } - pageParser.parsePage( - data => { - if (!this.pageWidth) //get PDF width - this.pageWidth = pageParser.width; - - let page = {Height: pageParser.height, - HLines: pageParser.HLines, - VLines: pageParser.VLines, - Fills:pageParser.Fills, - //needs to keep current default output format, text content will output to a separate file if '-c' command line argument is set - // Content:pdfPage.getTextContent(), - Texts: pageParser.Texts, - Fields: pageParser.Fields, - Boxsets: pageParser.Boxsets, - Images: pageParser.Images - }; - - this.pages.push(page); - - if (this.needRawText) { - pdfPage.getTextContent().then( - textContent => { - this.rawTextContents.push(textContent); - nodeUtil.p2jinfo("complete parsing raw text content:" + (id+1)); - continueOnNextPage.call(this); - }, - error => this.raiseErrorEvent("pdfPage.getTextContent error: " + error) - ); - } - else { - continueOnNextPage.call(this); - } - }, - errMsg => this.raiseErrorEvent("parsePage error:" + errMsg) - ); - }; + page.Texts = page.Texts.filter((t) => !t.merged); + } - cls.prototype.getRawTextContent = function() { - let retVal = ""; - if (!this.needRawText) - return retVal; - - _.each(this.rawTextContents, function(textContent, index) { - let prevText = null; - _.each(textContent.bidiTexts, function(textObj, idx) { - if (prevText) { - if (Math.abs(textObj.y - prevText.y) <= 9) { - prevText.str += textObj.str; - } - else { - retVal += prevText.str + "\r\n"; - prevText = textObj; - } - } - else { - prevText = textObj; - } - - }); - if (prevText) { - retVal += prevText.str; - } - retVal += "\r\n----------------Page (" + index + ") Break----------------\r\n"; - }); - - return retVal; - }; + return { Pages: this.pages }; + } - cls.prototype.getAllFieldsTypes = function() { - return PDFField.getAllFieldsTypes({Pages:this.pages || [], Width: this.pageWidth}); - }; - - cls.prototype.getMergedTextBlocksIfNeeded = function() { - for (let p = 0; p < this.pages.length; p++) { - let prevText = null; - let page = this.pages[p]; - - page.Texts.sort(PDFFont.compareBlockPos); - page.Texts = page.Texts.filter( (t, j) => { - let isDup = (j > 0) && PDFFont.areDuplicateBlocks(page.Texts[j-1], t); - if (isDup) { - nodeUtil.p2jinfo("skipped: dup text block: " + decodeURIComponent(t.R[0].T)); - } - return !isDup; - }); - - for (let i = 0; i < page.Texts.length; i++) { - let text = page.Texts[i]; - - if (prevText) { - if (PDFFont.areAdjacentBlocks(prevText, text) && PDFFont.haveSameStyle(prevText, text)) { - let preT = decodeURIComponent(prevText.R[0].T); - let curT = decodeURIComponent(text.R[0].T); - - prevText.R[0].T += text.R[0].T; - prevText.w += text.w; - text.merged = true; - - let mergedText = decodeURIComponent(prevText.R[0].T); - nodeUtil.p2jinfo(`merged text block: ${preT} + ${curT} => ${mergedText}`); - prevText = null; //yeah, only merge two blocks for now - } - else { - prevText = text; - } - } - else { - prevText = text; - } - } - - page.Texts = page.Texts.filter( t => !t.merged); - } - - return {Pages:this.pages, Width: this.pageWidth}; - }; - - cls.prototype.destroy = function() { - this.removeAllListeners(); - - if (this.pdfDocument) - this.pdfDocument.destroy(); - this.pdfDocument = null; - - this.pages = null; - this.rawTextContents = null; - }; + destroy() { + this.removeAllListeners(); - return cls; -})(); + if (this.pdfDocument) this.pdfDocument.destroy(); + this.pdfDocument = null; -module.exports = PDFJSClass; + this.pages = null; + this.rawTextContents = null; + } +} +module.exports = PDFJSClass; diff --git a/lib/pdfanno.js b/lib/pdfanno.js index 594561bc..b610278d 100644 --- a/lib/pdfanno.js +++ b/lib/pdfanno.js @@ -1,179 +1,157 @@ -'use strict'; - -let nodeUtil = require("util"), - _ = require("lodash"), - PDFUnit = require('./pdfunit.js'); - -let PDFAnno = (function PDFAnnoClosure() { - //BEGIN - MQZ 9/19/2012. Helper functions to parse acroForm elements - function setupRadioButton(annotation, item) { - let asName = ''; - //PDF Spec p.689: parent item's DV holds the item's value that is selected by default - let po = annotation.get('Parent'); - if (po) { - po.forEach(function(key, val){ - if (key === 'DV') { - asName = val.name || ''; - } - else if (key === 'TU') { - //radio buttons use the alternative text from the parent - item.alternativeText = val; - } else if( key == 'TM') { - item.alternativeID = val; - } - }); - } - - //PDF Spec p.606: get appearance dictionary - let ap = annotation.get('AP'); - //PDF Spec p.614 get normal appearance - let nVal = ap.get('N'); - //PDF Spec p.689 - nVal.forEach(function (key, value) { - if (key.toLowerCase() != "off") { - //value if selected - item.value = key; //export value - item.checked = (key === asName); //initial selection state +const nodeUtil = require("util"); + +//BEGIN - MQZ 9/19/2012. Helper functions to parse acroForm elements +function setupRadioButton(annotation, item) { + let asName = ''; + //PDF Spec p.689: parent item's DV holds the item's value that is selected by default + let po = annotation.get('Parent'); + if (po) { + po.forEach(function(key, val){ + if (key === 'DV') { + asName = val.name || ''; + } + else if (key === 'TU') { + //radio buttons use the alternative text from the parent + item.alternativeText = val; + } else if( key == 'TM') { + item.alternativeID = val; } }); - - if (!item.value) - item.value = "off"; } - function setupPushButton(annotation, item) { - //button label: PDF Spec p.640 - let mk = annotation.get('MK'); - item.value = mk.get('CA') || ''; - - //button action: url when mouse up: PDF Spec:p.642 - item.FL = ""; - let ap = annotation.get('A'); - if (ap) { - let sp = ap.get('S'); - item.FL = ap.get(sp.name); + //PDF Spec p.606: get appearance dictionary + let ap = annotation.get('AP'); + //PDF Spec p.614 get normal appearance + let nVal = ap.get('N'); + //PDF Spec p.689 + nVal.forEach(function (key, value) { + if (key.toLowerCase() != "off") { + //value if selected + item.value = key; //export value + item.checked = (key === asName); //initial selection state } - } + }); - function setupCheckBox(annotation, item) { - //PDF Spec p.606: get appearance dictionary - let ap = annotation.get('AP'); - //PDF Spec p.614 get normal appearance - let nVal = ap.get('N'); - - //PDF Spec p.689 - let i = 0; - nVal.forEach(function (key, value) { - i++; - if (i == 1) //initial selection state - item.value = key; - }); + if (!item.value) + item.value = "off"; +} + +function setupPushButton(annotation, item) { + //button label: PDF Spec p.640 + let mk = annotation.get('MK'); + if(mk) { + item.value = mk.get('CA') || ''; } - function setupDropDown(annotation, item) { - //PDF Spec p.688 - item.value = annotation.get('Opt') || []; + //button action: url when mouse up: PDF Spec:p.642 + item.FL = ""; + let ap = annotation.get('A'); + if (ap) { + let sp = ap.get('S'); + item.FL = ap.get(sp.name); + } +} + +function setupCheckBox(annotation, item) { + //PDF Spec p.606: get appearance dictionary + let ap = annotation.get('AP'); + //PDF Spec p.614 get normal appearance + let nVal = ap.get('N'); + + //PDF Spec p.689 + let i = 0; + nVal.forEach(function (key, value) { + i++; + if (i == 1) //initial selection state + item.value = key; + }); +} + +function setupDropDown(annotation, item) { + //PDF Spec p.688 + item.value = annotation.get('Opt') || []; +} + +function setupFieldAttributes(annotation, item) { + //MQZ. Jan.03.2013. additional-actions dictionary + //PDF Spec P.648. 8.5.2. Trigger Events + let aa = annotation.get('AA'); + if (!aa) { + return; } - function setupFieldAttributes(annotation, item) { - //MQZ. Jan.03.2013. additional-actions dictionary - //PDF Spec P.648. 8.5.2. Trigger Events - let aa = annotation.get('AA'); - if (!aa) { + //PDF Spec p.651 get format dictionary + let nVal = aa.get('F'); + if (!nVal) { + nVal = aa.get('K'); + if (!nVal) return; - } + } - //PDF Spec p.651 get format dictionary - let nVal = aa.get('F'); - if (!nVal) { - nVal = aa.get('K'); - if (!nVal) - return; + nVal.forEach(function (key, value) { + if (key === "JS") { + processFieldAttribute(value, item); } + }); +} - nVal.forEach(function (key, value) { - if (key === "JS") { - processFieldAttribute(value, item); - } - }); - } - - let AFSpecial_Format = ['zip', 'zip', 'phone', 'ssn', '']; +const AFSpecial_Format = ['zip', 'zip', 'phone', 'ssn', '']; // let AFNumber_Format = ['nDec', 'sepStyle', 'negStyle', 'currStyle', 'strCurrency', 'bCurrencyPrepend']; - //– nDec is the number of places after the decimal point; - //– sepStyle is an integer denoting whether to use a separator or not. If sepStyle=0, use commas. If sepStyle=1, do not separate. - //– negStyle is the formatting used for negative numbers: 0 = MinusBlack, 1 = Red, 2 = ParensBlack, 3 = ParensRed - //– currStyle is the currency style - not used - //- strCurrency is the currency symbol - //– bCurrencyPrepend +//– nDec is the number of places after the decimal point; +//– sepStyle is an integer denoting whether to use a separator or not. If sepStyle=0, use commas. If sepStyle=1, do not separate. +//– negStyle is the formatting used for negative numbers: 0 = MinusBlack, 1 = Red, 2 = ParensBlack, 3 = ParensRed +//– currStyle is the currency style - not used +//- strCurrency is the currency symbol +//– bCurrencyPrepend // let AFDate_FormatEx = ["m/d", "m/d/yy", "mm/dd/yy", "mm/yy", "d-mmm", "d-mmm-yy", "dd-mmm-yy", "yymm-dd", "mmm-yy", "mmmm-yy", "mmm d, yyyy", "mmmm d, yyyy", "m/d/yy h:MM tt", "m/d/yy HH:MM"]; - function processFieldAttribute(jsFuncName, item) { - if (item.hasOwnProperty('TName')) - return; +function processFieldAttribute(jsFuncName, item) { + if (item.hasOwnProperty('TName')) + return; - let vParts = jsFuncName.split('('); - if (vParts.length !== 2) - return; + if(!jsFuncName.split) + return; + + let vParts = jsFuncName.split('('); + if (vParts.length !== 2) + return; - let funcName = vParts[0]; - let funcParam = vParts[1].split(')')[0]; + let funcName = vParts[0]; + let funcParam = vParts[1].split(')')[0]; - switch (funcName) { - case 'AFSpecial_Format': - item.TName = AFSpecial_Format[Number(funcParam)]; - break; - case 'AFNumber_Format': + switch (funcName) { + case 'AFSpecial_Format': + item.TName = AFSpecial_Format[Number(funcParam)]; + break; + case 'AFNumber_Format': // nfs = funcParam.split(','); //set the Money fields to use the Number type with no decimal places after, no commas, and bCurrencyPrepend is set as true; (o use a negative sign (fits the PDF layout and our print formatting as well). // if (nfs[0] === '0' && nfs[1] === '1' && nfs[5]) // item.TName = 'money'; // else - item.TName = 'number'; - break; - case 'AFDate_FormatEx': - item.TName = 'date'; - item.MV = funcParam.replace(/^'+|^"+|'+$|"+$/g,''); //mask value - break; - case 'AFSpecial_KeystrokeEx': //special format: "arbitrary mask" - let maskValue = funcParam.replace(/^'+|^"+|'+$|"+$/g,''); //mask value - if ((!!maskValue) && maskValue.length > 0 && maskValue.length < 64) { - item.TName = 'mask'; //fixed length input - item.MV = maskValue; - } - break; - case 'AFPercent_Format': - item.TName = 'percent'; //funcParam => 2, 0, will specified how many decimal places - break; - } + item.TName = 'number'; + break; + case 'AFDate_FormatEx': + item.TName = 'date'; + item.MV = funcParam.replace(/^'+|^"+|'+$|"+$/g,''); //mask value + break; + case 'AFSpecial_KeystrokeEx': //special format: "arbitrary mask" + let maskValue = funcParam.replace(/^'+|^"+|'+$|"+$/g,''); //mask value + if ((!!maskValue) && maskValue.length > 0 && maskValue.length < 64) { + item.TName = 'mask'; //fixed length input + item.MV = maskValue; + } + break; + case 'AFPercent_Format': + item.TName = 'percent'; //funcParam => 2, 0, will specified how many decimal places + break; } +} - //END - MQZ 9/19/2012. Helper functions to parse acroForm elements - - // private static - let _nextId = 1; - let _name = 'PDFAnno'; - - // constructor - let cls = function (field, viewport, Fields, Boxsets) { - // private - let _id = _nextId++; - - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = function () { - return _id; - }; - this.get_name = function () { - return _name + _id; - }; - }; +//END - MQZ 9/19/2012. Helper functions to parse acroForm elements - cls.prototype.clean = function () { - delete this.get_id; - delete this.get_name; - }; - - cls.processAnnotation = function (annotation, item) { +class PDFAnno { + static processAnnotation(annotation, item) { if (item.fieldType == 'Btn') { //PDF Spec p.675 if (item.fieldFlags & 32768) { setupRadioButton(annotation, item); @@ -191,10 +169,11 @@ let PDFAnno = (function PDFAnnoClosure() { else if (item.fieldType == 'Tx') { setupFieldAttributes(annotation, item); } - }; - - return cls; -})(); + else { + nodeUtil.p2jwarn("Unknown fieldType: ", item); + } + } +} module.exports = PDFAnno; diff --git a/lib/pdfcanvas.js b/lib/pdfcanvas.js index 8d2032d1..2e756d48 100644 --- a/lib/pdfcanvas.js +++ b/lib/pdfcanvas.js @@ -1,108 +1,120 @@ -'use strict'; -let nodeUtil = require("util"), - _ = require('lodash'), +const nodeUtil = require("util"), PDFLine = require('./pdfline'), PDFFill = require('./pdffill'), PDFFont = require('./pdffont'), ImageData = require('./imagedata'); -(function () { - // private static - let _nextId = 1; - let _name = 'PDFCanvas'; - - // alias some functions to make (compiled) code shorter - let m = Math; - let mr = m.round; - let ms = m.sin; - let mc = m.cos; - let abs = m.abs; - let sqrt = m.sqrt; - - // precompute "00" to "FF" - let dec2hex = []; - for (let i = 0; i < 16; i++) { - for (let j = 0; j < 16; j++) { - dec2hex[i * 16 + j] = i.toString(16) + j.toString(16); - } - } +// alias some functions to make (compiled) code shorter +const {round: mr, sin: ms, cos: mc, abs, sqrt} = Math; - function createMatrixIdentity() { - return [ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1] - ]; +// precompute "00" to "FF" +const dec2hex = []; +for (let i = 0; i < 16; i++) { + for (let j = 0; j < 16; j++) { + dec2hex[i * 16 + j] = i.toString(16) + j.toString(16); } +} - function matrixMultiply(m1, m2) { - let result = createMatrixIdentity(); +function createMatrixIdentity() { + return [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1] + ]; +} - for (let x = 0; x < 3; x++) { - for (let y = 0; y < 3; y++) { - let sum = 0; +function matrixMultiply(m1, m2) { + let result = createMatrixIdentity(); - for (let z = 0; z < 3; z++) { - sum += m1[x][z] * m2[z][y]; - } + for (let x = 0; x < 3; x++) { + for (let y = 0; y < 3; y++) { + let sum = 0; - result[x][y] = sum; - } - } - return result; - } - - function copyState(o1, o2) { - o2.fillStyle = o1.fillStyle; - o2.lineCap = o1.lineCap; - o2.lineJoin = o1.lineJoin; - o2.lineWidth = o1.lineWidth; - o2.miterLimit = o1.miterLimit; - o2.shadowBlur = o1.shadowBlur; - o2.shadowColor = o1.shadowColor; - o2.shadowOffsetX = o1.shadowOffsetX; - o2.shadowOffsetY = o1.shadowOffsetY; - o2.strokeStyle = o1.strokeStyle; - o2.globalAlpha = o1.globalAlpha; - o2.arcScaleX_ = o1.arcScaleX_; - o2.arcScaleY_ = o1.arcScaleY_; - o2.lineScale_ = o1.lineScale_; - o2.dashArray = o1.dashArray; - } - - function processStyle(styleString) { - let str, alpha = 1; - - styleString = String(styleString); - if (styleString.substring(0, 3) == 'rgb') { - let start = styleString.indexOf('(', 3); - let end = styleString.indexOf(')', start + 1); - let guts = styleString.substring(start + 1, end).split(','); - - str = '#'; - for (let i = 0; i < 3; i++) { - str += dec2hex[Number(guts[i])]; + for (let z = 0; z < 3; z++) { + sum += m1[x][z] * m2[z][y]; } - if (guts.length == 4 && styleString.substr(3, 1) == 'a') { - alpha = guts[3]; - } - } else { - str = styleString; + result[x][y] = sum; + } + } + return result; +} + +function copyState(o1, o2) { + o2.fillStyle = o1.fillStyle; + o2.lineCap = o1.lineCap; + o2.lineJoin = o1.lineJoin; + o2.lineWidth = o1.lineWidth; + o2.miterLimit = o1.miterLimit; + o2.shadowBlur = o1.shadowBlur; + o2.shadowColor = o1.shadowColor; + o2.shadowOffsetX = o1.shadowOffsetX; + o2.shadowOffsetY = o1.shadowOffsetY; + o2.strokeStyle = o1.strokeStyle; + o2.globalAlpha = o1.globalAlpha; + o2.arcScaleX_ = o1.arcScaleX_; + o2.arcScaleY_ = o1.arcScaleY_; + o2.lineScale_ = o1.lineScale_; + o2.dashArray = o1.dashArray; +} + +function processStyle(styleString) { + let str, alpha = 1; + + styleString = String(styleString); + if (styleString.substring(0, 3) == 'rgb') { + let start = styleString.indexOf('(', 3); + let end = styleString.indexOf(')', start + 1); + let guts = styleString.substring(start + 1, end).split(','); + + str = '#'; + for (let i = 0; i < 3; i++) { + str += dec2hex[Number(guts[i])]; } - return {color:str, alpha:alpha}; + if (guts.length == 4 && styleString.substr(3, 1) == 'a') { + alpha = guts[3]; + } + } else { + str = styleString; } - function processLineCap(lineCap) { - switch (lineCap) { - case 'butt': - return 'flat'; - case 'round': - return 'round'; - case 'square': - default: - return 'square'; + return {color:str, alpha:alpha}; +} + +function processLineCap(lineCap) { + switch (lineCap) { + case 'butt': + return 'flat'; + case 'round': + return 'round'; + case 'square': + default: + return 'square'; + } +} + +// Helper function that takes the already fixed cordinates. +function bezierCurveToHelper(self, cp1, cp2, p) { + self.currentPath_.push({ + type:'bezierCurveTo', + cp1x:cp1.x, + cp1y:cp1.y, + cp2x:cp2.x, + cp2y:cp2.y, + x:p.x, + y:p.y + }); + self.currentX_ = p.x; + self.currentY_ = p.y; +} + +function matrixIsFinite(m) { + for (let j = 0; j < 3; j++) { + for (let k = 0; k < 2; k++) { + if (!isFinite(m[j][k]) || isNaN(m[j][k])) { + return false; + } } } @@ -135,20 +147,56 @@ let nodeUtil = require("util"), images.push(image); } - /** - * This class implements CanvasRenderingContext2D interface as described by - * the WHATWG. - * @param {HTMLElement} surfaceElement The element that the 2D context should - * be associated with - */ - function CanvasRenderingContext2D_(canvasTarget, scaledWidth, scaledHeight) { - // private - let _id = _nextId++; +function setM(ctx, m, updateLineScale) { + if (!matrixIsFinite(m)) { + return; + } + ctx.m_ = m; + + if (updateLineScale) { + // Get the line scale. + // Determinant of this.m_ means how much the area is enlarged by the + // transformation. So its square root can be used as a scale factor + // for width. + let det = m[0][0] * m[1][1] - m[0][1] * m[1][0]; + ctx.lineScale_ = sqrt(abs(det)); + } +} - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = function() { return _id; }; - this.get_name = function() { return _name + _id; }; +class CanvasPattern_ { + constructor() { + } +} +// Gradient / Pattern Stubs +class CanvasGradient_ { + constructor(aType) { + this.type_ = aType; + this.x0_ = 0; + this.y0_ = 0; + this.r0_ = 0; + this.x1_ = 0; + this.y1_ = 0; + this.r1_ = 0; + this.colors_ = []; + } + addColorStop(aOffset, aColor) { + aColor = processStyle(aColor); + this.colors_.push({offset:aOffset, + color:aColor.color, + alpha:aColor.alpha}); + } +} + + +/** + * This class implements CanvasRenderingContext2D interface as described by + * the WHATWG. + * @param {HTMLElement} surfaceElement The element that the 2D context should + * be associated with + */ +class CanvasRenderingContext2D_ { + constructor(canvasTarget, scaledWidth, scaledHeight) { this.m_ = createMatrixIdentity(); this.mStack_ = []; @@ -166,13 +214,13 @@ let nodeUtil = require("util"), this.miterLimit = 1; this.globalAlpha = 1; - if (!_.has(canvasTarget, "HLines") || !_.isArray(canvasTarget.HLines)) + if (!("HLines" in canvasTarget) || !Array.isArray(canvasTarget.HLines)) canvasTarget.HLines = []; - if (!_.has(canvasTarget, "VLines") || !_.isArray(canvasTarget.VLines)) + if (!("VLines" in canvasTarget) || !Array.isArray(canvasTarget.VLines)) canvasTarget.VLines = []; - if (!_.has(canvasTarget, "Fills") || !_.isArray(canvasTarget.Fills)) + if (!("Fills" in canvasTarget) || !Array.isArray(canvasTarget.Fills)) canvasTarget.Fills = []; - if (!_.has(canvasTarget, "Texts") || !_.isArray(canvasTarget.Texts)) + if (!("Texts" in canvasTarget) || !Array.isArray(canvasTarget.Texts)) canvasTarget.Texts = []; if (!_.has(canvasTarget, "Images") || !_.isArray(canvasTarget.Images)) canvasTarget.Images = []; @@ -190,42 +238,40 @@ let nodeUtil = require("util"), } //private helper methods - let _drawPDFLine = function(p1, p2, lineWidth, color) { - let dashedLine = _.isArray(this.dashArray) && (this.dashArray.length > 1); + #drawPDFLine(p1, p2, lineWidth, color) { + let dashedLine = Array.isArray(this.dashArray) && (this.dashArray.length > 1); let pL = new PDFLine(p1.x, p1.y, p2.x, p2.y, lineWidth, color, dashedLine); pL.processLine(this.canvas); - }; + } - let _drawPDFFill = function(cp, min, max, color) { + #drawPDFFill(cp, min, max, color) { let width = max.x - min.x; let height = max.y - min.y; let pF = new PDFFill(cp.x, cp.y, width, height, color); pF.processFill(this.canvas); - }; + } - let _needRemoveRect = function(x, y, w, h) { + #needRemoveRect(x, y, w, h) { let retVal = (Math.abs(w - Math.abs(h)) < 1 && w < 13); if (retVal) { nodeUtil.p2jinfo("Skipped: tiny rect: w=" + w + ", h=" + h); } return retVal; - }; - - let contextPrototype = CanvasRenderingContext2D_.prototype; + } - contextPrototype.getContext = function(ctxType) { + getContext(ctxType) { return (ctxType === "2d") ? this : null; - }; + } - contextPrototype.setLineDash = function(lineDash) { + setLineDash(lineDash) { this.dashArray = lineDash; - }; + } - contextPrototype.getLineDash= function() { + getLineDash() { return this.dashArray; - }; + } - contextPrototype.fillText = function(text, x, y, maxWidth, fontSize) { + fillText(text, x, y, maxWidth, fontSize) { if (!text || text.trim().length < 1) return; let p = this.getCoords_(x, y); @@ -236,73 +282,59 @@ let nodeUtil = require("util"), this.currentFont.processText(p, text, maxWidth, color, fontSize, this.canvas, this.m_); }; - contextPrototype.strokeText = function(text, x, y, maxWidth) { + strokeText(text, x, y, maxWidth) { //MQZ. 10/23/2012, yeah, no hollow text for now this.fillText(text, x, y, maxWidth); - }; + } - contextPrototype.measureText = function(text) { + measureText(text) { console.warn("to be implemented: contextPrototype.measureText - ", text); let chars = text.length || 1; return {width: chars * (this.currentFont.spaceWidth || 5)}; - }; + } - contextPrototype.setFont = function(fontObj) { - if ((!!this.currentFont) && _.isFunction(this.currentFont.clean)) { + setFont(fontObj) { + if ((!!this.currentFont) && typeof(this.currentFont.clean) === "function") { this.currentFont.clean(); this.currentFont = null; } this.currentFont = new PDFFont(fontObj); - }; + } - contextPrototype.clearRect = function () { - }; + clearRect() { + console.warn("to be implemented: contextPrototype.clearRect"); + } - contextPrototype.beginPath = function () { + beginPath() { // TODO: Branch current matrix so that save/restore has no effect // as per safari docs. this.currentPath_ = []; - }; + } - contextPrototype.moveTo = function (aX, aY) { + moveTo(aX, aY) { let p = this.getCoords_(aX, aY); this.currentPath_.push({type:'moveTo', x:p.x, y:p.y}); this.currentX_ = p.x; this.currentY_ = p.y; - }; + } - contextPrototype.lineTo = function (aX, aY) { + lineTo(aX, aY) { let p = this.getCoords_(aX, aY); this.currentPath_.push({type:'lineTo', x:p.x, y:p.y}); this.currentX_ = p.x; this.currentY_ = p.y; - }; + } - contextPrototype.bezierCurveTo = function (aCP1x, aCP1y, aCP2x, aCP2y, aX, aY) { + bezierCurveTo(aCP1x, aCP1y, aCP2x, aCP2y, aX, aY) { let p = this.getCoords_(aX, aY); let cp1 = this.getCoords_(aCP1x, aCP1y); let cp2 = this.getCoords_(aCP2x, aCP2y); - bezierCurveTo(this, cp1, cp2, p); - }; - - // Helper function that takes the already fixed cordinates. - function bezierCurveTo(self, cp1, cp2, p) { - self.currentPath_.push({ - type:'bezierCurveTo', - cp1x:cp1.x, - cp1y:cp1.y, - cp2x:cp2.x, - cp2y:cp2.y, - x:p.x, - y:p.y - }); - self.currentX_ = p.x; - self.currentY_ = p.y; + bezierCurveToHelper(this, cp1, cp2, p); } - contextPrototype.quadraticCurveTo = function (aCPx, aCPy, aX, aY) { + quadraticCurveTo(aCPx, aCPy, aX, aY) { // the following is lifted almost directly from // http://developer.mozilla.org/en/docs/Canvas_tutorial:Drawing_shapes @@ -318,10 +350,10 @@ let nodeUtil = require("util"), y:cp1.y + (p.y - this.currentY_) / 3.0 }; - bezierCurveTo(this, cp1, cp2, p); - }; + bezierCurveToHelper(this, cp1, cp2, p); + } - contextPrototype.arc = function (aX, aY, aRadius, aStartAngle, aEndAngle, aClockwise) { + arc(aX, aY, aRadius, aStartAngle, aEndAngle, aClockwise) { let arcType = aClockwise ? 'at' : 'wa'; let xStart = aX + mc(aStartAngle) * aRadius; @@ -348,11 +380,10 @@ let nodeUtil = require("util"), yStart:pStart.y, xEnd:pEnd.x, yEnd:pEnd.y}); + } - }; - - contextPrototype.rect = function (aX, aY, aWidth, aHeight) { - if (_needRemoveRect.call(this, aX, aY, aWidth, aHeight)) { + rect(aX, aY, aWidth, aHeight) { + if (this.#needRemoveRect(aX, aY, aWidth, aHeight)) { return;//try to remove the rectangle behind radio buttons and checkboxes } @@ -361,10 +392,10 @@ let nodeUtil = require("util"), this.lineTo(aX + aWidth, aY + aHeight); this.lineTo(aX, aY + aHeight); this.closePath(); - }; + } - contextPrototype.strokeRect = function (aX, aY, aWidth, aHeight) { - if (_needRemoveRect.call(this, aX, aY, aWidth, aHeight)) { + strokeRect(aX, aY, aWidth, aHeight) { + if (this.#needRemoveRect(aX, aY, aWidth, aHeight)) { return;//try to remove the rectangle behind radio buttons and checkboxes } @@ -379,10 +410,10 @@ let nodeUtil = require("util"), this.stroke(); this.currentPath_ = oldPath; - }; + } - contextPrototype.fillRect = function (aX, aY, aWidth, aHeight) { - if (_needRemoveRect.call(this, aX, aY, aWidth, aHeight)) { + fillRect(aX, aY, aWidth, aHeight) { + if (this.#needRemoveRect(aX, aY, aWidth, aHeight)) { return;//try to remove the rectangle behind radio buttons and checkboxes } @@ -397,18 +428,18 @@ let nodeUtil = require("util"), this.fill(); this.currentPath_ = oldPath; - }; + } - contextPrototype.createLinearGradient = function (aX0, aY0, aX1, aY1) { + createLinearGradient(aX0, aY0, aX1, aY1) { let gradient = new CanvasGradient_('gradient'); gradient.x0_ = aX0; gradient.y0_ = aY0; gradient.x1_ = aX1; gradient.y1_ = aY1; return gradient; - }; + } - contextPrototype.createRadialGradient = function (aX0, aY0, aR0, aX1, aY1, aR1) { + createRadialGradient(aX0, aY0, aR0, aX1, aY1, aR1) { let gradient = new CanvasGradient_('gradientradial'); gradient.x0_ = aX0; gradient.y0_ = aY0; @@ -417,9 +448,9 @@ let nodeUtil = require("util"), gradient.y1_ = aY1; gradient.r1_ = aR1; return gradient; - }; + } - contextPrototype.drawImage = function(image) { + drawImage(image, var_args) { if (image instanceof CanvasRenderingContext2D_) { image.canvas.Images.forEach(function(data) { addImage(data, this.canvas.Images); @@ -429,25 +460,25 @@ let nodeUtil = require("util"), } }; - contextPrototype.putImageData = function(image) { + #putImageData(image) { addImage(image, this.canvas.Images); }; - contextPrototype.createImageData = function(width, height) { + #createImageData(width, height) { return new ImageData(width, height); }; - contextPrototype.getImageData = function (x, y, w, h) { + getImageData(x, y, w, h) { //MQZ. returns empty data buffer for now return { width:w, height:h, data:new Uint8Array(w * h * 4) }; - }; + } - contextPrototype.stroke = function (aFill) { + stroke(aFill) { if (this.currentPath_.length < 2) { return; } @@ -469,14 +500,14 @@ let nodeUtil = require("util"), case 'lineTo': if (!aFill) { //lines if (i > 0) { - _drawPDFLine.call(this, this.currentPath_[i-1], p, lineWidth, color); + this.#drawPDFLine(this.currentPath_[i-1], p, lineWidth, color); } } break; case 'close': if (!aFill) { //lines if (i > 0) { - _drawPDFLine.call(this, this.currentPath_[i-1], this.currentPath_[0], lineWidth, color); + this.#drawPDFLine(this.currentPath_[i-1], this.currentPath_[0], lineWidth, color); } } p = null; @@ -506,70 +537,43 @@ let nodeUtil = require("util"), } if (aFill) { //fill - _drawPDFFill.call(this, min, min, max, color); + this.#drawPDFFill(min, min, max, color); } - }; + } - contextPrototype.fill = function () { + fill() { this.stroke(true); - }; + } - contextPrototype.closePath = function () { + closePath() { this.currentPath_.push({type:'close'}); - }; + } /** * @private */ - contextPrototype.getCoords_ = function (aX, aY) { + getCoords_ (aX, aY) { let m = this.m_; return { x: (aX * m[0][0] + aY * m[1][0] + m[2][0]), y: (aX * m[0][1] + aY * m[1][1] + m[2][1]) }; - }; + } - contextPrototype.save = function () { + save() { let o = {}; copyState(this, o); this.aStack_.push(o); this.mStack_.push(this.m_); this.m_ = matrixMultiply(createMatrixIdentity(), this.m_); - }; + } - contextPrototype.restore = function () { + restore() { copyState(this.aStack_.pop(), this); this.m_ = this.mStack_.pop(); - }; - - function matrixIsFinite(m) { - for (let j = 0; j < 3; j++) { - for (let k = 0; k < 2; k++) { - if (!isFinite(m[j][k]) || isNaN(m[j][k])) { - return false; - } - } - } - return true; - } - - function setM(ctx, m, updateLineScale) { - if (!matrixIsFinite(m)) { - return; - } - ctx.m_ = m; - - if (updateLineScale) { - // Get the line scale. - // Determinant of this.m_ means how much the area is enlarged by the - // transformation. So its square root can be used as a scale factor - // for width. - let det = m[0][0] * m[1][1] - m[0][1] * m[1][0]; - ctx.lineScale_ = sqrt(abs(det)); - } } - contextPrototype.translate = function (aX, aY) { + translate(aX, aY) { let m1 = [ [1, 0, 0], [0, 1, 0], @@ -577,9 +581,9 @@ let nodeUtil = require("util"), ]; setM(this, matrixMultiply(m1, this.m_), false); - }; + } - contextPrototype.rotate = function (aRot) { + rotate(aRot) { let c = mc(aRot); let s = ms(aRot); @@ -590,9 +594,9 @@ let nodeUtil = require("util"), ]; setM(this, matrixMultiply(m1, this.m_), false); - }; + } - contextPrototype.scale = function (aX, aY) { + scale(aX, aY) { this.arcScaleX_ *= aX; this.arcScaleY_ *= aY; let m1 = [ @@ -602,9 +606,9 @@ let nodeUtil = require("util"), ]; setM(this, matrixMultiply(m1, this.m_), true); - }; + } - contextPrototype.transform = function (m11, m12, m21, m22, dx, dy) { + transform(m11, m12, m21, m22, dx, dy) { let m1 = [ [m11, m12, 0], [m21, m22, 0], @@ -612,9 +616,9 @@ let nodeUtil = require("util"), ]; setM(this, matrixMultiply(m1, this.m_), true); - }; + } - contextPrototype.setTransform = function (m11, m12, m21, m22, dx, dy) { + setTransform(m11, m12, m21, m22, dx, dy) { let m = [ [m11, m12, 0], [m21, m22, 0], @@ -622,47 +626,24 @@ let nodeUtil = require("util"), ]; setM(this, m, true); - }; + } /******** STUBS ********/ - contextPrototype.clip = function () { + clip() { // TODO: Implement - }; + } - contextPrototype.arcTo = function () { + arcTo() { // TODO: Implement - }; - - contextPrototype.createPattern = function () { - return new CanvasPattern_; - }; - - // Gradient / Pattern Stubs - function CanvasGradient_(aType) { - this.type_ = aType; - this.x0_ = 0; - this.y0_ = 0; - this.r0_ = 0; - this.x1_ = 0; - this.y1_ = 0; - this.r1_ = 0; - this.colors_ = []; } - CanvasGradient_.prototype.addColorStop = function (aOffset, aColor) { - aColor = processStyle(aColor); - this.colors_.push({offset:aOffset, - color:aColor.color, - alpha:aColor.alpha}); - }; - - function CanvasPattern_() { + createPattern() { + return new CanvasPattern_(); } +} - // set up externs - module.exports = CanvasRenderingContext2D_; +// set up externs +module.exports = CanvasRenderingContext2D_; // CanvasRenderingContext2D = CanvasRenderingContext2D_; // CanvasGradient = CanvasGradient_; // CanvasPattern = CanvasPattern_; - -})(); diff --git a/lib/pdfconst.js b/lib/pdfconst.js new file mode 100644 index 00000000..e3cfef39 --- /dev/null +++ b/lib/pdfconst.js @@ -0,0 +1,117 @@ +const kColors = [ + '#000000', // 0 + '#ffffff', // 1 + '#4c4c4c', // 2 + '#808080', // 3 + '#999999', // 4 + '#c0c0c0', // 5 + '#cccccc', // 6 + '#e5e5e5', // 7 + '#f2f2f2', // 8 + '#008000', // 9 + '#00ff00', // 10 + '#bfffa0', // 11 + '#ffd629', // 12 + '#ff99cc', // 13 + '#004080', // 14 + '#9fc0e1', // 15 + '#5580ff', // 16 + '#a9c9fa', // 17 + '#ff0080', // 18 + '#800080', // 19 + '#ffbfff', // 20 + '#e45b21', // 21 + '#ffbfaa', // 22 + '#008080', // 23 + '#ff0000', // 24 + '#fdc59f', // 25 + '#808000', // 26 + '#bfbf00', // 27 + '#824100', // 28 + '#007256', // 29 + '#008000', // 30 + '#000080', // Last + 1 + '#008080', // Last + 2 + '#800080', // Last + 3 + '#ff0000', // Last + 4 + '#0000ff', // Last + 5 + '#008000' // Last + 6 +]; + +const kFontFaces = [ + "quicktype,arial,helvetica,sans-serif", // 00 - QuickType - sans-serif variable font + "quicktype condensed,arial narrow,arial,helvetica,sans-serif", // 01 - QuickType Condensed - thin sans-serif variable font + "quicktypepi,quicktypeiipi", // 02 - QuickType Pi + "quicktype mono,courier new,courier,monospace", // 03 - QuickType Mono - san-serif fixed font + "ocr-a,courier new,courier,monospace", // 04 - OCR-A - OCR readable san-serif fixed font + "ocr b mt,courier new,courier,monospace" // 05 - OCR-B MT - OCR readable san-serif fixed font + ]; + + const kFontStyles = [ + // Face Size Bold Italic StyleID(Comment) + // ----- ---- ---- ----- ----------------- + [0, 6, 0, 0], //00 + [0, 8, 0, 0], //01 + [0, 10, 0, 0], //02 + [0, 12, 0, 0], //03 + [0, 14, 0, 0], //04 + [0, 18, 0, 0], //05 + [0, 6, 1, 0], //06 + [0, 8, 1, 0], //07 + [0, 10, 1, 0], //08 + [0, 12, 1, 0], //09 + [0, 14, 1, 0], //10 + [0, 18, 1, 0], //11 + [0, 6, 0, 1], //12 + [0, 8, 0, 1], //13 + [0, 10, 0, 1], //14 + [0, 12, 0, 1], //15 + [0, 14, 0, 1], //16 + [0, 18, 0, 1], //17 + [0, 6, 1, 1], //18 + [0, 8, 1, 1], //19 + [0, 10, 1, 1], //20 + [0, 12, 1, 1], //21 + [0, 14, 1, 1], //22 + [0, 18, 1, 1], //23 + [1, 6, 0, 0], //24 + [1, 8, 0, 0], //25 + [1, 10, 0, 0], //26 + [1, 12, 0, 0], //27 + [1, 14, 0, 0], //28 + [1, 18, 0, 0], //29 + [1, 6, 1, 0], //30 + [1, 8, 1, 0], //31 + [1, 10, 1, 0], //32 + [1, 12, 1, 0], //33 + [1, 14, 1, 0], //34 + [1, 18, 1, 0], //35 + [1, 6, 0, 1], //36 + [1, 8, 0, 1], //37 + [1, 10, 0, 1], //38 + [1, 12, 0, 1], //39 + [1, 14, 0, 1], //40 + [1, 18, 0, 1], //41 + [2, 8, 0, 0], //42 + [2, 10, 0, 0], //43 + [2, 12, 0, 0], //44 + [2, 14, 0, 0], //45 + [2, 18, 0, 0], //46 + [3, 8, 0, 0], //47 + [3, 10, 0, 0], //48 + [3, 12, 0, 0], //49 + [4, 12, 0, 0], //50 + [0, 9, 0, 0], //51 + [0, 9, 1, 0], //52 + [0, 9, 0, 1], //53 + [0, 9, 1, 1], //54 + [1, 9, 0, 0], //55 + [1, 9, 1, 0], //56 + [1, 9, 1, 1], //57 + [4, 10, 0, 0], //58 + [5, 10, 0, 0], //59 + [5, 12, 0, 0] //60 +]; + + +module.exports = {kColors, kFontFaces, kFontStyles}; \ No newline at end of file diff --git a/lib/pdffield.js b/lib/pdffield.js index a7a56477..be56c76f 100644 --- a/lib/pdffield.js +++ b/lib/pdffield.js @@ -1,53 +1,14 @@ -'use strict'; +const nodeUtil = require("util"), + PDFUnit = require("./pdfunit"); -let nodeUtil = require("util"), - _ = require("lodash"), - PDFUnit = require('./pdfunit.js'); +const kFBANotOverridable = 0x00000400; // indicates the field is read only by the user +const kFBARequired = 0x00000010; // indicates the field is required +const kMinHeight = 20; -let PDFField = (function PDFFieldClosure() { - 'use strict'; - // private static - let _nextId = 1; - let _name = 'PDFField'; - let _tabIndex = 0; +class PDFField { + static tabIndex = 0; - let kFBANotOverridable = 0x00000400; // indicates the field is read only by the user - let kFBARequired = 0x00000010; // indicates the field is required - let kMinHeight = 20; - - // constructor - let cls = function (field, viewport, Fields, Boxsets) { - // private - let _id = _nextId++; - - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = function() { return _id; }; - this.get_name = function() { return _name + _id; }; - - this.field = field; - this.viewport = viewport; - this.Fields = Fields; - this.Boxsets = Boxsets; - }; - - // Normalize rectangle rect=[x1, y1, x2, y2] so that (x1,y1) < (x2,y2) - // For coordinate systems whose origin lies in the bottom-left, this - // means normalization to (BL,TR) ordering. For systems with origin in the - // top-left, this means (TL,BR) ordering. - let _normalizeRect = function(rect) { - let r = rect.slice(0); // clone rect - if (rect[0] > rect[2]) { - r[0] = rect[2]; - r[2] = rect[0]; - } - if (rect[1] > rect[3]) { - r[1] = rect[3]; - r[3] = rect[1]; - } - return r; - }; - - cls.isWidgetSupported = function(field) { + static isWidgetSupported(field) { let retVal = false; switch(field.fieldType) { @@ -71,23 +32,48 @@ let PDFField = (function PDFFieldClosure() { } return retVal; - }; + } - cls.isFormElement = function(field) { + static isFormElement(field) { let retVal = false; switch(field.subtype) { - case 'Widget': retVal = cls.isWidgetSupported(field); break; + case 'Widget': retVal = PDFField.isWidgetSupported(field); break; default: nodeUtil.p2jwarn("Unsupported: field.type of " + field.subtype); break; } return retVal; - }; + } + + // constructor + constructor(field, viewport, Fields, Boxsets) { + this.field = field; + this.viewport = viewport; + this.Fields = Fields; + this.Boxsets = Boxsets; + } + + // Normalize rectangle rect=[x1, y1, x2, y2] so that (x1,y1) < (x2,y2) + // For coordinate systems whose origin lies in the bottom-left, this + // means normalization to (BL,TR) ordering. For systems with origin in the + // top-left, this means (TL,BR) ordering. + static #normalizeRect(rect) { + const r = rect.slice(0); // clone rect + if (rect[0] > rect[2]) { + r[0] = rect[2]; + r[2] = rect[0]; + } + if (rect[1] > rect[3]) { + r[1] = rect[3]; + r[3] = rect[1]; + } + return r; + } - let _getFieldPosition = function(field) { + #getFieldPosition(field) { let viewPort = this.viewport; let fieldRect = viewPort.convertToViewportRectangle(field.rect); - let rect = _normalizeRect(fieldRect); + let rect = PDFField.#normalizeRect(fieldRect); let height = rect[3] - rect[1]; if (field.fieldType === 'Tx') { @@ -108,9 +94,9 @@ let PDFField = (function PDFFieldClosure() { w: PDFUnit.toFormX(rect[2] - rect[0]), h: PDFUnit.toFormY(height) }; - }; + } - let _getFieldBaseData = function(field) { + #getFieldBaseData(field) { let attributeMask = 0; //PDF Spec p.676 TABLE 8.70 Field flags common to all field types if (field.fieldFlags & 0x00000001) { @@ -134,17 +120,17 @@ let PDFField = (function PDFFieldClosure() { anData.TM = field.alternativeID; } - return _.extend(anData, _getFieldPosition.call(this, field)); - }; + return Object.assign(anData, this.#getFieldPosition(field)); + } - let _addAlpha = function(field) { - let anData = _.extend({ + #addAlpha(field) { + const anData = Object.assign({ style: 48, T: { Name: field.TName || "alpha", TypeInfo: {} } - }, _getFieldBaseData.call(this, field)); + }, this.#getFieldBaseData(field)); if (field.MV) { //field attributes: arbitrary mask value anData.MV = field.MV; @@ -154,48 +140,45 @@ let PDFField = (function PDFFieldClosure() { } this.Fields.push(anData); - }; + } - let _addCheckBox = function(box) { - let anData = _.extend({ + #addCheckBox(box) { + const anData = Object.assign({ style: 48, T: { Name: "box", TypeInfo: {} } - }, _getFieldBaseData.call(this, box)); + }, this.#getFieldBaseData(box)); this.Boxsets.push({boxes:[anData]}); - }; + } - let _addRadioButton = function(box) { - let anData = _.extend({ + #addRadioButton(box) { + const anData = Object.assign({ style: 48, T: { Name: "box", TypeInfo: {} } - }, _getFieldBaseData.call(this, box)); + }, this.#getFieldBaseData(box)); anData.id.Id = box.value; - if (_.has(box, 'checked')) { + if ('checked' in box) { anData.checked = box.checked; } - let rdGroup = _.find(this.Boxsets, function(boxset) { - return _.has(boxset, 'id') && _.has(boxset.id, 'Id') && (boxset.id.Id === box.fullName); - }); - - if ((!!rdGroup) && (_.has(rdGroup, 'boxes'))) { + const rdGroup = this.Boxsets.filter(boxset => ('id' in boxset) && ('Id' in boxset.id) && (boxset.id.Id === box.fullName))[0]; + if ((!!rdGroup) && ('boxes' in rdGroup)) { rdGroup.boxes.push(anData); } else { this.Boxsets.push({boxes:[anData], id: { Id: box.fullName, EN: 0}}); } - }; + } - let _addLinkButton = function(field) { - let anData = _.extend({ + #addLinkButton(field) { + const anData = Object.assign({ style: 48, T: { Name: "link" @@ -203,73 +186,76 @@ let PDFField = (function PDFFieldClosure() { FL: { form: {Id: field.FL} } - }, _getFieldBaseData.call(this, field)); + }, this.#getFieldBaseData(field)); this.Fields.push(anData); - }; + } - let _addSelect = function(field) { - let anData = _.extend({ + #addSelect(field) { + const anData = Object.assign({ style: 48, T: { Name: "alpha", TypeInfo: {} } - }, _getFieldBaseData.call(this, field)); + }, this.#getFieldBaseData(field)); anData.w -= 0.5; //adjust combobox width anData.PL = {V: [], D: []}; - _.each(field.value, function(ele, idx) { - anData.PL.D.push(ele[0]); - anData.PL.V.push(ele[1]); + field.value.forEach( (ele, idx) => { + if (Array.isArray(ele)) { + anData.PL.D.push(ele[0]); + anData.PL.V.push(ele[1]); + } else { + anData.PL.D.push(ele); + anData.PL.V.push(ele); + } }); - + + // add field value to the object + if (field.fieldValue) { + anData.V = field.fieldValue; + } this.Fields.push(anData); }; - // public (every instance will share the same method, but has no access to private fields defined in constructor) - cls.prototype.processField = function () { - - this.field.TI = _tabIndex++; + // public instance methods + processField() { + this.field.TI = PDFField.tabIndex++; switch(this.field.fieldType) { - case 'Tx': _addAlpha.call(this, this.field); break; - case 'Cb': _addCheckBox.call(this, this.field); break; - case 'Rd': _addRadioButton.call(this, this.field);break; - case 'Btn':_addLinkButton.call(this, this.field); break; - case 'Ch': _addSelect.call(this, this.field); break; + case 'Tx': this.#addAlpha(this.field); break; + case 'Cb': this.#addCheckBox(this.field); break; + case 'Rd': this.#addRadioButton(this.field);break; + case 'Btn':this.#addLinkButton(this.field); break; + case 'Ch': this.#addSelect(this.field); break; } this.clean(); - }; - - cls.prototype.clean = function() { - delete this.get_id; - delete this.get_name; + } + clean() { delete this.field; delete this.viewport; delete this.Fields; delete this.Boxsets; - }; + } //static public method to generate fieldsType object based on parser result - cls.getAllFieldsTypes = function(data) { - - function isFieldReadOnly(field) { + static getAllFieldsTypes(data) { + const isFieldReadOnly = field => { return (field.AM & kFBANotOverridable) ? true : false; - } + }; - function getFieldBase(field) { + const getFieldBase = field => { return {id: field.id.Id, type: field.T.Name, calc: isFieldReadOnly(field), value: field.V || ""}; - } + }; let retVal = []; - - _.each(data.Pages, function(page) { - _.each(page.Boxsets, function(boxsets) { + data.Pages.forEach( page => { + page.Boxsets.forEach( boxsets => { if (boxsets.boxes.length > 1) { //radio button - _.each(boxsets.boxes, function(box) { + boxsets.boxes.forEach( box => { retVal.push({id: boxsets.id.Id, type: "radio", calc: isFieldReadOnly(box), value: box.id.Id}); }); } @@ -278,15 +264,12 @@ let PDFField = (function PDFFieldClosure() { } }); - _.each(page.Fields, function(field){ - retVal.push(getFieldBase(field)); - }); + page.Fields.forEach(field => retVal.push(getFieldBase(field))); + }); return retVal; - }; - - return cls; -})(); + } +} module.exports = PDFField; diff --git a/lib/pdffill.js b/lib/pdffill.js index b6e19ca1..f144aa26 100644 --- a/lib/pdffill.js +++ b/lib/pdffill.js @@ -1,51 +1,37 @@ -'use strict'; -let nodeUtil = require("util"), - _ = require("lodash"), - PDFUnit = require('./pdfunit.js'); - -let PDFFill = (function PFPLineClosure() { - 'use strict'; - // private static - let _nextId = 1; - let _name = 'PDFFill'; +const nodeUtil = require("util"), + PDFUnit = require("./pdfunit"); +class PDFFill{ // constructor - let cls = function (x, y, width, height, color) { - // private - let _id = _nextId++; - - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = function() { return _id; }; - this.get_name = function() { return _name + _id; }; - + constructor(x, y, width, height, color) { this.x = x; this.y = y; this.width = width; this.height = height; this.color = color; - }; + } - // public (every instance will share the same method, but has no access to private fields defined in constructor) - cls.prototype.processFill = function (targetData) { - let clrId = PDFUnit.findColorIndex(this.color); + processFill(targetData) { + //MQZ.07/29/2013: when color is not in color dictionary, set the original color (oc) + const clrId = PDFUnit.findColorIndex(this.color); + const colorObj = (clrId > 0 && clrId < PDFUnit.colorCount()) ? {clr: clrId} : {oc: this.color}; - let oneFill = {x:PDFUnit.toFormX(this.x), + const oneFill = {x:PDFUnit.toFormX(this.x), y:PDFUnit.toFormY(this.y), w:PDFUnit.toFormX(this.width), h:PDFUnit.toFormY(this.height), - clr: clrId}; + ...colorObj}; - //MQZ.07/29/2013: when color is not in color dictionary, set the original color (oc) - if (clrId < 0) { - oneFill = _.extend({oc: this.color}, oneFill); + + if (oneFill.w < 2 && oneFill.h < 2) { + nodeUtil.p2jinfo("Skipped: tiny fill: " + oneFill.w + " x " + oneFill.h); + return; //skip short thick lines, like PA SPP lines behinds checkbox } targetData.Fills.push(oneFill); - }; - - return cls; -})(); + } +} module.exports = PDFFill; diff --git a/lib/pdffont.js b/lib/pdffont.js index 99591048..0a43a55e 100644 --- a/lib/pdffont.js +++ b/lib/pdffont.js @@ -1,127 +1,41 @@ -'use strict'; - -let nodeUtil = require("util"), - _ = require("lodash"), - PDFUnit = require('./pdfunit.js'); - -let PDFFont = (function PFPFontClosure() { - // private static - let _nextId = 1; - let _name = 'PDFFont'; - - let _boldSubNames = ["bd", "bold", "demi", "black"]; - let _stdFonts = ["arial", "helvetica", "sans-serif ", "courier ","monospace ", "ocr "]; - - let _kFontFaces = [ - "quicktype,arial,helvetica,sans-serif", // 00 - QuickType - sans-serif variable font - "quicktype condensed,arial narrow,arial,helvetica,sans-serif", // 01 - QuickType Condensed - thin sans-serif variable font - "quicktypepi,quicktypeiipi", // 02 - QuickType Pi - "quicktype mono,courier new,courier,monospace", // 03 - QuickType Mono - san-serif fixed font - "ocr-a,courier new,courier,monospace", // 04 - OCR-A - OCR readable san-serif fixed font - "ocr b mt,courier new,courier,monospace" // 05 - OCR-B MT - OCR readable san-serif fixed font - ]; - - let _kFontStyles = [ - // Face Size Bold Italic StyleID(Comment) - // ----- ---- ---- ----- ----------------- - [0, 6, 0, 0], //00 - [0, 8, 0, 0], //01 - [0, 10, 0, 0], //02 - [0, 12, 0, 0], //03 - [0, 14, 0, 0], //04 - [0, 18, 0, 0], //05 - [0, 6, 1, 0], //06 - [0, 8, 1, 0], //07 - [0, 10, 1, 0], //08 - [0, 12, 1, 0], //09 - [0, 14, 1, 0], //10 - [0, 18, 1, 0], //11 - [0, 6, 0, 1], //12 - [0, 8, 0, 1], //13 - [0, 10, 0, 1], //14 - [0, 12, 0, 1], //15 - [0, 14, 0, 1], //16 - [0, 18, 0, 1], //17 - [0, 6, 1, 1], //18 - [0, 8, 1, 1], //19 - [0, 10, 1, 1], //20 - [0, 12, 1, 1], //21 - [0, 14, 1, 1], //22 - [0, 18, 1, 1], //23 - [1, 6, 0, 0], //24 - [1, 8, 0, 0], //25 - [1, 10, 0, 0], //26 - [1, 12, 0, 0], //27 - [1, 14, 0, 0], //28 - [1, 18, 0, 0], //29 - [1, 6, 1, 0], //30 - [1, 8, 1, 0], //31 - [1, 10, 1, 0], //32 - [1, 12, 1, 0], //33 - [1, 14, 1, 0], //34 - [1, 18, 1, 0], //35 - [1, 6, 0, 1], //36 - [1, 8, 0, 1], //37 - [1, 10, 0, 1], //38 - [1, 12, 0, 1], //39 - [1, 14, 0, 1], //40 - [1, 18, 0, 1], //41 - [2, 8, 0, 0], //42 - [2, 10, 0, 0], //43 - [2, 12, 0, 0], //44 - [2, 14, 0, 0], //45 - [2, 18, 0, 0], //46 - [3, 8, 0, 0], //47 - [3, 10, 0, 0], //48 - [3, 12, 0, 0], //49 - [4, 12, 0, 0], //50 - [0, 9, 0, 0], //51 - [0, 9, 1, 0], //52 - [0, 9, 0, 1], //53 - [0, 9, 1, 1], //54 - [1, 9, 0, 0], //55 - [1, 9, 1, 0], //56 - [1, 9, 1, 1], //57 - [4, 10, 0, 0], //58 - [5, 10, 0, 0], //59 - [5, 12, 0, 0] //60 - ]; +const nodeUtil = require("util"), + PDFUnit = require("./pdfunit"), + {kFontFaces, kFontStyles} = require("./pdfconst"); +const _boldSubNames = ["bd", "bold", "demi", "black"]; +const _stdFonts = ["arial", "helvetica", "sans-serif ", "courier ","monospace ", "ocr "]; +const DISTANCE_DELTA = 0.1; - // constructor - let cls = function (fontObj) { - // private - let _id = _nextId++; - - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = function() { return _id; }; - this.get_name = function() { return _name + _id; }; - - this.fontObj = fontObj; - let typeName = (fontObj.name || fontObj.fallbackName); +class PDFFont { + #initTypeName() { + let typeName = (this.fontObj.name || this.fontObj.fallbackName); if (!typeName) { - typeName = _kFontFaces[0]; //default font family name + typeName = kFontFaces[0]; //default font family name } typeName = typeName.toLowerCase(); - this.typeName = typeName; + return typeName; + } + + #initSubType() { + let subType = this.typeName; + let bold = false; - let subType = typeName; - let nameArray = typeName.split('+'); - if (_.isArray(nameArray) && nameArray.length > 1) { + let nameArray = this.typeName.split('+'); + if (Array.isArray(nameArray) && nameArray.length > 1) { subType = nameArray[1].split("-"); - if (_.isArray(subType) && subType.length > 1) { - if (!this.bold) { - let subName = subType[1].toLowerCase(); - this.bold = _boldSubNames.indexOf(subName) >= 0; - } + if (Array.isArray(subType) && subType.length > 1) { + let subName = subType[1].toLowerCase(); + bold = _boldSubNames.indexOf(subName) >= 0; subType = subType[0]; } } - this.subType = subType; + return {subType, bold}; + } - this.isSymbol = typeName.indexOf("symbol") > 0 || _kFontFaces[2].indexOf(this.subType) >= 0; + #initSymbol() { + let isSymbol = this.typeName.indexOf("symbol") > 0 || kFontFaces[2].indexOf(this.subType) >= 0; if (this.fontObj.isSymbolicFont) { - let mFonts = _stdFonts.filter( (oneName) => (typeName.indexOf(oneName) >= 0) ); + let mFonts = _stdFonts.filter( (oneName) => (this.typeName.indexOf(oneName) >= 0) ); if (mFonts.length > 0) { this.fontObj.isSymbolicFont = false; //lots of Arial-based font is detected as symbol in VA forms (301, 76-c, etc.) reset the flag for now @@ -129,32 +43,45 @@ let PDFFont = (function PFPFontClosure() { } } else { - if (this.isSymbol) { + if (isSymbol) { this.fontObj.isSymbolicFont = true; //text pdf: va_ind_760c nodeUtil.p2jinfo("Reset: isSymbolicFont (true) for " + this.fontObj.name); } - } + } + return isSymbol; + } + + #initSpaceWidth() { + let spaceWidth = this.fontObj.spaceWidth; + if (!spaceWidth) { + var spaceId = Array.isArray(this.fontObj.toFontChar) ? this.fontObj.toFontChar.indexOf(32) : -1; + spaceWidth = (spaceId >= 0 && Array.isArray(this.fontObj.widths)) ? this.fontObj.widths[spaceId] : 250; + } + spaceWidth = PDFUnit.toFormX(spaceWidth) / 32; + return spaceWidth; + } - this.fontSize = 1; + // constructor + constructor(fontObj) { + this.fontObj = fontObj; - this.faceIdx = 0; - this.bold = false; - this.italic = false; + this.typeName = this.#initTypeName(); - this.fontStyleId = -1; + const {subType, bold} = this.#initSubType(); + this.subType = subType; + this.bold = bold; - this.spaceWidth = fontObj.spaceWidth; - if (!this.spaceWidth) { - var spaceId = Array.isArray(fontObj.toFontChar) ? fontObj.toFontChar.indexOf(32) : -1; - this.spaceWidth = (spaceId >= 0 && Array.isArray(fontObj.widths)) ? fontObj.widths[spaceId] : 250; - } - this.spaceWidth = PDFUnit.toFormX(this.spaceWidth) / 32; - }; + this.isSymbol = this.#initSymbol(); + this.spaceWidth = this.#initSpaceWidth(); - // public static - /** sort text blocks by y then x */ - const DISTANCE_DELTA = 0.1; - cls.compareBlockPos = function(t1, t2) { + this.fontSize = 1; + this.faceIdx = 0; + this.italic = false; + this.fontStyleId = -1; + } + + /** sort text blocks by y then x */ + static compareBlockPos(t1, t2) { if (t1.y < t2.y - DISTANCE_DELTA) { return -1; } @@ -167,9 +94,9 @@ let PDFFont = (function PFPFontClosure() { } } return 1; - }; + } - cls.haveSameStyle = function(t1, t2) { + static haveSameStyle(t1, t2) { let retVal = t1.R[0].S === t2.R[0].S; if (retVal && t1.R[0].S < 0) { for (let i = 0; i < t1.R[0].TS.length; i++) { @@ -184,52 +111,61 @@ let PDFFont = (function PFPFontClosure() { } return retVal; - }; + } - cls.getSpaceThreshHold = function(t1) { + static getSpaceThreshHold(t1) { return (PDFFont.getFontSize(t1)/12) * t1.sw; - }; + } - cls.areAdjacentBlocks = function(t1, t2) { - let isInSameLine = Math.abs(t1.y - t2.y) <= DISTANCE_DELTA; - let isDistanceSmallerThanASpace = ((t2.x - t1.x - t1.w) < cls.getSpaceThreshHold(t1)); + static areAdjacentBlocks(t1, t2) { + const isInSameLine = Math.abs(t1.y - t2.y) <= DISTANCE_DELTA; + const isDistanceSmallerThanASpace = ((t2.x - t1.x - t1.w) < PDFFont.getSpaceThreshHold(t1)); return isInSameLine && isDistanceSmallerThanASpace; - }; + } - cls.getFontSize = function(textBlock) { - let sId = textBlock.R[0].S; - return (sId < 0) ? textBlock.R[0].TS[1] : _kFontStyles[sId][1]; - }; + static getFontSize(textBlock) { + const sId = textBlock.R[0].S; + return (sId < 0) ? textBlock.R[0].TS[1] : kFontStyles[sId][1]; + } - cls.areDuplicateBlocks = function(t1, t2) { - return t1.x == t2.x && t1.y == t2.y && t1.R[0].T == t2.R[0].T && cls.haveSameStyle(t1, t2); - }; + static areDuplicateBlocks(t1, t2) { + return t1.x == t2.x && t1.y == t2.y && t1.R[0].T == t2.R[0].T && PDFFont.haveSameStyle(t1, t2); + } // private - let _setFaceIndex = function() { - let fontObj = this.fontObj; + #setFaceIndex() { + const fontObj = this.fontObj; this.bold = fontObj.bold; if (!this.bold) { this.bold = this.typeName.indexOf("bold") >= 0 || this.typeName.indexOf("black") >= 0; } this.italic = fontObj.italic; // fix https://github.com/modesty/pdf2json/issues/42 + // Extended the fix for https://github.com/modesty/pdf2json/issues/42 + if (!this.italic) { + this.italic = this.typeName.indexOf("italic") >= 0 || this.typeName.indexOf("oblique") >= 0; + } + // Added detection of hybrid dual bolditalic fonts + if (((!this.bold) || (!this.italic)) && (this.typeName.indexOf("boldobl") >= 0)) { + this.bold = true; + this.italic = true; + } let typeName = this.subType; if (fontObj.isSerifFont) { - if (_kFontFaces[1].indexOf(typeName) >= 0) + if (kFontFaces[1].indexOf(typeName) >= 0) this.faceIdx = 1; } - else if (_kFontFaces[2].indexOf(this.subType) >= 0) { + else if (kFontFaces[2].indexOf(this.subType) >= 0) { this.faceIdx = 2; } else if (fontObj.isMonospace) { this.faceIdx = 3; - if (_kFontFaces[4].indexOf(typeName) >= 0) + if (kFontFaces[4].indexOf(typeName) >= 0) this.faceIdx = 4; - else if (_kFontFaces[5].indexOf(typeName) >= 0) + else if (kFontFaces[5].indexOf(typeName) >= 0) this.faceIdx = 5; } else if (fontObj.isSymbolicFont) { @@ -242,10 +178,10 @@ let PDFFont = (function PFPFontClosure() { } // nodeUtil.p2jinfo"typeName = " + typeName + " => faceIdx = " + this.faceIdx); - }; + } - let _getFontStyleIndex = function(fontSize) { - _setFaceIndex.call(this); + #getFontStyleIndex(fontSize) { + this.#setFaceIndex(); //MQZ Feb.28.2013. Adjust bold text fontsize to work around word spacing issue this.fontSize = (this.bold && (fontSize > 12)) ? fontSize + 1 : fontSize; @@ -253,7 +189,7 @@ let PDFFont = (function PFPFontClosure() { let fsa = [this.faceIdx, this.fontSize, this.bold?1:0, this.italic?1:0]; let retVal = -1; - _kFontStyles.forEach(function(element, index, list){ + kFontStyles.forEach(function(element, index, list){ if (retVal === -1) { if (element[0] === fsa[0] && element[1] === fsa[1] && element[2] === fsa[2] && element[3] === fsa[3]) { @@ -263,9 +199,9 @@ let PDFFont = (function PFPFontClosure() { }); return retVal; - }; + } - let _processSymbolicFont = function(str) { + #processSymbolicFont(str) { let retVal = str; if (!str || str.length !== 1) @@ -297,9 +233,9 @@ let PDFFont = (function PFPFontClosure() { } return retVal; - }; + } - let _textRotationAngle = function (matrix2D) { + #textRotationAngle(matrix2D) { let retVal = 0; if (matrix2D[0][0] === 0 && matrix2D[1][1] === 0) { if (matrix2D[0][1] != 0 && matrix2D[1][0] != 0) { @@ -315,50 +251,50 @@ let PDFFont = (function PFPFontClosure() { } } return retVal; - }; + } - // public (every instance will share the same method, but has no access to private fields defined in constructor) - cls.prototype.processText = function (p, str, maxWidth, color, fontSize, targetData, matrix2D) { - let text = _processSymbolicFont.call(this, str); + // public instance methods + processText(p, str, maxWidth, color, fontSize, targetData, matrix2D) { + let text = this.#processSymbolicFont(str); if (!text) { return; } - this.fontStyleId = _getFontStyleIndex.call(this, fontSize); + this.fontStyleId = this.#getFontStyleIndex(fontSize); // when this.fontStyleId === -1, it means the text style doesn't match any entry in the dictionary // adding TS to better describe text style [fontFaceId, fontSize, 1/0 for bold, 1/0 for italic]; - let TS = [this.faceIdx, this.fontSize, this.bold?1:0, this.italic?1:0]; - - let clrId = PDFUnit.findColorIndex(color); + const TS = [this.faceIdx, this.fontSize, this.bold?1:0, this.italic?1:0]; + + const clrId = PDFUnit.findColorIndex(color); + const colorObj = (clrId > 0 && clrId < PDFUnit.colorCount()) ? {clr: clrId} : {oc: this.color}; + + let textRun = { + T: this.flash_encode(text), + S: this.fontStyleId, + TS: TS + }; + const rAngle = this.#textRotationAngle(matrix2D); + if (rAngle != 0) { + nodeUtil.p2jinfo(str + ": rotated " + rAngle + " degree."); + textRun = {...textRun, RA: rAngle}; + } let oneText = {x: PDFUnit.toFormX(p.x) - 0.25, y: PDFUnit.toFormY(p.y) - 0.75, w: PDFUnit.toFixedFloat(maxWidth), sw: this.spaceWidth, //font space width, use to merge adjacent text blocks - clr: clrId, A: "left", - R: [{ - T: this.flash_encode(text), - S: this.fontStyleId, - TS: TS - }] + R: [textRun] }; //MQZ.07/29/2013: when color is not in color dictionary, set the original color (oc) - if (clrId < 0) { - oneText = _.extend({oc: color}, oneText); - } + oneText = {...oneText, ...colorObj}; - let rAngle = _textRotationAngle.call(this, matrix2D); - if (rAngle != 0) { - nodeUtil.p2jinfo(str + ": rotated " + rAngle + " degree."); - _.extend(oneText.R[0], {RA: rAngle}); - } targetData.Texts.push(oneText); - }; + } - cls.prototype.flash_encode = function(str) { + flash_encode(str) { let retVal = encodeURIComponent(str); retVal = retVal.replace("%C2%96", "-"); retVal = retVal.replace("%C2%91", "%27"); @@ -371,15 +307,12 @@ let PDFFont = (function PFPFontClosure() { retVal = retVal.replace("%C2%9B", "%C2%BB"); return retVal; - }; + } - cls.prototype.clean = function() { + clean() { this.fontObj = null; delete this.fontObj; - }; - - return cls; -})(); + } +} module.exports = PDFFont; - diff --git a/lib/pdfimage.js b/lib/pdfimage.js index 317866a8..56b9a549 100644 --- a/lib/pdfimage.js +++ b/lib/pdfimage.js @@ -1,38 +1,35 @@ -'use strict'; -////////////////////////////////start of fake image -let PDFImage = (function() { - 'use strict'; - let _src = ''; - let _onload = null; +class PDFImage { + #_src = ''; + #_onload = null; - this.__defineSetter__("onload", function(val) { - _onload = val; - }); + set onload(val) { + this.#_onload = typeof val === 'function' ? val : null; + } - this.__defineGetter__("onload", function() { - return _onload; - }); + get onload() { + return this.#_onload; + } - this.__defineSetter__("src", function(val) { - _src = val; - if (_onload) _onload(); - }); + set src(val) { + this.#_src = val; + if (this.#_onload) this.#_onload(); + } - this.__defineGetter__("src", function() { - return _src; - }); + get src() { + return this.#_src; + } - this.btoa = function(val) { + btoa(val) { if (typeof window === 'undefined') { - return (new Buffer(val, 'ascii')).toString('base64'); + return (new Buffer.from(val, 'ascii')).toString('base64'); } else if (typeof window.btoa === 'function') return window.btoa(val); return ""; - }; + } -}); +} module.exports = PDFImage; diff --git a/lib/pdfline.js b/lib/pdfline.js index c119bd6e..7278d7e2 100644 --- a/lib/pdfline.js +++ b/lib/pdfline.js @@ -1,23 +1,8 @@ -'use strict'; -let nodeUtil = require("util"), - _ = require("lodash"), - PDFUnit = require('./pdfunit.js'); - -let PDFLine = (function PFPLineClosure() { - 'use strict'; - // private static - let _nextId = 1; - let _name = 'PDFLine'; - - // constructor - let cls = function (x1, y1, x2, y2, lineWidth, color, dashed) { - // private - let _id = _nextId++; - - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = function() { return _id; }; - this.get_name = function() { return _name + _id; }; +const nodeUtil = require("util"), + PDFUnit = require("./pdfunit"); +class PDFLine { + constructor(x1, y1, x2, y2, lineWidth, color, dashed) { this.x1 = x1; this.y1 = y1; this.x2 = x2; @@ -25,33 +10,28 @@ let PDFLine = (function PFPLineClosure() { this.lineWidth = lineWidth || 1.0; this.color = color; this.dashed = dashed; - }; + } - let _setStartPoint = function(oneLine, x, y) { + #setStartPoint(oneLine, x, y) { oneLine.x = PDFUnit.toFormX(x); oneLine.y = PDFUnit.toFormY(y); - }; + } - // public (every instance will share the same method, but has no access to private fields defined in constructor) - cls.prototype.processLine = function (targetData) { - let xDelta = Math.abs(this.x2 - this.x1); - let yDelta = Math.abs(this.y2 - this.y1); - let minDelta = this.lineWidth; + processLine(targetData) { + const xDelta = Math.abs(this.x2 - this.x1); + const yDelta = Math.abs(this.y2 - this.y1); + const minDelta = this.lineWidth; let oneLine = {x:0, y:0, w: PDFUnit.toFixedFloat(this.lineWidth), l:0}; //MQZ Aug.28.2013, adding color support, using color dictionary and default to black - let clrId = PDFUnit.findColorIndex(this.color); - if (clrId < 0) { - oneLine = _.extend({oc: this.color}, oneLine); - } - else if (clrId > 0 && clrId < (PDFUnit.colorCount() - 1)) { - oneLine = _.extend({clr: clrId}, oneLine); - } + const clrId = PDFUnit.findColorIndex(this.color); + const colorObj = (clrId > 0 && clrId < PDFUnit.colorCount()) ? {clr: clrId} : {oc: this.color}; + oneLine = {...oneLine, ...colorObj}; //MQZ Aug.29 dashed line support if (this.dashed) { - oneLine = _.extend({dsh: 1}, oneLine); + oneLine = oneLine = {...oneLine, dsh: 1}; } if ((yDelta < this.lineWidth) && (xDelta > minDelta)) { //HLine @@ -62,9 +42,9 @@ let PDFLine = (function PFPLineClosure() { oneLine.l = PDFUnit.toFormX(xDelta); if (this.x1 > this.x2) - _setStartPoint.call(this, oneLine, this.x2, this.y2); + this.#setStartPoint(oneLine, this.x2, this.y2); else - _setStartPoint.call(this, oneLine, this.x1, this.y1); + this.#setStartPoint(oneLine, this.x1, this.y1); targetData.HLines.push(oneLine); } else if ((xDelta < this.lineWidth) && (yDelta > minDelta)) {//VLine @@ -75,15 +55,13 @@ let PDFLine = (function PFPLineClosure() { oneLine.l = PDFUnit.toFormY(yDelta); if (this.y1 > this.y2) - _setStartPoint.call(this, oneLine, this.x2, this.y2); + this.#setStartPoint(oneLine, this.x2, this.y2); else - _setStartPoint.call(this, oneLine, this.x1, this.y1); + this.#setStartPoint(oneLine, this.x1, this.y1); targetData.VLines.push(oneLine); } - }; - - return cls; -})(); + } +} module.exports = PDFLine; diff --git a/lib/pdfunit.js b/lib/pdfunit.js index 10890455..2b198f60 100644 --- a/lib/pdfunit.js +++ b/lib/pdfunit.js @@ -1,116 +1,57 @@ -'use strict'; -let nodeUtil = require("util"); - -let PDFUnit = (function PFPUnitClosure() { - 'use strict'; - // private static - let _nextId = 1; - let _name = 'PDFUnit'; - - let dpi = 96.0; - let gridXPerInch = 4.0; - let gridYPerInch = 4.0; - - let _pixelXPerGrid = dpi/gridXPerInch; - let _pixelYPerGrid = dpi/gridYPerInch; - let _pixelPerPoint = dpi/72; - - let kColors = [ - '#000000', // 0 - '#ffffff', // 1 - '#4c4c4c', // 2 - '#808080', // 3 - '#999999', // 4 - '#c0c0c0', // 5 - '#cccccc', // 6 - '#e5e5e5', // 7 - '#f2f2f2', // 8 - '#008000', // 9 - '#00ff00', // 10 - '#bfffa0', // 11 - '#ffd629', // 12 - '#ff99cc', // 13 - '#004080', // 14 - '#9fc0e1', // 15 - '#5580ff', // 16 - '#a9c9fa', // 17 - '#ff0080', // 18 - '#800080', // 19 - '#ffbfff', // 20 - '#e45b21', // 21 - '#ffbfaa', // 22 - '#008080', // 23 - '#ff0000', // 24 - '#fdc59f', // 25 - '#808000', // 26 - '#bfbf00', // 27 - '#824100', // 28 - '#007256', // 29 - '#008000', // 30 - '#000080', // Last + 1 - '#008080', // Last + 2 - '#800080', // Last + 3 - '#ff0000', // Last + 4 - '#0000ff', // Last + 5 - '#008000', // Last + 6 - '#000000' // Last + 7 - ]; - - // constructor - let cls = function () { - // private - let _id = _nextId++; - - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = function() { return _id; }; - this.get_name = function() { return _name + _id; }; - }; - - cls.toFixedFloat = function(fNum) { +const {kColors} = require("./pdfconst"); + +const dpi = 96.0; +const gridXPerInch = 4.0; +const gridYPerInch = 4.0; + +const _pixelXPerGrid = dpi/gridXPerInch; +const _pixelYPerGrid = dpi/gridYPerInch; +const _pixelPerPoint = dpi/72; + +class PDFUnit { + static toFixedFloat(fNum) { return parseFloat(fNum.toFixed(3)); - }; + } - cls.colorCount = function() { + static colorCount() { return kColors.length; - }; + } - cls.toPixelX = function(formX) { + static toPixelX(formX) { return Math.round(formX * _pixelXPerGrid); - }; + } - cls.toPixelY = function(formY) { + static toPixelY(formY) { return Math.round(formY * _pixelYPerGrid); - }; + } - cls.pointToPixel = function(point) {// Point unit (1/72 an inch) to pixel units + static pointToPixel(point) {// Point unit (1/72 an inch) to pixel units return point * _pixelPerPoint; - }; + } - cls.getColorByIndex = function(clrId) { + static getColorByIndex(clrId) { return kColors[clrId]; - }; + } - cls.toFormPoint = function(viewportX, viewportY) { + static toFormPoint(viewportX, viewportY) { return [(viewportX / _pixelXPerGrid), (viewportY / _pixelYPerGrid)]; - }; + } - cls.toFormX = function(viewportX) { - return cls.toFixedFloat(viewportX / _pixelXPerGrid); - }; + static toFormX(viewportX) { + return PDFUnit.toFixedFloat(viewportX / _pixelXPerGrid); + } - cls.toFormY = function(viewportY) { - return cls.toFixedFloat(viewportY / _pixelYPerGrid); - }; + static toFormY(viewportY) { + return PDFUnit.toFixedFloat(viewportY / _pixelYPerGrid); + } - cls.findColorIndex = function(color) { + static findColorIndex(color) { if (color.length === 4) color += "000"; //MQZ. 07/29/2013: if color is not in dictionary, just return -1. The caller (pdffont, pdffill) will set the actual color return kColors.indexOf(color); - }; - - return cls; -})(); + } +} module.exports = PDFUnit; diff --git a/lib/ptixmlinject.js b/lib/ptixmlinject.js index e2977f96..3e49a289 100644 --- a/lib/ptixmlinject.js +++ b/lib/ptixmlinject.js @@ -1,39 +1,26 @@ -'use strict'; +const fs = require("fs"), + DOMParser = require("@xmldom/xmldom").DOMParser; -var nodeUtil = require("util"), -nodeEvents = require("events"), -fs = require('fs'), -_ = require('lodash'), -DOMParser = require('xmldom').DOMParser, -PDFCanvas = require('./pdfcanvas.js'), -PDFUnit = require('./pdfunit.js'), -PDFField = require('./pdffield.js'), -PDFAnno = require('./pdfanno.js'), -Image = require('./pdfimage.js'), -pkInfo = require('../package.json'); - -var xmlData; - -var PTIXmlParser = (function () { - 'use strict'; - - var ptiPageArray = []; +class PTIXmlParser { + xmlData = null; + ptiPageArray = []; // constructor - var cls = function () { - }; + constructor() { + this.xmlData = null; + this.ptiPageArray = []; + } - cls.prototype.parseXml = function (filePath,callback) { - - fs.readFile(filePath, 'utf8', function (err,data) { + parseXml(filePath, callback) { + fs.readFile(filePath, 'utf8', (err, data) => { if (err) { callback(err); } else { - xmlData = data; + this.xmlData = data; var parser = new DOMParser(); - var dom = parser.parseFromString(xmlData); + var dom = parser.parseFromString(this.xmlData); var root = dom.documentElement; var xmlFields = root.getElementsByTagName("field"); @@ -73,19 +60,18 @@ var PTIXmlParser = (function () { fields.push(item); - ptiPageArray[parseInt(page)]=fields; + this.ptiPageArray[parseInt(page)]=fields; } } callback(); }); - }; + } - cls.prototype.getFields = function(pageNum) { - return ptiPageArray[pageNum]; - }; - return cls; -})(); + getFields(pageNum) { + return this.ptiPageArray[pageNum]; + } +} module.exports = PTIXmlParser; diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 00000000..0024257a --- /dev/null +++ b/package-lock.json @@ -0,0 +1,13 @@ +{ + "name": "pdf2json", + "version": "2.0.0", + "lockfileVersion": 1, + "requires": true, + "dependencies": { + "@xmldom/xmldom": { + "version": "0.7.5", + "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.7.5.tgz", + "integrity": "sha512-V3BIhmY36fXZ1OtVcI9W+FxQqxVLsPKcNjWigIaa81dLC9IolJl5Mt4Cvhmr0flUnjSpTdrbMTSbXqYqV5dT6A==" + } + } +} diff --git a/package.json b/package.json index 291424c6..c0f7c380 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "pdf2json", - "version": "1.1.8", - "description": "A PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js", + "version": "2.0.0", + "description": "PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js", "keywords": [ "pdf", "pdf parser", @@ -28,27 +28,31 @@ "main": "./pdfparser.js", "scripts": { "test": "cd ./test && sh p2j.forms.sh", - "test-misc": "node pdf2json.js -f ./test/pdf/misc/ -o ./test/target/misc/ -c -m" + "test-misc": "cd ./test && sh p2j.one.sh misc . \"Expected: 5 success, 2 exception with stack trace\" ", + "parse": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/fd/form/F1040.pdf -o ./test/target/fd/form", + "parse-s": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/fd/form/F1040.pdf -o ./test/target/fd/form -s", + "parse-t": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/fd/form/F1040.pdf -o ./test/target/fd/form -s -t", + "parse-c": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/fd/form/F1040.pdf -o ./test/target/fd/form -s -t -c", + "parse-m": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/fd/form/F1040.pdf -o ./test/target/fd/form -s -t -c -m", + "parse-r": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/fd/form -o ./test/target/fd/form -t -c -m -r", + "parse-242": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/misc/i242_testingWithTable.pdf -o ./test/target/misc", + "parse-e": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/misc/i43_encrypted.pdf -o ./test/target/misc", + "parse-e2": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/misc/i243_problem_file_anon.pdf -o ./test/target/misc", + "parse-e3": "node --trace-deprecation --trace-warnings pdf2json.js -f ./test/pdf/misc/i200_test.pdf -o ./test/target/misc" }, "engines": { - "node": ">=4.5" + "node": ">=14.18.0", + "npm": ">=6.14.15" }, "bin": { "pdf2json": "./bin/pdf2json" }, "dependencies": { - "xmldom": "^0.1.22", - "lodash": "^4.15.0", - "optimist": "^0.6.1", - "async": "^2.0.1" - }, - "devDependencies": { + "@xmldom/xmldom": "^0.7.5" }, + "devDependencies": {}, "bundledDependencies": [ - "xmldom", - "lodash", - "optimist", - "async" + "@xmldom/xmldom" ], "maintainers": [ { @@ -61,11 +65,6 @@ "bugs": { "url": "http://github.com/modesty/pdf2json/issues" }, - "licenses": [ - { - "type": "Apache v2", - "url": "https://github.com/modesty/pdf2json/blob/master/license.txt" - } - ], + "license": "Apache-2.0", "readme": "https://github.com/modesty/pdf2json/blob/master/readme.md" } diff --git a/pdf2json.js b/pdf2json.js index 540608ca..19939fbf 100644 --- a/pdf2json.js +++ b/pdf2json.js @@ -1,4 +1,2 @@ -'use strict'; - -var P2JCMD = require('./lib/p2jcmd'); +const P2JCMD = require('./lib/p2jcmd'); new P2JCMD().start(); diff --git a/pdfparser.js b/pdfparser.js index e0e84b08..340be777 100644 --- a/pdfparser.js +++ b/pdfparser.js @@ -1,183 +1,167 @@ -'use strict'; - -let fs = require('fs'), - stream = require('stream'), - nodeUtil = require("util"), - _ = require("lodash"), - async = require("async"), - PDFJS = require("./lib/pdf.js"); +const fs = require("fs"), + { readFile } = require("fs/promises"), + {EventEmitter} = require("events"), + nodeUtil = require("util"), + PDFJS = require("./lib/pdf"), + {ParserStream} = require("./lib/parserstream"), + {kColors, kFontFaces, kFontStyles} = require("./lib/pdfconst"); + + +class PDFParser extends EventEmitter { // inherit from event emitter + //public static + static get colorDict() {return kColors; } + static get fontFaceDict() { return kFontFaces; } + static get fontStyleDict() { return kFontStyles; } + + //private static + static #maxBinBufferCount = 10; + static #binBuffer = {}; + + //private + #password = ""; + + #context = null; // service context object, only used in Web Service project; null in command line + + #pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started + #pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache + #data = null; //if file read success, data is PDF content; if failed, data is "err" object + #PDFJS = null; //will be initialized in constructor + #processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging (do NOT set to true) + + // constructor + constructor(context, needRawText, password) { + //call constructor for super class + super(); + + // private + // service context object, only used in Web Service project; null in command line + this.#context = context; -let PDFParser = (function () { - // private static - let _nextId = 1; - let _name = 'PDFParser'; + this.#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started + this.#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache + this.#data = null; //if file read success, data is PDF content; if failed, data is "err" object + this.#processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging (do NOT set to true) - let _binBuffer = {}; - let _maxBinBufferCount = 10; + this.#PDFJS = new PDFJS(needRawText); + this.#password = password; + } + + //public getter + get data() { return this.#data; } + get binBufferKey() { return this.#pdfFilePath + this.#pdfFileMTime; } //private methods, needs to invoked by [funcName].call(this, ...) - let _onPDFJSParseDataReady = function(data) { + #onPDFJSParseDataReady(data) { if (!data) { //v1.1.2: data===null means end of parsed data nodeUtil.p2jinfo("PDF parsing completed."); - let output = {"formImage": this.data}; - this.emit("pdfParser_dataReady", output); - if (typeof this.flushCallback === 'function') { - this.push(output); - this.flushCallback(); - this.flushCallback = null; - } + this.emit("pdfParser_dataReady", this.#data); } else { - Object.assign(this.data, data); + this.#data = {...this.#data, ...data}; } - }; + } - let _onPDFJSParserDataError = function(data) { - this.data = null; - this.emit("pdfParser_dataError", {"parserError": data}); - }; + #onPDFJSParserDataError(err) { + this.#data = null; + this.emit("pdfParser_dataError", {"parserError": err}); + // this.emit("error", err); + } + + #startParsingPDF(buffer) { + this.#data = {}; - let _startParsingPDF = function(buffer) { - this.data = {}; + this.#PDFJS.on("pdfjs_parseDataReady", this.#onPDFJSParseDataReady.bind(this)); + this.#PDFJS.on("pdfjs_parseDataError", this.#onPDFJSParserDataError.bind(this)); - this.PDFJS.on("pdfjs_parseDataReady", _onPDFJSParseDataReady.bind(this)); - this.PDFJS.on("pdfjs_parseDataError", _onPDFJSParserDataError.bind(this)); + //v1.3.0 the following Readable Stream-like events are replacement for the top two custom events + this.#PDFJS.on("readable", meta => this.emit("readable", meta)); + this.#PDFJS.on("data", data => this.emit("data", data)); + this.#PDFJS.on("error", this.#onPDFJSParserDataError.bind(this)); - this.PDFJS.parsePDFData(buffer || _binBuffer[this.pdfFilePath]); - }; + this.#PDFJS.parsePDFData(buffer || PDFParser.#binBuffer[this.binBufferKey], this.#password); + } - let _processBinaryCache = function() { - if (_.has(_binBuffer, this.pdfFilePath)) { - _startParsingPDF.call(this); + #processBinaryCache() { + if (this.binBufferKey in PDFParser.#binBuffer) { + this.#startParsingPDF(); return true; } - let allKeys = _.keys(_binBuffer); - if (allKeys.length > _maxBinBufferCount) { - let idx = this.get_id() % _maxBinBufferCount; - let key = allKeys[idx]; - _binBuffer[key] = null; - delete _binBuffer[key]; + const allKeys = Object.keys(PDFParser.#binBuffer); + if (allKeys.length > PDFParser.#maxBinBufferCount) { + const idx = this.id % PDFParser.#maxBinBufferCount; + const key = allKeys[idx]; + PDFParser.#binBuffer[key] = null; + delete PDFParser.#binBuffer[key]; nodeUtil.p2jinfo("re-cycled cache for " + key); } return false; - }; - - let _processPDFContent = function(err, data) { - nodeUtil.p2jinfo("Load PDF file status:" + (!!err ? "Error!" : "Success!") ); - if (err) { - this.data = err; - this.emit("pdfParser_dataError", this); - } - else { - _binBuffer[this.pdfFilePath] = data; - _startParsingPDF.call(this); - } - }; - - let _createContentStream = function(jsonObj) { - let rStream = new stream.Readable({objectMode: true}); - rStream.push(jsonObj); - rStream.push(null); - return rStream; - }; - - // constructor - function PdfParser(context, needRawText) { - //call constructor for super class - stream.Transform.call(this, {objectMode: true, bufferSize: 64 * 1024}); - - // private - let _id = _nextId++; - - // public (every instance will have their own copy of these methods, needs to be lightweight) - this.get_id = () => _id; - this.get_name = () => _name + _id; - - // service context object, only used in Web Service project; null in command line - this.context = context; - - this.pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started - this.data = null; //if file read success, data is PDF content; if failed, data is "err" object - this.PDFJS = new PDFJS(needRawText); - this.processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging - - this.chunks = []; - this.flushCallback = null; } - // inherit from event emitter - nodeUtil.inherits(PdfParser, stream.Transform); - - //implements transform stream - PdfParser.prototype._transform = function (chunk, enc, callback) { - this.chunks.push(Buffer.isBuffer(chunk) ? chunk : new Buffer(chunk, enc)); - callback(); - }; - - PdfParser.prototype._flush = function(callback) { - this.flushCallback = callback; - this.parseBuffer(Buffer.concat(this.chunks)); - }; - - PdfParser.prototype.fq = async.queue( (task, callback) => { - fs.readFile(task.path, callback); - }, 100); //public APIs - PdfParser.prototype.setVerbosity = function(verbosity) { - nodeUtil.verbosity(verbosity || 0); - }; + createParserStream() { + return new ParserStream(this, {objectMode: true, bufferSize: 64 * 1024}); + } - PdfParser.prototype.loadPDF = function(pdfFilePath, verbosity) { - this.setVerbosity(verbosity); + async loadPDF(pdfFilePath, verbosity) { + nodeUtil.verbosity(verbosity || 0); nodeUtil.p2jinfo("about to load PDF file " + pdfFilePath); - this.pdfFilePath = pdfFilePath; - if (this.processFieldInfoXML) { - this.PDFJS.tryLoadFieldInfoXML(pdfFilePath); - } - - if (_processBinaryCache.call(this)) - return; - - this.fq.push({path: pdfFilePath}, _processPDFContent.bind(this)); - }; + this.#pdfFilePath = pdfFilePath; + + try { + this.#pdfFileMTime = fs.statSync(pdfFilePath).mtimeMs; + if (this.#processFieldInfoXML) { + this.#PDFJS.tryLoadFieldInfoXML(pdfFilePath); + } + + if (this.#processBinaryCache()) + return; + + PDFParser.#binBuffer[this.binBufferKey] = await readFile(pdfFilePath); + nodeUtil.p2jinfo(`Load OK: ${pdfFilePath}`); + this.#startParsingPDF(); + } + catch(err) { + nodeUtil.p2jerror(`Load Failed: ${pdfFilePath} - ${err}`); + this.emit("pdfParser_dataError", err); + } + } // Introduce a way to directly process buffers without the need to write it to a temporary file - PdfParser.prototype.parseBuffer = function(pdfBuffer) { - _startParsingPDF.call(this, pdfBuffer); - }; + parseBuffer(pdfBuffer) { + this.#startParsingPDF(pdfBuffer); + } - PdfParser.prototype.getRawTextContent = function() { return this.PDFJS.getRawTextContent(); }; - PdfParser.prototype.getRawTextContentStream = function() { return _createContentStream(this.getRawTextContent()); }; + getRawTextContent() { return this.#PDFJS.getRawTextContent(); } + getRawTextContentStream() { return ParserStream.createContentStream(this.getRawTextContent()); } - PdfParser.prototype.getAllFieldsTypes = function() { return this.PDFJS.getAllFieldsTypes(); }; - PdfParser.prototype.getAllFieldsTypesStream = function() { return _createContentStream(this.getAllFieldsTypes()); }; + getAllFieldsTypes() { return this.#PDFJS.getAllFieldsTypes(); }; + getAllFieldsTypesStream() { return ParserStream.createContentStream(this.getAllFieldsTypes()); } - PdfParser.prototype.getMergedTextBlocksIfNeeded = function() { return {"formImage": this.PDFJS.getMergedTextBlocksIfNeeded()}; }; - PdfParser.prototype.getMergedTextBlocksStream = function() { return _createContentStream(this.getMergedTextBlocksIfNeeded()); }; + getMergedTextBlocksIfNeeded() { return this.#PDFJS.getMergedTextBlocksIfNeeded(); } + getMergedTextBlocksStream() { return ParserStream.createContentStream(this.getMergedTextBlocksIfNeeded()) } - PdfParser.prototype.destroy = function() { - this.removeAllListeners(); + destroy() { // invoked with stream transform process + super.removeAllListeners(); //context object will be set in Web Service project, but not in command line utility - if (this.context) { - this.context.destroy(); - this.context = null; + if (this.#context) { + this.#context.destroy(); + this.#context = null; } - this.pdfFilePath = null; - this.data = null; - this.chunks = null; - - this.PDFJS.destroy(); - this.PDFJS = null; - }; + this.#pdfFilePath = null; + this.#pdfFileMTime = null; + this.#data = null; + this.#processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging (do NOT set to true) - return PdfParser; -})(); + this.#PDFJS.destroy(); + this.#PDFJS = null; + } +} module.exports = PDFParser; diff --git a/readme.md b/readme.md index d68c99c6..d65e776c 100644 --- a/readme.md +++ b/readme.md @@ -17,15 +17,43 @@ To update with latest version: To Run in RESTful Web Service or as Commandline Utility * More details can be found at the bottom of this document. +## Test + +After install, run command line: + +> npm run test + +It'll scan and parse *260* PDF AcroForm files under *_./test/pdf_*, runs with *_-s -t -c -m_* command line options, generates primary output JSON, additional text content JSON, form fields JSON and merged text JSON file for each PDF. It usually takes ~20s in my MacBook Pro to complete, check *_./test/target/_* for outputs. + +### Test Exception Handlings + +After install, run command line: + +> npm run test-misc + +It'll scan and parse all PDF files under *_./test/pdf/misc_*, also runs with *_-s -t -c -m_* command line options, generates primary output JSON, additional text content JSON, form fields JSON and merged text JSON file for 5 PDF fields, while catches exceptions with stack trace for: + * _bad XRef entry_ for `pdf/misc/i200_test.pdf` + * _unsupported encryption algorithm_ for `pdf/misc/i43_encrypted.pdf` + * _Invalid XRef stream header_ for `pdf/misc/i243_problem_file_anon.pdf` + +### Test Streams +After install, run command line: + +> npm run parse-r + +It scans 165 PDF files under *../test/pdf/fd/form_*, parses with [Stream API](https://nodejs.org/dist/latest-v14.x/docs/api/stream.html), then generates output to *_./test/target/fd/form_*. + +More test scripts with different commandline options can be found at *_package.json_*. + ## Code Example * Parse a PDF file then write to a JSON file: ````javascript - let fs = require('fs'), + const fs = require('fs'), PDFParser = require("pdf2json"); - let pdfParser = new PDFParser(); + const pdfParser = new PDFParser(); pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) ); pdfParser.on("pdfParser_dataReady", pdfData => { @@ -45,17 +73,25 @@ Or, call directly with buffer: }) ```` +Or, use more granular page level parsing events (v2.0.0) + +````javascript + pdfParser.on("readable", meta => console.log("PDF Metadata", meta) ); + pdfParser.on("data", page => console.log(page ? "One page paged" : "All pages parsed", page)); + pdfParser.on("error", err => console.erro("Parser Error", err); +```` + * Parse a PDF then write a .txt file (which only contains textual content of the PDF) ````javascript - let fs = require('fs'), + const fs = require('fs'), PDFParser = require("pdf2json"); - let pdfParser = new PDFParser(this,1); + const pdfParser = new PDFParser(this,1); pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) ); pdfParser.on("pdfParser_dataReady", pdfData => { - fs.writeFile("./pdf2json/test/F1040EZ.content.txt", pdfParser.getRawTextContent()); + fs.writeFile("./pdf2json/test/F1040EZ.content.txt", pdfParser.getRawTextContent(), ()=>{console.log("Done.");}); }); pdfParser.loadPDF("./pdf2json/test/pdf/fd/form/F1040EZ.pdf"); @@ -64,14 +100,14 @@ Or, call directly with buffer: * Parse a PDF then write a fields.json file that only contains interactive forms' fields information: ````javascript - let fs = require('fs'), + const fs = require('fs'), PDFParser = require("pdf2json"); - let pdfParser = new PDFParser(); + const pdfParser = new PDFParser(); pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) ); pdfParser.on("pdfParser_dataReady", pdfData => { - fs.writeFile("./pdf2json/test/F1040EZ.fields.json", JSON.stringify(pdfParser.getAllFieldsTypes())); + fs.writeFile("./pdf2json/test/F1040EZ.fields.json", JSON.stringify(pdfParser.getAllFieldsTypes()), ()=>{console.log("Done.");}); }); pdfParser.loadPDF("./pdf2json/test/pdf/fd/form/F1040EZ.pdf"); @@ -80,14 +116,37 @@ Or, call directly with buffer: Alternatively, you can pipe input and output streams: (requires v1.1.4) ````javascript - let fs = require('fs'), + const fs = require('fs'), PDFParser = require("pdf2json"); - let inputStream = fs.createReadStream("./pdf2json/test/pdf/fd/form/F1040EZ.pdf", {bufferSize: 64 * 1024}); - let outputStream = fs.createWriteStream("./pdf2json/test/target/fd/form/F1040EZ.json"); + const inputStream = fs.createReadStream("./pdf2json/test/pdf/fd/form/F1040EZ.pdf", {bufferSize: 64 * 1024}); + const outputStream = fs.createWriteStream("./pdf2json/test/target/fd/form/F1040EZ.json"); inputStream.pipe(new PDFParser()).pipe(new StringifyStream()).pipe(outputStream); ```` + +With v2.0.0, last line above changes to +````javascript + inputStream.pipe(this.pdfParser.createParserStream()).pipe(new StringifyStream()).pipe(outputStream); +```` + +For additional output streams support: +````javascript + #generateMergedTextBlocksStream(callback) { + const outputStream = ParserStream.createOutputStream(this.outputPath.replace(".json", ".merged.json"), callback); + this.pdfParser.getMergedTextBlocksStream().pipe(new StringifyStream()).pipe(outputStream); + } + + #generateRawTextContentStream(callback) { + const outputStream = ParserStream.createOutputStream(this.outputPath.replace(".json", ".content.txt"), callback); + this.pdfParser.getRawTextContentStream().pipe(outputStream); + } + + #generateFieldsTypesStream(callback) { + const outputStream = ParserStream.createOutputStream(this.outputPath.replace(".json", ".fields.json"), callback); + this.pdfParser.getAllFieldsTypesStream().pipe(new StringifyStream()).pipe(outputStream); + } +```` See [p2jcmd.js](https://github.com/modesty/pdf2json/blob/master/lib/p2jcmd.js) for more details. @@ -97,12 +156,17 @@ See [p2jcmd.js](https://github.com/modesty/pdf2json/blob/master/lib/p2jcmd.js) f * pdfParser_dataError: will be raised when parsing failed * pdfParser_dataReady: when parsing succeeded +* alternative events: (v2.0.0) + * readable: first event dispatched after PDF file metadata is parsed and before processing any page + * data: one parsed page succeeded, null means last page has been processed, signle end of data stream + * error: exception or error occured + * start to parse PDF file from specified file path asynchronously: ````javascript function loadPDF(pdfFilePath); ```` If failed, event "pdfParser_dataError" will be raised with error object: {"parserError": errObj}; -If success, event "pdfParser_dataReady" will be raised with output data object: {"formImage": parseOutput}, which can be saved as json file (in command line) or serialized to json when running in web service. +If success, event "pdfParser_dataReady" will be raised with output data object: {"formImage": parseOutput}, which can be saved as json file (in command line) or serialized to json when running in web service. __note__: "formImage" is removed from v2.0.0, see breaking changes for details. * Get all textual content from "pdfParser_dataReady" event handler: ````javascript @@ -120,9 +184,9 @@ returns an array of field objects. Current parsed data has four main sub objects to describe the PDF document. -* 'Agency': the main text identifier for the PDF document. If Id.AgencyId present, it'll be same, otherwise it'll be set as document title; * 'Transcoder': pdf2json version number -* 'Id': the XML meta data that embedded in PDF document +* 'Agency': the main text identifier for the PDF document. If Id.AgencyId present, it'll be same, otherwise it'll be set as document title; (_deprecated since v2.0.0, see notes below_) +* 'Id': the XML meta data that embedded in PDF document (_deprecated since v2.0.0, see notes below_) * all forms attributes metadata are defined in "Custom" tab of "Document Properties" dialog in Acrobat Pro; * v0.1.22 added support for the following custom properties: * AgencyId: default "unknown"; @@ -130,14 +194,44 @@ Current parsed data has four main sub objects to describe the PDF document. * MC: default false; * Max: default -1; * Parent: parent name, default "unknown"; + * *_v2.0.0_*: 'Agency' and 'Id' are replaced with full metadata, example: for `./test/pdf/fd/form/F1040.pdf`, full metadata is: + ````json + Meta: { + PDFFormatVersion: '1.7', + IsAcroFormPresent: true, + IsXFAPresent: false, + Author: 'SE:W:CAR:MP', + Subject: 'U.S. Individual Income Tax Return', + Creator: 'Adobe Acrobat Pro 10.1.8', + Producer: 'Adobe Acrobat Pro 10.1.8', + CreationDate: "D:20131203133943-08'00'", + ModDate: "D:20140131180702-08'00'", + Metadata: { + 'xmp:modifydate': '2014-01-31T18:07:02-08:00', + 'xmp:createdate': '2013-12-03T13:39:43-08:00', + 'xmp:metadatadate': '2014-01-31T18:07:02-08:00', + 'xmp:creatortool': 'Adobe Acrobat Pro 10.1.8', + 'dc:format': 'application/pdf', + 'dc:description': 'U.S. Individual Income Tax Return', + 'dc:creator': 'SE:W:CAR:MP', + 'xmpmm:documentid': 'uuid:4d81e082-7ef2-4df7-b07b-8190e5d3eadf', + 'xmpmm:instanceid': 'uuid:7ea96d1c-3d2f-284a-a469-f0f284a093de', + 'pdf:producer': 'Adobe Acrobat Pro 10.1.8', + 'adhocwf:state': '1', + 'adhocwf:version': '1.1' + } + } + ```` * 'Pages': array of 'Page' object that describes each page in the PDF, including sizes, lines, fills and texts within the page. More info about 'Page' object can be found at 'Page Object Reference' section * 'Width': the PDF page width in page unit + ### Page object Reference Each page object within 'Pages' array describes page elements and attributes with 5 main fields: * 'Height': height of the page in page unit +* 'Width': width of the page in page unit, moved from root to page object in v2.0.0 * 'HLines': horizontal line array, each line has 'x', 'y' in relative coordinates for positioning, and 'w' for width, plus 'l' for length. Both width and length are in page unit * 'Vline': vertical line array, each line has 'x', 'y' in relative coordinates for positioning, and 'w' for width, plus 'l' for length. Both width and length are in page unit; * v0.4.3 added Line color support. Default is 'black', other wise set in 'clr' if found in color dictionary, or 'oc' field if not found in dictionary; @@ -153,6 +247,7 @@ Each page object within 'Pages' array describes page elements and attributes wit * 'R': an array of text run, each text run object has two main fields: * 'T': actual text * 'S': style index from style dictionary. More info about 'Style Dictionary' can be found at 'Dictionary Reference' section + * 'TS': [fontFaceId, fontSize, 1/0 for bold, 1/0 for italic] v0.4.5 added support when fields attributes information is defined in external xml file. pdf2json will always try load field attributes xml file based on file name convention (pdfFileName.pdf's field XML file must be named pdfFileName_fieldInfo.xml in the same directory). If found, fields info will be injected. @@ -163,8 +258,8 @@ This dictionary data contract design will allow the output just reference a dict It does require the client of the payload to have the same dictionary definition to make sense out of it when render the parser output on to screen. * Color Dictionary - - var kColors = [ +````javascript + const kColors = [ '#000000', // 0 '#ffffff', // 1 '#4c4c4c', // 2 @@ -204,11 +299,11 @@ It does require the client of the payload to have the same dictionary definition '#008000', // Last + 6 '#000000' // Last + 7 ]; - +```` * Style Dictionary: - - var _kFontFaces = [ +````javascript + const kFontFaces = [ "QuickType,Arial,Helvetica,sans-serif", // 00 - QuickType - sans-serif variable font "QuickType Condensed,Arial Narrow,Arial,Helvetica,sans-serif", // 01 - QuickType Condensed - thin sans-serif variable font "QuickTypePi", // 02 - QuickType Pi @@ -217,7 +312,7 @@ It does require the client of the payload to have the same dictionary definition "OCR B MT,Courier New,Courier,monospace" // 05 - OCR-B MT - OCR readable san-serif fixed font ]; - var _kFontStyles = [ + const kFontStyles = [ // Face Size Bold Italic StyleID(Comment) // ----- ---- ---- ----- ----------------- [0, 6, 0, 0], //00 @@ -282,7 +377,17 @@ It does require the client of the payload to have the same dictionary definition [5, 10, 0, 0], //59 [5, 12, 0, 0] //60 ]; - +```` +v2.0.0: to access these dictionary programactically, do either +````javascript + const {kColors, kFontFaces, kFontStyles} = require("./lib/pdfconst"); +```` +or via public static getters of PDFParser: +````javascript + console.dir(PDFParser.colorDict); + console.dir(PDFParser.fontFaceDict); + console.dir(PDFParser.fontStyleDict); +```` ## Interactive Forms Elements @@ -602,7 +707,7 @@ In order to run pdf.js in Node.js, we have to address those dependencies and als * pdf.js' global objects (like PDFJS and globalScope) need to be wrapped in a node module's scope * API Dependencies * XHR Level 2: I don't need XMLHttpRequest to load PDF asynchronously in node.js, so replaced it with node's fs (File System) to load PDF file based on request parameters; - * DOMParser: pdf.js instantiates DOMParser to parse XML based PDF meta data, I used xmldom node module to replace this browser JS library dependency. xmldom can be found at https://github.com/jindw/xmldom; + * DOMParser: pdf.js instantiates DOMParser to parse XML based PDF meta data, I used xmldom node module to replace this browser JS library dependency. xmldom can be found at https://github.com/xmldom/xmldom; * Web Worker: pdf.js has "fake worker" code built in, not much works need to be done, only need to stay aware the parsing would occur in the same thread, not in background worker thread; * Canvas: in order to keep pdf.js code intact as much as possible, I decided to create a HTML5 Canvas API implementation in a node module. It's named as 'PDFCanvas' and has the same API as HTML5 Canvas does, so no change in pdf.js' canvas.js file, we just need to replace the browser's Canvas API with PDFCanvas. This way, when 2D context API invoked, PDFCanvas just write it to a JS object based on the json format above, rather than drawing graphics on html5 canvas; * Extend/Modify pdf.js @@ -769,10 +874,26 @@ In order to support this auto merging capability, text block objects have an add * v1.1.4 unified event data structure: **only when you handle these top level events, no change if you use commandline** * event "pdfParser_dataError": {"parserError": errObj} - * event "pdfParser_dataReady": {"formImage": parseOutput} + * event "pdfParser_dataReady": {"formImage": parseOutput} __note__: "formImage" is removed from v2.0.0, see breaking changes for details. * v1.0.8 fixed [issue 27](https://github.com/modesty/pdf2json/issues/27), it converts x coordinate with the same ratio as y, which is 24 (96/4), rather than 8.7 (96/11), please adjust client renderer accordingly when position all elements' x coordinate. +* v2.0.0 output data field, `Agency` and `Id` are replaced with `Meta`, JSON of the PDF's full metadata. (See above for details). Each page object also added `Width` property besides `Height`. + +**Major Refactoring** +* v2.0.0 has the major refactoring since 2015. Primary updates including: + * Full PDF metadata support (see page format and breaking changes for details) + * Simplify root properties, besides the addition of `Meta` as root property, unnecessary "formImage" is removed from v2.0.0, also `Width` is move from root to each page object under `Pages`. + * Improved Stream support with test _`npm run parse-r`_, plus new events are added to PDF.js, including _`readable`_, _`data`_, _`end`_, _`error`_. These new Readable Stream like events can be optional replacement for customed events (_`pdfjs_parseDataReady`_, and _`pdfjs_parseDataError`_). It offers more granular data chunk flow control, like _`readable`_ with Meta, _`data`_ sequence for each PDF page result, instead of _`pdfjs_parseDataReady`_ combines all pages in one shot. See `./lib/parserstream.js` for more details + * Object with {clr:-1} (like HLines, VLines, Fills, etc.) is replaced with {oc: "#xxxxxx"}. If `clr` index value is valid, then `oc` (original color) field is removed. + * Greater performance, near ~20% improvements with PDFs under _test_ directory + * Better exception handling, fixes a few uncaught exception errors + * More test coverage, 4 more test scripts added, see _package.json_ for details + * Easier access to dictionaries, including color, font face and font style, see Dictionary reference section for details + * Refactor to ES6 class for major entry modules + * Dependencies removed: lodash, async and yargs + * Upgrade to Node v14.18.0 LTSs + ### Install on Ubuntu * Make sure nodejs is installed. Detailed installation steps can be found at http://stackoverflow.com/a/16303380/433814. @@ -830,12 +951,3 @@ Licensed under the [Apache License Version 2.0](https://github.com/modesty/pdf2j ## Support I'm currently running this project in my spare time. Thanks all for your [stars](https://github.com/modesty/pdf2json/stargazers) and [supports](https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=modestyZ%40gmail%2ecom&lc=GB&item_name=modesty%20zhang&item_number=git%40github%2ecom%3amodesty%2fpdf2json%2egit¤cy_code=USD&bn=PP%2dDonationsBF%3abtn_donate_SM%2egif%3aNonHosted). - - - - - - - - - diff --git a/test/p2j.forms.sh b/test/p2j.forms.sh index d647558f..286fa0c8 100755 --- a/test/p2j.forms.sh +++ b/test/p2j.forms.sh @@ -1,9 +1,22 @@ -#!/bin/bash +#!/usr/bin/env bash STARTTIME=$(date +%s) AGENCIES=("dc" "de" "ef" "fd" "nd" "or" "pa" "sc" "va") for i in "${AGENCIES[@]}" do - sh ./p2j.one.sh $i + sh ./p2j.one.sh $i form "Expected: NO Exception, All Parsed OK" done + +# Travis CI doesn't seem to support arrays in bash for testing. +# Reverting to a bunch of commands so that build button can be shown. +# sh ./p2j.one.sh dc +# sh ./p2j.one.sh de +# sh ./p2j.one.sh ef +# sh ./p2j.one.sh fd +# sh ./p2j.one.sh nd +# sh ./p2j.one.sh or +# sh ./p2j.one.sh pa +# sh ./p2j.one.sh sc +# sh ./p2j.one.sh va + ENDTIME=$(date +%s) echo "It takes $(($ENDTIME - $STARTTIME)) seconds to process all PDFs ..." diff --git a/test/p2j.one.sh b/test/p2j.one.sh index ef27706d..85e92e5c 100755 --- a/test/p2j.one.sh +++ b/test/p2j.one.sh @@ -1,22 +1,24 @@ -#!/bin/bash -IN_DIR_BASE=./pdf/ -OUT_DIR_BASE=./target/ -DATA_DIR_BASE=./data/ +#!/usr/bin/env bash +IN_DIR_BASE=./pdf +OUT_DIR_BASE=./target +DATA_DIR_BASE=./data PDF2JSON=../pdf2json.js AGENCY_NAME=$1 +FORM_BASE=$2 +EXPECTED_RESULT=$3 echo "-----------------------------------------------------" echo "Clean up existing $AGENCY_NAME JSON" echo "-----------------------------------------------------" -rm -rfv $OUT_DIR_BASE$AGENCY_NAME/ +rm -rfv $OUT_DIR_BASE/$AGENCY_NAME echo "-----------------------------------------------------" echo "Update $AGENCY_NAME PDF" echo "-----------------------------------------------------" -mkdir -p $OUT_DIR_BASE$AGENCY_NAME/form/ -node $PDF2JSON -f $IN_DIR_BASE$AGENCY_NAME/form/ -o $OUT_DIR_BASE$AGENCY_NAME/form/ -s -t -c -diff -rq $OUT_DIR_BASE$AGENCY_NAME/form/ $DATA_DIR_BASE$AGENCY_NAME/form/ +mkdir -p $OUT_DIR_BASE/$AGENCY_NAME/$FORM_BASE +node --trace-deprecation --trace-warnings $PDF2JSON -f $IN_DIR_BASE/$AGENCY_NAME/$FORM_BASE -o $OUT_DIR_BASE/$AGENCY_NAME/$FORM_BASE -s -t -c -m +# diff -rq $OUT_DIR_BASE$AGENCY_NAME/$FORM_BASE/ $DATA_DIR_BASE$AGENCY_NAME/$FORM_BASE/ echo "-----------------------------------------------------" -echo "All JSON and PDF are updated for $AGENCY_NAME" +echo "$IN_DIR_BASE/$AGENCY_NAME/$FORM_BASE : $EXPECTED_RESULT" echo "-----------------------------------------------------" diff --git a/test/pdf/misc/i200_test.pdf b/test/pdf/misc/i200_test.pdf new file mode 100644 index 00000000..bcf40e47 Binary files /dev/null and b/test/pdf/misc/i200_test.pdf differ diff --git a/test/pdf/misc/i221_tianjin_invoice.pdf b/test/pdf/misc/i221_tianjin_invoice.pdf new file mode 100644 index 00000000..adf695a2 Binary files /dev/null and b/test/pdf/misc/i221_tianjin_invoice.pdf differ diff --git a/test/pdf/misc/i242_testingWithTable.pdf b/test/pdf/misc/i242_testingWithTable.pdf new file mode 100644 index 00000000..b41278bf Binary files /dev/null and b/test/pdf/misc/i242_testingWithTable.pdf differ diff --git a/test/pdf/misc/i243_problem_file_anon.pdf b/test/pdf/misc/i243_problem_file_anon.pdf new file mode 100644 index 00000000..d4652dc3 Binary files /dev/null and b/test/pdf/misc/i243_problem_file_anon.pdf differ