Skip to content

Commit

Permalink
v1.1.4: complete transform stream
Browse files Browse the repository at this point in the history
  • Loading branch information
modestysn committed Mar 18, 2016
1 parent 7a18a6d commit cfe890e
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 51 deletions.
5 changes: 3 additions & 2 deletions base/core/worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,9 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {
var start = Date.now();
page.extractTextContent().then(function(textContent) {
promise.resolve(textContent);
log('text indexing: page=%d - time=%dms', pageNum,
Date.now() - start);
//MQZ 03/17/2016 comment out log
//log('text indexing: page=%d - time=%dms', pageNum,
// Date.now() - start);
}, function (e) {
// Skip errored pages
promise.reject(e);
Expand Down
140 changes: 116 additions & 24 deletions lib/p2jcmd.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
'use strict';

let nodeUtil = require("util"),
stream = require('stream'),
fs = require('fs'),
path = require('path'),
_ = require('underscore'),
_ = require('underscore'),
PDFParser = require("../pdfparser"),
pkInfo = require('../package.json'),
async = require("async");
Expand All @@ -27,10 +28,17 @@ let optimist = require('optimist')
.alias('c', 'content')
.describe('c', '(optional) when specified, will generate .content.txt that includes text content from PDF.\n')
.alias('m', 'merge')
.describe('m', '(optional) when specified, will generate .merged.json that includes auto-merged broken text blocks from PDF (Experimental).\n');
.describe('m', '(optional) when specified, will generate .merged.json that includes auto-merged broken text blocks from PDF (Experimental).\n')
.alias('r', 'stream')
.describe('r', '(optional) when specified, will process and parse with buffer/object transform stream rather than file system (Experimental).\n');

const argv = optimist.argv;
const VERBOSITY_LEVEL = (_.has(argv, 's') ? 0 : 5);

let argv = optimist.argv;
const PROCESS_RAW_TEXT_CONTENT = _.has(argv, 'c');
const PROCESS_FIELDS_CONTENT = _.has(argv, 't');
const PROCESS_MERGE_BROKEN_TEXT_BLOCKS = _.has(argv, 'm');
const PROCESS_WITH_STREAM = _.has(argv, 'r');

let PDF2JSONUtil = (function () {

Expand Down Expand Up @@ -71,10 +79,8 @@ let PDF2JSONUtil = (function () {
};


let _writeOneJSON = function(data, callback) {
let pJSON = JSON.stringify({"formImage":data});

fs.writeFile(this.outputPath, pJSON, err => {
let _writeOneJSON = function(pJSON, callback) {
fs.writeFile(this.outputPath, JSON.stringify(pJSON), err => {
if(err) {
console.warn(this.inputFile + " => " + this.outputFile + " Exception: " + err);
this.curProcessor.failedCount++;
Expand All @@ -88,12 +94,11 @@ let PDF2JSONUtil = (function () {
};

let _writeOneJSONWithMergedTextBlocks = function(data, callback) {
data.Pages = this.pdfParser.getMergedTextBlocksIfNeeded();
let pJSON = JSON.stringify({"formImage":data});
let pJSON = this.pdfParser.getMergedTextBlocksIfNeeded();
let outputPath = this.outputPath.replace(".json", ".merged.json");
let contentFile = this.outputFile.replace(".json", ".merged.json");

fs.writeFile(outputPath, pJSON, err => {
fs.writeFile(outputPath, JSON.stringify(pJSON), err => {
if (err) {
console.warn(err);
} else {
Expand All @@ -104,20 +109,19 @@ let PDF2JSONUtil = (function () {
};

let _parseOnePDF = function(callback) {
let processRawTextContent = _.has(argv, 'c');
this.pdfParser = new PDFParser(null, processRawTextContent);
this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT);

this.pdfParser.on("pdfParser_dataReady", evtData => {
if ((!!evtData) && (!!evtData.data)) {
let outputTasks = [cbFunc => _writeOneJSON.call(this, evtData.data, cbFunc)];
if (_.has(argv, 't')) {//needs to generate fields.json file
outputTasks.push(cbFunc => _generateFieldsTypesFile.call(this, evtData.data, cbFunc));
if ((!!evtData) && (!!evtData.formImage)) {
let outputTasks = [cbFunc => _writeOneJSON.call(this, evtData, cbFunc)];
if (PROCESS_FIELDS_CONTENT) {//needs to generate fields.json file
outputTasks.push(cbFunc => _generateFieldsTypesFile.call(this, evtData, cbFunc));
}
if (processRawTextContent) {//needs to generate content.txt file
outputTasks.push(cbFunc => _generateRawTextContentFile.call(this, evtData.data, cbFunc));
if (PROCESS_RAW_TEXT_CONTENT) {//needs to generate content.txt file
outputTasks.push(cbFunc => _generateRawTextContentFile.call(this, evtData, cbFunc));
}
if (_.has(argv, 'm')) {//needs to generate json file with merged broken text blocks
outputTasks.push(cbFunc => _writeOneJSONWithMergedTextBlocks.call(this, evtData.data, cbFunc));
if (PROCESS_MERGE_BROKEN_TEXT_BLOCKS) {//needs to generate json file with merged broken text blocks
outputTasks.push(cbFunc => _writeOneJSONWithMergedTextBlocks.call(this, evtData, cbFunc));
}

async.series(outputTasks, function(err, results){
Expand All @@ -143,10 +147,95 @@ let PDF2JSONUtil = (function () {
});

console.log("\nTranscoding " + this.inputFile + " to - " + this.outputPath);
this.pdfParser.loadPDF(this.inputPath, (_.has(argv, 's') ? 0 : 5));
this.pdfParser.loadPDF(this.inputPath, VERBOSITY_LEVEL);
};

// constructor
function StringifyStream(){
stream.Transform.call(this);

this._readableState.objectMode = false;
this._writableState.objectMode = true;
}
nodeUtil.inherits(StringifyStream, stream.Transform);

StringifyStream.prototype._transform = function(obj, encoding, cb){
this.push(JSON.stringify(obj));
cb();
};

let _createOutputStream = function(outputPath, callback) {
let outputStream = fs.createWriteStream(outputPath);
outputStream.on('finish', () => {
callback(null, outputPath);
});
outputStream.on('error', err => {
callback({"streamError": err}, outputPath);
});

return outputStream;
};

let _generateMergedTextBlocksStream = function(callback) {
let outputStream = _createOutputStream.call(this, this.outputPath.replace(".json", ".merged.json"), callback);
this.pdfParser.getMergedTextBlocksStream().pipe(new StringifyStream()).pipe(outputStream);
};

let _generateRawTextContentStream = function(callback) {
let outputStream = _createOutputStream.call(this, this.outputPath.replace(".json", ".content.txt"), callback);
this.pdfParser.getRawTextContentStream().pipe(outputStream);
};

let _generateFieldsTypesStream = function(callback) {
let outputStream = _createOutputStream.call(this, this.outputPath.replace(".json", ".fields.json"), callback);
this.pdfParser.getAllFieldsTypesStream().pipe(new StringifyStream()).pipe(outputStream);
};

let _parseOnePDFStream = function(callback) {
this.pdfParser = new PDFParser(null, PROCESS_RAW_TEXT_CONTENT);

this.pdfParser.on("pdfParser_dataError", evtData => {
this.curProcessor.failedCount++;
let errMsg = "Exception: " + evtData.data;
_continue.call(this, callback, errMsg);
});

let outputStream = fs.createWriteStream(this.outputPath);
outputStream.on('finish', () => {
console.log("Primary stream OK: [" + this.inputPath + "] => [" + this.outputPath + "]");
this.curProcessor.successCount++;

let outputTasks = [];
if (PROCESS_FIELDS_CONTENT) {//needs to generate fields.json file
outputTasks.push(cbFunc => _generateFieldsTypesStream.call(this, cbFunc));
}
if (PROCESS_RAW_TEXT_CONTENT) {//needs to generate content.txt file
outputTasks.push(cbFunc => _generateRawTextContentStream.call(this, cbFunc));
}
if (PROCESS_MERGE_BROKEN_TEXT_BLOCKS) {//needs to generate json file with merged broken text blocks
outputTasks.push(cbFunc => _generateMergedTextBlocksStream.call(this, cbFunc));
}

if (outputTasks.length > 0) {
async.series(outputTasks, function (err, results) {
if (err) {
console.error("Additional streams Error: " + err);
} else {
console.log("Additional streams OK: \n", results);
}
_continue.call(this, callback);
});
}
else {
_continue.call(this, callback);
}
});

console.log("\nTranscoding " + this.inputFile + " to - " + this.outputPath);
let inputStream = fs.createReadStream(this.inputPath, {bufferSize: 64 * 1024});
inputStream.pipe(this.pdfParser).pipe(new StringifyStream()).pipe(outputStream);
};

// constructor
let cls = function (inputDir, inputFile, curProcessor) {
// public, this instance copies
this.inputDir = path.normalize(inputDir);
Expand Down Expand Up @@ -217,8 +306,11 @@ let PDF2JSONUtil = (function () {
if (!!validateMsg) {
_continue.call(this, callback, validateMsg);
}
else {
_parseOnePDF.call(this, callback);
else if (PROCESS_WITH_STREAM) {
_parseOnePDFStream.call(this, callback);
}
else {
_parseOnePDF.call(this, callback);
}
};

Expand Down
8 changes: 1 addition & 7 deletions lib/pdf.js
Original file line number Diff line number Diff line change
Expand Up @@ -456,12 +456,6 @@ let PDFJSClass = (function () {
let preT = decodeURIComponent(prevText.R[0].T);
let curT = decodeURIComponent(text.R[0].T);

//let distance = Math.abs(text.x - prevText.x - prevText.w);
//let textSize = PDFFont.getFontSize(prevText);
//let spaceWidth = text.sw;
//let threshHold = PDFFont.getSpaceThreshHold(prevText);
//console.log(`\ndistance=${distance}\tthreshHold=${threshHold}\ttextSize=${textSize}\tspaceWidth=${spaceWidth}`);

prevText.R[0].T += text.R[0].T;
prevText.w += text.w;
text.merged = true;
Expand All @@ -481,7 +475,7 @@ let PDFJSClass = (function () {
page.Texts = page.Texts.filter( t => !t.merged);
}

return this.pages;
return {Pages:this.pages, Width: this.pageWidth};
};

cls.prototype.destroy = function() {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "pdf2json",
"version": "1.1.3",
"version": "1.1.4",
"description": "A PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js",
"keywords": [
"pdf",
Expand Down
33 changes: 25 additions & 8 deletions pdfparser.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ let PDFParser = (function () {
let _onPDFJSParseDataReady = function(data) {
if (!data) { //v1.1.2: data===null means end of parsed data
nodeUtil.p2jinfo("PDF parsing completed.");
this.emit("pdfParser_dataReady", this);
let output = {"formImage": this.data};
this.emit("pdfParser_dataReady", output);
if (typeof this.flushCallback === 'function') {
this.push(this);
this.push(output);
this.flushCallback();
this.flushCallback = null;
}
Expand All @@ -32,8 +33,8 @@ let PDFParser = (function () {
};

let _onPDFJSParserDataError = function(data) {
this.data = data;
this.emit("pdfParser_dataError", this);
this.data = null;
this.emit("pdfParser_dataError", {"parserError": data});
};

let _startParsingPDF = function(buffer) {
Expand Down Expand Up @@ -76,10 +77,17 @@ let PDFParser = (function () {
}
};

let _createContentStream = function(jsonObj) {
let rStream = new stream.Readable({objectMode: true});
rStream.push(jsonObj);
rStream.push(null);
return rStream;
};

// constructor
function PdfParser(context, needRawText) {
//call constructor for super class
stream.Transform.call(this, {objectMode: true});
stream.Transform.call(this, {objectMode: true, bufferSize: 64 * 1024});

// private
let _id = _nextId++;
Expand Down Expand Up @@ -118,8 +126,12 @@ let PDFParser = (function () {
}, 100);

//public APIs
PdfParser.prototype.setVerbosity = function(verbosity) {
nodeUtil.verbosity(verbosity || 0);
};

PdfParser.prototype.loadPDF = function(pdfFilePath, verbosity) {
nodeUtil.verbosity(verbosity);
this.setVerbosity(verbosity);
nodeUtil.p2jinfo("about to load PDF file " + pdfFilePath);

this.pdfFilePath = pdfFilePath;
Expand All @@ -139,8 +151,13 @@ let PDFParser = (function () {
};

PdfParser.prototype.getRawTextContent = function() { return this.PDFJS.getRawTextContent(); };
PdfParser.prototype.getAllFieldsTypes = function() { this.PDFJS.getAllFieldsTypes(this.data); };
PdfParser.prototype.getMergedTextBlocksIfNeeded = function() { this.PDFJS.getMergedTextBlocksIfNeeded(); };
PdfParser.prototype.getRawTextContentStream = function() { return _createContentStream(this.getRawTextContent()); };

PdfParser.prototype.getAllFieldsTypes = function() { return this.PDFJS.getAllFieldsTypes(); };
PdfParser.prototype.getAllFieldsTypesStream = function() { return _createContentStream(this.getAllFieldsTypes()); };

PdfParser.prototype.getMergedTextBlocksIfNeeded = function() { return {"formImage": this.PDFJS.getMergedTextBlocksIfNeeded()}; };
PdfParser.prototype.getMergedTextBlocksStream = function() { return _createContentStream(this.getMergedTextBlocksIfNeeded()); };

PdfParser.prototype.destroy = function() {
this.removeAllListeners();
Expand Down
Loading

0 comments on commit cfe890e

Please sign in to comment.