Skip to content

Commit

Permalink
feat(pdf-import) : improve quality of import, remove pdf2json
Browse files Browse the repository at this point in the history
Signed-off-by: Dan Selman <[email protected]>
  • Loading branch information
dselman authored and jeromesimeon committed Sep 2, 2020
1 parent da2f5ce commit dfdb530
Show file tree
Hide file tree
Showing 8 changed files with 36,924 additions and 17,015 deletions.
22,536 changes: 16,685 additions & 5,851 deletions package-lock.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions packages/markdown-pdf/.eslintignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ test/data
src/grammars
.travis
scripts
src/domstubs.js
7 changes: 4 additions & 3 deletions packages/markdown-pdf/jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,10 @@ module.exports = {
// transform: null,

// An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation
// transformIgnorePatterns: [
// "/node_modules/"
// ],
// transformIgnorePatterns : [
// 'src/pdf.js',
// 'src/pdf.worker.js'
// ]

// An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them
// unmockedModulePathPatterns: undefined,
Expand Down
5 changes: 3 additions & 2 deletions packages/markdown-pdf/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
"dependencies": {
"@accordproject/markdown-cicero": "0.12.7",
"@accordproject/markdown-common": "0.12.7",
"pdf2json": "1.2.0",
"pdfjs-dist": "^2.4.456",
"type-of": "^2.0.1",
"pdfmake": "0.1.66"
},
Expand All @@ -97,7 +97,8 @@
"!./out/**/*",
"!./lib/**/*",
"!./umd/**/*",
"!./bin/index.js"
"!./bin/index.js",
"!./src/domstubs.js"
],
"path": "header.txt",
"blocking": true,
Expand Down
233 changes: 90 additions & 143 deletions packages/markdown-pdf/src/PdfTransformer.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@

'use strict';

const PDFParser = require('pdf2json');
// HACK few hacks to let PDF.js be loaded not as a module in global space.
require('./domstubs.js').setStubs(global);

let pdfjsLib = require('pdfjs-dist/es5/build/pdf.js');

const CiceroMarkTransformer = require('@accordproject/markdown-cicero').CiceroMarkTransformer;
const PdfPrinter = require('pdfmake');
const ToPdfMakeVisitor = require('./ToPdfMakeVisitor');
Expand Down Expand Up @@ -83,87 +87,81 @@ class PdfTransformer {
* @param {string} [format] result format, defaults to 'concerto'. Pass
* 'json' to return the JSON data.
* @param {object} [options] - the PDF parsing options
* @param {number} [options.paragraphVerticalOffset] - the vertical offset used to detect pararaphs (defaults to 1)
* @param {boolean} [options.preservePages] - whether to preserve page breaks (defaults to true)
* @param {number} [options.paragraphVerticalOffset] - the vertical offset used to detect
* pararaphs as a multiple of the line height (defaults to 2)
* @param {boolean} [options.preservePages] - whether to preserve PDF page breaks (defaults to true)
* @returns {promise} a Promise to the CiceroMark DOM
*/
async toCiceroMark(input, format = 'concerto', options = { paragraphVerticalOffset: 1, preservePages: true }) {
return new Promise( (resolve, reject) => {
const pdfParser = new PDFParser(null, false);
const errorCallback = (errData) => reject(`PDF parsing failed with error ${errData.parserError}`);
const conversionCallback = (pdfData) => {

const document = {
$class : 'org.accordproject.commonmark.Document',
xmlns : pdfData.formImage.Id.Name,
nodes : []
async toCiceroMark(input, format = 'concerto', options = { paragraphVerticalOffset: 2, preservePages: true }) {

let loadingTask = pdfjsLib.getDocument(input.buffer);

const doc = await loadingTask.promise;
let numPages = doc.numPages;
const metadata = await doc.getMetadata();

const pages = [];
for( let n=1; n <= numPages; n++) {
const page = await doc.getPage(n);
const content = await page.getTextContent({
normalizeWhitespace: true,
disableCombineTextItems: true,
});

let currentPara = null;
let lastY = 0;
const result = {
nodes: []
};

content.items.forEach( text => {
const tx = text.transform;
const textY = tx[5];
const height = text.height;
const newPara = Math.abs(lastY - textY) > (height * options.paragraphVerticalOffset);

if(!currentPara || newPara) {
currentPara = {
$class : 'org.accordproject.commonmark.Paragraph',
nodes : []
};
result.nodes.push(currentPara);
}

const textNode = {
$class : 'org.accordproject.commonmark.Text',
text : text.str.replace(/(?:\r\n|\r|\n)/g, ' ')
};

// pdfData = pdfParser.getMergedTextBlocksIfNeeded();

let currentPara = null;
pdfData.formImage.Pages.forEach(page => {
let lastY = 0;
page.Texts.forEach( text => {
if(!currentPara || Math.abs(lastY - text.y) > options.paragraphVerticalOffset) {
currentPara = {
$class : 'org.accordproject.commonmark.Paragraph',
nodes : []
};
document.nodes.push(currentPara);
}

text.R.forEach( run => {
let [/*fontFaceId*/, /*fontSize*/, bold, italic] = run.TS;
const textNode = {
$class : 'org.accordproject.commonmark.Text',
text : run.T ? decodeURIComponent(run.T) : ''
};
if(bold && !italic) {
const bold = {
$class : 'org.accordproject.commonmark.Strong',
nodes : [textNode]
};
PdfTransformer.pushNode(currentPara, bold, lastY, text.y);
}
else if(italic && !bold) {
const italic = {
$class : 'org.accordproject.commonmark.Emph',
nodes : [textNode]
};
PdfTransformer.pushNode(currentPara, italic, lastY, text.y);
}
else if(italic && bold) {
const boldItalic = {
$class : 'org.accordproject.commonmark.Strong',
nodes : [{
$class : 'org.accordproject.commonmark.Emph',
nodes : [textNode]
}]
};
PdfTransformer.pushNode(currentPara, boldItalic, lastY, text.y);
}
else {
PdfTransformer.pushNode(currentPara, textNode, lastY, text.y);
}
});
lastY = text.y;
});
currentPara.nodes.push(textNode);

if(options.preservePages) {
document.nodes.push( {
$class : 'org.accordproject.commonmark.ThematicBreak'
});
}
if(text.str.trim().length > 0) {
lastY = textY;
}
});

if(options.preservePages) {
result.nodes.push( {
$class : 'org.accordproject.commonmark.ThematicBreak'
});
resolve(document);
};
}

// trigger parsing
pdfParser.on('pdfParser_dataError', errorCallback);
pdfParser.on('pdfParser_dataReady', conversionCallback);
pdfParser.parseBuffer(input);
pages.push(result);
}

let merged = [];

pages.forEach( page => {
merged = merged.concat(page.nodes);
});

const document = {
$class : 'org.accordproject.commonmark.Document',
xmlns : metadata.Title ? metadata.Title : 'Unknown',
nodes : merged
};

return document;
}

/**
Expand Down Expand Up @@ -198,7 +196,8 @@ class PdfTransformer {

dd.pageSize = 'LETTER';
dd.pageOrientation = 'portrait',
dd.pageMargins = [ 80, 80, 80, 80 ];
// left, top, right, bottom
dd.pageMargins = [ 81, 72, 81, 72 ]; // units are points (72 per inch)

// allow overrding top-level options
Object.assign(dd, options);
Expand Down Expand Up @@ -235,39 +234,45 @@ class PdfTransformer {
const defaultStyles = {
Footer: {
alignment: 'left',
margin : [10, 10, 0, 0]
fontSize: 10,
// left, top, right, bottom
margin : [81, 36, 0, 0]
},
PageNumber: {
alignment: 'center',
margin : [0, 0, 0, 0]
fontSize: 10,
// left, top, right, bottom
margin : [0, -11, 0, 0]
},
Header: {
alignment: 'right',
margin : [0, 10, 10, 0]
fontSize: 10,
// left, top, right, bottom
margin : [0, 36, 81, 0]
},
heading_one: {
fontSize: 30,
fontSize: 25,
bold: true,
alignment: 'center'
},
heading_two: {
fontSize: 28,
fontSize: 20,
bold: true
},
heading_three: {
fontSize: 26,
fontSize: 16,
bold: true
},
heading_four: {
fontSize: 24,
fontSize: 15,
bold: true
},
heading_five: {
fontSize: 22,
fontSize: 14,
bold: true
},
heading_six: {
fontSize: 20,
fontSize: 13,
bold: true
},
Code: {
Expand All @@ -286,7 +291,7 @@ class PdfTransformer {
alignment: 'justify'
},
toc: {
fontSize: 30,
fontSize: 25,
bold: true,
alignment: 'center'
},
Expand All @@ -308,64 +313,6 @@ class PdfTransformer {
pdfDoc.pipe(outputStream);
pdfDoc.end();
}

/**
* Utility to get the last child of a node.
* @param {object} node a commonmark node
* @returns {object} the last child node, or null
*/
static getLastChildNode(node) {
return node.nodes.length > 0 ? node.nodes[node.nodes.length-1] : null;
}

/**
* Utility to merge text nodes. It recurses so that is can deal with
* bold, italic, bold+italic text.
* @param {object} srcNode a commonmark node
* @param {object} destNode a commonmark node
* @returns {object} the modified destination node, or null
*/
static mergeTextNode(srcNode, destNode) {
if(srcNode && destNode ) {
if( srcNode.$class === destNode.$class ) {
if(srcNode.$class === 'org.accordproject.commonmark.Text') {
destNode.text = destNode.text + srcNode.text;
return destNode;
}
else {
const srcChild = PdfTransformer.getLastChildNode(srcNode);
const destChild = PdfTransformer.getLastChildNode(destNode);
return PdfTransformer.mergeTextNode(srcChild, destChild);
}
}
}

return null;
}

/**
* Utility to merge adjacent text runs from a PDF
* @param {*} currentPara CommonMark paragraph node
* @param {*} node the current node
* @param {*} lastY the last Y offset position from PDF
* @param {*} textY the current Y offset position from PDF
*/
static pushNode(currentPara, node, lastY, textY) {
if(lastY !== textY) {
currentPara.nodes.push( {
$class : 'org.accordproject.commonmark.Softbreak'
});
currentPara.nodes.push(node);
}
else {
const lastNode = PdfTransformer.getLastChildNode(currentPara);
const merged = PdfTransformer.mergeTextNode(node, lastNode);

if(!merged) {
currentPara.nodes.push(node);
}
}
}
}

module.exports = PdfTransformer;
14 changes: 13 additions & 1 deletion packages/markdown-pdf/src/PdfTransformer.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
'use strict';

const fs = require('fs');
const path = require('path');

const PdfTransformer = require('./PdfTransformer');
const CiceroMarkTransformer = require('@accordproject/markdown-cicero').CiceroMarkTransformer;

Expand Down Expand Up @@ -134,11 +136,21 @@ describe('pdf import', () => {
// console.log(JSON.stringify(ciceroMarkDom, null, 4));
// }
expect(ciceroMarkDom).toMatchSnapshot(); // (1)
return saveCiceroMarkAsPdf(ciceroMarkDom, file + '-import'); // roundtrip for debug
return saveCiceroMarkAsPdf(ciceroMarkDom, file + '-roundtrip'); // roundtrip for debug
});
});
});

describe('pdf import 2', () => {
it('converts Land_Sale_Contract to cicero mark', async () => {
const pdfContent = fs.readFileSync( path.join(__dirname, '/../test/data', 'Land_Sale_Contract.pdf'), null );
const ciceroMarkDom = await pdfTransformer.toCiceroMark(pdfContent, 'json');
// console.log(JSON.stringify(ciceroMarkDom, null, 4));
expect(ciceroMarkDom).toMatchSnapshot(); // (1)
return saveCiceroMarkAsPdf(ciceroMarkDom, 'Land_Sale_Contract-debug'); // roundtrip for debug
});
});

describe('pdf generation', () => {
getMarkdownFiles().forEach(([file, markdownContent], i) => {
it(`converts ${file} to pdf`, async () => {
Expand Down
Loading

0 comments on commit dfdb530

Please sign in to comment.