diff --git a/README.md b/README.md index 2bf7cc7..ef9e175 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,6 @@ npm run test - [dictionary-en-gb](https://ghub.io/dictionary-en-gb): English (United Kingdom) spelling dictionary in UTF-8 - [html-to-text](https://ghub.io/html-to-text): Advanced html to plain text converter - [nlcst-to-string](https://ghub.io/nlcst-to-string): Stringify NLCST -- [node-readability](https://ghub.io/node-readability): Turning any web page into a clean view. - [vfile-reporter-json](https://ghub.io/vfile-reporter-json): JSON reporter for virtual files @@ -186,3 +185,7 @@ npm run test ## License This project is licensed under the GNU GENERAL PUBLIC LICENSE Version 3 - see the [LICENSE](LICENSE) file for details + +## Notes + +Due to [node-readability](https://github.com/luin/readability) being stale I have imported the relevent functions into this project and refactored it so it doesn't use [request](https://github.com/request/request) and therfor has no vulnrabilities. diff --git a/helpers.js b/helpers.js new file mode 100644 index 0000000..9f286a5 --- /dev/null +++ b/helpers.js @@ -0,0 +1,673 @@ +module.exports.capitalizeFirstLetter = function (string) { + return string.charAt(0).toUpperCase() + string.slice(1) +} + +module.exports.toTitleCase = function (str) { + return str.replace(/\w\S*/g, function (txt) { + return txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase() + }) +} + +// All of the regular expressions in use within readability. +const regexps = { + unlikelyCandidatesRe: /combx|modal|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor|social|teaserlist|time|tweet|twitter/i, + okMaybeItsACandidateRe: /and|article|body|column|main|story|entry|^post/im, + positiveRe: /article|body|content|entry|hentry|page|pagination|post|section|chapter|description|main|blog|text/i, + negativeRe: /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|utility|tags|widget/i, + divToPElementsRe: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, + replaceBrsRe: /(]*>[ \n\r\t]*){2,}/gi, + replaceFontsRe: /<(\/?)font[^>]*>/gi, + trimRe: /^\s+|\s+$/g, + normalizeRe: /\s{2,}/g, + killBreaksRe: /((\s| ?)*){1,}/g, + videoRe: /http:\/\/(www\.)?(youtube|vimeo|youku|tudou|56|yinyuetai)\.com/i, + attributeRe: /blog|post|article/i +} + +let debug +const dbg = (debug) ? console.log : function () {} + +let cleanRules = [] + +module.exports.setCleanRules = function (rules) { + cleanRules = rules +} + +/** + * Prepare the HTML document for readability to scrape it. + * This includes things like stripping javascript, CSS, and handling terrible markup. + * + * @return void + **/ +module.exports.prepDocument = function (document) { + const frames = document.getElementsByTagName('frame') + if (frames.length > 0) { + let bestFrame = null + let bestFrameSize = 0 + + Array.prototype.slice.call(frames, 0).forEach(function (frame) { + const frameSize = frame.offsetWidth + frame.offsetHeight + let canAccessFrame = false + try { + if (frame.contentWindow.document.body) { + canAccessFrame = true + } + } catch (e) {} + + if (canAccessFrame && frameSize > bestFrameSize) { + bestFrame = frame + bestFrameSize = frameSize + } + }) + + if (bestFrame) { + const newBody = document.createElement('body') + newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML + newBody.style.overflow = 'scroll' + document.body = newBody + + const frameset = document.getElementsByTagName('frameset')[0] + if (frameset) { + frameset.parentNode.removeChild(frameset) + } + } + } + + // Strip out all