Skip to content

Commit

Permalink
Merge pull request #22 from fmacpro/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
fmacpro authored Sep 5, 2019
2 parents a055d8c + 87ad27b commit 9c5ef77
Show file tree
Hide file tree
Showing 4 changed files with 3,808 additions and 5,956 deletions.
74 changes: 45 additions & 29 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,13 @@ const articleParser = async function (options, socket) {

const page = await browser.newPage()

// Inject jQuery - https://stackoverflow.com/a/50598512
const jquery = await page.evaluate(() => window.fetch('https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js').then((res) => res.text()))

const response = await page.goto(options.url)

await page.evaluate(jquery)

socket.emit('parse:status', 'Fetching ' + options.url)

// Evaluate status
Expand All @@ -96,6 +101,7 @@ const articleParser = async function (options, socket) {
const protocol = pathArray[0]
const host = pathArray[2]

article.host = host
article.baseurl = protocol + '//' + host

// Evaluate title
Expand All @@ -109,8 +115,6 @@ const articleParser = async function (options, socket) {
}

// Evaluate meta
await page.addScriptTag({ url: 'https://code.jquery.com/jquery-3.2.1.min.js' })

socket.emit('parse:status', 'Evaluating Meta Data')

const meta = await page.evaluate(() => {
Expand Down Expand Up @@ -157,11 +161,45 @@ const articleParser = async function (options, socket) {
// Body Content Identification
socket.emit('parse:status', 'Evaluating Content')

const content = await contentParser(html, options.readability)
if (typeof options.readability === 'undefined') {
options.readability = {}
}

const dom = new JSDOM(html)

await helpers.setCleanRules(options.readability.cleanRulers || [])
await helpers.prepDocument(dom.window.document)

// Turn relative links into absolute links
article.processed.html = await absolutify(content.content, article.baseurl)
article.title.text = content.title
// Title
article.title.text = await getTitle(dom.window.document)

let content = ''

// Content
if (article.host === 'twitter.com') { // Twitter Content
// Tweet
content = await page.evaluate(() => {
const j = window.$

j('.permalink-tweet-container .js-tweet-text-container .twitter-timeline-link').remove()

return j('.permalink-tweet-container .js-tweet-text-container').html()
})
} else if (article.host === 'www.youtube.com') { // Youtube Content
// Video Title
article.title.text = await page.evaluate(() => {
return window.ytInitialData.contents.twoColumnWatchNextResults.results.results.contents[0].videoPrimaryInfoRenderer.title.runs[0].text
})
// Video Description
content = await page.evaluate(() => {
return window.ytInitialData.contents.twoColumnWatchNextResults.results.results.contents[1].videoSecondaryInfoRenderer.description.runs[0].text
})
} else { // General Content
content = helpers.grabArticle(dom.window.document).innerHTML
}

// Turn relative links into absolute links & assign processed html
article.processed.html = await absolutify(content, article.baseurl)

// Get in article links
if (options.enabled.includes('links')) {
Expand Down Expand Up @@ -326,7 +364,7 @@ const getRawText = function (html, title, options) {
let rawText = htmlToText.fromString(html, options)

// Normalise
rawText = nlp(title + '\n\n' + rawText)
rawText = nlp(rawText)
rawText.normalize()
rawText = rawText.out('text')

Expand Down Expand Up @@ -401,22 +439,6 @@ const htmlCleaner = function (html, options) {
})
}

const contentParser = async function (html, options) {
if (typeof options === 'undefined') {
options = {}
}

const dom = new JSDOM(html)

await helpers.setCleanRules(options.cleanRulers || [])
await helpers.prepDocument(dom.window.document)

const content = await getContent(dom.window.document)
const title = await getTitle(dom.window.document)

return ({ title: title, content: content })
}

const keywordParser = function (html, options) {
return new Promise(function (resolve, reject) {
if (typeof options === 'undefined') {
Expand Down Expand Up @@ -483,12 +505,6 @@ const lighthouseAnalysis = async function (options, socket) {
return results.lhr
}

const getContent = function (document) {
var articleContent = helpers.grabArticle(document)

return articleContent.innerHTML
}

const getTitle = function (document) {
var title = findMetaTitle(document) || document.title
var betterTitle
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "horseman-article-parser",
"version": "0.6.0",
"version": "0.6.1",
"description": "Web Page Inspection Tool. Sentiment Analysis, Keyword Extraction, Named Entity Recognition & Spell Check",
"main": "index.js",
"scripts": {
Expand Down
3 changes: 2 additions & 1 deletion test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ const parser = require('./index.js')
const fs = require('fs')

const options = {
url: 'https://www.theguardian.com/politics/2018/sep/24/theresa-may-calls-for-immigration-based-on-skills-and-wealth',
url: 'https://en.wikipedia.org/wiki/Act_of_Parliament_(UK)',
enabled: ['lighthouse', 'screenshot', 'links', 'sentiment', 'entities', 'spelling', 'keywords']
//enabled: ['links', 'sentiment', 'entities', 'spelling', 'keywords']
}

parser.parseArticle(options)
Expand Down
Loading

0 comments on commit 9c5ef77

Please sign in to comment.