Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v8.0.9 #390

Merged
merged 2 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions deno.json

This file was deleted.

4 changes: 2 additions & 2 deletions examples/browser-article-parser/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"start": "node server"
},
"dependencies": {
"express": "^4.18.2",
"got": "^14.2.0"
"express": "latest",
"got": "latest"
}
}
4 changes: 2 additions & 2 deletions examples/bun-article-parser/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
"start": "bun run index.ts"
},
"devDependencies": {
"bun-types": "^1.0.26"
"bun-types": "latest"
},
"dependencies": {
"@extractus/article-extractor": "latest",
"hono": "^4.0.1"
"hono": "latest"
}
}
2 changes: 1 addition & 1 deletion examples/deno-article-parser/deno.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"version": "1.0.0",
"imports": {
"serve": "https://deno.land/std/http/server.ts",
"hono": "https://deno.land/x/hono@v3.11.2/mod.ts",
"hono": "https://deno.land/x/hono/mod.ts",
"article-extractor": "https://esm.sh/@extractus/article-extractor"
},
"tasks": {
Expand Down
2 changes: 1 addition & 1 deletion examples/node-article-parser/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@
},
"dependencies": {
"@extractus/article-extractor": "latest",
"express": "^4.18.2"
"express": "latest"
}
}
19 changes: 19 additions & 0 deletions examples/pupperteer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# node-article-parser with Pupperteer

Install dependencies:

```bash
npm i

# or pnpm, yarn
```

Start server:

```bash
npm start
```

Open `http://localhost:3100/?url=https://client-side-rendering.pages.dev/lorem-ipsum` to see the result.

---
64 changes: 64 additions & 0 deletions examples/pupperteer/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import puppeteer from 'puppeteer'
import express from 'express'
import { extractFromHtml } from '@extractus/article-extractor'

const app = express()

const meta = {
service: 'article-parser-pupperteer',
lang: 'javascript',
server: 'express',
platform: 'node',
}

const loadHtml = async (url) => {
let browser = null
try {
console.log('Initialize puppeteer engine')
browser = await puppeteer.launch()
const page = await browser.newPage()
await page.setDefaultNavigationTimeout(6e4)
console.log(`Start rendering target page "${url}"`)
await page.goto(url, {
waitUntil: 'networkidle0',
})
Dismissed Show dismissed Hide dismissed
console.log(`Load html content from target page ${url}`)
const html = await page.content()
return html
} catch (err) {
console.error(err)
return null
} finally {
if (browser) {
await browser.close()
}
}
}

app.get('/', async (req, res) => {
const url = req.query.url
if (!url) {
return res.json(meta)
}
try {
const html = await loadHtml(url)
const data = await extractFromHtml(html, url)
return res.json({
error: 0,
message: 'article has been extracted successfully',
data,
meta,
})
} catch (err) {
return res.json({
error: 1,
message: err.message,
data: null,
meta,
})
}
})

app.listen(3100, () => {
console.log('Server is running at http://localhost:3100')
})
14 changes: 14 additions & 0 deletions examples/pupperteer/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"name": "node-pupperteer",
"version": "1.0.0",
"main": "index.js",
"type": "module",
"scripts": {
"start": "node index.js"
},
"dependencies": {
"@extractus/article-extractor": "latest",
"express": "latest",
"puppeteer": "latest"
}
}
4 changes: 2 additions & 2 deletions examples/tsnode-article-parser/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
"start": "node dist/index.js"
},
"devDependencies": {
"typescript": "^5.3.3"
"typescript": "latest"
},
"dependencies": {
"@extractus/article-extractor": "latest",
"express": "^4.18.2"
"express": "latest"
}
}
11 changes: 6 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "8.0.8",
"version": "8.0.9",
"name": "@extractus/article-extractor",
"description": "To extract main article from given URL",
"homepage": "https://github.com/extractus/article-extractor",
Expand All @@ -11,15 +11,16 @@
"main": "./src/main.js",
"type": "module",
"imports": {
"cross-fetch": "./src/deno/cross-fetch.js"
"cross-fetch": "./src/deno/cross-fetch.js",
"linkedom": "https://deno.land/x/[email protected]/deno-dom-wasm.ts"
},
"browser": {
"cross-fetch": "./src/deno/cross-fetch.js",
"linkedom": "./src/browser/linkedom.js"
},
"types": "./index.d.ts",
"engines": {
"node": ">= 16"
"node": ">= 18"
},
"scripts": {
"lint": "eslint .",
Expand All @@ -38,8 +39,8 @@
},
"devDependencies": {
"@types/sanitize-html": "^2.11.0",
"eslint": "^9.1.1",
"globals": "^15.0.0",
"eslint": "^9.2.0",
"globals": "^15.1.0",
"https-proxy-agent": "^7.0.4",
"jest": "^29.7.0",
"nock": "^13.5.4"
Expand Down
8 changes: 4 additions & 4 deletions src/utils/parseFromHtml.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ const summarize = (desc, txt, threshold, maxlen) => { // eslint-disable-line
}

export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
const html = purify(inputHtml)
const meta = extractMetaData(html)
const pureHtml = purify(inputHtml)
const meta = extractMetaData(pureHtml)

let title = meta.title

Expand All @@ -57,7 +57,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {

// gather title
if (!title) {
title = extractTitleWithReadability(html, inputUrl)
title = extractTitleWithReadability(pureHtml, inputUrl)
}
if (!title) {
return null
Expand Down Expand Up @@ -95,7 +95,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
}
)

const content = fns(html)
const content = fns(inputHtml)

if (!content) {
return null
Expand Down
Loading