Skip to content

Commit

Permalink
v8.0.8
Browse files Browse the repository at this point in the history
- Decode content using detected charset
- Update dependencies
  - Update eslint config

Related issues: #386, #320
  • Loading branch information
ndaidong committed Apr 26, 2024
1 parent d2d3834 commit d616100
Show file tree
Hide file tree
Showing 11 changed files with 171 additions and 151 deletions.
2 changes: 0 additions & 2 deletions .eslintignore

This file was deleted.

121 changes: 0 additions & 121 deletions .eslintrc.json

This file was deleted.

128 changes: 128 additions & 0 deletions eslint.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// eslint.config.js

import eslintjs from '@eslint/js'
import globals from 'globals'

export default [
eslintjs.configs.recommended,
{
languageOptions: {
ecmaVersion: 'latest',
sourceType: 'module',
globals: {
...globals.node,
...globals.browser,
...globals.jest,
Intl: 'readonly',
},
},
ignores: [
'node_modules',
'storage',
],
rules: {
'arrow-spacing': ['error', { 'before': true, 'after': true }],
'block-spacing': ['error', 'always'],
'brace-style': ['error', '1tbs', { 'allowSingleLine': true }],
'camelcase': ['error', {
'allow': ['^UNSAFE_'],
'properties': 'never',
'ignoreGlobals': true,
}],
'comma-dangle': ['error', {
'arrays': 'always-multiline',
'objects': 'always-multiline',
'imports': 'never',
'exports': 'never',
'functions': 'never',
}],
'comma-spacing': ['error', { 'before': false, 'after': true }],
'eol-last': 'error',
'eqeqeq': ['error', 'always', { 'null': 'ignore' }],
'func-call-spacing': ['error', 'never'],
'indent': [
'error',
2,
{
'MemberExpression': 1,
'FunctionDeclaration': {
'body': 1,
'parameters': 2,
},
'SwitchCase': 1,
'ignoredNodes': ['TemplateLiteral > *'],
},
],
'key-spacing': ['error', { 'beforeColon': false, 'afterColon': true }],
'keyword-spacing': ['error', { 'before': true, 'after': true }],
'lines-between-class-members': ['error', 'always', { 'exceptAfterSingleLine': true }],
'max-len': [
'error',
{
'code': 120,
'ignoreTrailingComments': true,
'ignoreComments': true,
'ignoreUrls': true,
},
],
'max-lines': [
'error',
{
'max': 360,
'skipBlankLines': true,
'skipComments': false,
},
],
'max-lines-per-function': [
'error',
{
'max': 150,
'skipBlankLines': true,
},
],
'max-params': ['error', 3],
'no-array-constructor': 'error',
'no-mixed-spaces-and-tabs': 'error',
'no-multi-spaces': 'error',
'no-multi-str': 'error',
'no-multiple-empty-lines': [
'error',
{
'max': 1,
'maxEOF': 0,
},
],
'no-restricted-syntax': [
'error',
'WithStatement',
'BinaryExpression[operator=\'in\']',
],
'no-trailing-spaces': 'error',
'no-use-before-define': [
'error',
{
'functions': true,
'classes': true,
'variables': false,
},
],
'no-var': 'warn',
'object-curly-spacing': ['error', 'always'],
'padded-blocks': [
'error',
{
'blocks': 'never',
'switches': 'never',
'classes': 'never',
},
],
'quotes': ['error', 'single'],
'space-before-blocks': ['error', 'always'],
'space-before-function-paren': ['error', 'always'],
'space-infix-ops': 'error',
'space-unary-ops': ['error', { 'words': true, 'nonwords': false }],
'space-in-parens': ['error', 'never'],
'semi': ['error', 'never'],
},
},
]
15 changes: 6 additions & 9 deletions eval.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { readFileSync, writeFileSync, existsSync } from 'node:fs'
import { slugify } from 'bellajs'

import { isValid as isValidUrl } from './src/utils/linker.js'
import { extractFromHtml } from './src/main.js'
import { extract, extractFromHtml } from './src/main.js'

if (!existsSync('evaluation')) {
execSync('mkdir evaluation')
Expand All @@ -15,15 +15,12 @@ if (!existsSync('evaluation')) {
const extractFromUrl = async (url) => {
try {
console.time('extraction')
const res = await fetch(url)
const buffer = await res.arrayBuffer()
const decoder = new TextDecoder('iso-8859-1')
const html = decoder.decode(buffer)

const art = await extractFromHtml(html)
const art = await extract(url)
console.log(art)
const slug = slugify(art.title)
writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8')
if (art) {
const slug = slugify(art.title)
writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8')
}
console.timeEnd('extraction')
} catch (err) {
console.trace(err)
Expand Down
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "8.0.7",
"version": "8.0.8",
"name": "@extractus/article-extractor",
"description": "To extract main article from given URL",
"homepage": "https://github.com/extractus/article-extractor",
Expand Down Expand Up @@ -38,7 +38,8 @@
},
"devDependencies": {
"@types/sanitize-html": "^2.11.0",
"eslint": "^8.57.0",
"eslint": "^9.1.1",
"globals": "^15.0.0",
"https-proxy-agent": "^7.0.4",
"jest": "^29.7.0",
"nock": "^13.5.4"
Expand Down
10 changes: 7 additions & 3 deletions src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {

import retrieve from './utils/retrieve.js'
import parseFromHtml from './utils/parseFromHtml.js'
import { getCharset } from './utils/html.js'
import { isValid as isValidUrl } from './utils/linker.js'

export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
Expand All @@ -16,11 +17,14 @@ export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
if (!isValidUrl(input)) {
return parseFromHtml(input, null, parserOptions || {})
}
const html = await retrieve(input, fetchOptions)
if (!html) {
const buffer = await retrieve(input, fetchOptions)
const text = buffer ? Buffer.from(buffer).toString().trim() : ''
if (!text) {
return null
}

const charset = getCharset(text)
const decoder = new TextDecoder(charset)
const html = decoder.decode(buffer)
return parseFromHtml(html, input, parserOptions || {})
}

Expand Down
11 changes: 5 additions & 6 deletions src/utils/extractMetaData.js
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,10 @@ export default (html) => {
type: typeAttrs,
}

const document = new DOMParser().parseFromString(html, 'text/html')
entry.title = document.querySelector('head > title')?.innerText
const doc = new DOMParser().parseFromString(html, 'text/html')
entry.title = doc.querySelector('head > title')?.innerText

Array.from(document.getElementsByTagName('link')).forEach(node => {
Array.from(doc.getElementsByTagName('link')).forEach(node => {
const rel = node.getAttribute('rel')
const href = node.getAttribute('href')
if (rel && href) {
Expand All @@ -135,14 +135,13 @@ export default (html) => {
}
})

Array.from(document.getElementsByTagName('meta')).forEach(node => {
Array.from(doc.getElementsByTagName('meta')).forEach(node => {
const result = getMetaContentByNameOrProperty(node, attributeLists)
if (result) {
entry[result.key] = result.content
}
})

const entries = extractLdSchema(document, entry)

const entries = extractLdSchema(doc, entry)
return entries
}
11 changes: 11 additions & 0 deletions src/utils/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@ const stripMultispaces = (str) => {
return str.replace(WS_REGEXP, ' ').trim()
}

export const getCharset = (html) => {
const doc = new DOMParser().parseFromString(html, 'text/html')
const m = doc.querySelector('meta[charset]') || null
let charset = m ? m.getAttribute('charset') : ''
if (!charset) {
const h = doc.querySelector('meta[http-equiv="content-type"]') || null
charset = h ? h.getAttribute('content')?.split(';')[1]?.replace('charset=', '')?.trim() : ''
}
return charset?.toLowerCase() || 'utf8'
}

export const cleanify = (inputHtml) => {
const doc = new DOMParser().parseFromString(inputHtml, 'text/html')
const html = doc.documentElement.innerHTML
Expand Down
Loading

0 comments on commit d616100

Please sign in to comment.