v8.0.8

- Decode content using detected charset - Update dependencies - Update eslint config Related issues: #386, #320
extractus · Apr 26, 2024 · d616100 · d616100
1 parent d2d3834
commit d616100
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 151 deletions.
diff --git a/.eslintignore b/.eslintignore
diff --git a/.eslintrc.json b/.eslintrc.json
diff --git a/eslint.config.js b/eslint.config.js
@@ -0,0 +1,128 @@
+// eslint.config.js
+
+import eslintjs from '@eslint/js'
+import globals from 'globals'
+
+export default [
+  eslintjs.configs.recommended,
+  {
+    languageOptions: {
+      ecmaVersion: 'latest',
+      sourceType: 'module',
+      globals: {
+        ...globals.node,
+        ...globals.browser,
+        ...globals.jest,
+        Intl: 'readonly',
+      },
+    },
+    ignores: [
+      'node_modules',
+      'storage',
+    ],
+    rules: {
+      'arrow-spacing': ['error', { 'before': true, 'after': true }],
+      'block-spacing': ['error', 'always'],
+      'brace-style': ['error', '1tbs', { 'allowSingleLine': true }],
+      'camelcase': ['error', {
+        'allow': ['^UNSAFE_'],
+        'properties': 'never',
+        'ignoreGlobals': true,
+      }],
+      'comma-dangle': ['error', {
+        'arrays': 'always-multiline',
+        'objects': 'always-multiline',
+        'imports': 'never',
+        'exports': 'never',
+        'functions': 'never',
+      }],
+      'comma-spacing': ['error', { 'before': false, 'after': true }],
+      'eol-last': 'error',
+      'eqeqeq': ['error', 'always', { 'null': 'ignore' }],
+      'func-call-spacing': ['error', 'never'],
+      'indent': [
+        'error',
+        2,
+        {
+          'MemberExpression': 1,
+          'FunctionDeclaration': {
+            'body': 1,
+            'parameters': 2,
+          },
+          'SwitchCase': 1,
+          'ignoredNodes': ['TemplateLiteral > *'],
+        },
+      ],
+      'key-spacing': ['error', { 'beforeColon': false, 'afterColon': true }],
+      'keyword-spacing': ['error', { 'before': true, 'after': true }],
+      'lines-between-class-members': ['error', 'always', { 'exceptAfterSingleLine': true }],
+      'max-len': [
+        'error',
+        {
+          'code': 120,
+          'ignoreTrailingComments': true,
+          'ignoreComments': true,
+          'ignoreUrls': true,
+        },
+      ],
+      'max-lines': [
+        'error',
+        {
+          'max': 360,
+          'skipBlankLines': true,
+          'skipComments': false,
+        },
+      ],
+      'max-lines-per-function': [
+        'error',
+        {
+          'max': 150,
+          'skipBlankLines': true,
+        },
+      ],
+      'max-params': ['error', 3],
+      'no-array-constructor': 'error',
+      'no-mixed-spaces-and-tabs': 'error',
+      'no-multi-spaces': 'error',
+      'no-multi-str': 'error',
+      'no-multiple-empty-lines': [
+        'error',
+        {
+          'max': 1,
+          'maxEOF': 0,
+        },
+      ],
+      'no-restricted-syntax': [
+        'error',
+        'WithStatement',
+        'BinaryExpression[operator=\'in\']',
+      ],
+      'no-trailing-spaces': 'error',
+      'no-use-before-define': [
+        'error',
+        {
+          'functions': true,
+          'classes': true,
+          'variables': false,
+        },
+      ],
+      'no-var': 'warn',
+      'object-curly-spacing': ['error', 'always'],
+      'padded-blocks': [
+        'error',
+        {
+          'blocks': 'never',
+          'switches': 'never',
+          'classes': 'never',
+        },
+      ],
+      'quotes': ['error', 'single'],
+      'space-before-blocks': ['error', 'always'],
+      'space-before-function-paren': ['error', 'always'],
+      'space-infix-ops': 'error',
+      'space-unary-ops': ['error', { 'words': true, 'nonwords': false }],
+      'space-in-parens': ['error', 'never'],
+      'semi': ['error', 'never'],
+    },
+  },
+]
diff --git a/eval.js b/eval.js
@@ -6,7 +6,7 @@ import { readFileSync, writeFileSync, existsSync } from 'node:fs'
 import { slugify } from 'bellajs'
 
 import { isValid as isValidUrl } from './src/utils/linker.js'
-import { extractFromHtml } from './src/main.js'
+import { extract, extractFromHtml } from './src/main.js'
 
 if (!existsSync('evaluation')) {
   execSync('mkdir evaluation')
@@ -15,15 +15,12 @@ if (!existsSync('evaluation')) {
 const extractFromUrl = async (url) => {
   try {
     console.time('extraction')
-    const res = await fetch(url)
-    const buffer = await res.arrayBuffer()
-    const decoder = new TextDecoder('iso-8859-1')
-    const html = decoder.decode(buffer)
-
-    const art = await extractFromHtml(html)
+    const art = await extract(url)
     console.log(art)
-    const slug = slugify(art.title)
-    writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8')
+    if (art) {
+      const slug = slugify(art.title)
+      writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8')
+    }
     console.timeEnd('extraction')
   } catch (err) {
     console.trace(err)

diff --git a/package.json b/package.json
@@ -1,5 +1,5 @@
 {
-  "version": "8.0.7",
+  "version": "8.0.8",
   "name": "@extractus/article-extractor",
   "description": "To extract main article from given URL",
   "homepage": "https://github.com/extractus/article-extractor",
@@ -38,7 +38,8 @@
   },
   "devDependencies": {
     "@types/sanitize-html": "^2.11.0",
-    "eslint": "^8.57.0",
+    "eslint": "^9.1.1",
+    "globals": "^15.0.0",
     "https-proxy-agent": "^7.0.4",
     "jest": "^29.7.0",
     "nock": "^13.5.4"

diff --git a/src/main.js b/src/main.js
@@ -6,6 +6,7 @@ import {
 
 import retrieve from './utils/retrieve.js'
 import parseFromHtml from './utils/parseFromHtml.js'
+import { getCharset } from './utils/html.js'
 import { isValid as isValidUrl } from './utils/linker.js'
 
 export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
@@ -16,11 +17,14 @@ export const extract = async (input, parserOptions = {}, fetchOptions = {}) => {
   if (!isValidUrl(input)) {
     return parseFromHtml(input, null, parserOptions || {})
   }
-  const html = await retrieve(input, fetchOptions)
-  if (!html) {
+  const buffer = await retrieve(input, fetchOptions)
+  const text = buffer ? Buffer.from(buffer).toString().trim() : ''
+  if (!text) {
     return null
   }
-
+  const charset = getCharset(text)
+  const decoder = new TextDecoder(charset)
+  const html = decoder.decode(buffer)
   return parseFromHtml(html, input, parserOptions || {})
 }
 

diff --git a/src/utils/extractMetaData.js b/src/utils/extractMetaData.js
@@ -121,10 +121,10 @@ export default (html) => {
     type: typeAttrs,
   }
 
-  const document = new DOMParser().parseFromString(html, 'text/html')
-  entry.title = document.querySelector('head > title')?.innerText
+  const doc = new DOMParser().parseFromString(html, 'text/html')
+  entry.title = doc.querySelector('head > title')?.innerText
 
-  Array.from(document.getElementsByTagName('link')).forEach(node => {
+  Array.from(doc.getElementsByTagName('link')).forEach(node => {
     const rel = node.getAttribute('rel')
     const href = node.getAttribute('href')
     if (rel && href) {
@@ -135,14 +135,13 @@ export default (html) => {
     }
   })
 
-  Array.from(document.getElementsByTagName('meta')).forEach(node => {
+  Array.from(doc.getElementsByTagName('meta')).forEach(node => {
     const result = getMetaContentByNameOrProperty(node, attributeLists)
     if (result) {
       entry[result.key] = result.content
     }
   })
 
-  const entries = extractLdSchema(document, entry)
-
+  const entries = extractLdSchema(doc, entry)
   return entries
 }
diff --git a/src/utils/html.js b/src/utils/html.js
@@ -28,6 +28,17 @@ const stripMultispaces = (str) => {
   return str.replace(WS_REGEXP, ' ').trim()
 }
 
+export const getCharset = (html) => {
+  const doc = new DOMParser().parseFromString(html, 'text/html')
+  const m = doc.querySelector('meta[charset]') || null
+  let charset = m ? m.getAttribute('charset') : ''
+  if (!charset) {
+    const h = doc.querySelector('meta[http-equiv="content-type"]') || null
+    charset = h ? h.getAttribute('content')?.split(';')[1]?.replace('charset=', '')?.trim() : ''
+  }
+  return charset?.toLowerCase() || 'utf8'
+}
+
 export const cleanify = (inputHtml) => {
   const doc = new DOMParser().parseFromString(inputHtml, 'text/html')
   const html = doc.documentElement.innerHTML