diff --git a/.eslintignore b/.eslintignore deleted file mode 100644 index f06235c4..00000000 --- a/.eslintignore +++ /dev/null @@ -1,2 +0,0 @@ -node_modules -dist diff --git a/.eslintrc.json b/.eslintrc.json deleted file mode 100644 index 6afcb283..00000000 --- a/.eslintrc.json +++ /dev/null @@ -1,121 +0,0 @@ -{ - "parserOptions": { - "ecmaVersion": "latest", - "sourceType": "module" - }, - "env": { - "es6": true, - "node": true, - "browser": true, - "jest": true - }, - "globals": { - "globalThis": true - }, - "plugins": [], - "overrides": [], - "extends": ["eslint:recommended"], - "rules": { - "arrow-spacing": ["error", { "before": true, "after": true }], - "block-spacing": ["error", "always"], - "brace-style": ["error", "1tbs", { "allowSingleLine": true }], - "camelcase": ["error", { - "allow": ["^UNSAFE_"], - "properties": "never", - "ignoreGlobals": true - }], - "comma-dangle": ["error", { - "arrays": "always-multiline", - "objects": "always-multiline", - "imports": "never", - "exports": "never", - "functions": "never" - }], - "comma-spacing": ["error", { "before": false, "after": true }], - "eol-last": "error", - "eqeqeq": ["error", "always", { "null": "ignore" }], - "func-call-spacing": ["error", "never"], - "indent": [ - "error", - 2, - { - "MemberExpression": 1, - "FunctionDeclaration": { - "body": 1, - "parameters": 2 - }, - "SwitchCase": 1 - } - ], - "key-spacing": ["error", { "beforeColon": false, "afterColon": true }], - "keyword-spacing": ["error", { "before": true, "after": true }], - "lines-between-class-members": ["error", "always", { "exceptAfterSingleLine": true }], - "max-len": [ - "error", - { - "code": 120, - "ignoreTrailingComments": true, - "ignoreComments": true, - "ignoreUrls": true - } - ], - "max-lines": [ - "error", - { - "max": 360, - "skipBlankLines": true, - "skipComments": false - } - ], - "max-lines-per-function": [ - "error", - { - "max": 150, - "skipBlankLines": true - } - ], - "max-params": ["error", 3], - "no-array-constructor": "error", - "no-mixed-spaces-and-tabs": "error", - "no-multi-spaces": "error", - "no-multi-str": "error", - "no-multiple-empty-lines": [ - "error", - { - "max": 1, - "maxEOF": 0 - } - ], - "no-restricted-syntax": [ - "error", - "WithStatement", - "BinaryExpression[operator='in']" - ], - "no-trailing-spaces": "error", - "no-use-before-define": [ - "error", - { - "functions": true, - "classes": true, - "variables": false - } - ], - "no-var": "warn", - "object-curly-spacing": ["error", "always"], - "padded-blocks": [ - "error", - { - "blocks": "never", - "switches": "never", - "classes": "never" - } - ], - "quotes": ["error", "single"], - "space-before-blocks": ["error", "always"], - "space-before-function-paren": ["error", "always"], - "space-infix-ops": "error", - "space-unary-ops": ["error", { "words": true, "nonwords": false }], - "space-in-parens": ["error", "never"], - "semi": ["error", "never"] - } -} diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml index e4e11759..b54eddc5 100644 --- a/.github/workflows/ci-test.yml +++ b/.github/workflows/ci-test.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: - node_version: [18.x, 20.x, 21.x] + node_version: [18.x, 20.x, 22.x] steps: - uses: actions/checkout@v4 diff --git a/eslint.config.js b/eslint.config.js new file mode 100644 index 00000000..0d8699e3 --- /dev/null +++ b/eslint.config.js @@ -0,0 +1,128 @@ +// eslint.config.js + +import eslintjs from '@eslint/js' +import globals from 'globals' + +export default [ + eslintjs.configs.recommended, + { + languageOptions: { + ecmaVersion: 'latest', + sourceType: 'module', + globals: { + ...globals.node, + ...globals.browser, + ...globals.jest, + Intl: 'readonly', + }, + }, + ignores: [ + 'node_modules', + 'storage', + ], + rules: { + 'arrow-spacing': ['error', { 'before': true, 'after': true }], + 'block-spacing': ['error', 'always'], + 'brace-style': ['error', '1tbs', { 'allowSingleLine': true }], + 'camelcase': ['error', { + 'allow': ['^UNSAFE_'], + 'properties': 'never', + 'ignoreGlobals': true, + }], + 'comma-dangle': ['error', { + 'arrays': 'always-multiline', + 'objects': 'always-multiline', + 'imports': 'never', + 'exports': 'never', + 'functions': 'never', + }], + 'comma-spacing': ['error', { 'before': false, 'after': true }], + 'eol-last': 'error', + 'eqeqeq': ['error', 'always', { 'null': 'ignore' }], + 'func-call-spacing': ['error', 'never'], + 'indent': [ + 'error', + 2, + { + 'MemberExpression': 1, + 'FunctionDeclaration': { + 'body': 1, + 'parameters': 2, + }, + 'SwitchCase': 1, + 'ignoredNodes': ['TemplateLiteral > *'], + }, + ], + 'key-spacing': ['error', { 'beforeColon': false, 'afterColon': true }], + 'keyword-spacing': ['error', { 'before': true, 'after': true }], + 'lines-between-class-members': ['error', 'always', { 'exceptAfterSingleLine': true }], + 'max-len': [ + 'error', + { + 'code': 120, + 'ignoreTrailingComments': true, + 'ignoreComments': true, + 'ignoreUrls': true, + }, + ], + 'max-lines': [ + 'error', + { + 'max': 360, + 'skipBlankLines': true, + 'skipComments': false, + }, + ], + 'max-lines-per-function': [ + 'error', + { + 'max': 150, + 'skipBlankLines': true, + }, + ], + 'max-params': ['error', 3], + 'no-array-constructor': 'error', + 'no-mixed-spaces-and-tabs': 'error', + 'no-multi-spaces': 'error', + 'no-multi-str': 'error', + 'no-multiple-empty-lines': [ + 'error', + { + 'max': 1, + 'maxEOF': 0, + }, + ], + 'no-restricted-syntax': [ + 'error', + 'WithStatement', + 'BinaryExpression[operator=\'in\']', + ], + 'no-trailing-spaces': 'error', + 'no-use-before-define': [ + 'error', + { + 'functions': true, + 'classes': true, + 'variables': false, + }, + ], + 'no-var': 'warn', + 'object-curly-spacing': ['error', 'always'], + 'padded-blocks': [ + 'error', + { + 'blocks': 'never', + 'switches': 'never', + 'classes': 'never', + }, + ], + 'quotes': ['error', 'single'], + 'space-before-blocks': ['error', 'always'], + 'space-before-function-paren': ['error', 'always'], + 'space-infix-ops': 'error', + 'space-unary-ops': ['error', { 'words': true, 'nonwords': false }], + 'space-in-parens': ['error', 'never'], + 'semi': ['error', 'never'], + }, + }, +] diff --git a/eval.js b/eval.js index e790b2d0..b98d7c91 100644 --- a/eval.js +++ b/eval.js @@ -6,7 +6,7 @@ import { readFileSync, writeFileSync, existsSync } from 'node:fs' import { slugify } from 'bellajs' import { isValid as isValidUrl } from './src/utils/linker.js' -import { extractFromHtml } from './src/main.js' +import { extract, extractFromHtml } from './src/main.js' if (!existsSync('evaluation')) { execSync('mkdir evaluation') @@ -15,15 +15,12 @@ if (!existsSync('evaluation')) { const extractFromUrl = async (url) => { try { console.time('extraction') - const res = await fetch(url) - const buffer = await res.arrayBuffer() - const decoder = new TextDecoder('iso-8859-1') - const html = decoder.decode(buffer) - - const art = await extractFromHtml(html) + const art = await extract(url) console.log(art) - const slug = slugify(art.title) - writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8') + if (art) { + const slug = slugify(art.title) + writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8') + } console.timeEnd('extraction') } catch (err) { console.trace(err) diff --git a/package.json b/package.json index caa793a9..defbd965 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "8.0.7", + "version": "8.0.8", "name": "@extractus/article-extractor", "description": "To extract main article from given URL", "homepage": "https://github.com/extractus/article-extractor", @@ -38,7 +38,8 @@ }, "devDependencies": { "@types/sanitize-html": "^2.11.0", - "eslint": "^8.57.0", + "eslint": "^9.1.1", + "globals": "^15.0.0", "https-proxy-agent": "^7.0.4", "jest": "^29.7.0", "nock": "^13.5.4" diff --git a/src/main.js b/src/main.js index 1ffb8ea8..25b0b5bd 100644 --- a/src/main.js +++ b/src/main.js @@ -6,6 +6,7 @@ import { import retrieve from './utils/retrieve.js' import parseFromHtml from './utils/parseFromHtml.js' +import { getCharset } from './utils/html.js' import { isValid as isValidUrl } from './utils/linker.js' export const extract = async (input, parserOptions = {}, fetchOptions = {}) => { @@ -16,11 +17,14 @@ export const extract = async (input, parserOptions = {}, fetchOptions = {}) => { if (!isValidUrl(input)) { return parseFromHtml(input, null, parserOptions || {}) } - const html = await retrieve(input, fetchOptions) - if (!html) { + const buffer = await retrieve(input, fetchOptions) + const text = buffer ? Buffer.from(buffer).toString().trim() : '' + if (!text) { return null } - + const charset = getCharset(text) + const decoder = new TextDecoder(charset) + const html = decoder.decode(buffer) return parseFromHtml(html, input, parserOptions || {}) } diff --git a/src/utils/extractMetaData.js b/src/utils/extractMetaData.js index ffc4cd74..ede5f9b2 100644 --- a/src/utils/extractMetaData.js +++ b/src/utils/extractMetaData.js @@ -121,10 +121,10 @@ export default (html) => { type: typeAttrs, } - const document = new DOMParser().parseFromString(html, 'text/html') - entry.title = document.querySelector('head > title')?.innerText + const doc = new DOMParser().parseFromString(html, 'text/html') + entry.title = doc.querySelector('head > title')?.innerText - Array.from(document.getElementsByTagName('link')).forEach(node => { + Array.from(doc.getElementsByTagName('link')).forEach(node => { const rel = node.getAttribute('rel') const href = node.getAttribute('href') if (rel && href) { @@ -135,14 +135,13 @@ export default (html) => { } }) - Array.from(document.getElementsByTagName('meta')).forEach(node => { + Array.from(doc.getElementsByTagName('meta')).forEach(node => { const result = getMetaContentByNameOrProperty(node, attributeLists) if (result) { entry[result.key] = result.content } }) - const entries = extractLdSchema(document, entry) - + const entries = extractLdSchema(doc, entry) return entries } diff --git a/src/utils/html.js b/src/utils/html.js index 9c3248e8..b602344f 100644 --- a/src/utils/html.js +++ b/src/utils/html.js @@ -28,6 +28,17 @@ const stripMultispaces = (str) => { return str.replace(WS_REGEXP, ' ').trim() } +export const getCharset = (html) => { + const doc = new DOMParser().parseFromString(html, 'text/html') + const m = doc.querySelector('meta[charset]') || null + let charset = m ? m.getAttribute('charset') : '' + if (!charset) { + const h = doc.querySelector('meta[http-equiv="content-type"]') || null + charset = h ? h.getAttribute('content')?.split(';')[1]?.replace('charset=', '')?.trim() : '' + } + return charset?.toLowerCase() || 'utf8' +} + export const cleanify = (inputHtml) => { const doc = new DOMParser().parseFromString(inputHtml, 'text/html') const html = doc.documentElement.innerHTML diff --git a/src/utils/linker.js b/src/utils/linker.js index 9cbe837b..3c1a70f0 100644 --- a/src/utils/linker.js +++ b/src/utils/linker.js @@ -8,7 +8,7 @@ export const isValid = (url = '') => { try { const ourl = new URL(url) return ourl !== null && ourl.protocol.startsWith('http') - } catch (err) { + } catch { return false } } @@ -22,7 +22,7 @@ export const absolutify = (fullUrl = '', relativeUrl = '') => { try { const result = new URL(relativeUrl, fullUrl) return result.toString() - } catch (err) { + } catch { return '' } } @@ -96,7 +96,7 @@ export const purify = (url) => { }) return pureUrl.toString().replace(pureUrl.hash, '') - } catch (err) { + } catch { return null } } diff --git a/src/utils/retrieve.js b/src/utils/retrieve.js index d493d753..6922a752 100644 --- a/src/utils/retrieve.js +++ b/src/utils/retrieve.js @@ -31,6 +31,6 @@ export default async (url, options = {}) => { if (status >= 400) { throw new Error(`Request failed with error code ${status}`) } - const text = await res.text() - return text.trim() + const buffer = await res.arrayBuffer() + return buffer } diff --git a/src/utils/retrieve.test.js b/src/utils/retrieve.test.js index 646a2c2c..3e69236c 100644 --- a/src/utils/retrieve.test.js +++ b/src/utils/retrieve.test.js @@ -27,7 +27,8 @@ describe('test retrieve() method', () => { nock(baseUrl).get(path).reply(200, '
this is content
', { 'Content-Type': 'text/html', }) - const html = await retrieve(url) + const buffer = await retrieve(url) + const html = Buffer.from(buffer).toString() expect(html).toEqual('
this is content
') }) @@ -37,7 +38,8 @@ describe('test retrieve() method', () => { nock(baseUrl).get(path).reply(200, '\n\r\r\n\n
this is content
\n\r\r\n\n', { 'Content-Type': 'text/html', }) - const html = await retrieve(url) + const buffer = await retrieve(url) + const html = Buffer.from(buffer).toString().trim() expect(html).toEqual('
this is content
') }) @@ -53,11 +55,12 @@ describe('test retrieve() method', () => { 'Content-Type': 'text/html', }) - const html = await retrieve(url, { + const buffer = await retrieve(url, { proxy: { target: 'https://proxy-server.com/api/proxy?url=', }, }) + const html = Buffer.from(buffer).toString() expect(html).toEqual('
this is content
') nock.cleanAll() })