From 5ed7d021511e570ff12deeaeb65aff9d01a29ac4 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Fri, 26 Apr 2024 14:40:58 +0700 Subject: [PATCH] v7.1.2 - Improve charset detection --- package.json | 2 +- src/utils/retrieve.js | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index b053f7e..a0ad1db 100755 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "7.1.1", + "version": "7.1.2", "name": "@extractus/feed-extractor", "description": "To read and normalize RSS/ATOM/JSON feed data", "homepage": "https://extractor-demos.pages.dev", diff --git a/src/utils/retrieve.js b/src/utils/retrieve.js index c17127d..2eee201 100755 --- a/src/utils/retrieve.js +++ b/src/utils/retrieve.js @@ -1,6 +1,7 @@ // utils -> retrieve import fetch from 'cross-fetch' +import { XMLParser } from 'fast-xml-parser' const profetch = async (url, options = {}) => { const { proxy = {}, signal = null } = options @@ -15,6 +16,20 @@ const profetch = async (url, options = {}) => { return res } +const getCharsetFromText = (text) => { + try { + const firstLine = text.split('\n')[0].trim().replace('', '>') + const parser = new XMLParser({ + ignoreAttributes: false, + }) + let obj = parser.parse(firstLine) + const { xml: root = {} } = obj + return root['@_encoding'] || 'utf8' + } catch { + return 'utf8' + } +} + export default async (url, options = {}) => { const { headers = { @@ -35,9 +50,10 @@ export default async (url, options = {}) => { const buffer = await res.arrayBuffer() const text = buffer ? Buffer.from(buffer).toString().trim() : '' + console.log(contentType) if (/(\+|\/)(xml|html)/.test(contentType)) { const arr = contentType.split('charset=') - const charset = arr.length === 2 ? arr[1].trim() : 'utf8' + let charset = arr.length === 2 ? arr[1].trim() : getCharsetFromText(text) const decoder = new TextDecoder(charset) const xml = decoder.decode(buffer) return { type: 'xml', text: xml.trim(), status, contentType }