Merge pull request #110 from extractus/7.0.5

v7.0.5
extractus · Aug 24, 2023 · 11e373c · 11e373c
2 parents e228331 + 640381d
commit 11e373c
Show file tree

Hide file tree

Showing 10 changed files with 324 additions and 15 deletions.
diff --git a/.eslintrc.json b/.eslintrc.json
@@ -62,15 +62,15 @@
     "max-lines": [
       "error",
       {
-        "max": 460,
+        "max": 520,
         "skipBlankLines": true,
         "skipComments": false
       }
     ],
     "max-lines-per-function": [
       "error",
       {
-        "max": 150,
+        "max": 240,
         "skipBlankLines": true
       }
     ],

diff --git a/README.md b/README.md
@@ -114,6 +114,7 @@ URL of a valid feed source
 Feed content must be accessible and conform one of the following standards:
 
   - [RSS Feed](https://www.rssboard.org/rss-specification)
+    - [RDF Feed](https://web.resource.org/rss/1.0/spec)
   - [ATOM Feed](https://datatracker.ietf.org/doc/html/rfc5023)
   - [JSON Feed](https://www.jsonfeed.org/version/1.1/)
 

diff --git a/package.json b/package.json
@@ -1,5 +1,5 @@
 {
-  "version": "7.0.4",
+  "version": "7.0.5",
   "name": "@extractus/feed-extractor",
   "description": "To read and normalize RSS/ATOM/JSON feed data",
   "homepage": "https://extractor-demos.pages.dev",

diff --git a/src/main.js b/src/main.js
@@ -3,10 +3,11 @@
 import { isValid as isValidUrl } from './utils/linker.js'
 
 import retrieve from './utils/retrieve.js'
-import { validate, xml2obj, isRSS, isAtom } from './utils/xmlparser.js'
+import { validate, xml2obj, isRSS, isAtom, isRdf } from './utils/xmlparser.js'
 import parseJsonFeed from './utils/parseJsonFeed.js'
 import parseRssFeed from './utils/parseRssFeed.js'
 import parseAtomFeed from './utils/parseAtomFeed.js'
+import parseRdfFeed from './utils/parseRdfFeed.js'
 
 const getopt = (options = {}) => {
   const {
@@ -42,11 +43,14 @@ export const extractFromXml = (xml, options = {}) => {
   const opts = getopt(options)
 
   const data = xml2obj(xml, opts.xmlParserOptions)
+
   return isRSS(data)
     ? parseRssFeed(data, opts)
     : isAtom(data)
       ? parseAtomFeed(data, opts)
-      : null
+      : isRdf(data)
+        ? parseRdfFeed(data, opts)
+        : null
 }
 
 export const extract = async (url, options = {}, fetchOptions = {}) => {

diff --git a/src/main.test.js b/src/main.test.js
@@ -138,6 +138,30 @@ describe('test extract() standard feed', () => {
     expect(validateProps(result.entries[0])).toBe(true)
   })
 
+  test('extract rdf feed from Slashdot with extraFields', async () => {
+    const url = 'https://some-news-page.tld/atom'
+    const xml = readFileSync('test-data/rdf-standard.xml', 'utf8')
+    const { baseUrl, path } = parseUrl(url)
+    nock(baseUrl).get(path).reply(200, xml, {
+      'Content-Type': 'application/xml',
+    })
+    const result = await extract(url, {
+      getExtraFeedFields: data => {
+        return {
+          subject: data['dc:subject'],
+        }
+      },
+      getExtraEntryFields: data => {
+        return {
+          author: data['dc:creator'],
+        }
+      },
+    })
+    expect(hasProperty(result, 'subject')).toBe(true)
+    expect(hasProperty(result.entries[0], 'author')).toBe(true)
+    expect(validateProps(result.entries[0])).toBe(true)
+  })
+
   test('extract atom feed which contains multi links', async () => {
     const url = 'https://some-news-page.tld/atom/multilinks'
     const xml = readFileSync('test-data/atom-multilinks.xml', 'utf8')
@@ -291,6 +315,22 @@ describe('test extract() without normalization', () => {
     expect(hasProperty(result.item, 'guid')).toBe(true)
   })
 
+  test('extract rdf feed from Slashdot without normalization', async () => {
+    const url = 'https://some-news-page.tld/atom'
+    const xml = readFileSync('test-data/rdf-standard.xml', 'utf8')
+    const { baseUrl, path } = parseUrl(url)
+    nock(baseUrl).get(path).reply(200, xml, {
+      'Content-Type': 'application/xml',
+    })
+    const result = await extract(url, {
+      normalization: false,
+    })
+    expect(hasProperty(result.channel, 'syn:updateBase')).toBe(true)
+    expect(hasProperty(result.channel, 'dc:rights')).toBe(true)
+    expect(hasProperty(result, 'item')).toBe(true)
+    expect(hasProperty(result.item[0], 'slash:department')).toBe(true)
+  })
+
   test('extract atom feed from Google', async () => {
     const url = 'https://some-news-page.tld/atom'
     const xml = readFileSync('test-data/atom-feed-standard-realworld.xml', 'utf8')
@@ -358,7 +398,7 @@ describe('test extract() without normalization', () => {
 })
 
 describe('test extract with `baseUrl` option', () => {
-  test('extract rss feed with xml', () => {
+  test('extract rss feed from file', () => {
     const baseUrl = 'https://huggingface.co'
     const xml = readFileSync('test-data/rss-feed-miss-base-url.xml', 'utf8')
     const result = extractFromXml(xml, { baseUrl })
@@ -376,7 +416,26 @@ describe('test extract with `baseUrl` option', () => {
     expect(result.entries[0].link).toBe(baseUrl + '/blog/intro-graphml')
   })
 
-  test('extract rss feed with json', () => {
+  test('extract rdf feed from file', () => {
+    const baseUrl = 'https://slashdot.org'
+    const xml = readFileSync('test-data/rdf-standard.xml', 'utf8')
+    const result = extractFromXml(xml, { baseUrl })
+
+    feedAttrs.forEach((k) => {
+      expect(hasProperty(result, k)).toBe(true)
+    })
+
+    entryAttrs.forEach((k) => {
+      expect(hasProperty(result.entries[0], k)).toBe(true)
+    })
+
+    expect(validateProps(result.entries[0])).toBe(true)
+    expect(result.link).toBe(baseUrl + '/')
+    const firstItemLink = result.entries[0].link
+    expect(firstItemLink.startsWith('https://tech.slashdot.org/story/23/08/23/2238246/spacex-')).toBe(true)
+  })
+
+  test('extract json feed from file', () => {
     const baseUrl = 'https://www.jsonfeed.org'
     const json = readFileSync('test-data/json-feed-miss-base-url.json', 'utf8')
     const result = extractFromJson(JSON.parse(json), { baseUrl })

diff --git a/src/utils/parseAtomFeed.js b/src/utils/parseAtomFeed.js
@@ -98,8 +98,10 @@ const parseAtom = (data, options = {}) => {
     getExtraFeedFields,
   } = options
 
+  const feedData = data.feed
+
   if (!normalization) {
-    return flatten(data.feed, baseUrl)
+    return flatten(feedData, baseUrl)
   }
 
   const {
@@ -111,9 +113,9 @@ const parseAtom = (data, options = {}) => {
     language = '',
     updated = '',
     entry: item = [],
-  } = data.feed
+  } = feedData
 
-  const extraFields = getExtraFeedFields(data.feed)
+  const extraFields = getExtraFeedFields(feedData)
 
   const items = isArray(item) ? item : [item]
 

diff --git a/src/utils/parseRdfFeed.js b/src/utils/parseRdfFeed.js
@@ -0,0 +1,129 @@
+// parseRssFeed.js
+
+// specs: https://www.rssboard.org/rss-specification
+
+import { isArray } from 'bellajs'
+
+import {
+  getText,
+  toISODateString,
+  buildDescription,
+  getPureUrl,
+  getEntryId
+} from './normalizer.js'
+
+const transform = (item, options) => {
+  const {
+    useISODateFormat,
+    descriptionMaxLen,
+    baseUrl,
+    getExtraEntryFields,
+  } = options
+
+  const {
+    guid = '',
+    title = '',
+    link = '',
+    'dc:date': pubDate = '',
+    description = '',
+    'content:encoded': content = '',
+  } = item
+
+  const published = useISODateFormat ? toISODateString(pubDate) : pubDate
+  const htmlContent = getText(description || content)
+  const entry = {
+    id: getEntryId(guid, link, pubDate),
+    title: getText(title),
+    link: getPureUrl(link, guid, baseUrl),
+    published,
+    description: buildDescription(description || htmlContent, descriptionMaxLen),
+  }
+
+  const extraFields = getExtraEntryFields(item)
+
+  return {
+    ...entry,
+    ...extraFields,
+  }
+}
+
+const flatten = (feed, baseUrl) => {
+  const {
+    title = '',
+    link = '',
+    item,
+  } = feed
+
+  const items = isArray(item) ? item : [item]
+  const entries = items.map((entry) => {
+    const {
+      id,
+      title = '',
+      link = '',
+    } = entry
+
+    const item = {
+      ...entry,
+      title: getText(title),
+      link: getPureUrl(link, id, baseUrl),
+    }
+
+    return item
+  })
+
+  const output = {
+    ...feed,
+    title: getText(title),
+    link: getPureUrl(link, baseUrl),
+    item: isArray(item) ? entries : entries[0],
+  }
+  return output
+}
+
+const parseRdf = (data, options = {}) => {
+  const {
+    normalization,
+    baseUrl,
+    getExtraFeedFields,
+  } = options
+
+  const feedData = data['rdf:RDF']
+
+  if (!normalization) {
+    return flatten(feedData, baseUrl)
+  }
+
+  const {
+    title = '',
+    link = '',
+    description = '',
+    generator = '',
+    'dc:language': language = '',
+    'dc:date': lastBuildDate = '',
+  } = feedData.channel
+
+  const { item } = feedData
+
+  const extraFields = getExtraFeedFields(feedData)
+
+  const items = isArray(item) ? item : [item]
+
+  const published = options.useISODateFormat ? toISODateString(lastBuildDate) : lastBuildDate
+
+  return {
+    title: getText(title),
+    link: getPureUrl(link, '', baseUrl),
+    description,
+    language,
+    generator,
+    published,
+    ...extraFields,
+    entries: items.map((item) => {
+      return transform(item, options)
+    }),
+  }
+}
+
+export default (data, options = {}) => {
+  return parseRdf(data, options)
+}
diff --git a/src/utils/parseRssFeed.js b/src/utils/parseRssFeed.js
@@ -103,8 +103,10 @@ const parseRss = (data, options = {}) => {
     getExtraFeedFields,
   } = options
 
+  const feedData = data.rss.channel
+
   if (!normalization) {
-    return flatten(data.rss.channel, baseUrl)
+    return flatten(feedData, baseUrl)
   }
 
   const {
@@ -115,9 +117,9 @@ const parseRss = (data, options = {}) => {
     language = '',
     lastBuildDate = '',
     item = [],
-  } = data.rss.channel
+  } = feedData
 
-  const extraFields = getExtraFeedFields(data.rss.channel)
+  const extraFields = getExtraFeedFields(feedData)
 
   const items = isArray(item) ? item : [item]
 

diff --git a/src/utils/xmlparser.js b/src/utils/xmlparser.js
@@ -12,15 +12,19 @@ export const isAtom = (data = {}) => {
   return hasProperty(data, 'feed') && hasProperty(data.feed, 'entry')
 }
 
+export const isRdf = (data = {}) => {
+  return hasProperty(data, 'rdf:RDF') && hasProperty(data['rdf:RDF'], 'channel')
+}
+
 export const validate = (xml) => {
   return (!isString(xml) || !xml.length) ? false : XMLValidator.validate(xml) === true
 }
 
 export const xml2obj = (xml = '', extraOptions = {}) => {
   const options = {
-    ...extraOptions,
-    ignoreAttributes: false,
     attributeNamePrefix: '@_',
+    ignoreAttributes: false,
+    ...extraOptions,
   }
   const parser = new XMLParser(options)
   const jsonObj = parser.parse(xml)