Skip to content

Commit

Permalink
Merge pull request #110 from extractus/7.0.5
Browse files Browse the repository at this point in the history
v7.0.5
  • Loading branch information
ndaidong authored Aug 24, 2023
2 parents e228331 + 640381d commit 11e373c
Show file tree
Hide file tree
Showing 10 changed files with 324 additions and 15 deletions.
4 changes: 2 additions & 2 deletions .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,15 @@
"max-lines": [
"error",
{
"max": 460,
"max": 520,
"skipBlankLines": true,
"skipComments": false
}
],
"max-lines-per-function": [
"error",
{
"max": 150,
"max": 240,
"skipBlankLines": true
}
],
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ URL of a valid feed source
Feed content must be accessible and conform one of the following standards:

- [RSS Feed](https://www.rssboard.org/rss-specification)
- [RDF Feed](https://web.resource.org/rss/1.0/spec)
- [ATOM Feed](https://datatracker.ietf.org/doc/html/rfc5023)
- [JSON Feed](https://www.jsonfeed.org/version/1.1/)

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "7.0.4",
"version": "7.0.5",
"name": "@extractus/feed-extractor",
"description": "To read and normalize RSS/ATOM/JSON feed data",
"homepage": "https://extractor-demos.pages.dev",
Expand Down
8 changes: 6 additions & 2 deletions src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import { isValid as isValidUrl } from './utils/linker.js'

import retrieve from './utils/retrieve.js'
import { validate, xml2obj, isRSS, isAtom } from './utils/xmlparser.js'
import { validate, xml2obj, isRSS, isAtom, isRdf } from './utils/xmlparser.js'
import parseJsonFeed from './utils/parseJsonFeed.js'
import parseRssFeed from './utils/parseRssFeed.js'
import parseAtomFeed from './utils/parseAtomFeed.js'
import parseRdfFeed from './utils/parseRdfFeed.js'

const getopt = (options = {}) => {
const {
Expand Down Expand Up @@ -42,11 +43,14 @@ export const extractFromXml = (xml, options = {}) => {
const opts = getopt(options)

const data = xml2obj(xml, opts.xmlParserOptions)

return isRSS(data)
? parseRssFeed(data, opts)
: isAtom(data)
? parseAtomFeed(data, opts)
: null
: isRdf(data)
? parseRdfFeed(data, opts)
: null
}

export const extract = async (url, options = {}, fetchOptions = {}) => {
Expand Down
63 changes: 61 additions & 2 deletions src/main.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,30 @@ describe('test extract() standard feed', () => {
expect(validateProps(result.entries[0])).toBe(true)
})

test('extract rdf feed from Slashdot with extraFields', async () => {
const url = 'https://some-news-page.tld/atom'
const xml = readFileSync('test-data/rdf-standard.xml', 'utf8')
const { baseUrl, path } = parseUrl(url)
nock(baseUrl).get(path).reply(200, xml, {
'Content-Type': 'application/xml',
})
const result = await extract(url, {
getExtraFeedFields: data => {
return {
subject: data['dc:subject'],
}
},
getExtraEntryFields: data => {
return {
author: data['dc:creator'],
}
},
})
expect(hasProperty(result, 'subject')).toBe(true)
expect(hasProperty(result.entries[0], 'author')).toBe(true)
expect(validateProps(result.entries[0])).toBe(true)
})

test('extract atom feed which contains multi links', async () => {
const url = 'https://some-news-page.tld/atom/multilinks'
const xml = readFileSync('test-data/atom-multilinks.xml', 'utf8')
Expand Down Expand Up @@ -291,6 +315,22 @@ describe('test extract() without normalization', () => {
expect(hasProperty(result.item, 'guid')).toBe(true)
})

test('extract rdf feed from Slashdot without normalization', async () => {
const url = 'https://some-news-page.tld/atom'
const xml = readFileSync('test-data/rdf-standard.xml', 'utf8')
const { baseUrl, path } = parseUrl(url)
nock(baseUrl).get(path).reply(200, xml, {
'Content-Type': 'application/xml',
})
const result = await extract(url, {
normalization: false,
})
expect(hasProperty(result.channel, 'syn:updateBase')).toBe(true)
expect(hasProperty(result.channel, 'dc:rights')).toBe(true)
expect(hasProperty(result, 'item')).toBe(true)
expect(hasProperty(result.item[0], 'slash:department')).toBe(true)
})

test('extract atom feed from Google', async () => {
const url = 'https://some-news-page.tld/atom'
const xml = readFileSync('test-data/atom-feed-standard-realworld.xml', 'utf8')
Expand Down Expand Up @@ -358,7 +398,7 @@ describe('test extract() without normalization', () => {
})

describe('test extract with `baseUrl` option', () => {
test('extract rss feed with xml', () => {
test('extract rss feed from file', () => {
const baseUrl = 'https://huggingface.co'
const xml = readFileSync('test-data/rss-feed-miss-base-url.xml', 'utf8')
const result = extractFromXml(xml, { baseUrl })
Expand All @@ -376,7 +416,26 @@ describe('test extract with `baseUrl` option', () => {
expect(result.entries[0].link).toBe(baseUrl + '/blog/intro-graphml')
})

test('extract rss feed with json', () => {
test('extract rdf feed from file', () => {
const baseUrl = 'https://slashdot.org'
const xml = readFileSync('test-data/rdf-standard.xml', 'utf8')
const result = extractFromXml(xml, { baseUrl })

feedAttrs.forEach((k) => {
expect(hasProperty(result, k)).toBe(true)
})

entryAttrs.forEach((k) => {
expect(hasProperty(result.entries[0], k)).toBe(true)
})

expect(validateProps(result.entries[0])).toBe(true)
expect(result.link).toBe(baseUrl + '/')
const firstItemLink = result.entries[0].link
expect(firstItemLink.startsWith('https://tech.slashdot.org/story/23/08/23/2238246/spacex-')).toBe(true)
})

test('extract json feed from file', () => {
const baseUrl = 'https://www.jsonfeed.org'
const json = readFileSync('test-data/json-feed-miss-base-url.json', 'utf8')
const result = extractFromJson(JSON.parse(json), { baseUrl })
Expand Down
8 changes: 5 additions & 3 deletions src/utils/parseAtomFeed.js
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,10 @@ const parseAtom = (data, options = {}) => {
getExtraFeedFields,
} = options

const feedData = data.feed

if (!normalization) {
return flatten(data.feed, baseUrl)
return flatten(feedData, baseUrl)
}

const {
Expand All @@ -111,9 +113,9 @@ const parseAtom = (data, options = {}) => {
language = '',
updated = '',
entry: item = [],
} = data.feed
} = feedData

const extraFields = getExtraFeedFields(data.feed)
const extraFields = getExtraFeedFields(feedData)

const items = isArray(item) ? item : [item]

Expand Down
129 changes: 129 additions & 0 deletions src/utils/parseRdfFeed.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// parseRssFeed.js

// specs: https://www.rssboard.org/rss-specification

import { isArray } from 'bellajs'

import {
getText,
toISODateString,
buildDescription,
getPureUrl,
getEntryId
} from './normalizer.js'

const transform = (item, options) => {
const {
useISODateFormat,
descriptionMaxLen,
baseUrl,
getExtraEntryFields,
} = options

const {
guid = '',
title = '',
link = '',
'dc:date': pubDate = '',
description = '',
'content:encoded': content = '',
} = item

const published = useISODateFormat ? toISODateString(pubDate) : pubDate
const htmlContent = getText(description || content)
const entry = {
id: getEntryId(guid, link, pubDate),
title: getText(title),
link: getPureUrl(link, guid, baseUrl),
published,
description: buildDescription(description || htmlContent, descriptionMaxLen),
}

const extraFields = getExtraEntryFields(item)

return {
...entry,
...extraFields,
}
}

const flatten = (feed, baseUrl) => {
const {
title = '',
link = '',
item,
} = feed

const items = isArray(item) ? item : [item]
const entries = items.map((entry) => {
const {
id,
title = '',
link = '',
} = entry

const item = {
...entry,
title: getText(title),
link: getPureUrl(link, id, baseUrl),
}

return item
})

const output = {
...feed,
title: getText(title),
link: getPureUrl(link, baseUrl),
item: isArray(item) ? entries : entries[0],
}
return output
}

const parseRdf = (data, options = {}) => {
const {
normalization,
baseUrl,
getExtraFeedFields,
} = options

const feedData = data['rdf:RDF']

if (!normalization) {
return flatten(feedData, baseUrl)
}

const {
title = '',
link = '',
description = '',
generator = '',
'dc:language': language = '',
'dc:date': lastBuildDate = '',
} = feedData.channel

const { item } = feedData

const extraFields = getExtraFeedFields(feedData)

const items = isArray(item) ? item : [item]

const published = options.useISODateFormat ? toISODateString(lastBuildDate) : lastBuildDate

return {
title: getText(title),
link: getPureUrl(link, '', baseUrl),
description,
language,
generator,
published,
...extraFields,
entries: items.map((item) => {
return transform(item, options)
}),
}
}

export default (data, options = {}) => {
return parseRdf(data, options)
}
8 changes: 5 additions & 3 deletions src/utils/parseRssFeed.js
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,10 @@ const parseRss = (data, options = {}) => {
getExtraFeedFields,
} = options

const feedData = data.rss.channel

if (!normalization) {
return flatten(data.rss.channel, baseUrl)
return flatten(feedData, baseUrl)
}

const {
Expand All @@ -115,9 +117,9 @@ const parseRss = (data, options = {}) => {
language = '',
lastBuildDate = '',
item = [],
} = data.rss.channel
} = feedData

const extraFields = getExtraFeedFields(data.rss.channel)
const extraFields = getExtraFeedFields(feedData)

const items = isArray(item) ? item : [item]

Expand Down
8 changes: 6 additions & 2 deletions src/utils/xmlparser.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,19 @@ export const isAtom = (data = {}) => {
return hasProperty(data, 'feed') && hasProperty(data.feed, 'entry')
}

export const isRdf = (data = {}) => {
return hasProperty(data, 'rdf:RDF') && hasProperty(data['rdf:RDF'], 'channel')
}

export const validate = (xml) => {
return (!isString(xml) || !xml.length) ? false : XMLValidator.validate(xml) === true
}

export const xml2obj = (xml = '', extraOptions = {}) => {
const options = {
...extraOptions,
ignoreAttributes: false,
attributeNamePrefix: '@_',
ignoreAttributes: false,
...extraOptions,
}
const parser = new XMLParser(options)
const jsonObj = parser.parse(xml)
Expand Down
Loading

0 comments on commit 11e373c

Please sign in to comment.