From 80ef993cc58cd6e9d5ff569953a9507c904d0a9c Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Thu, 24 Aug 2023 17:15:29 +0700 Subject: [PATCH 1/3] Add RDF parser To resolve issue #80 --- .eslintrc.json | 4 +- README.md | 1 + package.json | 2 +- src/main.js | 8 ++- src/main.test.js | 63 +++++++++++++++++- src/utils/parseAtomFeed.js | 8 ++- src/utils/parseRdfFeed.js | 130 +++++++++++++++++++++++++++++++++++++ src/utils/parseRssFeed.js | 8 ++- src/utils/xmlparser.js | 4 ++ test-data/rdf-standard.xml | 108 ++++++++++++++++++++++++++++++ 10 files changed, 323 insertions(+), 13 deletions(-) create mode 100644 src/utils/parseRdfFeed.js create mode 100644 test-data/rdf-standard.xml diff --git a/.eslintrc.json b/.eslintrc.json index d22d31c..42c1225 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -62,7 +62,7 @@ "max-lines": [ "error", { - "max": 460, + "max": 520, "skipBlankLines": true, "skipComments": false } @@ -70,7 +70,7 @@ "max-lines-per-function": [ "error", { - "max": 150, + "max": 240, "skipBlankLines": true } ], diff --git a/README.md b/README.md index edd3059..7b4e668 100755 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ URL of a valid feed source Feed content must be accessible and conform one of the following standards: - [RSS Feed](https://www.rssboard.org/rss-specification) + - [RDF Feed](https://web.resource.org/rss/1.0/spec) - [ATOM Feed](https://datatracker.ietf.org/doc/html/rfc5023) - [JSON Feed](https://www.jsonfeed.org/version/1.1/) diff --git a/package.json b/package.json index 550a2d0..2bf66fb 100755 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "7.0.4", + "version": "7.0.5", "name": "@extractus/feed-extractor", "description": "To read and normalize RSS/ATOM/JSON feed data", "homepage": "https://extractor-demos.pages.dev", diff --git a/src/main.js b/src/main.js index f657c34..e41ba9e 100755 --- a/src/main.js +++ b/src/main.js @@ -3,10 +3,11 @@ import { isValid as isValidUrl } from './utils/linker.js' import retrieve from './utils/retrieve.js' -import { validate, xml2obj, isRSS, isAtom } from './utils/xmlparser.js' +import { validate, xml2obj, isRSS, isAtom, isRdf } from './utils/xmlparser.js' import parseJsonFeed from './utils/parseJsonFeed.js' import parseRssFeed from './utils/parseRssFeed.js' import parseAtomFeed from './utils/parseAtomFeed.js' +import parseRdfFeed from './utils/parseRdfFeed.js' const getopt = (options = {}) => { const { @@ -42,11 +43,14 @@ export const extractFromXml = (xml, options = {}) => { const opts = getopt(options) const data = xml2obj(xml, opts.xmlParserOptions) + return isRSS(data) ? parseRssFeed(data, opts) : isAtom(data) ? parseAtomFeed(data, opts) - : null + : isRdf(data) + ? parseRdfFeed(data, opts) + : null } export const extract = async (url, options = {}, fetchOptions = {}) => { diff --git a/src/main.test.js b/src/main.test.js index cc19476..36e7666 100644 --- a/src/main.test.js +++ b/src/main.test.js @@ -138,6 +138,30 @@ describe('test extract() standard feed', () => { expect(validateProps(result.entries[0])).toBe(true) }) + test('extract rdf feed from Slashdot with extraFields', async () => { + const url = 'https://some-news-page.tld/atom' + const xml = readFileSync('test-data/rdf-standard.xml', 'utf8') + const { baseUrl, path } = parseUrl(url) + nock(baseUrl).get(path).reply(200, xml, { + 'Content-Type': 'application/xml', + }) + const result = await extract(url, { + getExtraFeedFields: data => { + return { + subject: data['dc:subject'], + } + }, + getExtraEntryFields: data => { + return { + author: data['dc:creator'], + } + }, + }) + expect(hasProperty(result, 'subject')).toBe(true) + expect(hasProperty(result.entries[0], 'author')).toBe(true) + expect(validateProps(result.entries[0])).toBe(true) + }) + test('extract atom feed which contains multi links', async () => { const url = 'https://some-news-page.tld/atom/multilinks' const xml = readFileSync('test-data/atom-multilinks.xml', 'utf8') @@ -291,6 +315,22 @@ describe('test extract() without normalization', () => { expect(hasProperty(result.item, 'guid')).toBe(true) }) + test('extract rdf feed from Slashdot without normalization', async () => { + const url = 'https://some-news-page.tld/atom' + const xml = readFileSync('test-data/rdf-standard.xml', 'utf8') + const { baseUrl, path } = parseUrl(url) + nock(baseUrl).get(path).reply(200, xml, { + 'Content-Type': 'application/xml', + }) + const result = await extract(url, { + normalization: false, + }) + expect(hasProperty(result.channel, 'syn:updateBase')).toBe(true) + expect(hasProperty(result.channel, 'dc:rights')).toBe(true) + expect(hasProperty(result, 'item')).toBe(true) + expect(hasProperty(result.item[0], 'slash:department')).toBe(true) + }) + test('extract atom feed from Google', async () => { const url = 'https://some-news-page.tld/atom' const xml = readFileSync('test-data/atom-feed-standard-realworld.xml', 'utf8') @@ -358,7 +398,7 @@ describe('test extract() without normalization', () => { }) describe('test extract with `baseUrl` option', () => { - test('extract rss feed with xml', () => { + test('extract rss feed from file', () => { const baseUrl = 'https://huggingface.co' const xml = readFileSync('test-data/rss-feed-miss-base-url.xml', 'utf8') const result = extractFromXml(xml, { baseUrl }) @@ -376,7 +416,26 @@ describe('test extract with `baseUrl` option', () => { expect(result.entries[0].link).toBe(baseUrl + '/blog/intro-graphml') }) - test('extract rss feed with json', () => { + test('extract rdf feed from file', () => { + const baseUrl = 'https://slashdot.org' + const xml = readFileSync('test-data/rdf-standard.xml', 'utf8') + const result = extractFromXml(xml, { baseUrl }) + + feedAttrs.forEach((k) => { + expect(hasProperty(result, k)).toBe(true) + }) + + entryAttrs.forEach((k) => { + expect(hasProperty(result.entries[0], k)).toBe(true) + }) + + expect(validateProps(result.entries[0])).toBe(true) + expect(result.link).toBe(baseUrl + '/') + const firstItemLink = result.entries[0].link + expect(firstItemLink.startsWith('https://tech.slashdot.org/story/23/08/23/2238246/spacex-')).toBe(true) + }) + + test('extract json feed from file', () => { const baseUrl = 'https://www.jsonfeed.org' const json = readFileSync('test-data/json-feed-miss-base-url.json', 'utf8') const result = extractFromJson(JSON.parse(json), { baseUrl }) diff --git a/src/utils/parseAtomFeed.js b/src/utils/parseAtomFeed.js index 7b14fe4..96b4784 100644 --- a/src/utils/parseAtomFeed.js +++ b/src/utils/parseAtomFeed.js @@ -98,8 +98,10 @@ const parseAtom = (data, options = {}) => { getExtraFeedFields, } = options + const feedData = data.feed + if (!normalization) { - return flatten(data.feed, baseUrl) + return flatten(feedData, baseUrl) } const { @@ -111,9 +113,9 @@ const parseAtom = (data, options = {}) => { language = '', updated = '', entry: item = [], - } = data.feed + } = feedData - const extraFields = getExtraFeedFields(data.feed) + const extraFields = getExtraFeedFields(feedData) const items = isArray(item) ? item : [item] diff --git a/src/utils/parseRdfFeed.js b/src/utils/parseRdfFeed.js new file mode 100644 index 0000000..876145f --- /dev/null +++ b/src/utils/parseRdfFeed.js @@ -0,0 +1,130 @@ +// parseRssFeed.js + +// specs: https://www.rssboard.org/rss-specification + +import { isArray, hasProperty } from 'bellajs' + +import { + getText, + toISODateString, + buildDescription, + getPureUrl, + getOptionalTags, + getEntryId +} from './normalizer.js' + +const transform = (item, options) => { + const { + useISODateFormat, + descriptionMaxLen, + baseUrl, + getExtraEntryFields, + } = options + + const { + guid = '', + title = '', + link = '', + 'dc:date': pubDate = '', + description = '', + 'content:encoded': content = '', + } = item + + const published = useISODateFormat ? toISODateString(pubDate) : pubDate + const htmlContent = getText(description || content) + const entry = { + id: getEntryId(guid, link, pubDate), + title: getText(title), + link: getPureUrl(link, guid, baseUrl), + published, + description: buildDescription(description || htmlContent, descriptionMaxLen), + } + + const extraFields = getExtraEntryFields(item) + + return { + ...entry, + ...extraFields, + } +} + +const flatten = (feed, baseUrl) => { + const { + title = '', + link = '', + item, + } = feed + + const items = isArray(item) ? item : [item] + const entries = items.map((entry) => { + const { + id, + title = '', + link = '', + } = entry + + const item = { + ...entry, + title: getText(title), + link: getPureUrl(link, id, baseUrl), + } + + return item + }) + + const output = { + ...feed, + title: getText(title), + link: getPureUrl(link, baseUrl), + item: isArray(item) ? entries : entries[0], + } + return output +} + +const parseRdf = (data, options = {}) => { + const { + normalization, + baseUrl, + getExtraFeedFields, + } = options + + const feedData = data['rdf:RDF'] + + if (!normalization) { + return flatten(feedData, baseUrl) + } + + const { + title = '', + link = '', + description = '', + generator = '', + 'dc:language': language = '', + 'dc:date': lastBuildDate = '', + } = feedData.channel + + const { item } = feedData + + const extraFields = getExtraFeedFields(feedData) + + const items = isArray(item) ? item : [item] + + const published = options.useISODateFormat ? toISODateString(lastBuildDate) : lastBuildDate + + return { + title: getText(title), + link: getPureUrl(link, '', baseUrl), + description, + language, + generator, + published, + ...extraFields, + entries: items.map((item) => { + return transform(item, options) + }), + } +} + +export default (data, options = {}) => { + return parseRdf(data, options) +} diff --git a/src/utils/parseRssFeed.js b/src/utils/parseRssFeed.js index ac7892c..cbc793f 100644 --- a/src/utils/parseRssFeed.js +++ b/src/utils/parseRssFeed.js @@ -103,8 +103,10 @@ const parseRss = (data, options = {}) => { getExtraFeedFields, } = options + const feedData = data.rss.channel + if (!normalization) { - return flatten(data.rss.channel, baseUrl) + return flatten(feedData, baseUrl) } const { @@ -115,9 +117,9 @@ const parseRss = (data, options = {}) => { language = '', lastBuildDate = '', item = [], - } = data.rss.channel + } = feedData - const extraFields = getExtraFeedFields(data.rss.channel) + const extraFields = getExtraFeedFields(feedData) const items = isArray(item) ? item : [item] diff --git a/src/utils/xmlparser.js b/src/utils/xmlparser.js index cd1fd50..3cf7bb4 100755 --- a/src/utils/xmlparser.js +++ b/src/utils/xmlparser.js @@ -12,6 +12,10 @@ export const isAtom = (data = {}) => { return hasProperty(data, 'feed') && hasProperty(data.feed, 'entry') } +export const isRdf = (data = {}) => { + return hasProperty(data, 'rdf:RDF') && hasProperty(data['rdf:RDF'], 'channel') +} + export const validate = (xml) => { return (!isString(xml) || !xml.length) ? false : XMLValidator.validate(xml) === true } diff --git a/test-data/rdf-standard.xml b/test-data/rdf-standard.xml new file mode 100644 index 0000000..59a00e5 --- /dev/null +++ b/test-data/rdf-standard.xml @@ -0,0 +1,108 @@ + + + + Slashdot + https://slashdot.org/ + News for nerds, stuff that matters + en-us + Copyright 1997-2016, SlashdotMedia. All Rights Reserved. + 2023-08-24T09:29:07+00:00 + Dice + help@slashdot.org + Technology + 1970-01-01T00:00+00:00 + 1 + hourly + + + + + + + + + + + + Slashdot + https://a.fsdn.com/sd/topics/topicslashdot.gif + https://slashdot.org/ + + + SpaceX Working With Cloudflare To Speed Up Starlink Service + https://tech.slashdot.org/story/23/08/23/2238246/spacex-working-with-cloudflare-to-speed-up-starlink-service?utm_source=rss1.0mainlinkanon&utm_medium=feed + According to The Information (paywalled), SpaceX is working with Cloudlfare to boost the performance of its satellite internet service Starlink. Reuters reports: The two companies are working on a way to increase Starlink's network of mini data centers around the globe that could help it deliver faster network speeds to its customers, the report said. According to SpaceX's website, Starlink users typically have download speeds between 25 and 220 Mbps, with the "majority" over 100 Mbps. Upload speeds range between 5 and 20 Mbps.<p><div class="share_submission" style="position:relative;"> +<a class="slashpop" href="http://twitter.com/home?status=SpaceX+Working+With+Cloudflare+To+Speed+Up+Starlink+Service%3A+https%3A%2F%2Ftech.slashdot.org%2Fstory%2F23%2F08%2F23%2F2238246%2F%3Futm_source%3Dtwitter%26utm_medium%3Dtwitter"><img src="https://a.fsdn.com/sd/twitter_icon_large.png"></a> +<a class="slashpop" href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Ftech.slashdot.org%2Fstory%2F23%2F08%2F23%2F2238246%2Fspacex-working-with-cloudflare-to-speed-up-starlink-service%3Futm_source%3Dslashdot%26utm_medium%3Dfacebook"><img src="https://a.fsdn.com/sd/facebook_icon_large.png"></a> + + + +</div></p><p><a href="https://tech.slashdot.org/story/23/08/23/2238246/spacex-working-with-cloudflare-to-speed-up-starlink-service?utm_source=rss1.0moreanon&amp;utm_medium=feed">Read more of this story</a> at Slashdot.</p><iframe src="https://slashdot.org/slashdot-it.pl?op=discuss&amp;id=23035684&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> + BeauHD + 2023-08-24T07:00:00+00:00 + internet + joining-forces + technology + 0,0,0,0,0,0,0 + + + Paralyzed Woman Able To 'Speak' Through Digital Avatar In World First + https://science.slashdot.org/story/23/08/23/2234246/paralyzed-woman-able-to-speak-through-digital-avatar-in-world-first?utm_source=rss1.0mainlinkanon&utm_medium=feed + An anonymous reader quotes a report from The Guardian: A severely paralyzed woman has been able to speak through an avatar using technology that translated her brain signals into speech and facial expressions. The latest technology uses tiny electrodes implanted on the surface of the brain to detect electrical activity in the part of the brain that controls speech and face movements. These signals are translated directly into a digital avatar's speech and facial expressions including smiling, frowning or surprise. The patient, a 47-year-old woman, Ann, has been severely paralyzed since suffering a brainstem stroke more than 18 years ago. She cannot speak or type and normally communicates using movement-tracking technology that allows her to slowly select letters at up to 14 words a minute. She hopes the avatar technology could enable her to work as a counsellor in future. + +The team implanted a paper-thin rectangle of 253 electrodes on to the surface of Ann's brain over a region critical for speech. The electrodes intercepted the brain signals that, if not for the stroke, would have controlled muscles in her tongue, jaw, larynx and face. After implantation, Ann worked with the team to train the system's AI algorithm to detect her unique brain signals for various speech sounds by repeating different phrases repeatedly. The computer learned 39 distinctive sounds and a Chat GPT-style language model was used to translate the signals into intelligible sentences. This was then used to control an avatar with a voice personalized to sound like Ann's voice before the injury, based on a recording of her speaking at her wedding. + +The technology was not perfect, decoding words incorrectly 28% of the time in a test run involving more than 500 phrases, and it generated brain-to-text at a rate of 78 words a minute, compared with the 110-150 words typically spoken in natural conversation. However, scientists said the latest advances in accuracy, speed and sophistication suggest the technology is now at a point of being practically useful for patients. A crucial next step is to create a wireless version of the BCI that could be implanted beneath the skull. The findings have been published in the journal Nature.<p><div class="share_submission" style="position:relative;"> +<a class="slashpop" href="http://twitter.com/home?status=Paralyzed+Woman+Able+To+'Speak'+Through+Digital+Avatar+In+World+First%3A+https%3A%2F%2Fscience.slashdot.org%2Fstory%2F23%2F08%2F23%2F2234246%2F%3Futm_source%3Dtwitter%26utm_medium%3Dtwitter"><img src="https://a.fsdn.com/sd/twitter_icon_large.png"></a> +<a class="slashpop" href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fscience.slashdot.org%2Fstory%2F23%2F08%2F23%2F2234246%2Fparalyzed-woman-able-to-speak-through-digital-avatar-in-world-first%3Futm_source%3Dslashdot%26utm_medium%3Dfacebook"><img src="https://a.fsdn.com/sd/facebook_icon_large.png"></a> + + + +</div></p><p><a href="https://science.slashdot.org/story/23/08/23/2234246/paralyzed-woman-able-to-speak-through-digital-avatar-in-world-first?utm_source=rss1.0moreanon&amp;utm_medium=feed">Read more of this story</a> at Slashdot.</p><iframe src="https://slashdot.org/slashdot-it.pl?op=discuss&amp;id=23035680&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> + BeauHD + 2023-08-24T03:30:00+00:00 + medicine + we're-at-a-tipping-point + science + 5 + 5,4,4,3,3,1,0 + + + Amazon Sues Online Stores Selling Pirated DVDs + https://yro.slashdot.org/story/23/08/23/2230246/amazon-sues-online-stores-selling-pirated-dvds?utm_source=rss1.0mainlinkanon&utm_medium=feed + Amazon has filed a lawsuit against a group of online stores that sell pirated DVDs of key titles such as "The Lord of the Rings: The Rings of Power" and "The Peripheral." TorrentFreak reports: In a complaint filed at a California federal court, Amazon accuses seven websites of selling pirated discs. These sites, including dvdshelf.com.au, dvds.trade, and dvdwholesale.co.uk, are presumably operated by the same group, using a variety of companies. For the public at large, it may not be immediately obvious that these discs are pirated. However, since Amazon doesn't produce or sell DVDs for these Prime Video series, there is no doubt that they are created from illicit sources. + +The piracy operation consists of at least seven websites and these all remain online today. According to Amazon, the sites ship to customers in the U.S. and abroad, twenty-four hours a day, seven days a week, resulting in mass copyright infringement. Before going to court, investigators conducted more than twenty test purchases of pirated DVDs. After these orders arrived, Amazon sent the discs to the Motion Picture Association which independently confirmed that they were all pirated. + +The complaint lists Yangchun Zhang as a key suspect. This person presumably resides in China and obtained the 'DVD Shelf' trademark in Australia. In addition, Zhang is also listed as the registrant of several of the domain names involved. The complaint accuses Zhang and the others of both copyright and trademark infringement. Through the lawsuit (PDF), Amazon hopes to recoup damages, which can run in the millions of dollars. Another key priority is to shut the sites down and Amazon asks the court for an injunction to stop all infringing activity.<p><div class="share_submission" style="position:relative;"> +<a class="slashpop" href="http://twitter.com/home?status=Amazon+Sues+Online+Stores+Selling+Pirated+DVDs%3A+https%3A%2F%2Fyro.slashdot.org%2Fstory%2F23%2F08%2F23%2F2230246%2F%3Futm_source%3Dtwitter%26utm_medium%3Dtwitter"><img src="https://a.fsdn.com/sd/twitter_icon_large.png"></a> +<a class="slashpop" href="http://www.facebook.com/sharer.php?u=https%3A%2F%2Fyro.slashdot.org%2Fstory%2F23%2F08%2F23%2F2230246%2Famazon-sues-online-stores-selling-pirated-dvds%3Futm_source%3Dslashdot%26utm_medium%3Dfacebook"><img src="https://a.fsdn.com/sd/facebook_icon_large.png"></a> + + + +</div></p><p><a href="https://yro.slashdot.org/story/23/08/23/2230246/amazon-sues-online-stores-selling-pirated-dvds?utm_source=rss1.0moreanon&amp;utm_medium=feed">Read more of this story</a> at Slashdot.</p><iframe src="https://slashdot.org/slashdot-it.pl?op=discuss&amp;id=23035666&amp;smallembed=1" style="height: 300px; width: 100%; border: none;"></iframe> + BeauHD + 2023-08-24T01:10:00+00:00 + piracy + cease-and-desist + yro + 28 + 28,28,25,22,5,3,1 + + + + Search Slashdot + Search Slashdot stories + query + https://slashdot.org/search.pl + + From 5a1cf4cc0caeaf817281c357cb16dee9b3752ec9 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Thu, 24 Aug 2023 17:22:25 +0700 Subject: [PATCH 2/3] Allow customize `attributeNamePrefix` Resolve issue #100 --- src/utils/xmlparser.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/xmlparser.js b/src/utils/xmlparser.js index 3cf7bb4..bd5a82a 100755 --- a/src/utils/xmlparser.js +++ b/src/utils/xmlparser.js @@ -22,9 +22,9 @@ export const validate = (xml) => { export const xml2obj = (xml = '', extraOptions = {}) => { const options = { - ...extraOptions, - ignoreAttributes: false, attributeNamePrefix: '@_', + ignoreAttributes: false, + ...extraOptions, } const parser = new XMLParser(options) const jsonObj = parser.parse(xml) From 640381d8f8d5ff6ff60a09f4e41c3910227832b9 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Thu, 24 Aug 2023 17:25:55 +0700 Subject: [PATCH 3/3] Fix linting issue --- src/utils/parseRdfFeed.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/utils/parseRdfFeed.js b/src/utils/parseRdfFeed.js index 876145f..5c582b2 100644 --- a/src/utils/parseRdfFeed.js +++ b/src/utils/parseRdfFeed.js @@ -2,14 +2,13 @@ // specs: https://www.rssboard.org/rss-specification -import { isArray, hasProperty } from 'bellajs' +import { isArray } from 'bellajs' import { getText, toISODateString, buildDescription, getPureUrl, - getOptionalTags, getEntryId } from './normalizer.js'