Skip to content

Commit

Permalink
Merge pull request #280 from ndaidong/v7.x.x
Browse files Browse the repository at this point in the history
Change method to deal with `source` and `description`
  • Loading branch information
ndaidong authored Jul 19, 2022
2 parents c6394e6 + cf786d5 commit fc1e720
Show file tree
Hide file tree
Showing 14 changed files with 176 additions and 209 deletions.
118 changes: 59 additions & 59 deletions dist/article-parser.browser.js

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions dist/article-parser.browser.js.map

Large diffs are not rendered by default.

162 changes: 81 additions & 81 deletions dist/cjs/article-parser.js

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions dist/cjs/article-parser.js.map

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dist/cjs/package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "article-parser-cjs",
"version": "7.0.0rc3",
"version": "7.0.0rc4",
"main": "./article-parser.js"
}
7 changes: 4 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "7.0.0rc3",
"version": "7.0.0rc4",
"name": "article-parser",
"description": "To extract main article from given URL",
"homepage": "https://ndaidong.github.io/article-parser-demo/",
Expand Down Expand Up @@ -37,7 +37,8 @@
"linkedom": "^0.14.12",
"sanitize-html": "^2.7.0",
"string-comparison": "^1.1.0",
"urlpattern-polyfill": "^5.0.3"
"tldts": "^5.7.84",
"urlpattern-polyfill": "^5.0.5"
},
"standard": {
"ignore": [
Expand All @@ -48,7 +49,7 @@
"@types/sanitize-html": "^2.6.2",
"cross-env": "^7.0.3",
"esbuild": "^0.14.49",
"jest": "^28.1.2",
"jest": "^28.1.3",
"nock": "^13.2.8",
"standard": "^17.0.0"
},
Expand Down
4 changes: 2 additions & 2 deletions src/config.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ const htmlCrushOptions = {
const parserOptions = {
wordsPerMinute: 300, // to estimate "time to read"
urlsCompareAlgorithm: 'levenshtein', // to find the best url from list
descriptionLengthThreshold: 40, // min num of chars required for description
descriptionTruncateLen: 156, // max num of chars generated for description
descriptionLengthThreshold: 210, // min num of chars required for description
descriptionTruncateLen: 210, // max num of chars generated for description
contentLengthThreshold: 200 // content must have at least 200 chars
}

Expand Down
5 changes: 3 additions & 2 deletions src/main.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ describe('test extract(bad url)', () => {
})

describe('test extract(regular article url)', () => {
const expDesc = "Navigation here Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs."
const cases = [
{
input: {
Expand Down Expand Up @@ -101,7 +102,7 @@ describe('test extract(regular article url)', () => {
validate: (result, expect) => {
expect(result).toBeTruthy()
expect(result.title).toEqual('Article title here')
expect(result.description).toEqual('Few words to summarize this article content')
expect(result.description).toEqual(expDesc)
}
}
]
Expand All @@ -124,6 +125,6 @@ describe('test extract(regular article url)', () => {
const result = await extract(html)
expect(result).toBeTruthy()
expect(result.title).toEqual('Article title here')
expect(result.description).toEqual('Few words to summarize this article content')
expect(result.description).toEqual(expDesc)
})
})
10 changes: 0 additions & 10 deletions src/utils/linker.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,6 @@ export const isValid = (url = '') => {
}
}

export const getHostname = (url) => {
try {
const { hostname } = new URL(url)
return hostname.replace('www.', '')
} catch (err) {
logger.error(err)
return ''
}
}

export const absolutify = (fullUrl = '', relativeUrl = '') => {
try {
const result = new URL(relativeUrl, fullUrl)
Expand Down
36 changes: 0 additions & 36 deletions src/utils/linker.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import { readFileSync } from 'fs'
import { isString } from 'bellajs'

import {
getHostname,
chooseBestUrl,
isValid as isValidUrl,
purify as purifyUrl,
Expand Down Expand Up @@ -180,41 +179,6 @@ describe('test absolutifyUrl()', () => {
})
})

describe('test getHostname()', () => {
const entries = [
{
url: '',
expected: ''
},
{
url: {},
expected: ''
},
{
url: 'https://www.some.where/article/abc-xyz',
expected: 'some.where'
},
{
url: 'https://www.alpha.some.where/blog/authors/article/abc-xyz',
expected: 'alpha.some.where'
},
{
url: 'https://10.1.1.5:1888/article/abc-xyz',
expected: '10.1.1.5'
}
]
entries.forEach((entry) => {
const {
url,
expected
} = entry
test(`absolutifyUrl("${url}") must become "${expected}"`, () => {
const result = getHostname(url)
expect(result).toEqual(expected)
})
})
})

describe('test chooseBestUrl()', () => {
test('test chooseBestUrl an actual case', () => {
const title = 'Google đã ra giá mua Fitbit'
Expand Down
22 changes: 16 additions & 6 deletions src/utils/parseFromHtml.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// utils -> parseFromHtml

import { stripTags, truncate, unique, pipe } from 'bellajs'
import { getDomain } from 'tldts'

import { cleanify, cleanAndMinify as cleanAndMinifyHtml } from './html.js'

Expand All @@ -9,7 +10,6 @@ import {
purify as purifyUrl,
absolutify as absolutifyUrl,
normalize as normalizeUrls,
getHostname,
chooseBestUrl
} from './linker.js'

Expand All @@ -28,9 +28,20 @@ import logger from './logger.js'
import { getParserOptions } from '../config.js'

const summarize = (desc, txt, threshold, maxlen) => {
return desc.length < threshold
? truncate(txt, maxlen).replace(/\n/g, ' ')
: desc
const removeFirstParts = (str) => {
const arr = str.split(' - ')
if (arr.length > 1) {
arr.shift()
return arr.join(' ')
}
return str
}
const metadesc = removeFirstParts(desc)
if (metadesc.length > threshold) {
return metadesc
}
const extradesc = truncate(txt, maxlen).replace(/\n/g, ' ')
return removeFirstParts(extradesc)
}

export default async (inputHtml, inputUrl = '') => {
Expand All @@ -46,7 +57,6 @@ export default async (inputHtml, inputUrl = '') => {
description: metaDesc,
image: metaImg,
author,
source,
published
} = meta

Expand Down Expand Up @@ -129,7 +139,7 @@ export default async (inputHtml, inputUrl = '') => {
image,
content,
author,
source: source || getHostname(bestUrl),
source: getDomain(bestUrl),
published,
ttr: getTimeToRead(textContent)
}
Expand Down
3 changes: 2 additions & 1 deletion src/utils/parseFromHtml.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import parseFromHtml from './parseFromHtml.js'
import { addTransformations } from './transformation.js'

describe('test parseFromHtml()', () => {
const expDesc = "Navigation here Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs."
const cases = [
{
input: {
Expand Down Expand Up @@ -66,7 +67,7 @@ describe('test parseFromHtml()', () => {
},
expectation: (result, expect) => {
expect(result.title).toEqual('Article title here')
expect(result.description).toEqual('Few words to summarize this article content')
expect(result.description).toEqual(expDesc)
expect(result.content).toEqual(expect.stringContaining('<a target="_blank" href="https://otherwhere.com/descriptions/rational-peach">'))
expect(result.content).toEqual(expect.stringContaining('<a target="_blank" href="https://somewhere.com/dict/watermelon">'))
}
Expand Down
2 changes: 1 addition & 1 deletion test-data/regular-article.html
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<meta property="og:title" content="Article title here">
<meta property="og:type" content="article">
<meta property="og:url" content="https://somewhere.com/path/to/article-title-here">
<meta property="og:description" content="Few words to summarize this article content">
<meta property="og:description" content="Navigation here Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs.">
<meta property="og:image" content="https://somewhere.com/path/to/image.jpg">
<meta property="article:published_time" content="2021-12-15T10:00:00.000+07:00">
<meta property="article:modified_time" content="2021-12-16T09:00:00.000+07:00">
Expand Down
2 changes: 1 addition & 1 deletion test-data/vnn-article.html
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<meta property="og:title" content="Article title here">
<meta property="og:type" content="article">
<meta property="og:url" content="https://vnn.vn/path/to/article-title-here">
<meta property="og:description" content="Few words to summarize this article content">
<meta property="og:description" content="VNN News - Few words to summarize this article content">
<meta property="og:image" content="https://somewhere.com/path/to/image.jpg">
<meta property="article:published_time" content="2021-12-15T10:00:00.000+07:00">
<meta property="article:modified_time" content="2021-12-16T09:00:00.000+07:00">
Expand Down

0 comments on commit fc1e720

Please sign in to comment.