Skip to content

Commit

Permalink
Update readme.
Browse files Browse the repository at this point in the history
  • Loading branch information
lewisdonovan committed Nov 1, 2024
1 parent 05edb88 commit ee3857b
Show file tree
Hide file tree
Showing 12 changed files with 168 additions and 76 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@ yarn add google-news-scraper
Simply import the package and pass a config object.
```javascript
import googleNewsScraper from 'google-news-scraper';

const articles = await googleNewsScraper({ searchTerm: "The Oscars" });

```
A minimum working example can be found in [this repo](https://github.com/lewisdonovan/gns-example).
Full documentation on the [config object](#config) can be found below.

## Output 📲
Expand Down
77 changes: 54 additions & 23 deletions dist/cjs/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,30 +81,61 @@ const getArticleType = (article) => {
return "";
};

// const getPrettyUrl = (uglyUrl: string, logger: winston.Logger): string | null => {
// const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
// if (!base64Match) {
// return null;
// }
// const base64String = base64Match[1];
// try {
// const decodedString = Buffer.from(base64String, "base64").toString("ascii");
// const urlPattern = /https?:\/\/[^\s"']+/g;
// const matches = decodedString.match(urlPattern) || [];
// const urls = matches.flatMap(match => {
// const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
// return splitUrls.filter(url => {
// const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
// return cleanUrl;
// });
// });
// const uniqueUrls = [...new Set(urls)];
// const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
// logger.info(finalUrl);
// return finalUrl;
// } catch (error) {
// logger.error(error);
// return null;
// }
// }
const getPrettyUrl = (uglyUrl, logger) => {
const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
if (!base64Match) {
return null;
}
const base64String = base64Match[1];
var _a, _b;
try {
const decodedString = Buffer.from(base64String, "base64").toString("ascii");
const urlPattern = /https?:\/\/[^\s"']+/g;
const matches = decodedString.match(urlPattern) || [];
const urls = matches.flatMap(match => {
const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
return splitUrls.filter(url => {
const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
return cleanUrl;
});
});
const uniqueUrls = [...new Set(urls)];
const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
logger.info(finalUrl);
return finalUrl;
// Step 1: Extract the encoded portion between 'read/' and '?'
let encodedPart = uglyUrl.split('read/')[1].split('?')[0];
// Step 2: Remove 'CB' prefix if present
if (encodedPart.startsWith('CB')) {
encodedPart = encodedPart.substring(2);
}
// Step 3: Replace URL-safe Base64 characters
encodedPart = encodedPart.replace(/-/g, '+').replace(/_/g, '/');
// Step 4: Add padding if necessary
const padding = '='.repeat((4 - (encodedPart.length % 4)) % 4);
encodedPart += padding;
// Step 5: First Base64 decode
const firstDecodedBytes = atob(encodedPart);
// Step 6: Extract the second encoded string (Base64 URL-safe characters)
const secondEncodedPart = (_b = (_a = firstDecodedBytes === null || firstDecodedBytes === void 0 ? void 0 : firstDecodedBytes.match(/[A-Za-z0-9\-_]+/g)) === null || _a === void 0 ? void 0 : _a.join('')) !== null && _b !== void 0 ? _b : '';
// Step 7: Replace URL-safe characters in the second string
let secondEncoded = secondEncodedPart.replace(/-/g, '+').replace(/_/g, '/');
const secondPadding = '='.repeat((4 - (secondEncoded.length % 4)) % 4);
secondEncoded += secondPadding;
// Step 8: Second Base64 decode to get the final URL
const finalURL = atob(secondEncoded);
console.log('Final URL:', finalURL);
return finalURL;
}
catch (error) {
logger.error(error);
console.error('Error decoding URL:', error);
return null;
}
};
Expand All @@ -114,7 +145,7 @@ const buildQueryString = (query) => {
if (Object.keys(query).length === 0)
return "";
// Build query string
// Example: { q: 'puppies', hl: 'en', gl: 'US' } => '?q=puppies&hl=en&gl=US'
// Example: { q: 'zapatos', gl: 'ES', ceid: "es:es" } => '?q=zapatos&gl=ES&ceid=ES:es'
const queryString = Object.keys(query).reduce((acc, key, index) => {
const prefix = index === 0 ? '?' : '&';
return `${acc}${prefix}${key}=${query[key]}`;
Expand Down Expand Up @@ -2778,7 +2809,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
const $ = cheerio__namespace.load(content);
const articles = $('article');
let results = [];
$(articles).each(function (i) {
$(articles).each(function () {
var _a, _b, _c, _d, _e, _f, _g, _h, _j;
const link = ((_c = (_b = (_a = $(this)) === null || _a === void 0 ? void 0 : _a.find('a[href^="./article"]')) === null || _b === void 0 ? void 0 : _b.attr('href')) === null || _c === void 0 ? void 0 : _c.replace('./', 'https://news.google.com/')) || ((_f = (_e = (_d = $(this)) === null || _d === void 0 ? void 0 : _d.find('a[href^="./read"]')) === null || _e === void 0 ? void 0 : _e.attr('href')) === null || _f === void 0 ? void 0 : _f.replace('./', 'https://news.google.com/')) || "";
const srcset = (_g = $(this).find('figure').find('img').attr('srcset')) === null || _g === void 0 ? void 0 : _g.split(' ');
Expand All @@ -2800,7 +2831,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
});
if (config.prettyURLs) {
results = yield Promise.all(results.map(article => {
const url = getPrettyUrl(article.link, logger);
const url = getPrettyUrl(article.link);
if (url) {
article.link = url;
}
Expand Down
2 changes: 1 addition & 1 deletion dist/cjs/min/index.min.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dist/cjs/min/index.min.js.map

Large diffs are not rendered by default.

77 changes: 54 additions & 23 deletions dist/esm/index.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -60,30 +60,61 @@ const getArticleType = (article) => {
return "";
};

// const getPrettyUrl = (uglyUrl: string, logger: winston.Logger): string | null => {
// const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
// if (!base64Match) {
// return null;
// }
// const base64String = base64Match[1];
// try {
// const decodedString = Buffer.from(base64String, "base64").toString("ascii");
// const urlPattern = /https?:\/\/[^\s"']+/g;
// const matches = decodedString.match(urlPattern) || [];
// const urls = matches.flatMap(match => {
// const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
// return splitUrls.filter(url => {
// const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
// return cleanUrl;
// });
// });
// const uniqueUrls = [...new Set(urls)];
// const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
// logger.info(finalUrl);
// return finalUrl;
// } catch (error) {
// logger.error(error);
// return null;
// }
// }
const getPrettyUrl = (uglyUrl, logger) => {
const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
if (!base64Match) {
return null;
}
const base64String = base64Match[1];
var _a, _b;
try {
const decodedString = Buffer.from(base64String, "base64").toString("ascii");
const urlPattern = /https?:\/\/[^\s"']+/g;
const matches = decodedString.match(urlPattern) || [];
const urls = matches.flatMap(match => {
const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
return splitUrls.filter(url => {
const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
return cleanUrl;
});
});
const uniqueUrls = [...new Set(urls)];
const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
logger.info(finalUrl);
return finalUrl;
// Step 1: Extract the encoded portion between 'read/' and '?'
let encodedPart = uglyUrl.split('read/')[1].split('?')[0];
// Step 2: Remove 'CB' prefix if present
if (encodedPart.startsWith('CB')) {
encodedPart = encodedPart.substring(2);
}
// Step 3: Replace URL-safe Base64 characters
encodedPart = encodedPart.replace(/-/g, '+').replace(/_/g, '/');
// Step 4: Add padding if necessary
const padding = '='.repeat((4 - (encodedPart.length % 4)) % 4);
encodedPart += padding;
// Step 5: First Base64 decode
const firstDecodedBytes = atob(encodedPart);
// Step 6: Extract the second encoded string (Base64 URL-safe characters)
const secondEncodedPart = (_b = (_a = firstDecodedBytes === null || firstDecodedBytes === void 0 ? void 0 : firstDecodedBytes.match(/[A-Za-z0-9\-_]+/g)) === null || _a === void 0 ? void 0 : _a.join('')) !== null && _b !== void 0 ? _b : '';
// Step 7: Replace URL-safe characters in the second string
let secondEncoded = secondEncodedPart.replace(/-/g, '+').replace(/_/g, '/');
const secondPadding = '='.repeat((4 - (secondEncoded.length % 4)) % 4);
secondEncoded += secondPadding;
// Step 8: Second Base64 decode to get the final URL
const finalURL = atob(secondEncoded);
console.log('Final URL:', finalURL);
return finalURL;
}
catch (error) {
logger.error(error);
console.error('Error decoding URL:', error);
return null;
}
};
Expand All @@ -93,7 +124,7 @@ const buildQueryString = (query) => {
if (Object.keys(query).length === 0)
return "";
// Build query string
// Example: { q: 'puppies', hl: 'en', gl: 'US' } => '?q=puppies&hl=en&gl=US'
// Example: { q: 'zapatos', gl: 'ES', ceid: "es:es" } => '?q=zapatos&gl=ES&ceid=ES:es'
const queryString = Object.keys(query).reduce((acc, key, index) => {
const prefix = index === 0 ? '?' : '&';
return `${acc}${prefix}${key}=${query[key]}`;
Expand Down Expand Up @@ -2757,7 +2788,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
const $ = cheerio.load(content);
const articles = $('article');
let results = [];
$(articles).each(function (i) {
$(articles).each(function () {
var _a, _b, _c, _d, _e, _f, _g, _h, _j;
const link = ((_c = (_b = (_a = $(this)) === null || _a === void 0 ? void 0 : _a.find('a[href^="./article"]')) === null || _b === void 0 ? void 0 : _b.attr('href')) === null || _c === void 0 ? void 0 : _c.replace('./', 'https://news.google.com/')) || ((_f = (_e = (_d = $(this)) === null || _d === void 0 ? void 0 : _d.find('a[href^="./read"]')) === null || _e === void 0 ? void 0 : _e.attr('href')) === null || _f === void 0 ? void 0 : _f.replace('./', 'https://news.google.com/')) || "";
const srcset = (_g = $(this).find('figure').find('img').attr('srcset')) === null || _g === void 0 ? void 0 : _g.split(' ');
Expand All @@ -2779,7 +2810,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
});
if (config.prettyURLs) {
results = yield Promise.all(results.map(article => {
const url = getPrettyUrl(article.link, logger);
const url = getPrettyUrl(article.link);
if (url) {
article.link = url;
}
Expand Down
2 changes: 1 addition & 1 deletion dist/esm/index.mjs.map

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dist/esm/min/index.min.mjs

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dist/esm/min/index.min.mjs.map

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dist/tsc/buildQueryString.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ const buildQueryString = (query) => {
if (Object.keys(query).length === 0)
return "";
// Build query string
// Example: { q: 'puppies', hl: 'en', gl: 'US' } => '?q=puppies&hl=en&gl=US'
// Example: { q: 'zapatos', gl: 'ES', ceid: "es:es" } => '?q=zapatos&gl=ES&ceid=ES:es'
const queryString = Object.keys(query).reduce((acc, key, index) => {
const prefix = index === 0 ? '?' : '&';
return `${acc}${prefix}${key}=${query[key]}`;
Expand Down
71 changes: 51 additions & 20 deletions dist/tsc/getPrettyUrl.js
Original file line number Diff line number Diff line change
@@ -1,27 +1,58 @@
// const getPrettyUrl = (uglyUrl: string, logger: winston.Logger): string | null => {
// const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
// if (!base64Match) {
// return null;
// }
// const base64String = base64Match[1];
// try {
// const decodedString = Buffer.from(base64String, "base64").toString("ascii");
// const urlPattern = /https?:\/\/[^\s"']+/g;
// const matches = decodedString.match(urlPattern) || [];
// const urls = matches.flatMap(match => {
// const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
// return splitUrls.filter(url => {
// const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
// return cleanUrl;
// });
// });
// const uniqueUrls = [...new Set(urls)];
// const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
// logger.info(finalUrl);
// return finalUrl;
// } catch (error) {
// logger.error(error);
// return null;
// }
// }
const getPrettyUrl = (uglyUrl, logger) => {
const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
if (!base64Match) {
return null;
}
const base64String = base64Match[1];
var _a, _b;
try {
const decodedString = Buffer.from(base64String, "base64").toString("ascii");
const urlPattern = /https?:\/\/[^\s"']+/g;
const matches = decodedString.match(urlPattern) || [];
const urls = matches.flatMap(match => {
const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
return splitUrls.filter(url => {
const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
return cleanUrl;
});
});
const uniqueUrls = [...new Set(urls)];
const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
logger.info(finalUrl);
return finalUrl;
// Step 1: Extract the encoded portion between 'read/' and '?'
let encodedPart = uglyUrl.split('read/')[1].split('?')[0];
// Step 2: Remove 'CB' prefix if present
if (encodedPart.startsWith('CB')) {
encodedPart = encodedPart.substring(2);
}
// Step 3: Replace URL-safe Base64 characters
encodedPart = encodedPart.replace(/-/g, '+').replace(/_/g, '/');
// Step 4: Add padding if necessary
const padding = '='.repeat((4 - (encodedPart.length % 4)) % 4);
encodedPart += padding;
// Step 5: First Base64 decode
const firstDecodedBytes = atob(encodedPart);
// Step 6: Extract the second encoded string (Base64 URL-safe characters)
const secondEncodedPart = (_b = (_a = firstDecodedBytes === null || firstDecodedBytes === void 0 ? void 0 : firstDecodedBytes.match(/[A-Za-z0-9\-_]+/g)) === null || _a === void 0 ? void 0 : _a.join('')) !== null && _b !== void 0 ? _b : '';
// Step 7: Replace URL-safe characters in the second string
let secondEncoded = secondEncodedPart.replace(/-/g, '+').replace(/_/g, '/');
const secondPadding = '='.repeat((4 - (secondEncoded.length % 4)) % 4);
secondEncoded += secondPadding;
// Step 8: Second Base64 decode to get the final URL
const finalURL = atob(secondEncoded);
console.log('Final URL:', finalURL);
return finalURL;
}
catch (error) {
logger.error(error);
console.error('Error decoding URL:', error);
return null;
}
};
Expand Down
2 changes: 1 addition & 1 deletion dist/tsc/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
let results = [];
let i = 0;
const urlChecklist = [];
$(articles).each(function (i) {
$(articles).each(function () {
var _a, _b, _c, _d, _e, _f, _g, _h, _j;
const link = ((_c = (_b = (_a = $(this)) === null || _a === void 0 ? void 0 : _a.find('a[href^="./article"]')) === null || _b === void 0 ? void 0 : _b.attr('href')) === null || _c === void 0 ? void 0 : _c.replace('./', 'https://news.google.com/')) || ((_f = (_e = (_d = $(this)) === null || _d === void 0 ? void 0 : _d.find('a[href^="./read"]')) === null || _e === void 0 ? void 0 : _e.attr('href')) === null || _f === void 0 ? void 0 : _f.replace('./', 'https://news.google.com/')) || "";
link && urlChecklist.push(link);
Expand Down
2 changes: 1 addition & 1 deletion src/buildQueryString.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ const buildQueryString = ( query: QueryVars ) => {
if (Object.keys(query).length === 0) return "";

// Build query string
// Example: { q: 'puppies', hl: 'en', gl: 'US' } => '?q=puppies&hl=en&gl=US'
// Example: { q: 'zapatos', gl: 'ES', ceid: "es:es" } => '?q=zapatos&gl=ES&ceid=ES:es'
const queryString = Object.keys(query).reduce((acc, key, index) => {
const prefix = index === 0 ? '?' : '&'
return `${acc}${prefix}${key}=${query[key]}`
Expand Down

0 comments on commit ee3857b

Please sign in to comment.