Update readme.

lewisdonovan · Nov 1, 2024 · ee3857b · ee3857b
1 parent 05edb88
commit ee3857b
Show file tree

Hide file tree

Showing 12 changed files with 168 additions and 76 deletions.
diff --git a/README.md b/README.md
@@ -33,10 +33,9 @@ yarn add google-news-scraper
 Simply import the package and pass a config object.
 ```javascript
 import googleNewsScraper from 'google-news-scraper';
-
 const articles = await googleNewsScraper({ searchTerm: "The Oscars" });
-
 ```
+A minimum working example can be found in [this repo](https://github.com/lewisdonovan/gns-example). 
 Full documentation on the [config object](#config) can be found below.
 
 ## Output 📲

diff --git a/dist/cjs/index.js b/dist/cjs/index.js
@@ -81,30 +81,61 @@ const getArticleType = (article) => {
     return "";
 };
 
+// const getPrettyUrl = (uglyUrl: string, logger: winston.Logger): string | null => {
+//   const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
+//   if (!base64Match) {
+//     return null;
+//   }
+//   const base64String = base64Match[1];
+//   try {
+//     const decodedString = Buffer.from(base64String, "base64").toString("ascii");
+//     const urlPattern = /https?:\/\/[^\s"']+/g;
+//     const matches = decodedString.match(urlPattern) || [];
+//     const urls = matches.flatMap(match => {
+//       const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
+//       return splitUrls.filter(url => {
+//         const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
+//         return cleanUrl;
+//       });
+//     });
+//     const uniqueUrls = [...new Set(urls)];
+//     const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
+//     logger.info(finalUrl);
+//     return finalUrl;
+//   } catch (error) {
+//     logger.error(error);
+//     return null;
+//   }
+// }
 const getPrettyUrl = (uglyUrl, logger) => {
-    const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
-    if (!base64Match) {
-        return null;
-    }
-    const base64String = base64Match[1];
+    var _a, _b;
     try {
-        const decodedString = Buffer.from(base64String, "base64").toString("ascii");
-        const urlPattern = /https?:\/\/[^\s"']+/g;
-        const matches = decodedString.match(urlPattern) || [];
-        const urls = matches.flatMap(match => {
-            const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
-            return splitUrls.filter(url => {
-                const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
-                return cleanUrl;
-            });
-        });
-        const uniqueUrls = [...new Set(urls)];
-        const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
-        logger.info(finalUrl);
-        return finalUrl;
+        // Step 1: Extract the encoded portion between 'read/' and '?'
+        let encodedPart = uglyUrl.split('read/')[1].split('?')[0];
+        // Step 2: Remove 'CB' prefix if present
+        if (encodedPart.startsWith('CB')) {
+            encodedPart = encodedPart.substring(2);
+        }
+        // Step 3: Replace URL-safe Base64 characters
+        encodedPart = encodedPart.replace(/-/g, '+').replace(/_/g, '/');
+        // Step 4: Add padding if necessary
+        const padding = '='.repeat((4 - (encodedPart.length % 4)) % 4);
+        encodedPart += padding;
+        // Step 5: First Base64 decode
+        const firstDecodedBytes = atob(encodedPart);
+        // Step 6: Extract the second encoded string (Base64 URL-safe characters)
+        const secondEncodedPart = (_b = (_a = firstDecodedBytes === null || firstDecodedBytes === void 0 ? void 0 : firstDecodedBytes.match(/[A-Za-z0-9\-_]+/g)) === null || _a === void 0 ? void 0 : _a.join('')) !== null && _b !== void 0 ? _b : '';
+        // Step 7: Replace URL-safe characters in the second string
+        let secondEncoded = secondEncodedPart.replace(/-/g, '+').replace(/_/g, '/');
+        const secondPadding = '='.repeat((4 - (secondEncoded.length % 4)) % 4);
+        secondEncoded += secondPadding;
+        // Step 8: Second Base64 decode to get the final URL
+        const finalURL = atob(secondEncoded);
+        console.log('Final URL:', finalURL);
+        return finalURL;
     }
     catch (error) {
-        logger.error(error);
+        console.error('Error decoding URL:', error);
         return null;
     }
 };
@@ -114,7 +145,7 @@ const buildQueryString = (query) => {
     if (Object.keys(query).length === 0)
         return "";
     // Build query string
-    // Example: { q: 'puppies', hl: 'en', gl: 'US' } => '?q=puppies&hl=en&gl=US'
+    // Example: { q: 'zapatos', gl: 'ES', ceid: "es:es" } => '?q=zapatos&gl=ES&ceid=ES:es'
     const queryString = Object.keys(query).reduce((acc, key, index) => {
         const prefix = index === 0 ? '?' : '&';
         return `${acc}${prefix}${key}=${query[key]}`;
@@ -2778,7 +2809,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
     const $ = cheerio__namespace.load(content);
     const articles = $('article');
     let results = [];
-    $(articles).each(function (i) {
+    $(articles).each(function () {
         var _a, _b, _c, _d, _e, _f, _g, _h, _j;
         const link = ((_c = (_b = (_a = $(this)) === null || _a === void 0 ? void 0 : _a.find('a[href^="./article"]')) === null || _b === void 0 ? void 0 : _b.attr('href')) === null || _c === void 0 ? void 0 : _c.replace('./', 'https://news.google.com/')) || ((_f = (_e = (_d = $(this)) === null || _d === void 0 ? void 0 : _d.find('a[href^="./read"]')) === null || _e === void 0 ? void 0 : _e.attr('href')) === null || _f === void 0 ? void 0 : _f.replace('./', 'https://news.google.com/')) || "";
         const srcset = (_g = $(this).find('figure').find('img').attr('srcset')) === null || _g === void 0 ? void 0 : _g.split(' ');
@@ -2800,7 +2831,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
     });
     if (config.prettyURLs) {
         results = yield Promise.all(results.map(article => {
-            const url = getPrettyUrl(article.link, logger);
+            const url = getPrettyUrl(article.link);
             if (url) {
                 article.link = url;
             }

diff --git a/dist/cjs/min/index.min.js b/dist/cjs/min/index.min.js
diff --git a/dist/cjs/min/index.min.js.map b/dist/cjs/min/index.min.js.map
diff --git a/dist/esm/index.mjs b/dist/esm/index.mjs
@@ -60,30 +60,61 @@ const getArticleType = (article) => {
     return "";
 };
 
+// const getPrettyUrl = (uglyUrl: string, logger: winston.Logger): string | null => {
+//   const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
+//   if (!base64Match) {
+//     return null;
+//   }
+//   const base64String = base64Match[1];
+//   try {
+//     const decodedString = Buffer.from(base64String, "base64").toString("ascii");
+//     const urlPattern = /https?:\/\/[^\s"']+/g;
+//     const matches = decodedString.match(urlPattern) || [];
+//     const urls = matches.flatMap(match => {
+//       const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
+//       return splitUrls.filter(url => {
+//         const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
+//         return cleanUrl;
+//       });
+//     });
+//     const uniqueUrls = [...new Set(urls)];
+//     const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
+//     logger.info(finalUrl);
+//     return finalUrl;
+//   } catch (error) {
+//     logger.error(error);
+//     return null;
+//   }
+// }
 const getPrettyUrl = (uglyUrl, logger) => {
-    const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
-    if (!base64Match) {
-        return null;
-    }
-    const base64String = base64Match[1];
+    var _a, _b;
     try {
-        const decodedString = Buffer.from(base64String, "base64").toString("ascii");
-        const urlPattern = /https?:\/\/[^\s"']+/g;
-        const matches = decodedString.match(urlPattern) || [];
-        const urls = matches.flatMap(match => {
-            const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
-            return splitUrls.filter(url => {
-                const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
-                return cleanUrl;
-            });
-        });
-        const uniqueUrls = [...new Set(urls)];
-        const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
-        logger.info(finalUrl);
-        return finalUrl;
+        // Step 1: Extract the encoded portion between 'read/' and '?'
+        let encodedPart = uglyUrl.split('read/')[1].split('?')[0];
+        // Step 2: Remove 'CB' prefix if present
+        if (encodedPart.startsWith('CB')) {
+            encodedPart = encodedPart.substring(2);
+        }
+        // Step 3: Replace URL-safe Base64 characters
+        encodedPart = encodedPart.replace(/-/g, '+').replace(/_/g, '/');
+        // Step 4: Add padding if necessary
+        const padding = '='.repeat((4 - (encodedPart.length % 4)) % 4);
+        encodedPart += padding;
+        // Step 5: First Base64 decode
+        const firstDecodedBytes = atob(encodedPart);
+        // Step 6: Extract the second encoded string (Base64 URL-safe characters)
+        const secondEncodedPart = (_b = (_a = firstDecodedBytes === null || firstDecodedBytes === void 0 ? void 0 : firstDecodedBytes.match(/[A-Za-z0-9\-_]+/g)) === null || _a === void 0 ? void 0 : _a.join('')) !== null && _b !== void 0 ? _b : '';
+        // Step 7: Replace URL-safe characters in the second string
+        let secondEncoded = secondEncodedPart.replace(/-/g, '+').replace(/_/g, '/');
+        const secondPadding = '='.repeat((4 - (secondEncoded.length % 4)) % 4);
+        secondEncoded += secondPadding;
+        // Step 8: Second Base64 decode to get the final URL
+        const finalURL = atob(secondEncoded);
+        console.log('Final URL:', finalURL);
+        return finalURL;
     }
     catch (error) {
-        logger.error(error);
+        console.error('Error decoding URL:', error);
         return null;
     }
 };
@@ -93,7 +124,7 @@ const buildQueryString = (query) => {
     if (Object.keys(query).length === 0)
         return "";
     // Build query string
-    // Example: { q: 'puppies', hl: 'en', gl: 'US' } => '?q=puppies&hl=en&gl=US'
+    // Example: { q: 'zapatos', gl: 'ES', ceid: "es:es" } => '?q=zapatos&gl=ES&ceid=ES:es'
     const queryString = Object.keys(query).reduce((acc, key, index) => {
         const prefix = index === 0 ? '?' : '&';
         return `${acc}${prefix}${key}=${query[key]}`;
@@ -2757,7 +2788,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
     const $ = cheerio.load(content);
     const articles = $('article');
     let results = [];
-    $(articles).each(function (i) {
+    $(articles).each(function () {
         var _a, _b, _c, _d, _e, _f, _g, _h, _j;
         const link = ((_c = (_b = (_a = $(this)) === null || _a === void 0 ? void 0 : _a.find('a[href^="./article"]')) === null || _b === void 0 ? void 0 : _b.attr('href')) === null || _c === void 0 ? void 0 : _c.replace('./', 'https://news.google.com/')) || ((_f = (_e = (_d = $(this)) === null || _d === void 0 ? void 0 : _d.find('a[href^="./read"]')) === null || _e === void 0 ? void 0 : _e.attr('href')) === null || _f === void 0 ? void 0 : _f.replace('./', 'https://news.google.com/')) || "";
         const srcset = (_g = $(this).find('figure').find('img').attr('srcset')) === null || _g === void 0 ? void 0 : _g.split(' ');
@@ -2779,7 +2810,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
     });
     if (config.prettyURLs) {
         results = yield Promise.all(results.map(article => {
-            const url = getPrettyUrl(article.link, logger);
+            const url = getPrettyUrl(article.link);
             if (url) {
                 article.link = url;
             }

diff --git a/dist/esm/index.mjs.map b/dist/esm/index.mjs.map
diff --git a/dist/esm/min/index.min.mjs b/dist/esm/min/index.min.mjs
diff --git a/dist/esm/min/index.min.mjs.map b/dist/esm/min/index.min.mjs.map
diff --git a/dist/tsc/buildQueryString.js b/dist/tsc/buildQueryString.js
@@ -3,7 +3,7 @@ const buildQueryString = (query) => {
     if (Object.keys(query).length === 0)
         return "";
     // Build query string
-    // Example: { q: 'puppies', hl: 'en', gl: 'US' } => '?q=puppies&hl=en&gl=US'
+    // Example: { q: 'zapatos', gl: 'ES', ceid: "es:es" } => '?q=zapatos&gl=ES&ceid=ES:es'
     const queryString = Object.keys(query).reduce((acc, key, index) => {
         const prefix = index === 0 ? '?' : '&';
         return `${acc}${prefix}${key}=${query[key]}`;

diff --git a/dist/tsc/getPrettyUrl.js b/dist/tsc/getPrettyUrl.js
@@ -1,27 +1,58 @@
+// const getPrettyUrl = (uglyUrl: string, logger: winston.Logger): string | null => {
+//   const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
+//   if (!base64Match) {
+//     return null;
+//   }
+//   const base64String = base64Match[1];
+//   try {
+//     const decodedString = Buffer.from(base64String, "base64").toString("ascii");
+//     const urlPattern = /https?:\/\/[^\s"']+/g;
+//     const matches = decodedString.match(urlPattern) || [];
+//     const urls = matches.flatMap(match => {
+//       const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
+//       return splitUrls.filter(url => {
+//         const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
+//         return cleanUrl;
+//       });
+//     });
+//     const uniqueUrls = [...new Set(urls)];
+//     const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
+//     logger.info(finalUrl);
+//     return finalUrl;
+//   } catch (error) {
+//     logger.error(error);
+//     return null;
+//   }
+// }
 const getPrettyUrl = (uglyUrl, logger) => {
-    const base64Match = uglyUrl.match(/\/read\/([A-Za-z0-9-_]+)/);
-    if (!base64Match) {
-        return null;
-    }
-    const base64String = base64Match[1];
+    var _a, _b;
     try {
-        const decodedString = Buffer.from(base64String, "base64").toString("ascii");
-        const urlPattern = /https?:\/\/[^\s"']+/g;
-        const matches = decodedString.match(urlPattern) || [];
-        const urls = matches.flatMap(match => {
-            const splitUrls = match.split(/(?<!http:|https:)R(?![a-zA-Z0-9-_])|(?<!http:|https:)y(?![a-zA-Z0-9-_])/);
-            return splitUrls.filter(url => {
-                const cleanUrl = url.trim().replace(/[^\w\-\/:.]+$/, '').replace(/\\x[0-9A-Fa-f]{2}/g, '');
-                return cleanUrl;
-            });
-        });
-        const uniqueUrls = [...new Set(urls)];
-        const finalUrl = uniqueUrls.length ? uniqueUrls[0] : uglyUrl;
-        logger.info(finalUrl);
-        return finalUrl;
+        // Step 1: Extract the encoded portion between 'read/' and '?'
+        let encodedPart = uglyUrl.split('read/')[1].split('?')[0];
+        // Step 2: Remove 'CB' prefix if present
+        if (encodedPart.startsWith('CB')) {
+            encodedPart = encodedPart.substring(2);
+        }
+        // Step 3: Replace URL-safe Base64 characters
+        encodedPart = encodedPart.replace(/-/g, '+').replace(/_/g, '/');
+        // Step 4: Add padding if necessary
+        const padding = '='.repeat((4 - (encodedPart.length % 4)) % 4);
+        encodedPart += padding;
+        // Step 5: First Base64 decode
+        const firstDecodedBytes = atob(encodedPart);
+        // Step 6: Extract the second encoded string (Base64 URL-safe characters)
+        const secondEncodedPart = (_b = (_a = firstDecodedBytes === null || firstDecodedBytes === void 0 ? void 0 : firstDecodedBytes.match(/[A-Za-z0-9\-_]+/g)) === null || _a === void 0 ? void 0 : _a.join('')) !== null && _b !== void 0 ? _b : '';
+        // Step 7: Replace URL-safe characters in the second string
+        let secondEncoded = secondEncodedPart.replace(/-/g, '+').replace(/_/g, '/');
+        const secondPadding = '='.repeat((4 - (secondEncoded.length % 4)) % 4);
+        secondEncoded += secondPadding;
+        // Step 8: Second Base64 decode to get the final URL
+        const finalURL = atob(secondEncoded);
+        console.log('Final URL:', finalURL);
+        return finalURL;
     }
     catch (error) {
-        logger.error(error);
+        console.error('Error decoding URL:', error);
         return null;
     }
 };

diff --git a/dist/tsc/index.js b/dist/tsc/index.js
@@ -83,7 +83,7 @@ const googleNewsScraper = (userConfig) => __awaiter(void 0, void 0, void 0, func
     let results = [];
     let i = 0;
     const urlChecklist = [];
-    $(articles).each(function (i) {
+    $(articles).each(function () {
         var _a, _b, _c, _d, _e, _f, _g, _h, _j;
         const link = ((_c = (_b = (_a = $(this)) === null || _a === void 0 ? void 0 : _a.find('a[href^="./article"]')) === null || _b === void 0 ? void 0 : _b.attr('href')) === null || _c === void 0 ? void 0 : _c.replace('./', 'https://news.google.com/')) || ((_f = (_e = (_d = $(this)) === null || _d === void 0 ? void 0 : _d.find('a[href^="./read"]')) === null || _e === void 0 ? void 0 : _e.attr('href')) === null || _f === void 0 ? void 0 : _f.replace('./', 'https://news.google.com/')) || "";
         link && urlChecklist.push(link);

diff --git a/src/buildQueryString.ts b/src/buildQueryString.ts
@@ -6,7 +6,7 @@ const buildQueryString = ( query: QueryVars ) => {
   if (Object.keys(query).length === 0) return "";
 
   // Build query string
-  // Example: { q: 'puppies', hl: 'en', gl: 'US' } => '?q=puppies&hl=en&gl=US'
+  // Example: { q: 'zapatos', gl: 'ES', ceid: "es:es" } => '?q=zapatos&gl=ES&ceid=ES:es'
   const queryString = Object.keys(query).reduce((acc, key, index) => {
     const prefix = index === 0 ? '?' : '&'
     return `${acc}${prefix}${key}=${query[key]}`