shahednasser · iwaduarte · Sep 20, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023
diff --git a/.eslintrc.js b/.eslintrc.js
@@ -14,5 +14,6 @@ module.exports = {
     'no-console': 'off',
     'default-case': 'off',
     'no-prototype-builtins': 'off',
+    'linebreak-style': 0,
   },
 };
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+* text=auto
diff --git a/README.md b/README.md
@@ -167,7 +167,8 @@ Starting from version 1.2.3, you can now post local markdown files to the platfo
 For example:
 
 ```bash
-cross-post run /path/to/test.md -l
+# canonicalUrl is optional 
+cross-post run /path/to/test.md -l <canonicalUrl> 
 ```
 
 You can also use any of the previous options mentioned.

diff --git a/index.js b/index.js
@@ -9,7 +9,7 @@ program.usage('[command] [options]');
 program
   .command('run <url>')
   .description('Cross post a blog post')
-  .option('-l, --local', 'Use if the you want to directly post a local Markdown file. <url> in this case should be the path to the file')
+  .option('-l, --local [canonicalUrl]', 'For using a local Markdown file, <url> will be the path and <canonicalUrl> is optional')
   .option('-t, --title [title]', 'Title for the article')
   .option('-p, --platforms [platforms...]', `Platforms to post articles to. Allowed values are: ${allowedPlatforms.join(', ')}`)
   .option('-s, --selector [selector]', 'The selector to look for in the document in the URL supplied. By default, it will be article. '

diff --git a/src/commands/run.js b/src/commands/run.js
@@ -1,15 +1,12 @@
 const fs = require('fs');
 const path = require('path');
-const process = require('process');
 const Conf = require('conf');
 const got = require('got');
-const jsdom = require('jsdom');
+const { JSDOM } = require('jsdom');
 const htmlparser2 = require('htmlparser2');
 const { URLSearchParams } = require('url');
 const { marked } = require('marked');
 
-const { JSDOM } = jsdom;
-
 const TurndownService = require('turndown');
 const CLI = require('clui');
 
@@ -23,7 +20,7 @@ const {
   displaySuccess,
   isPlatformAllowed,
   platformNotAllowedMessage,
-  isDataURL,
+  isDataURL, getRemoteArticleDOM, findMainContentElements, formatMarkdownImages,
 } = require('../utils');
 const postToDev = require('./platforms/dev');
 const postToHashnode = require('./platforms/hashnode');
@@ -63,30 +60,27 @@ function search(type, node) {
 }
 
 /**
- * 
+ *
  * @param {*} url the string that has provided by the user
- * @returns 
+ * @returns
  */
 async function getImageForHashnode(url) {
-  try {
-    const response = await got(url);
-    let count = 0, imageUrl;
-    const parser = new htmlparser2.Parser({
-      onopentag: function(name, attribs) {
-        if (name === 'img' && attribs.src && attribs.src.includes('/_next/image')) {
-          count += 1;
-          if (count === 2) {
-            imageUrl = attribs.src;
-          }
+  const response = await got(url);
+  let count = 0;
+  let imageUrl;
+  const parser = new htmlparser2.Parser({
+    onopentag(name, attribs) {
+      if (name === 'img' && attribs.src && attribs.src.includes('/_next/image')) {
+        count += 1;
+        if (count === 2) {
+          imageUrl = attribs.src;
         }
-      },
-    });
-    parser.write(response.body);
-    parser.end();
-    return imageUrl;
-  } catch (error) {
-    //pass
-  }
+      }
+    },
+  });
+  parser.write(response.body);
+  parser.end();
+  return imageUrl;
 }
 
 /**
@@ -153,7 +147,7 @@ function postToPlatforms(title, markdown, url, image, p) {
 /**
  *
  * @param {string} url URL of the blog post
- * @param {object} param1 The parameters from the command line
+ * @param {object} options The parameters from the command line
  */
 async function run(url, options) {
   let {
@@ -183,12 +177,7 @@ async function run(url, options) {
   }
 
   // check if configurations exist for the platforms
-  const errorPlatform = chosenPlatforms.find((platform) => {
-    if (!configstore.get(platform)) {
-      return true;
-    }
-    return false;
-  });
+  const errorPlatform = chosenPlatforms.find((platform) => !configstore.get(platform));
 
   if (errorPlatform) {
     console.error(
@@ -282,18 +271,16 @@ async function run(url, options) {
             if (image) {
               image = image.getAttribute('src');
             }
+          } else if (url.includes('hashnode')) {
+            await getImageForHashnode(url).then((img) => {
+              const params = new URLSearchParams(img.split('?')[1]);
+              image = params.get('url');
+            });
           } else {
-            if (url.includes("hashnode")) {
-              await getImageForHashnode(url).then((img) => {
-                const params = new URLSearchParams(img.split('?')[1]);
-                image = params.get('url');
-              });
-            }else{
-              image = search('image')
-            }
+            image = search('image', articleNode);
           }
         }
-        // check if image is dataurl
+        // check if image is data-url
         if (image && isDataURL(image)) {
           const res = await uploadToCloudinary(image);
           image = res.url;
@@ -304,8 +291,16 @@ async function run(url, options) {
         }
       }
     }
+    // create links for images in files
+    const isLocalAPath = typeof local === 'string';
+
+    const articleDOM = isLocalAPath && await getRemoteArticleDOM(local);
+    const mainElement = isLocalAPath && findMainContentElements(articleDOM.window.document.body);
+    markdown = isLocalAPath ? formatMarkdownImages(markdown, mainElement, local) : markdown;
+    const newURL = local ? '' : url;
+    const canonicalURL = isLocalAPath ? local : newURL;
 
-    postToPlatforms(title, markdown, local ? '' : url, image, p);
+    postToPlatforms(title, markdown, canonicalURL, image, p);
   } else {
     handleError('No articles found in the URL.');
   }

diff --git a/src/utils.js b/src/utils.js
@@ -1,7 +1,185 @@
 const chalk = require('chalk');
+const { get } = require('axios');
+const { JSDOM } = require('jsdom');
 
 const allowedPlatforms = ['dev', 'hashnode', 'medium'];
 
+/**
+ * Replaces the 'http' scheme with 'https' in a given URL.
+ *
+ * @function
+ * @name enforceHTTPS
+ * @param {string} url - The URL to be converted to HTTPS.
+ * @returns {string|null} - The URL with 'https' scheme, or null if the input is null.
+ *
+ * @example
+ * const url = "http://example.com";
+ * const httpsUrl = enforceHTTPS(url);  // Output will be "https://example.com"
+ */
+const enforceHTTPS = (url) => url?.replace(/^(http:\/\/)/, 'https://');
+
+/**
+ * Fetches the HTML content from a remote URL and returns it as a JSDOM object.
+ *
+ * @async
+ * @function
+ * @name getRemoteArticleDOM
+ * @param {string} url - The URL of the remote article to fetch.
+ * @returns {Promise<JSDOM>} - A promise that resolves to a JSDOM object containing
+ * the HTML content of the remote article.
+ */
+const getRemoteArticleDOM = async (url) => {
+  const { data } = await get(enforceHTTPS(url));
+  return new JSDOM(data);
+};
+
+/**
+ * Finds the nearest common ancestor of an array of HTML elements.
+ *
+ * @function
+ * @name findNearestCommonAncestor
+ * @param {HTMLElement[]} elements - An array of HTML elements for which to find
+ * the nearest common ancestor.
+ * @returns {HTMLElement|null} - The nearest common ancestor element, or null
+ * if the input array is empty or null.
+ *
+ * @example
+ * const elem1 = document.getElementById('elem1');
+ * const elem2 = document.getElementById('elem2');
+ * const commonAncestor = findNearestCommonAncestor([elem1, elem2]);
+ *
+ * // commonAncestor will contain the nearest common ancestor HTMLElement or null.
+ */
+const findNearestCommonAncestor = (elements) => {
+  if (elements?.length === 0) {
+    return null;
+  }
+  const findAncestors = (element, ancestorsSet) => {
+    if (element) {
+      ancestorsSet.add(element);
+      findAncestors(element.parentElement, ancestorsSet);
+    }
+  };
+  const ancestorsList = elements.map((element) => {
+    const ancestors = new Set();
+    findAncestors(element, ancestors);
+    return ancestors;
+  });
+
+  const commonAncestors = ancestorsList.reduce((acc, currSet) => acc
+    .filter((ancestor) => currSet.has(ancestor)), [...ancestorsList[0]]);
+
+  return commonAncestors[0] || null;
+};
+
+/**
+ * Ranks HTML elements based on how many text density it has
+ * and returns the top 20 elements that contain a `<p>` tag.
+ *
+ * @function
+ * @name rankingTag
+ * @param {HTMLElement} document - The HTML jsdom element representing the root of the document.
+ * @returns {HTMLElement[]} - An array of the top 20 HTMLElements that contain a `<p>` tag.
+ *
+ */
+const rankingTag = (document) => {
+  const elements = document.querySelectorAll('p, blockquote, h1, h2, h3, h4, h5, h6');
+  const scoreTag = {
+    p: 0.8,
+    blockquote: 0.9,
+    h1: 0.6,
+    h2: 0.6,
+    h3: 0.6,
+    h4: 0.6,
+    h5: 0.6,
+    h6: 0.6,
+  };
+
+  const { elementScores, elementHasPTag } = Array.from(elements).reduce(
+    (acc, element) => {
+      const textLength = element.textContent.length;
+      const tagName = element.tagName.toLowerCase();
+
+      if (tagName.includes('-')) {
+        return acc;
+      }
+
+      const scoreMultiplier = scoreTag[tagName];
+      const score = textLength * scoreMultiplier;
+      const { parentElement } = element;
+
+      if (parentElement && !parentElement.tagName.toLowerCase().includes('-')) {
+        if (acc.elementScores.has(parentElement)) {
+          acc.elementScores.set(parentElement, acc.elementScores.get(parentElement) + score);
+        } else {
+          acc.elementScores.set(parentElement, score);
+        }
+
+        if (tagName === 'p') {
+          acc.elementHasPTag.set(parentElement, true);
+        }
+      }
+
+      return acc;
+    },
+    { elementScores: new Map(), elementHasPTag: new Map() },
+  );
+
+  return Array.from(elementScores.entries())
+    .filter(([parentElement]) => elementHasPTag.has(parentElement))
+    .sort(([, scoreA], [, scoreB]) => scoreB - scoreA)
+    .slice(0, 20)
+    .map(([element]) => element);
+};
+const findMainContentElements = (document) => findNearestCommonAncestor(rankingTag(document));
+
+/**
+ * Formats Markdown images within the provided Markdown string.
+ *
+ * @function
+ * @name formatMarkdownImages
+ * @param {string} markdown - The Markdown text that needs to be formatted.
+ * @param {HTMLElement} element - The HTMLElement (from jsdom) where images will be extracted.
+ * @param {string} url - The URL to be used for setting the images absolute path
+ * @returns {string} - The formatted Markdown string.
+ *
+ * @example
+ * const markdown = "![Alt text](/path/to/image.jpg)";
+ * const element = new jsdom.window.HTMLElement('body');
+ * const url = "https://example.com";
+ * const result = '![Alt text](https://example.com/imagefromElement.png)'
+ */
+const formatMarkdownImages = (markdown, element, url) => {
+  const formattedUrl = new URL(url);
+  formattedUrl.pathname = '';
+  formattedUrl.search = '';
+  formattedUrl.hash = '';
+
+  const baseUrl = formattedUrl.toString();
+
+  const prefixUrl = (URL) => enforceHTTPS(!URL.startsWith('http://') && !URL.startsWith('https://') ? baseUrl + URL : URL);
+
+  const imagesSrc = Array.from(element.querySelectorAll('img, picture')).map((HTMLImage) => {
+    const { src, tagName } = HTMLImage || {};
+
+    if (tagName.toLowerCase() === 'img') return src ? prefixUrl(src) : null;
+    if (tagName.toLowerCase() === 'picture') {
+      const { srcset } = HTMLImage.querySelector('source') || {};
+      const srcsetItems = srcset.split(',');
+      if (srcset) return prefixUrl(srcsetItems[srcsetItems.length - 1].trim().split(' ')[0]);
+    }
+    return null;
+  }).filter(Boolean);
+
+  if (url.includes('medium.com')) { imagesSrc.shift(); } // first image is always the profile image
+
+  const GRAB_IMAGES_MARKDOWN_REGEX = /!\[(.*?)]\((.*?)\)/g;
+  return markdown.replace(GRAB_IMAGES_MARKDOWN_REGEX, (match, p1, p2) => {
+    const newUrl = imagesSrc.shift() || p2;
+    return `![${p1}](${newUrl})`;
+  });
+};
+
 module.exports = {
   allowedPlatforms,
   displayError: chalk.bold.red,
@@ -20,4 +198,7 @@ module.exports = {
     return !!s.match(regex);
   },
   imagePlatform: 'cloudinary',
+  findMainContentElements,
+  getRemoteArticleDOM,
+  formatMarkdownImages,
 };