diff --git a/.eslintrc.js b/.eslintrc.js index 0186066..4db8381 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -14,5 +14,6 @@ module.exports = { 'no-console': 'off', 'default-case': 'off', 'no-prototype-builtins': 'off', + 'linebreak-style': 0, }, }; diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..2125666 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto \ No newline at end of file diff --git a/README.md b/README.md index e9ad00c..0111d14 100644 --- a/README.md +++ b/README.md @@ -167,9 +167,14 @@ Starting from version 1.2.3, you can now post local markdown files to the platfo For example: ```bash -cross-post run /path/to/test.md -l +# canonicalUrl is optional +cross-post run /path/to/test.md -l ``` +### What is a canonical URL ? +A canonical URL is the preferred version of a web page. It helps search engines understand which URL to index. Used to avoid duplicate content issues. +It is used if your post is already published elsewhere but you still need more reach. So when publishing to a new vendor you would add that info so the website can point to the original poster. + You can also use any of the previous options mentioned. #### Selector Configuration diff --git a/index.js b/index.js index 84555c7..eb03be1 100755 --- a/index.js +++ b/index.js @@ -9,7 +9,7 @@ program.usage('[command] [options]'); program .command('run ') .description('Cross post a blog post') - .option('-l, --local', 'Use if the you want to directly post a local Markdown file. in this case should be the path to the file') + .option('-l, --local [canonicalUrl]', 'For using a local Markdown file, will be the path and is optional') .option('-t, --title [title]', 'Title for the article') .option('-p, --platforms [platforms...]', `Platforms to post articles to. Allowed values are: ${allowedPlatforms.join(', ')}`) .option('-s, --selector [selector]', 'The selector to look for in the document in the URL supplied. By default, it will be article. ' diff --git a/src/commands/run.js b/src/commands/run.js index 924828e..5fbd1a6 100644 --- a/src/commands/run.js +++ b/src/commands/run.js @@ -1,15 +1,12 @@ const fs = require('fs'); const path = require('path'); -const process = require('process'); const Conf = require('conf'); const got = require('got'); -const jsdom = require('jsdom'); +const { JSDOM } = require('jsdom'); const htmlparser2 = require('htmlparser2'); const { URLSearchParams } = require('url'); const { marked } = require('marked'); -const { JSDOM } = jsdom; - const TurndownService = require('turndown'); const CLI = require('clui'); @@ -23,7 +20,7 @@ const { displaySuccess, isPlatformAllowed, platformNotAllowedMessage, - isDataURL, + isDataURL, getRemoteArticleDOM, findMainContentElements, formatMarkdownImages, } = require('../utils'); const postToDev = require('./platforms/dev'); const postToHashnode = require('./platforms/hashnode'); @@ -63,30 +60,27 @@ function search(type, node) { } /** - * + * * @param {*} url the string that has provided by the user - * @returns + * @returns */ async function getImageForHashnode(url) { - try { - const response = await got(url); - let count = 0, imageUrl; - const parser = new htmlparser2.Parser({ - onopentag: function(name, attribs) { - if (name === 'img' && attribs.src && attribs.src.includes('/_next/image')) { - count += 1; - if (count === 2) { - imageUrl = attribs.src; - } + const response = await got(url); + let count = 0; + let imageUrl; + const parser = new htmlparser2.Parser({ + onopentag(name, attribs) { + if (name === 'img' && attribs.src && attribs.src.includes('/_next/image')) { + count += 1; + if (count === 2) { + imageUrl = attribs.src; } - }, - }); - parser.write(response.body); - parser.end(); - return imageUrl; - } catch (error) { - //pass - } + } + }, + }); + parser.write(response.body); + parser.end(); + return imageUrl; } /** @@ -153,7 +147,7 @@ function postToPlatforms(title, markdown, url, image, p) { /** * * @param {string} url URL of the blog post - * @param {object} param1 The parameters from the command line + * @param {object} options The parameters from the command line */ async function run(url, options) { let { @@ -183,12 +177,7 @@ async function run(url, options) { } // check if configurations exist for the platforms - const errorPlatform = chosenPlatforms.find((platform) => { - if (!configstore.get(platform)) { - return true; - } - return false; - }); + const errorPlatform = chosenPlatforms.find((platform) => !configstore.get(platform)); if (errorPlatform) { console.error( @@ -282,18 +271,16 @@ async function run(url, options) { if (image) { image = image.getAttribute('src'); } + } else if (url.includes('hashnode')) { + await getImageForHashnode(url).then((img) => { + const params = new URLSearchParams(img.split('?')[1]); + image = params.get('url'); + }); } else { - if (url.includes("hashnode")) { - await getImageForHashnode(url).then((img) => { - const params = new URLSearchParams(img.split('?')[1]); - image = params.get('url'); - }); - }else{ - image = search('image') - } + image = search('image', articleNode); } } - // check if image is dataurl + // check if image is data-url if (image && isDataURL(image)) { const res = await uploadToCloudinary(image); image = res.url; @@ -304,8 +291,16 @@ async function run(url, options) { } } } + // create links for images in files + const isLocalAPath = typeof local === 'string'; + + const articleDOM = isLocalAPath && await getRemoteArticleDOM(local); + const mainElement = isLocalAPath && findMainContentElements(articleDOM.window.document.body); + markdown = isLocalAPath ? formatMarkdownImages(markdown, mainElement, local) : markdown; + const newURL = local ? '' : url; + const canonicalURL = isLocalAPath ? local : newURL; - postToPlatforms(title, markdown, local ? '' : url, image, p); + postToPlatforms(title, markdown, canonicalURL, image, p); } else { handleError('No articles found in the URL.'); } diff --git a/src/utils.js b/src/utils.js index e3e7e79..8bbf09c 100644 --- a/src/utils.js +++ b/src/utils.js @@ -1,7 +1,185 @@ const chalk = require('chalk'); +const { get } = require('axios'); +const { JSDOM } = require('jsdom'); const allowedPlatforms = ['dev', 'hashnode', 'medium']; +/** + * Replaces the 'http' scheme with 'https' in a given URL. + * + * @function + * @name enforceHTTPS + * @param {string} url - The URL to be converted to HTTPS. + * @returns {string|null} - The URL with 'https' scheme, or null if the input is null. + * + * @example + * const url = "http://example.com"; + * const httpsUrl = enforceHTTPS(url); // Output will be "https://example.com" + */ +const enforceHTTPS = (url) => url?.replace(/^(http:\/\/)/, 'https://'); + +/** + * Fetches the HTML content from a remote URL and returns it as a JSDOM object. + * + * @async + * @function + * @name getRemoteArticleDOM + * @param {string} url - The URL of the remote article to fetch. + * @returns {Promise} - A promise that resolves to a JSDOM object containing + * the HTML content of the remote article. + */ +const getRemoteArticleDOM = async (url) => { + const { data } = await get(enforceHTTPS(url)); + return new JSDOM(data); +}; + +/** + * Finds the nearest common ancestor of an array of HTML elements. + * + * @function + * @name findNearestCommonAncestor + * @param {HTMLElement[]} elements - An array of HTML elements for which to find + * the nearest common ancestor. + * @returns {HTMLElement|null} - The nearest common ancestor element, or null + * if the input array is empty or null. + * + * @example + * const elem1 = document.getElementById('elem1'); + * const elem2 = document.getElementById('elem2'); + * const commonAncestor = findNearestCommonAncestor([elem1, elem2]); + * + * // commonAncestor will contain the nearest common ancestor HTMLElement or null. + */ +const findNearestCommonAncestor = (elements) => { + if (elements?.length === 0) { + return null; + } + const findAncestors = (element, ancestorsSet) => { + if (element) { + ancestorsSet.add(element); + findAncestors(element.parentElement, ancestorsSet); + } + }; + const ancestorsList = elements.map((element) => { + const ancestors = new Set(); + findAncestors(element, ancestors); + return ancestors; + }); + + const commonAncestors = ancestorsList.reduce((acc, currSet) => acc + .filter((ancestor) => currSet.has(ancestor)), [...ancestorsList[0]]); + + return commonAncestors[0] || null; +}; + +/** + * Ranks HTML elements based on how many text density it has + * and returns the top 20 elements that contain a `

` tag. + * + * @function + * @name rankingTag + * @param {HTMLElement} document - The HTML jsdom element representing the root of the document. + * @returns {HTMLElement[]} - An array of the top 20 HTMLElements that contain a `

` tag. + * + */ +const rankingTag = (document) => { + const elements = document.querySelectorAll('p, blockquote, h1, h2, h3, h4, h5, h6'); + const scoreTag = { + p: 0.8, + blockquote: 0.9, + h1: 0.6, + h2: 0.6, + h3: 0.6, + h4: 0.6, + h5: 0.6, + h6: 0.6, + }; + + const { elementScores, elementHasPTag } = Array.from(elements).reduce( + (acc, element) => { + const textLength = element.textContent.length; + const tagName = element.tagName.toLowerCase(); + + if (tagName.includes('-')) { + return acc; + } + + const scoreMultiplier = scoreTag[tagName]; + const score = textLength * scoreMultiplier; + const { parentElement } = element; + + if (parentElement && !parentElement.tagName.toLowerCase().includes('-')) { + if (acc.elementScores.has(parentElement)) { + acc.elementScores.set(parentElement, acc.elementScores.get(parentElement) + score); + } else { + acc.elementScores.set(parentElement, score); + } + + if (tagName === 'p') { + acc.elementHasPTag.set(parentElement, true); + } + } + + return acc; + }, + { elementScores: new Map(), elementHasPTag: new Map() }, + ); + + return Array.from(elementScores.entries()) + .filter(([parentElement]) => elementHasPTag.has(parentElement)) + .sort(([, scoreA], [, scoreB]) => scoreB - scoreA) + .slice(0, 20) + .map(([element]) => element); +}; +const findMainContentElements = (document) => findNearestCommonAncestor(rankingTag(document)); + +/** + * Formats Markdown images within the provided Markdown string. + * + * @function + * @name formatMarkdownImages + * @param {string} markdown - The Markdown text that needs to be formatted. + * @param {HTMLElement} element - The HTMLElement (from jsdom) where images will be extracted. + * @param {string} url - The URL to be used for setting the images absolute path + * @returns {string} - The formatted Markdown string. + * + * @example + * const markdown = "![Alt text](/path/to/image.jpg)"; + * const element = new jsdom.window.HTMLElement('body'); + * const url = "https://example.com"; + * const result = '![Alt text](https://example.com/imagefromElement.png)' + */ +const formatMarkdownImages = (markdown, element, url) => { + const formattedUrl = new URL(url); + formattedUrl.pathname = ''; + formattedUrl.search = ''; + formattedUrl.hash = ''; + + const baseUrl = formattedUrl.toString(); + + const prefixUrl = (URL) => enforceHTTPS(!URL.startsWith('http://') && !URL.startsWith('https://') ? baseUrl + URL : URL); + + const imagesSrc = Array.from(element.querySelectorAll('img, picture')).map((HTMLImage) => { + const { src, tagName } = HTMLImage || {}; + + if (tagName.toLowerCase() === 'img') return src ? prefixUrl(src) : null; + if (tagName.toLowerCase() === 'picture') { + const { srcset } = HTMLImage.querySelector('source') || {}; + const srcsetItems = srcset.split(','); + if (srcset) return prefixUrl(srcsetItems[srcsetItems.length - 1].trim().split(' ')[0]); + } + return null; + }).filter(Boolean); + + if (url.includes('medium.com')) { imagesSrc.shift(); } // first image is always the profile image + + const GRAB_IMAGES_MARKDOWN_REGEX = /!\[(.*?)]\((.*?)\)/g; + return markdown.replace(GRAB_IMAGES_MARKDOWN_REGEX, (match, p1, p2) => { + const newUrl = imagesSrc.shift() || p2; + return `![${p1}](${newUrl})`; + }); +}; + module.exports = { allowedPlatforms, displayError: chalk.bold.red, @@ -20,4 +198,7 @@ module.exports = { return !!s.match(regex); }, imagePlatform: 'cloudinary', + findMainContentElements, + getRemoteArticleDOM, + formatMarkdownImages, };