From 59633e8b208ae4bafc68bd42ae7cfe4fdcc06ca9 Mon Sep 17 00:00:00 2001 From: pftom <1043269994@qq.com> Date: Sun, 2 Feb 2025 22:18:04 +0800 Subject: [PATCH 1/2] feat(utils): Remove base64 images from HTML content during markdown conversion - Add `removeBase64Images` function to strip base64 image tags - Update `getReadabilityHtml` and `getMarkdown` to clean base64 images before processing - Improve content sanitization during HTML to markdown conversion --- packages/utils/src/html2md.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/packages/utils/src/html2md.ts b/packages/utils/src/html2md.ts index 5367c01ec..84331f4c5 100644 --- a/packages/utils/src/html2md.ts +++ b/packages/utils/src/html2md.ts @@ -38,10 +38,17 @@ export const removeUnusedHtmlNode = () => { return html; }; +// Function to remove base64 images from HTML content +const removeBase64Images = (html: string): string => { + // Remove img tags with base64 content + return html.replace(/]+src="data:image\/[^>]+"[^>]*>/g, ''); +}; + export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragment) => { try { const parsed = new Readability(node.cloneNode(true) as Document).parse(); - return parsed?.content; + // Clean base64 images before returning content + return parsed?.content ? removeBase64Images(parsed.content) : removeUnusedHtmlNode(); } catch (_err) { return removeUnusedHtmlNode(); } @@ -56,7 +63,7 @@ export const getReadabilityMarkdown = (element: Document | HTMLElement | Documen export const getMarkdown = (element: Document | HTMLElement | DocumentFragment) => { const div = document.createElement('div'); div.appendChild(element.cloneNode(true)); - const html = div.innerHTML; + const html = removeBase64Images(div.innerHTML); const md = convertHTMLToMarkdown('render', html); return md; }; From 045da0f73a80f48c294368fc1388cda32aaf9d56 Mon Sep 17 00:00:00 2001 From: pftom <1043269994@qq.com> Date: Sun, 2 Feb 2025 22:30:43 +0800 Subject: [PATCH 2/2] refactor(utils): Enhance HTML and Markdown sanitization with comprehensive cleaning utilities - Add `cleanHtml` function to preprocess HTML content - Implement `cleanMarkdown` function for post-processing markdown - Improve content sanitization by removing comments, scripts, empty tags, and base64 images - Update `removeUnusedHtmlNode`, `getReadabilityHtml`, and `getMarkdown` to use new cleaning methods - Enhance markdown cleaning with link, whitespace, and special character removal --- packages/utils/src/html2md.ts | 98 +++++++++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 16 deletions(-) diff --git a/packages/utils/src/html2md.ts b/packages/utils/src/html2md.ts index 84331f4c5..bd7ae3ec0 100644 --- a/packages/utils/src/html2md.ts +++ b/packages/utils/src/html2md.ts @@ -2,6 +2,74 @@ import { parse } from 'node-html-parser'; import { Readability } from '@mozilla/readability'; import { convertHTMLToMarkdown } from './markdown'; +// HTML preprocessing utilities +const cleanHtml = (htmlContent: string): string => { + let cleanedHtml = htmlContent; + + // Remove all comments + cleanedHtml = cleanedHtml.replace(//g, ''); + + // Remove all script and style tags with their content + cleanedHtml = cleanedHtml.replace(/)<[^<]*)*<\/script>/gi, ''); + cleanedHtml = cleanedHtml.replace(/)<[^<]*)*<\/style>/gi, ''); + + // Remove base64 images + cleanedHtml = cleanedHtml.replace(/]+src="data:image\/[^>]+"[^>]*>/g, ''); + + // Remove empty tags except for specific ones we want to keep + const keepTags = ['img', 'br', 'hr']; + const emptyTagPattern = new RegExp( + `<(?!(?:${keepTags.join('|')})\b)[^>]+?>[\s\r\n]*]+?>`, + 'g', + ); + cleanedHtml = cleanedHtml.replace(emptyTagPattern, ''); + + // Clean up excessive whitespace + cleanedHtml = cleanedHtml.replace(/\s+/g, ' ').trim(); + + return cleanedHtml; +}; + +// Markdown postprocessing utilities +const cleanMarkdown = (markdownContent: string): string => { + let cleanedMarkdown = markdownContent; + + // Remove multiple consecutive empty lines + cleanedMarkdown = cleanedMarkdown.replace(/\n{3,}/g, '\n\n'); + + // Remove trailing spaces at the end of lines + cleanedMarkdown = cleanedMarkdown.replace(/[ \t]+$/gm, ''); + + // Clean up code blocks (ensure proper spacing) + cleanedMarkdown = cleanedMarkdown.replace(/```(\w*)\n\n/g, '```$1\n'); + + // Remove base64 image markdown + cleanedMarkdown = cleanedMarkdown.replace(/!\[[^\]]*\]\(data:image\/[^)]+\)/g, ''); + + // Remove javascript: links + cleanedMarkdown = cleanedMarkdown.replace(/\[[^\]]*\]\(javascript:[^)]*\)/g, ''); + cleanedMarkdown = cleanedMarkdown.replace(/\(javascript:[^)]*\)/g, ''); + + // Remove empty links + cleanedMarkdown = cleanedMarkdown.replace(/\[([^\]]*)\]\(\s*\)/g, '$1'); + cleanedMarkdown = cleanedMarkdown.replace(/!\[([^\]]*)\]\(\s*\)/g, ''); + + // Clean up excessive spaces around bold/italic markers + cleanedMarkdown = cleanedMarkdown.replace(/\*\s+(\S)/g, '*$1'); + cleanedMarkdown = cleanedMarkdown.replace(/(\S)\s+\*/g, '$1*'); + + // Remove zero-width spaces and other invisible characters + cleanedMarkdown = cleanedMarkdown.replace(/[\u200B\u200C\u200D\uFEFF]/g, ''); + + // Remove empty lines that only contain spaces or special characters + cleanedMarkdown = cleanedMarkdown.replace(/^\s*[\[\]\(\)\*\-\_\#\~\`]+\s*$/gm, ''); + + // Clean up multiple spaces between words + cleanedMarkdown = cleanedMarkdown.replace(/\s{2,}/g, ' '); + + return cleanedMarkdown.trim(); +}; + export const removeUnusedHtmlNode = () => { const $ = parse(document?.documentElement?.innerHTML); @@ -20,7 +88,13 @@ export const removeUnusedHtmlNode = () => { `; } for (const item of $.querySelectorAll('img')) { - item.setAttribute('all', 'unset'); + // Remove base64 images and keep only valid URL images + const src = item.getAttribute('src') || ''; + if (src.startsWith('data:')) { + item.remove(); + } else { + item.setAttribute('all', 'unset'); + } } for (const item of $.querySelectorAll('plasmo-csui')) { item.innerHTML = '
'; @@ -30,25 +104,17 @@ export const removeUnusedHtmlNode = () => { (node) => node.nodeType === Node.COMMENT_NODE, ); for (const item of commentNodes) { - item.textContent = 'comment'; + item.remove(); } const html = $.innerHTML; - - return html; -}; - -// Function to remove base64 images from HTML content -const removeBase64Images = (html: string): string => { - // Remove img tags with base64 content - return html.replace(/]+src="data:image\/[^>]+"[^>]*>/g, ''); + return cleanHtml(html); }; export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragment) => { try { const parsed = new Readability(node.cloneNode(true) as Document).parse(); - // Clean base64 images before returning content - return parsed?.content ? removeBase64Images(parsed.content) : removeUnusedHtmlNode(); + return parsed?.content ? cleanHtml(parsed.content) : removeUnusedHtmlNode(); } catch (_err) { return removeUnusedHtmlNode(); } @@ -57,20 +123,20 @@ export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragme export const getReadabilityMarkdown = (element: Document | HTMLElement | DocumentFragment) => { const html = getReadabilityHtml(element); const md = convertHTMLToMarkdown('render', html); - return md; + return cleanMarkdown(md); }; export const getMarkdown = (element: Document | HTMLElement | DocumentFragment) => { const div = document.createElement('div'); div.appendChild(element.cloneNode(true)); - const html = removeBase64Images(div.innerHTML); + const html = cleanHtml(div.innerHTML); const md = convertHTMLToMarkdown('render', html); - return md; + return cleanMarkdown(md); }; export function getSelectionNodesMarkdown() { const selection = window.getSelection(); - const range = selection.getRangeAt(0); + const range = selection?.getRangeAt(0); const text = selection?.toString(); const fragment = range.cloneRange().cloneContents();