From 59633e8b208ae4bafc68bd42ae7cfe4fdcc06ca9 Mon Sep 17 00:00:00 2001
From: pftom <1043269994@qq.com>
Date: Sun, 2 Feb 2025 22:18:04 +0800
Subject: [PATCH 1/2] feat(utils): Remove base64 images from HTML content
 during markdown conversion

- Add `removeBase64Images` function to strip base64 image tags
- Update `getReadabilityHtml` and `getMarkdown` to clean base64 images before processing
- Improve content sanitization during HTML to markdown conversion
---
 packages/utils/src/html2md.ts | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/packages/utils/src/html2md.ts b/packages/utils/src/html2md.ts
index 5367c01ec..84331f4c5 100644
--- a/packages/utils/src/html2md.ts
+++ b/packages/utils/src/html2md.ts
@@ -38,10 +38,17 @@ export const removeUnusedHtmlNode = () => {
   return html;
 };
 
+// Function to remove base64 images from HTML content
+const removeBase64Images = (html: string): string => {
+  // Remove img tags with base64 content
+  return html.replace(/<img[^>]+src="data:image\/[^>]+"[^>]*>/g, '');
+};
+
 export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragment) => {
   try {
     const parsed = new Readability(node.cloneNode(true) as Document).parse();
-    return parsed?.content;
+    // Clean base64 images before returning content
+    return parsed?.content ? removeBase64Images(parsed.content) : removeUnusedHtmlNode();
   } catch (_err) {
     return removeUnusedHtmlNode();
   }
@@ -56,7 +63,7 @@ export const getReadabilityMarkdown = (element: Document | HTMLElement | Documen
 export const getMarkdown = (element: Document | HTMLElement | DocumentFragment) => {
   const div = document.createElement('div');
   div.appendChild(element.cloneNode(true));
-  const html = div.innerHTML;
+  const html = removeBase64Images(div.innerHTML);
   const md = convertHTMLToMarkdown('render', html);
   return md;
 };

From 045da0f73a80f48c294368fc1388cda32aaf9d56 Mon Sep 17 00:00:00 2001
From: pftom <1043269994@qq.com>
Date: Sun, 2 Feb 2025 22:30:43 +0800
Subject: [PATCH 2/2] refactor(utils): Enhance HTML and Markdown sanitization
 with comprehensive cleaning utilities

- Add `cleanHtml` function to preprocess HTML content
- Implement `cleanMarkdown` function for post-processing markdown
- Improve content sanitization by removing comments, scripts, empty tags, and base64 images
- Update `removeUnusedHtmlNode`, `getReadabilityHtml`, and `getMarkdown` to use new cleaning methods
- Enhance markdown cleaning with link, whitespace, and special character removal
---
 packages/utils/src/html2md.ts | 98 +++++++++++++++++++++++++++++------
 1 file changed, 82 insertions(+), 16 deletions(-)

diff --git a/packages/utils/src/html2md.ts b/packages/utils/src/html2md.ts
index 84331f4c5..bd7ae3ec0 100644
--- a/packages/utils/src/html2md.ts
+++ b/packages/utils/src/html2md.ts
@@ -2,6 +2,74 @@ import { parse } from 'node-html-parser';
 import { Readability } from '@mozilla/readability';
 import { convertHTMLToMarkdown } from './markdown';
 
+// HTML preprocessing utilities
+const cleanHtml = (htmlContent: string): string => {
+  let cleanedHtml = htmlContent;
+
+  // Remove all comments
+  cleanedHtml = cleanedHtml.replace(/<!--[\s\S]*?-->/g, '');
+
+  // Remove all script and style tags with their content
+  cleanedHtml = cleanedHtml.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
+  cleanedHtml = cleanedHtml.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
+
+  // Remove base64 images
+  cleanedHtml = cleanedHtml.replace(/<img[^>]+src="data:image\/[^>]+"[^>]*>/g, '');
+
+  // Remove empty tags except for specific ones we want to keep
+  const keepTags = ['img', 'br', 'hr'];
+  const emptyTagPattern = new RegExp(
+    `<(?!(?:${keepTags.join('|')})\b)[^>]+?>[\s\r\n]*</[^>]+?>`,
+    'g',
+  );
+  cleanedHtml = cleanedHtml.replace(emptyTagPattern, '');
+
+  // Clean up excessive whitespace
+  cleanedHtml = cleanedHtml.replace(/\s+/g, ' ').trim();
+
+  return cleanedHtml;
+};
+
+// Markdown postprocessing utilities
+const cleanMarkdown = (markdownContent: string): string => {
+  let cleanedMarkdown = markdownContent;
+
+  // Remove multiple consecutive empty lines
+  cleanedMarkdown = cleanedMarkdown.replace(/\n{3,}/g, '\n\n');
+
+  // Remove trailing spaces at the end of lines
+  cleanedMarkdown = cleanedMarkdown.replace(/[ \t]+$/gm, '');
+
+  // Clean up code blocks (ensure proper spacing)
+  cleanedMarkdown = cleanedMarkdown.replace(/```(\w*)\n\n/g, '```$1\n');
+
+  // Remove base64 image markdown
+  cleanedMarkdown = cleanedMarkdown.replace(/!\[[^\]]*\]\(data:image\/[^)]+\)/g, '');
+
+  // Remove javascript: links
+  cleanedMarkdown = cleanedMarkdown.replace(/\[[^\]]*\]\(javascript:[^)]*\)/g, '');
+  cleanedMarkdown = cleanedMarkdown.replace(/\(javascript:[^)]*\)/g, '');
+
+  // Remove empty links
+  cleanedMarkdown = cleanedMarkdown.replace(/\[([^\]]*)\]\(\s*\)/g, '$1');
+  cleanedMarkdown = cleanedMarkdown.replace(/!\[([^\]]*)\]\(\s*\)/g, '');
+
+  // Clean up excessive spaces around bold/italic markers
+  cleanedMarkdown = cleanedMarkdown.replace(/\*\s+(\S)/g, '*$1');
+  cleanedMarkdown = cleanedMarkdown.replace(/(\S)\s+\*/g, '$1*');
+
+  // Remove zero-width spaces and other invisible characters
+  cleanedMarkdown = cleanedMarkdown.replace(/[\u200B\u200C\u200D\uFEFF]/g, '');
+
+  // Remove empty lines that only contain spaces or special characters
+  cleanedMarkdown = cleanedMarkdown.replace(/^\s*[\[\]\(\)\*\-\_\#\~\`]+\s*$/gm, '');
+
+  // Clean up multiple spaces between words
+  cleanedMarkdown = cleanedMarkdown.replace(/\s{2,}/g, ' ');
+
+  return cleanedMarkdown.trim();
+};
+
 export const removeUnusedHtmlNode = () => {
   const $ = parse(document?.documentElement?.innerHTML);
 
@@ -20,7 +88,13 @@ export const removeUnusedHtmlNode = () => {
 </svg>`;
   }
   for (const item of $.querySelectorAll('img')) {
-    item.setAttribute('all', 'unset');
+    // Remove base64 images and keep only valid URL images
+    const src = item.getAttribute('src') || '';
+    if (src.startsWith('data:')) {
+      item.remove();
+    } else {
+      item.setAttribute('all', 'unset');
+    }
   }
   for (const item of $.querySelectorAll('plasmo-csui')) {
     item.innerHTML = '<div></div>';
@@ -30,25 +104,17 @@ export const removeUnusedHtmlNode = () => {
     (node) => node.nodeType === Node.COMMENT_NODE,
   );
   for (const item of commentNodes) {
-    item.textContent = 'comment';
+    item.remove();
   }
 
   const html = $.innerHTML;
-
-  return html;
-};
-
-// Function to remove base64 images from HTML content
-const removeBase64Images = (html: string): string => {
-  // Remove img tags with base64 content
-  return html.replace(/<img[^>]+src="data:image\/[^>]+"[^>]*>/g, '');
+  return cleanHtml(html);
 };
 
 export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragment) => {
   try {
     const parsed = new Readability(node.cloneNode(true) as Document).parse();
-    // Clean base64 images before returning content
-    return parsed?.content ? removeBase64Images(parsed.content) : removeUnusedHtmlNode();
+    return parsed?.content ? cleanHtml(parsed.content) : removeUnusedHtmlNode();
   } catch (_err) {
     return removeUnusedHtmlNode();
   }
@@ -57,20 +123,20 @@ export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragme
 export const getReadabilityMarkdown = (element: Document | HTMLElement | DocumentFragment) => {
   const html = getReadabilityHtml(element);
   const md = convertHTMLToMarkdown('render', html);
-  return md;
+  return cleanMarkdown(md);
 };
 
 export const getMarkdown = (element: Document | HTMLElement | DocumentFragment) => {
   const div = document.createElement('div');
   div.appendChild(element.cloneNode(true));
-  const html = removeBase64Images(div.innerHTML);
+  const html = cleanHtml(div.innerHTML);
   const md = convertHTMLToMarkdown('render', html);
-  return md;
+  return cleanMarkdown(md);
 };
 
 export function getSelectionNodesMarkdown() {
   const selection = window.getSelection();
-  const range = selection.getRangeAt(0);
+  const range = selection?.getRangeAt(0);
   const text = selection?.toString();
 
   const fragment = range.cloneRange().cloneContents();