Merge pull request #442 from refly-ai/feat/support-extension

Feat/support extension
refly-ai · Feb 2, 2025 · ec1c4f3 · ec1c4f3
2 parents 903f431 + 045da0f
commit ec1c4f3
Showing 1 changed file with 82 additions and 9 deletions.
diff --git a/packages/utils/src/html2md.ts b/packages/utils/src/html2md.ts
@@ -2,6 +2,74 @@ import { parse } from 'node-html-parser';
 import { Readability } from '@mozilla/readability';
 import { convertHTMLToMarkdown } from './markdown';
 
+// HTML preprocessing utilities
+const cleanHtml = (htmlContent: string): string => {
+  let cleanedHtml = htmlContent;
+
+  // Remove all comments
+  cleanedHtml = cleanedHtml.replace(/<!--[\s\S]*?-->/g, '');
+
+  // Remove all script and style tags with their content
+  cleanedHtml = cleanedHtml.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
+  cleanedHtml = cleanedHtml.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
+
+  // Remove base64 images
+  cleanedHtml = cleanedHtml.replace(/<img[^>]+src="data:image\/[^>]+"[^>]*>/g, '');
+
+  // Remove empty tags except for specific ones we want to keep
+  const keepTags = ['img', 'br', 'hr'];
+  const emptyTagPattern = new RegExp(
+    `<(?!(?:${keepTags.join('|')})\b)[^>]+?>[\s\r\n]*</[^>]+?>`,
+    'g',
+  );
+  cleanedHtml = cleanedHtml.replace(emptyTagPattern, '');
+
+  // Clean up excessive whitespace
+  cleanedHtml = cleanedHtml.replace(/\s+/g, ' ').trim();
+
+  return cleanedHtml;
+};
+
+// Markdown postprocessing utilities
+const cleanMarkdown = (markdownContent: string): string => {
+  let cleanedMarkdown = markdownContent;
+
+  // Remove multiple consecutive empty lines
+  cleanedMarkdown = cleanedMarkdown.replace(/\n{3,}/g, '\n\n');
+
+  // Remove trailing spaces at the end of lines
+  cleanedMarkdown = cleanedMarkdown.replace(/[ \t]+$/gm, '');
+
+  // Clean up code blocks (ensure proper spacing)
+  cleanedMarkdown = cleanedMarkdown.replace(/```(\w*)\n\n/g, '```$1\n');
+
+  // Remove base64 image markdown
+  cleanedMarkdown = cleanedMarkdown.replace(/!\[[^\]]*\]\(data:image\/[^)]+\)/g, '');
+
+  // Remove javascript: links
+  cleanedMarkdown = cleanedMarkdown.replace(/\[[^\]]*\]\(javascript:[^)]*\)/g, '');
+  cleanedMarkdown = cleanedMarkdown.replace(/\(javascript:[^)]*\)/g, '');
+
+  // Remove empty links
+  cleanedMarkdown = cleanedMarkdown.replace(/\[([^\]]*)\]\(\s*\)/g, '$1');
+  cleanedMarkdown = cleanedMarkdown.replace(/!\[([^\]]*)\]\(\s*\)/g, '');
+
+  // Clean up excessive spaces around bold/italic markers
+  cleanedMarkdown = cleanedMarkdown.replace(/\*\s+(\S)/g, '*$1');
+  cleanedMarkdown = cleanedMarkdown.replace(/(\S)\s+\*/g, '$1*');
+
+  // Remove zero-width spaces and other invisible characters
+  cleanedMarkdown = cleanedMarkdown.replace(/[\u200B\u200C\u200D\uFEFF]/g, '');
+
+  // Remove empty lines that only contain spaces or special characters
+  cleanedMarkdown = cleanedMarkdown.replace(/^\s*[\[\]\(\)\*\-\_\#\~\`]+\s*$/gm, '');
+
+  // Clean up multiple spaces between words
+  cleanedMarkdown = cleanedMarkdown.replace(/\s{2,}/g, ' ');
+
+  return cleanedMarkdown.trim();
+};
+
 export const removeUnusedHtmlNode = () => {
   const $ = parse(document?.documentElement?.innerHTML);
 
@@ -20,7 +88,13 @@ export const removeUnusedHtmlNode = () => {
 </svg>`;
   }
   for (const item of $.querySelectorAll('img')) {
-    item.setAttribute('all', 'unset');
+    // Remove base64 images and keep only valid URL images
+    const src = item.getAttribute('src') || '';
+    if (src.startsWith('data:')) {
+      item.remove();
+    } else {
+      item.setAttribute('all', 'unset');
+    }
   }
   for (const item of $.querySelectorAll('plasmo-csui')) {
     item.innerHTML = '<div></div>';
@@ -30,18 +104,17 @@ export const removeUnusedHtmlNode = () => {
     (node) => node.nodeType === Node.COMMENT_NODE,
   );
   for (const item of commentNodes) {
-    item.textContent = 'comment';
+    item.remove();
   }
 
   const html = $.innerHTML;
-
-  return html;
+  return cleanHtml(html);
 };
 
 export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragment) => {
   try {
     const parsed = new Readability(node.cloneNode(true) as Document).parse();
-    return parsed?.content;
+    return parsed?.content ? cleanHtml(parsed.content) : removeUnusedHtmlNode();
   } catch (_err) {
     return removeUnusedHtmlNode();
   }
@@ -50,20 +123,20 @@ export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragme
 export const getReadabilityMarkdown = (element: Document | HTMLElement | DocumentFragment) => {
   const html = getReadabilityHtml(element);
   const md = convertHTMLToMarkdown('render', html);
-  return md;
+  return cleanMarkdown(md);
 };
 
 export const getMarkdown = (element: Document | HTMLElement | DocumentFragment) => {
   const div = document.createElement('div');
   div.appendChild(element.cloneNode(true));
-  const html = div.innerHTML;
+  const html = cleanHtml(div.innerHTML);
   const md = convertHTMLToMarkdown('render', html);
-  return md;
+  return cleanMarkdown(md);
 };
 
 export function getSelectionNodesMarkdown() {
   const selection = window.getSelection();
-  const range = selection.getRangeAt(0);
+  const range = selection?.getRangeAt(0);
   const text = selection?.toString();
 
   const fragment = range.cloneRange().cloneContents();