diff --git a/packages/utils/src/html2md.ts b/packages/utils/src/html2md.ts index 5367c01ec..bd7ae3ec0 100644 --- a/packages/utils/src/html2md.ts +++ b/packages/utils/src/html2md.ts @@ -2,6 +2,74 @@ import { parse } from 'node-html-parser'; import { Readability } from '@mozilla/readability'; import { convertHTMLToMarkdown } from './markdown'; +// HTML preprocessing utilities +const cleanHtml = (htmlContent: string): string => { + let cleanedHtml = htmlContent; + + // Remove all comments + cleanedHtml = cleanedHtml.replace(//g, ''); + + // Remove all script and style tags with their content + cleanedHtml = cleanedHtml.replace(/)<[^<]*)*<\/script>/gi, ''); + cleanedHtml = cleanedHtml.replace(/)<[^<]*)*<\/style>/gi, ''); + + // Remove base64 images + cleanedHtml = cleanedHtml.replace(/]+src="data:image\/[^>]+"[^>]*>/g, ''); + + // Remove empty tags except for specific ones we want to keep + const keepTags = ['img', 'br', 'hr']; + const emptyTagPattern = new RegExp( + `<(?!(?:${keepTags.join('|')})\b)[^>]+?>[\s\r\n]*]+?>`, + 'g', + ); + cleanedHtml = cleanedHtml.replace(emptyTagPattern, ''); + + // Clean up excessive whitespace + cleanedHtml = cleanedHtml.replace(/\s+/g, ' ').trim(); + + return cleanedHtml; +}; + +// Markdown postprocessing utilities +const cleanMarkdown = (markdownContent: string): string => { + let cleanedMarkdown = markdownContent; + + // Remove multiple consecutive empty lines + cleanedMarkdown = cleanedMarkdown.replace(/\n{3,}/g, '\n\n'); + + // Remove trailing spaces at the end of lines + cleanedMarkdown = cleanedMarkdown.replace(/[ \t]+$/gm, ''); + + // Clean up code blocks (ensure proper spacing) + cleanedMarkdown = cleanedMarkdown.replace(/```(\w*)\n\n/g, '```$1\n'); + + // Remove base64 image markdown + cleanedMarkdown = cleanedMarkdown.replace(/!\[[^\]]*\]\(data:image\/[^)]+\)/g, ''); + + // Remove javascript: links + cleanedMarkdown = cleanedMarkdown.replace(/\[[^\]]*\]\(javascript:[^)]*\)/g, ''); + cleanedMarkdown = cleanedMarkdown.replace(/\(javascript:[^)]*\)/g, ''); + + // Remove empty links + cleanedMarkdown = cleanedMarkdown.replace(/\[([^\]]*)\]\(\s*\)/g, '$1'); + cleanedMarkdown = cleanedMarkdown.replace(/!\[([^\]]*)\]\(\s*\)/g, ''); + + // Clean up excessive spaces around bold/italic markers + cleanedMarkdown = cleanedMarkdown.replace(/\*\s+(\S)/g, '*$1'); + cleanedMarkdown = cleanedMarkdown.replace(/(\S)\s+\*/g, '$1*'); + + // Remove zero-width spaces and other invisible characters + cleanedMarkdown = cleanedMarkdown.replace(/[\u200B\u200C\u200D\uFEFF]/g, ''); + + // Remove empty lines that only contain spaces or special characters + cleanedMarkdown = cleanedMarkdown.replace(/^\s*[\[\]\(\)\*\-\_\#\~\`]+\s*$/gm, ''); + + // Clean up multiple spaces between words + cleanedMarkdown = cleanedMarkdown.replace(/\s{2,}/g, ' '); + + return cleanedMarkdown.trim(); +}; + export const removeUnusedHtmlNode = () => { const $ = parse(document?.documentElement?.innerHTML); @@ -20,7 +88,13 @@ export const removeUnusedHtmlNode = () => { `; } for (const item of $.querySelectorAll('img')) { - item.setAttribute('all', 'unset'); + // Remove base64 images and keep only valid URL images + const src = item.getAttribute('src') || ''; + if (src.startsWith('data:')) { + item.remove(); + } else { + item.setAttribute('all', 'unset'); + } } for (const item of $.querySelectorAll('plasmo-csui')) { item.innerHTML = '
'; @@ -30,18 +104,17 @@ export const removeUnusedHtmlNode = () => { (node) => node.nodeType === Node.COMMENT_NODE, ); for (const item of commentNodes) { - item.textContent = 'comment'; + item.remove(); } const html = $.innerHTML; - - return html; + return cleanHtml(html); }; export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragment) => { try { const parsed = new Readability(node.cloneNode(true) as Document).parse(); - return parsed?.content; + return parsed?.content ? cleanHtml(parsed.content) : removeUnusedHtmlNode(); } catch (_err) { return removeUnusedHtmlNode(); } @@ -50,20 +123,20 @@ export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragme export const getReadabilityMarkdown = (element: Document | HTMLElement | DocumentFragment) => { const html = getReadabilityHtml(element); const md = convertHTMLToMarkdown('render', html); - return md; + return cleanMarkdown(md); }; export const getMarkdown = (element: Document | HTMLElement | DocumentFragment) => { const div = document.createElement('div'); div.appendChild(element.cloneNode(true)); - const html = div.innerHTML; + const html = cleanHtml(div.innerHTML); const md = convertHTMLToMarkdown('render', html); - return md; + return cleanMarkdown(md); }; export function getSelectionNodesMarkdown() { const selection = window.getSelection(); - const range = selection.getRangeAt(0); + const range = selection?.getRangeAt(0); const text = selection?.toString(); const fragment = range.cloneRange().cloneContents();