From 59633e8b208ae4bafc68bd42ae7cfe4fdcc06ca9 Mon Sep 17 00:00:00 2001
From: pftom <1043269994@qq.com>
Date: Sun, 2 Feb 2025 22:18:04 +0800
Subject: [PATCH 1/2] feat(utils): Remove base64 images from HTML content
during markdown conversion
- Add `removeBase64Images` function to strip base64 image tags
- Update `getReadabilityHtml` and `getMarkdown` to clean base64 images before processing
- Improve content sanitization during HTML to markdown conversion
---
packages/utils/src/html2md.ts | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/packages/utils/src/html2md.ts b/packages/utils/src/html2md.ts
index 5367c01ec..84331f4c5 100644
--- a/packages/utils/src/html2md.ts
+++ b/packages/utils/src/html2md.ts
@@ -38,10 +38,17 @@ export const removeUnusedHtmlNode = () => {
return html;
};
+// Function to remove base64 images from HTML content
+const removeBase64Images = (html: string): string => {
+ // Remove img tags with base64 content
+ return html.replace(/
]+src="data:image\/[^>]+"[^>]*>/g, '');
+};
+
export const getReadabilityHtml = (node: Document | HTMLElement | DocumentFragment) => {
try {
const parsed = new Readability(node.cloneNode(true) as Document).parse();
- return parsed?.content;
+ // Clean base64 images before returning content
+ return parsed?.content ? removeBase64Images(parsed.content) : removeUnusedHtmlNode();
} catch (_err) {
return removeUnusedHtmlNode();
}
@@ -56,7 +63,7 @@ export const getReadabilityMarkdown = (element: Document | HTMLElement | Documen
export const getMarkdown = (element: Document | HTMLElement | DocumentFragment) => {
const div = document.createElement('div');
div.appendChild(element.cloneNode(true));
- const html = div.innerHTML;
+ const html = removeBase64Images(div.innerHTML);
const md = convertHTMLToMarkdown('render', html);
return md;
};
From 045da0f73a80f48c294368fc1388cda32aaf9d56 Mon Sep 17 00:00:00 2001
From: pftom <1043269994@qq.com>
Date: Sun, 2 Feb 2025 22:30:43 +0800
Subject: [PATCH 2/2] refactor(utils): Enhance HTML and Markdown sanitization
with comprehensive cleaning utilities
- Add `cleanHtml` function to preprocess HTML content
- Implement `cleanMarkdown` function for post-processing markdown
- Improve content sanitization by removing comments, scripts, empty tags, and base64 images
- Update `removeUnusedHtmlNode`, `getReadabilityHtml`, and `getMarkdown` to use new cleaning methods
- Enhance markdown cleaning with link, whitespace, and special character removal
---
packages/utils/src/html2md.ts | 98 +++++++++++++++++++++++++++++------
1 file changed, 82 insertions(+), 16 deletions(-)
diff --git a/packages/utils/src/html2md.ts b/packages/utils/src/html2md.ts
index 84331f4c5..bd7ae3ec0 100644
--- a/packages/utils/src/html2md.ts
+++ b/packages/utils/src/html2md.ts
@@ -2,6 +2,74 @@ import { parse } from 'node-html-parser';
import { Readability } from '@mozilla/readability';
import { convertHTMLToMarkdown } from './markdown';
+// HTML preprocessing utilities
+const cleanHtml = (htmlContent: string): string => {
+ let cleanedHtml = htmlContent;
+
+ // Remove all comments
+ cleanedHtml = cleanedHtml.replace(//g, '');
+
+ // Remove all script and style tags with their content
+ cleanedHtml = cleanedHtml.replace(/