diff --git a/packages/utils/src/html2md.ts b/packages/utils/src/html2md.ts
index 5367c01ec..bd7ae3ec0 100644
--- a/packages/utils/src/html2md.ts
+++ b/packages/utils/src/html2md.ts
@@ -2,6 +2,74 @@ import { parse } from 'node-html-parser';
import { Readability } from '@mozilla/readability';
import { convertHTMLToMarkdown } from './markdown';
+// HTML preprocessing utilities
+const cleanHtml = (htmlContent: string): string => {
+ let cleanedHtml = htmlContent;
+
+ // Remove all comments
+ cleanedHtml = cleanedHtml.replace(//g, '');
+
+ // Remove all script and style tags with their content
+ cleanedHtml = cleanedHtml.replace(/