Skip to content

Commit d665782

Browse files
committed
perf(content-analyzer): ⚡️ Mitigate ReDoS risk in content analysis
- Limit input length to 100,000 characters - Optimize regex patterns to reduce backtracking and ambiguity - Avoid .*? and use [\s\S]{0,5000} to cap match length - Use non-capturing groups and anchor tags more tightly
1 parent 4d4da2f commit d665782

File tree

2 files changed

+13
-5
lines changed

2 files changed

+13
-5
lines changed

src/classes/content-analyzer.ts

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -309,9 +309,17 @@ export class ContentAnalyzer {
309309
const lowercaseContent = content.toLowerCase();
310310

311311
// Analyze document structure
312-
const hasHeaders = /<h[1-6][^>]*>.*?<\/h[1-6]>/i.test(content);
313-
const hasLists = /<[ou]l[^>]*>.*?<\/[ou]l>/i.test(content);
314-
const hasTables = /<table[^>]*>.*?<\/table>/i.test(content);
312+
// Limit input length to mitigate ReDoS risk
313+
const SAFE_CONTENT_LENGTH = 100_000;
314+
const safeContent = content.length > SAFE_CONTENT_LENGTH ? content.slice(0, SAFE_CONTENT_LENGTH) : content;
315+
316+
// Optimized regex patterns to reduce backtracking and ambiguity
317+
// - Avoid .*? and use [\s\S]{0,5000} to cap match length
318+
// - Use non-capturing groups and anchor tags more tightly
319+
320+
const hasHeaders = /<h[1-6](?:\s[^>]*)?>[\s\S]{0,5000}?<\/h[1-6]>/i.test(safeContent);
321+
const hasLists = /<(?:ul|ol)(?:\s[^>]*)?>[\s\S]{0,5000}?<\/(?:ul|ol)>/i.test(safeContent);
322+
const hasTables = /<table(?:\s[^>]*)?>[\s\S]{0,5000}?<\/table>/i.test(safeContent);
315323

316324
if (hasHeaders) signals.add('structured');
317325
if (hasLists) signals.add('list');

src/classes/web.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,11 @@
5252
* @license Apache-2.0
5353
*/
5454

55-
import fs from 'node:fs/promises';
56-
import path from 'node:path';
5755
import { Sema } from 'async-sema';
5856
import { LRUCache } from 'lru-cache';
5957
import natural from 'natural';
58+
import fs from 'node:fs/promises';
59+
import path from 'node:path';
6060
import { type Browser, chromium, type Page } from 'playwright';
6161
import { gemini_model, genAI, safetySettings } from '../constants/gemini-settings.js';
6262
import { ContentAnalyzer, PromptGenerator } from './content-analyzer.js';

0 commit comments

Comments
 (0)