|
| 1 | +use htmd::HtmlToMarkdown; |
| 2 | +use regex::Regex; |
| 3 | + |
| 4 | +/// Parses HTML content and converts it to Markdown |
| 5 | +/// |
| 6 | +/// This function replicates the behavior of the JavaScript parseMarkdown function: |
| 7 | +/// - Converts HTML to Markdown using htmd |
| 8 | +/// - Processes multi-line links by escaping newlines inside link content |
| 9 | +/// - Removes "Skip to Content" links |
| 10 | +/// - Returns empty string for empty/null input |
| 11 | +pub fn parse_markdown(html: &str) -> String { |
| 12 | + if html.is_empty() { |
| 13 | + return String::new(); |
| 14 | + } |
| 15 | + |
| 16 | + // Convert HTML to Markdown using htmd |
| 17 | + let markdown = match HtmlToMarkdown::new().convert(html) { |
| 18 | + Ok(md) => md, |
| 19 | + Err(_) => { |
| 20 | + // Return empty string if conversion fails |
| 21 | + return String::new(); |
| 22 | + } |
| 23 | + }; |
| 24 | + |
| 25 | + // Process the markdown content |
| 26 | + let processed_markdown = process_multiline_links(&markdown); |
| 27 | + remove_skip_to_content_links(&processed_markdown) |
| 28 | +} |
| 29 | + |
| 30 | +/// Processes multi-line links by escaping newlines inside link content |
| 31 | +/// |
| 32 | +/// This function replicates the JavaScript processMultiLineLinks function: |
| 33 | +/// - Tracks when we're inside link content (between [ and ]) |
| 34 | +/// - Escapes newlines with backslash when inside links |
| 35 | +fn process_multiline_links(markdown_content: &str) -> String { |
| 36 | + let mut new_markdown_content = String::new(); |
| 37 | + let mut link_open_count: usize = 0; |
| 38 | + |
| 39 | + for ch in markdown_content.chars() { |
| 40 | + match ch { |
| 41 | + '[' => { |
| 42 | + link_open_count += 1; |
| 43 | + } |
| 44 | + ']' => { |
| 45 | + link_open_count = link_open_count.saturating_sub(1); |
| 46 | + } |
| 47 | + _ => {} |
| 48 | + } |
| 49 | + |
| 50 | + let inside_link_content = link_open_count > 0; |
| 51 | + |
| 52 | + if inside_link_content && ch == '\n' { |
| 53 | + new_markdown_content.push('\\'); |
| 54 | + new_markdown_content.push('\n'); |
| 55 | + } else { |
| 56 | + new_markdown_content.push(ch); |
| 57 | + } |
| 58 | + } |
| 59 | + |
| 60 | + new_markdown_content |
| 61 | +} |
| 62 | + |
| 63 | +/// Removes "Skip to Content" links from the markdown content |
| 64 | +/// |
| 65 | +/// This function replicates the JavaScript removeSkipToContentLinks function: |
| 66 | +/// - Removes [Skip to Content](#page) and [Skip to content](#skip) patterns |
| 67 | +/// - Case-insensitive matching |
| 68 | +fn remove_skip_to_content_links(markdown_content: &str) -> String { |
| 69 | + let re = Regex::new(r"(?i)\[Skip to Content\]\(#[^)]*\)").unwrap(); |
| 70 | + re.replace_all(markdown_content, "").to_string() |
| 71 | +} |
| 72 | + |
| 73 | +#[cfg(test)] |
| 74 | +mod tests { |
| 75 | + use super::*; |
| 76 | + |
| 77 | + #[test] |
| 78 | + fn test_parse_markdown_simple() { |
| 79 | + let html = "<p>Hello, world!</p>"; |
| 80 | + let result = parse_markdown(html); |
| 81 | + assert_eq!(result.trim(), "Hello, world!"); |
| 82 | + } |
| 83 | + |
| 84 | + #[test] |
| 85 | + fn test_parse_markdown_complex() { |
| 86 | + let html = |
| 87 | + "<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>"; |
| 88 | + let result = parse_markdown(html); |
| 89 | + assert_eq!(result.trim(), "Hello **bold** world!\n\n* List item"); |
| 90 | + } |
| 91 | + |
| 92 | + #[test] |
| 93 | + fn test_parse_markdown_empty() { |
| 94 | + let html = ""; |
| 95 | + let result = parse_markdown(html); |
| 96 | + assert_eq!(result, ""); |
| 97 | + } |
| 98 | + |
| 99 | + #[test] |
| 100 | + fn test_process_multiline_links() { |
| 101 | + let markdown = "[Link\nwith newline](http://example.com)"; |
| 102 | + let result = process_multiline_links(markdown); |
| 103 | + assert_eq!(result, "[Link\\\nwith newline](http://example.com)"); |
| 104 | + } |
| 105 | + |
| 106 | + #[test] |
| 107 | + fn test_remove_skip_to_content_links() { |
| 108 | + let markdown = "Some content [Skip to Content](#page) more content"; |
| 109 | + let result = remove_skip_to_content_links(markdown); |
| 110 | + assert_eq!(result, "Some content more content"); |
| 111 | + } |
| 112 | + |
| 113 | + #[test] |
| 114 | + fn test_remove_skip_to_content_links_case_insensitive() { |
| 115 | + let markdown = "Some content [Skip to content](#skip) more content"; |
| 116 | + let result = remove_skip_to_content_links(markdown); |
| 117 | + assert_eq!(result, "Some content more content"); |
| 118 | + } |
| 119 | +} |
0 commit comments