Skip to content

Commit f7d0d52

Browse files
authored
Merge pull request #23 from blocklessnetwork/feat/bless-crawl
feat: bless-crawler module
2 parents 04280f6 + 1e9295f commit f7d0d52

File tree

7 files changed

+1340
-4
lines changed

7 files changed

+1340
-4
lines changed

Cargo.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@ license = "MIT/Apache-2.0"
1010
repository = "https://github.com/blocklessnetwork/sdk-rust"
1111

1212
[dependencies]
13+
htmd = { version = "0.2.2", default-features = false }
1314
json = { version = "0.12", default-features = false }
15+
kuchikiki = { version = "0.8", default-features = false }
16+
regex = { version = "1.11.1", default-features = false, features = ["unicode-case"] }
1417
serde = { version = "1.0", features = ["derive"], optional = true }
15-
16-
[dev-dependencies]
17-
serde_json = "1.0"
18+
serde_json = { version = "1.0", default-features = false, features = ["alloc"] }
19+
url = { version = "2.5", default-features = false }
1820

1921
[features]
2022
default = ["serde"]

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ cargo build --release --target wasm32-wasip1 --example llm-mcp
8787
| [httpbin](./examples/httpbin.rs) | HTTP to query anything from httpbin |||
8888
| [llm](./examples/llm.rs) | LLM to chat with `Llama-3.1-8B-Instruct-q4f32_1-MLC` and `SmolLM2-1.7B-Instruct-q4f16_1-MLC` models |||
8989
| [llm-mcp](./examples/llm-mcp.rs) | LLM with MCP (Model Control Protocol) demonstrating tool integration using SSE endpoints |||
90-
90+
| [web-scrape](./examples/web-scrape.rs) | Web Scraping to scrape content from a single URL with custom configuration overrides |||
9191

9292
## Testing
9393

examples/web-scrape.rs

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
use blockless_sdk::*;
2+
3+
/// This example demonstrates how to use the Blockless SDK to perform web scraping
4+
/// using the BlessCrawl functionality.
5+
///
6+
/// It shows how to:
7+
/// - Create a BlessCrawl instance with default configuration
8+
/// - Scrape content from a single URL with custom configuration overrides
9+
/// - Map links from a webpage to discover available URLs
10+
/// - Handle errors and responses appropriately
11+
fn main() {
12+
println!("=== Blockless Web Scraping SDK Example ===\n");
13+
14+
example_scraping();
15+
example_mapping();
16+
example_crawling();
17+
}
18+
19+
fn example_scraping() {
20+
println!("--- Example 1: Basic Web Scraping ---");
21+
22+
let url = "https://example.com";
23+
println!("scraping: {}...", url);
24+
25+
// First scrape with default config
26+
let response = BlessCrawl::default()
27+
.scrape(url, None)
28+
.expect("Failed to scrape");
29+
println!("response with default config: {:?}", response);
30+
println!();
31+
println!(
32+
"---------- markdown ----------\n{}\n------------------------------",
33+
response.data.content
34+
);
35+
}
36+
37+
fn example_mapping() {
38+
println!("--- Example 2: Link Mapping/Discovery ---");
39+
40+
let url = "https://example.com";
41+
println!("Mapping links from: {}", url);
42+
43+
let options = MapOptions::new()
44+
.with_link_types(vec!["internal".to_string(), "external".to_string()])
45+
.with_base_url(url.to_string())
46+
.with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]);
47+
48+
let response = BlessCrawl::default()
49+
.map(url, Some(options))
50+
.expect("Failed to map");
51+
println!("response: {:?}", response);
52+
println!();
53+
println!(
54+
"------------ links ------------\n{:?}\n------------------------------",
55+
response.data.links
56+
);
57+
println!();
58+
println!(
59+
"------------ total links ------------\n{}\n------------------------------",
60+
response.data.total_links
61+
);
62+
}
63+
64+
fn example_crawling() {
65+
println!("--- Example 3: Recursive Website Crawling ---");
66+
67+
let url = "https://example.com";
68+
println!("Crawling website: {}", url);
69+
70+
let options = CrawlOptions::new()
71+
.with_max_depth(2)
72+
.with_limit(10)
73+
.with_include_paths(vec!["/".to_string()])
74+
.with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()])
75+
.with_follow_external(false)
76+
.with_delay_between_requests(1000)
77+
.with_parallel_requests(3);
78+
79+
let response = BlessCrawl::default()
80+
.crawl(url, Some(options))
81+
.expect("Failed to crawl");
82+
println!("response: {:?}", response);
83+
println!();
84+
println!(
85+
"------------ pages ------------\n{:?}\n------------------------------",
86+
response.data.pages
87+
);
88+
println!();
89+
println!(
90+
"------------ total pages ------------\n{}\n------------------------------",
91+
response.data.total_pages
92+
);
93+
}
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
use htmd::HtmlToMarkdown;
2+
use regex::Regex;
3+
4+
/// Parses HTML content and converts it to Markdown
5+
///
6+
/// This function replicates the behavior of the JavaScript parseMarkdown function:
7+
/// - Converts HTML to Markdown using htmd
8+
/// - Processes multi-line links by escaping newlines inside link content
9+
/// - Removes "Skip to Content" links
10+
/// - Returns empty string for empty/null input
11+
pub fn parse_markdown(html: &str) -> String {
12+
if html.is_empty() {
13+
return String::new();
14+
}
15+
16+
// Convert HTML to Markdown using htmd
17+
let markdown = match HtmlToMarkdown::new().convert(html) {
18+
Ok(md) => md,
19+
Err(_) => {
20+
// Return empty string if conversion fails
21+
return String::new();
22+
}
23+
};
24+
25+
// Process the markdown content
26+
let processed_markdown = process_multiline_links(&markdown);
27+
remove_skip_to_content_links(&processed_markdown)
28+
}
29+
30+
/// Processes multi-line links by escaping newlines inside link content
31+
///
32+
/// This function replicates the JavaScript processMultiLineLinks function:
33+
/// - Tracks when we're inside link content (between [ and ])
34+
/// - Escapes newlines with backslash when inside links
35+
fn process_multiline_links(markdown_content: &str) -> String {
36+
let mut new_markdown_content = String::new();
37+
let mut link_open_count: usize = 0;
38+
39+
for ch in markdown_content.chars() {
40+
match ch {
41+
'[' => {
42+
link_open_count += 1;
43+
}
44+
']' => {
45+
link_open_count = link_open_count.saturating_sub(1);
46+
}
47+
_ => {}
48+
}
49+
50+
let inside_link_content = link_open_count > 0;
51+
52+
if inside_link_content && ch == '\n' {
53+
new_markdown_content.push('\\');
54+
new_markdown_content.push('\n');
55+
} else {
56+
new_markdown_content.push(ch);
57+
}
58+
}
59+
60+
new_markdown_content
61+
}
62+
63+
/// Removes "Skip to Content" links from the markdown content
64+
///
65+
/// This function replicates the JavaScript removeSkipToContentLinks function:
66+
/// - Removes [Skip to Content](#page) and [Skip to content](#skip) patterns
67+
/// - Case-insensitive matching
68+
fn remove_skip_to_content_links(markdown_content: &str) -> String {
69+
let re = Regex::new(r"(?i)\[Skip to Content\]\(#[^)]*\)").unwrap();
70+
re.replace_all(markdown_content, "").to_string()
71+
}
72+
73+
#[cfg(test)]
74+
mod tests {
75+
use super::*;
76+
77+
#[test]
78+
fn test_parse_markdown_simple() {
79+
let html = "<p>Hello, world!</p>";
80+
let result = parse_markdown(html);
81+
assert_eq!(result.trim(), "Hello, world!");
82+
}
83+
84+
#[test]
85+
fn test_parse_markdown_complex() {
86+
let html =
87+
"<div><p>Hello <strong>bold</strong> world!</p><ul><li>List item</li></ul></div>";
88+
let result = parse_markdown(html);
89+
assert_eq!(result.trim(), "Hello **bold** world!\n\n* List item");
90+
}
91+
92+
#[test]
93+
fn test_parse_markdown_empty() {
94+
let html = "";
95+
let result = parse_markdown(html);
96+
assert_eq!(result, "");
97+
}
98+
99+
#[test]
100+
fn test_process_multiline_links() {
101+
let markdown = "[Link\nwith newline](http://example.com)";
102+
let result = process_multiline_links(markdown);
103+
assert_eq!(result, "[Link\\\nwith newline](http://example.com)");
104+
}
105+
106+
#[test]
107+
fn test_remove_skip_to_content_links() {
108+
let markdown = "Some content [Skip to Content](#page) more content";
109+
let result = remove_skip_to_content_links(markdown);
110+
assert_eq!(result, "Some content more content");
111+
}
112+
113+
#[test]
114+
fn test_remove_skip_to_content_links_case_insensitive() {
115+
let markdown = "Some content [Skip to content](#skip) more content";
116+
let result = remove_skip_to_content_links(markdown);
117+
assert_eq!(result, "Some content more content");
118+
}
119+
}

0 commit comments

Comments
 (0)