Skip to content

Commit da5012b

Browse files
authored
Merge pull request #24 from blocklessnetwork/feat/bless-crawl-refactor
bless-crawl module refactor
2 parents f7d0d52 + 28708dd commit da5012b

File tree

1 file changed

+31
-181
lines changed

1 file changed

+31
-181
lines changed

src/bless_crawl/mod.rs

Lines changed: 31 additions & 181 deletions
Original file line numberDiff line numberDiff line change
@@ -40,32 +40,6 @@ extern "C" {
4040
bytes_written: *mut usize,
4141
) -> ExitCode;
4242

43-
/// Extract and return all discoverable links from webpage
44-
#[allow(clippy::too_many_arguments)]
45-
fn map(
46-
h: *mut Handle,
47-
url_ptr: *const u8,
48-
url_len: usize,
49-
options_ptr: *const u8,
50-
options_len: usize,
51-
result_ptr: *mut u8,
52-
result_len: usize,
53-
bytes_written: *mut usize,
54-
) -> ExitCode;
55-
56-
/// Recursively crawl website starting from given URL
57-
#[allow(clippy::too_many_arguments)]
58-
fn crawl(
59-
h: *mut Handle,
60-
url_ptr: *const u8,
61-
url_len: usize,
62-
options_ptr: *const u8,
63-
options_len: usize,
64-
result_ptr: *mut u8,
65-
result_len: usize,
66-
bytes_written: *mut usize,
67-
) -> ExitCode;
68-
6943
/// Close and cleanup a web scraper instance
7044
fn close(h: Handle) -> ExitCode;
7145
}
@@ -89,34 +63,6 @@ mod mock_ffi {
8963
1
9064
}
9165

92-
#[allow(clippy::too_many_arguments)]
93-
pub unsafe fn map(
94-
h: *mut Handle,
95-
_url_ptr: *const u8,
96-
_url_len: usize,
97-
_options_ptr: *const u8,
98-
_options_len: usize,
99-
result_ptr: *mut u8,
100-
result_len: usize,
101-
bytes_written: *mut usize,
102-
) -> ExitCode {
103-
1
104-
}
105-
106-
#[allow(clippy::too_many_arguments)]
107-
pub unsafe fn crawl(
108-
h: *mut Handle,
109-
_url_ptr: *const u8,
110-
_url_len: usize,
111-
_options_ptr: *const u8,
112-
_options_len: usize,
113-
result_ptr: *mut u8,
114-
result_len: usize,
115-
bytes_written: *mut usize,
116-
) -> ExitCode {
117-
1
118-
}
119-
12066
pub unsafe fn close(_h: Handle) -> ExitCode {
12167
1
12268
}
@@ -525,56 +471,21 @@ impl BlessCrawl {
525471
url: &str,
526472
options: Option<MapOptions>,
527473
) -> Result<Response<MapData>, WebScrapeErrorKind> {
528-
let mut combined_options = serde_json::to_value(&self.config).unwrap();
529-
if let Some(map_opts) = options {
530-
combined_options["map_options"] = serde_json::to_value(map_opts).unwrap();
531-
}
532-
let options_json = serde_json::to_vec(&combined_options).unwrap();
533-
534-
let mut result_buf = vec![0u8; Self::MAX_MAP_BUFFER_SIZE];
535-
let mut bytes_written: usize = 0;
536-
537-
let mut handle = self.inner;
538-
let code = unsafe {
539-
map(
540-
&mut handle,
541-
url.as_ptr(),
542-
url.len(),
543-
options_json.as_ptr(),
544-
options_json.len(),
545-
result_buf.as_mut_ptr(),
546-
result_buf.len(),
547-
&mut bytes_written,
548-
)
549-
};
550-
551-
if code != 0 {
552-
return Err(code.into());
553-
}
554-
555-
if bytes_written == 0 {
556-
return Err(WebScrapeErrorKind::EmptyResponse);
557-
}
558-
559-
if bytes_written > result_buf.len() {
560-
return Err(WebScrapeErrorKind::MemoryError);
561-
}
562-
563-
let result_bytes =
564-
unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
565-
566-
// deserialize the result to MapResponse
567-
let map_response =
568-
serde_json::from_slice::<Response<MapData>>(result_bytes).map_err(|e| {
569-
eprintln!("error: {:?}", e);
570-
WebScrapeErrorKind::ParseError
571-
})?;
572-
573-
if let Some(error) = map_response.error {
574-
return Err(WebScrapeErrorKind::RuntimeError(error));
575-
}
576-
577-
Ok(map_response)
474+
let _map_options = options.unwrap_or_default();
475+
476+
// let scrape_response = self.scrape(url, None)?;
477+
// TODO: implement map by post-processing the scrape response or using fetch
478+
479+
Ok(Response {
480+
success: true,
481+
error: None,
482+
data: MapData {
483+
url: url.to_string(),
484+
links: vec![],
485+
total_links: 0,
486+
timestamp: 0,
487+
},
488+
})
578489
}
579490

580491
/// Recursively crawls a website with configurable depth and filtering.
@@ -583,83 +494,22 @@ impl BlessCrawl {
583494
url: &str,
584495
options: Option<CrawlOptions>,
585496
) -> Result<Response<CrawlData<ScrapeData>>, WebScrapeErrorKind> {
586-
let mut combined_options = serde_json::to_value(&self.config).unwrap();
587-
if let Some(crawl_opts) = options {
588-
combined_options["crawl_options"] = serde_json::to_value(crawl_opts).unwrap();
589-
}
590-
let options_json = serde_json::to_vec(&combined_options).unwrap();
591-
592-
let mut result_buf = vec![0u8; Self::MAX_CRAWL_BUFFER_SIZE];
593-
let mut bytes_written: usize = 0;
594-
595-
let mut handle = self.inner;
596-
let code = unsafe {
597-
crawl(
598-
&mut handle,
599-
url.as_ptr(),
600-
url.len(),
601-
options_json.as_ptr(),
602-
options_json.len(),
603-
result_buf.as_mut_ptr(),
604-
result_buf.len(),
605-
&mut bytes_written,
606-
)
607-
};
608-
609-
if code != 0 {
610-
return Err(code.into());
611-
}
612-
613-
if bytes_written == 0 {
614-
return Err(WebScrapeErrorKind::EmptyResponse);
615-
}
616-
617-
if bytes_written > result_buf.len() {
618-
return Err(WebScrapeErrorKind::MemoryError);
619-
}
620-
621-
let result_bytes =
622-
unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
623-
624-
// deserialize the result to CrawlResponse
625-
let mut host_crawl_response = serde_json::from_slice::<Response<CrawlData<ScrapeData>>>(
626-
result_bytes,
627-
)
628-
.map_err(|e| {
629-
eprintln!("error: {:?}", e);
630-
WebScrapeErrorKind::ParseError
631-
})?;
632-
633-
if let Some(error) = host_crawl_response.error {
634-
return Err(WebScrapeErrorKind::RuntimeError(error));
635-
}
636-
637-
// post-process html
638-
for page in host_crawl_response.data.pages.iter_mut() {
639-
page.content = transform_html(TransformHtmlOptions {
640-
html: page.content.clone(),
641-
url: page.metadata.url.clone(),
642-
include_tags: self.config.include_tags.clone().unwrap_or_default(),
643-
exclude_tags: self.config.exclude_tags.clone().unwrap_or_default(),
644-
only_main_content: self.config.only_main_content,
645-
})
646-
.map_err(|e| {
647-
eprintln!("error: {:?}", e);
648-
WebScrapeErrorKind::TransformError
649-
})?;
650-
651-
// if the format is markdown, set the content to the markdown of the html
652-
match self.config.format {
653-
Format::Markdown => {
654-
page.content = parse_markdown(&page.content);
655-
}
656-
Format::Html => (), // no need to do anything
657-
Format::Json => unimplemented!(),
658-
}
659-
}
660-
661-
// convert the host CrawlResponse to the user CrawlResponse
662-
Ok(host_crawl_response)
497+
let _crawl_options = options.unwrap_or_default();
498+
499+
// TODO: implement crawl by post-processing the scrape response or using fetch
500+
501+
Ok(Response {
502+
success: true,
503+
error: None,
504+
data: CrawlData {
505+
root_url: url.to_string(),
506+
pages: vec![],
507+
link_map: None,
508+
depth_reached: 0,
509+
total_pages: 0,
510+
errors: vec![],
511+
},
512+
})
663513
}
664514
}
665515

0 commit comments

Comments
 (0)