@@ -40,32 +40,6 @@ extern "C" {
4040 bytes_written : * mut usize ,
4141 ) -> ExitCode ;
4242
43- /// Extract and return all discoverable links from webpage
44- #[ allow( clippy:: too_many_arguments) ]
45- fn map (
46- h : * mut Handle ,
47- url_ptr : * const u8 ,
48- url_len : usize ,
49- options_ptr : * const u8 ,
50- options_len : usize ,
51- result_ptr : * mut u8 ,
52- result_len : usize ,
53- bytes_written : * mut usize ,
54- ) -> ExitCode ;
55-
56- /// Recursively crawl website starting from given URL
57- #[ allow( clippy:: too_many_arguments) ]
58- fn crawl (
59- h : * mut Handle ,
60- url_ptr : * const u8 ,
61- url_len : usize ,
62- options_ptr : * const u8 ,
63- options_len : usize ,
64- result_ptr : * mut u8 ,
65- result_len : usize ,
66- bytes_written : * mut usize ,
67- ) -> ExitCode ;
68-
6943 /// Close and cleanup a web scraper instance
7044 fn close ( h : Handle ) -> ExitCode ;
7145}
@@ -89,34 +63,6 @@ mod mock_ffi {
8963 1
9064 }
9165
92- #[ allow( clippy:: too_many_arguments) ]
93- pub unsafe fn map (
94- h : * mut Handle ,
95- _url_ptr : * const u8 ,
96- _url_len : usize ,
97- _options_ptr : * const u8 ,
98- _options_len : usize ,
99- result_ptr : * mut u8 ,
100- result_len : usize ,
101- bytes_written : * mut usize ,
102- ) -> ExitCode {
103- 1
104- }
105-
106- #[ allow( clippy:: too_many_arguments) ]
107- pub unsafe fn crawl (
108- h : * mut Handle ,
109- _url_ptr : * const u8 ,
110- _url_len : usize ,
111- _options_ptr : * const u8 ,
112- _options_len : usize ,
113- result_ptr : * mut u8 ,
114- result_len : usize ,
115- bytes_written : * mut usize ,
116- ) -> ExitCode {
117- 1
118- }
119-
12066 pub unsafe fn close ( _h : Handle ) -> ExitCode {
12167 1
12268 }
@@ -525,56 +471,21 @@ impl BlessCrawl {
525471 url : & str ,
526472 options : Option < MapOptions > ,
527473 ) -> Result < Response < MapData > , WebScrapeErrorKind > {
528- let mut combined_options = serde_json:: to_value ( & self . config ) . unwrap ( ) ;
529- if let Some ( map_opts) = options {
530- combined_options[ "map_options" ] = serde_json:: to_value ( map_opts) . unwrap ( ) ;
531- }
532- let options_json = serde_json:: to_vec ( & combined_options) . unwrap ( ) ;
533-
534- let mut result_buf = vec ! [ 0u8 ; Self :: MAX_MAP_BUFFER_SIZE ] ;
535- let mut bytes_written: usize = 0 ;
536-
537- let mut handle = self . inner ;
538- let code = unsafe {
539- map (
540- & mut handle,
541- url. as_ptr ( ) ,
542- url. len ( ) ,
543- options_json. as_ptr ( ) ,
544- options_json. len ( ) ,
545- result_buf. as_mut_ptr ( ) ,
546- result_buf. len ( ) ,
547- & mut bytes_written,
548- )
549- } ;
550-
551- if code != 0 {
552- return Err ( code. into ( ) ) ;
553- }
554-
555- if bytes_written == 0 {
556- return Err ( WebScrapeErrorKind :: EmptyResponse ) ;
557- }
558-
559- if bytes_written > result_buf. len ( ) {
560- return Err ( WebScrapeErrorKind :: MemoryError ) ;
561- }
562-
563- let result_bytes =
564- unsafe { std:: slice:: from_raw_parts ( result_buf. as_ptr ( ) , bytes_written) } ;
565-
566- // deserialize the result to MapResponse
567- let map_response =
568- serde_json:: from_slice :: < Response < MapData > > ( result_bytes) . map_err ( |e| {
569- eprintln ! ( "error: {:?}" , e) ;
570- WebScrapeErrorKind :: ParseError
571- } ) ?;
572-
573- if let Some ( error) = map_response. error {
574- return Err ( WebScrapeErrorKind :: RuntimeError ( error) ) ;
575- }
576-
577- Ok ( map_response)
474+ let _map_options = options. unwrap_or_default ( ) ;
475+
476+ // let scrape_response = self.scrape(url, None)?;
477+ // TODO: implement map by post-processing the scrape response or using fetch
478+
479+ Ok ( Response {
480+ success : true ,
481+ error : None ,
482+ data : MapData {
483+ url : url. to_string ( ) ,
484+ links : vec ! [ ] ,
485+ total_links : 0 ,
486+ timestamp : 0 ,
487+ } ,
488+ } )
578489 }
579490
580491 /// Recursively crawls a website with configurable depth and filtering.
@@ -583,83 +494,22 @@ impl BlessCrawl {
583494 url : & str ,
584495 options : Option < CrawlOptions > ,
585496 ) -> Result < Response < CrawlData < ScrapeData > > , WebScrapeErrorKind > {
586- let mut combined_options = serde_json:: to_value ( & self . config ) . unwrap ( ) ;
587- if let Some ( crawl_opts) = options {
588- combined_options[ "crawl_options" ] = serde_json:: to_value ( crawl_opts) . unwrap ( ) ;
589- }
590- let options_json = serde_json:: to_vec ( & combined_options) . unwrap ( ) ;
591-
592- let mut result_buf = vec ! [ 0u8 ; Self :: MAX_CRAWL_BUFFER_SIZE ] ;
593- let mut bytes_written: usize = 0 ;
594-
595- let mut handle = self . inner ;
596- let code = unsafe {
597- crawl (
598- & mut handle,
599- url. as_ptr ( ) ,
600- url. len ( ) ,
601- options_json. as_ptr ( ) ,
602- options_json. len ( ) ,
603- result_buf. as_mut_ptr ( ) ,
604- result_buf. len ( ) ,
605- & mut bytes_written,
606- )
607- } ;
608-
609- if code != 0 {
610- return Err ( code. into ( ) ) ;
611- }
612-
613- if bytes_written == 0 {
614- return Err ( WebScrapeErrorKind :: EmptyResponse ) ;
615- }
616-
617- if bytes_written > result_buf. len ( ) {
618- return Err ( WebScrapeErrorKind :: MemoryError ) ;
619- }
620-
621- let result_bytes =
622- unsafe { std:: slice:: from_raw_parts ( result_buf. as_ptr ( ) , bytes_written) } ;
623-
624- // deserialize the result to CrawlResponse
625- let mut host_crawl_response = serde_json:: from_slice :: < Response < CrawlData < ScrapeData > > > (
626- result_bytes,
627- )
628- . map_err ( |e| {
629- eprintln ! ( "error: {:?}" , e) ;
630- WebScrapeErrorKind :: ParseError
631- } ) ?;
632-
633- if let Some ( error) = host_crawl_response. error {
634- return Err ( WebScrapeErrorKind :: RuntimeError ( error) ) ;
635- }
636-
637- // post-process html
638- for page in host_crawl_response. data . pages . iter_mut ( ) {
639- page. content = transform_html ( TransformHtmlOptions {
640- html : page. content . clone ( ) ,
641- url : page. metadata . url . clone ( ) ,
642- include_tags : self . config . include_tags . clone ( ) . unwrap_or_default ( ) ,
643- exclude_tags : self . config . exclude_tags . clone ( ) . unwrap_or_default ( ) ,
644- only_main_content : self . config . only_main_content ,
645- } )
646- . map_err ( |e| {
647- eprintln ! ( "error: {:?}" , e) ;
648- WebScrapeErrorKind :: TransformError
649- } ) ?;
650-
651- // if the format is markdown, set the content to the markdown of the html
652- match self . config . format {
653- Format :: Markdown => {
654- page. content = parse_markdown ( & page. content ) ;
655- }
656- Format :: Html => ( ) , // no need to do anything
657- Format :: Json => unimplemented ! ( ) ,
658- }
659- }
660-
661- // convert the host CrawlResponse to the user CrawlResponse
662- Ok ( host_crawl_response)
497+ let _crawl_options = options. unwrap_or_default ( ) ;
498+
499+ // TODO: implement crawl by post-processing the scrape response or using fetch
500+
501+ Ok ( Response {
502+ success : true ,
503+ error : None ,
504+ data : CrawlData {
505+ root_url : url. to_string ( ) ,
506+ pages : vec ! [ ] ,
507+ link_map : None ,
508+ depth_reached : 0 ,
509+ total_pages : 0 ,
510+ errors : vec ! [ ] ,
511+ } ,
512+ } )
663513 }
664514}
665515
0 commit comments