feat: updated code

crawlab-team · Apr 12, 2024 · 5528f41 · 5528f41
1 parent 8fd5132
commit 5528f41
Show file tree

Hide file tree

Showing 8 changed files with 147 additions and 25 deletions.
diff --git a/crawlab_ai/cli/__init__.py b/crawlab_ai/cli/__init__.py
@@ -1,12 +1,14 @@
 import argparse
 
+from crawlab_ai.cli.gen_code import setup_gen_code_parser
 from crawlab_ai.cli.config import setup_config_parser
 from crawlab_ai.cli.crawl import setup_crawl_parser
 
 parser = argparse.ArgumentParser(description="Web scraping tool")
 subparsers = parser.add_subparsers(dest="command")
 
 setup_crawl_parser(subparsers)
+setup_gen_code_parser(subparsers)
 setup_config_parser(subparsers)
 
 

diff --git a/crawlab_ai/cli/crawl.py b/crawlab_ai/cli/crawl.py
@@ -38,5 +38,8 @@ def crawl_list(args):
 
 def crawl_article(args):
     data = read_article(args.url)
-    with open(args.output, "w") as f:
-        f.write(json.dumps(data))
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(json.dumps(data))
+    else:
+        print(json.dumps(data, indent=2))
diff --git a/crawlab_ai/cli/gen_code.py b/crawlab_ai/cli/gen_code.py
@@ -1,12 +1,45 @@
+from crawlab_ai.code.article import get_code_article
+from crawlab_ai.code.list import get_code_list
 from crawlab_ai.utils.auth import get_token
 
 
 def setup_gen_code_parser(subparsers):
     gen_code_parser = subparsers.add_parser(
         "gen_code", help="Generate crawler code for a webpage"
     )
+    gen_code_parser.add_argument("url", help="URL to generate code for")
+    gen_code_parser.add_argument(
+        "-t",
+        "--type",
+        help="Type of the webpage to generate code for",
+        default="list",
+        choices=["article", "list"],
+    )
+    gen_code_parser.add_argument("-o", "--output", help="Output file path")
     gen_code_parser.set_defaults(func=gen_code)
 
 
 def gen_code(args):
     get_token()
+    if args.type == "list":
+        gen_code_list(args)
+    elif args.type == "article":
+        gen_code_article(args)
+
+
+def gen_code_list(args):
+    code = get_code_list(args.url)
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(code)
+    else:
+        print(code)
+
+
+def gen_code_article(args):
+    code = get_code_article(args.url)
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(code)
+    else:
+        print(code)
diff --git a/crawlab_ai/code/article.py b/crawlab_ai/code/article.py
@@ -0,0 +1,20 @@
+import requests
+from crawlab_ai.spider.article_spider import extract_article_rules
+from crawlab_ai.utils.auth import get_auth_headers
+from crawlab_ai.utils.env import get_api_endpoint
+
+
+def get_code_article(url: str):
+    rules = extract_article_rules(url)
+    res = requests.post(
+        url=get_api_endpoint() + "/code/article",
+        headers=get_auth_headers(),
+        json={"url": url, "rules": rules},
+    )
+    res.raise_for_status()
+    data = res.json()
+    return data["source_code"]
+
+
+if __name__ == "__main__":
+    print(get_code_article("https://www.36kr.com/p/2601845967059847"))
diff --git a/crawlab_ai/code/list.py b/crawlab_ai/code/list.py
@@ -0,0 +1,22 @@
+from typing import List
+
+import requests
+from crawlab_ai.spider.list_spider import extract_list_rules
+from crawlab_ai.utils.auth import get_auth_headers
+from crawlab_ai.utils.env import get_api_endpoint
+
+
+def get_code_list(url: str, fields: List[str] | dict = None):
+    rules = extract_list_rules(url, fields)
+    res = requests.post(
+        url=get_api_endpoint() + "/code/list",
+        headers=get_auth_headers(),
+        json={"url": url, "rules": rules},
+    )
+    res.raise_for_status()
+    data = res.json()
+    return data["source_code"]
+
+
+if __name__ == "__main__":
+    print(get_code_list("https://quotes.toscrape.com"))
diff --git a/crawlab_ai/scrapy/list_spider.py b/crawlab_ai/scrapy/list_spider.py
@@ -70,6 +70,8 @@ def _fetch_rules(self):
                 "fields": self.fields,
             },
         )
+        res.raise_for_status()
+
         data = res.json()
         self._list_element_css_selector = data["model_list"][0]["list_model"][
             "list_element_css_selector"

diff --git a/crawlab_ai/spider/article_spider.py b/crawlab_ai/spider/article_spider.py
@@ -9,7 +9,7 @@
 class ArticleSpider(BaseSpider):
     def __init__(self, url: str, get_html=None):
         super().__init__(url, get_html)
-        self.data: dict | None = None
+        self.rules: dict | None = None
         self.url = url
         self.get_html = get_html or self.crawl
 
@@ -22,32 +22,57 @@ def fetch_rules(self):
                 "url": self.url,
             },
         )
-        if res.status_code != 200:
-            raise Exception("Failed to fetch rules for URL: " + self.url)
+        res.raise_for_status()
 
-        self.data = res.json()
+        self.rules = res.json()
         logger.info("Rules fetched successfully for URL: " + self.url)
-        logger.info("Title: %s", self.data.get("title"))
-        logger.info("Author: %s", self.data.get("author"))
-        logger.info("Publish date: %s", self.data.get("publish_date"))
-        logger.info("Content: %s", self.data.get("content")[:200])
-        logger.info("Title CSS selector: %s", self.data.get("title_css_selector"))
-        logger.info("Author CSS selector: %s", self.data.get("author_css_selector"))
+        logger.info("Title: %s", self.rules.get("title"))
+        logger.info("Author: %s", self.rules.get("author"))
+        logger.info("Publish date: %s", self.rules.get("publish_date"))
+        logger.info("Content: %s", self.rules.get("content")[:200])
+        logger.info("Title CSS selector: %s", self.rules.get("title_css_selector"))
+        logger.info("Author CSS selector: %s", self.rules.get("author_css_selector"))
         logger.info(
-            "Publish date CSS selector: %s", self.data.get("publish_date_css_selector")
+            "Publish date CSS selector: %s", self.rules.get("publish_date_css_selector")
         )
-        logger.info("Content CSS selector: %s", self.data.get("content_css_selector"))
+        logger.info("Content CSS selector: %s", self.rules.get("content_css_selector"))
 
     def crawl(self):
         logger.info("Crawling URL: " + self.url)
         self.fetch_rules()
         logger.info("Crawling completed for URL: " + self.url)
 
 
-def read_article(url: str, get_html=None):
+def read_article(url: str, get_html=None) -> dict:
+    """
+    Read an article from a URL
+
+    Args:
+        url (str): URL of the article
+        get_html (function): Function to get HTML content
+
+    Returns:
+        dict: Article data
+    """
     spider = ArticleSpider(url, get_html)
     spider.crawl()
-    return spider.data
+    return spider.rules
+
+
+def extract_article_rules(url: str, get_html=None) -> dict:
+    """
+    Extract article rules from a URL
+
+    Args:
+        url (str): URL of the article
+        get_html (function): Function to get HTML content
+
+    Returns:
+        dict: Article rules
+    """
+    spider = ArticleSpider(url, get_html)
+    spider.fetch_rules()
+    return spider.rules
 
 
 if __name__ == "__main__":

diff --git a/crawlab_ai/spider/list_spider.py b/crawlab_ai/spider/list_spider.py
@@ -35,7 +35,6 @@ def __init__(self, url: str, fields: List[dict] = None, get_html=None):
         self.url = url
         self.fields = fields
         self.data = []
-        self.fetch_rules()
 
     @property
     def list_element_css_selector(self):
@@ -59,9 +58,9 @@ def fetch_rules(self):
                 "fields": self.fields,
             },
         )
+        res.raise_for_status()
+
         data = res.json()
-        if res.status_code != 200:
-            raise Exception("Failed to fetch rules for URL: " + self.url)
         self.rules = data["model_list"][0]
         logger.info("Rules fetched successfully for URL: " + self.url)
         logger.info("List element CSS selector: " + self.list_element_css_selector)
@@ -72,6 +71,7 @@ def fetch_rules(self):
         )
 
     def crawl(self):
+        self.fetch_rules()
         futures = []
         with ThreadPoolExecutor(max_workers=5) as executor:
             futures.append(executor.submit(self._fetch_data, self.url))
@@ -125,8 +125,12 @@ def _get_fields(fields: List[str] | dict = None) -> Optional[List[str] | List[di
 
 
 def read_list(
-    url: str, fields: List[str] | dict = None, get_html=None, as_dataframe=True
-) -> DataFrame | List[dict]:
+    url: str,
+    fields: List[str] | dict = None,
+    get_html=None,
+    as_dataframe=True,
+    return_rules=False,
+) -> DataFrame | List[dict] | tuple[DataFrame | List[dict], dict]:
     """
     Reads a list of items from a webpage and returns a DataFrame.
 
@@ -135,18 +139,29 @@ def read_list(
         fields (List[str] | dict): A list of fields to be extracted from each list element.
         get_html (function): A function to fetch the HTML content of a webpage. Defaults to the requests library.
         as_dataframe (bool): Whether to return the extracted data as a DataFrame. Defaults to True.
+        return_rules (bool): Whether to return the rules used for extraction. Defaults to False.
 
     Returns:
-        DataFrame | List[dict]: A DataFrame containing the extracted data if as_dataframe is True, otherwise a list of
-        dictionaries.
+        DataFrame | List[dict] | tuple[DataFrame | List[dict], dict]: The extracted data as a DataFrame or a list of
+        dictionaries. If return_rules is True, a tuple containing the data and the rules used for extraction is
+        returned.
 
     """
     spider = ListSpider(url=url, fields=_get_fields(fields), get_html=get_html)
     spider.crawl()
     if as_dataframe:
-        return DataFrame(spider.data)
+        return_data = DataFrame(spider.data)
     else:
-        return spider.data
+        return_data = spider.data
+    if return_rules:
+        return return_data, spider.rules
+    return return_data
+
+
+def extract_list_rules(url: str, fields: List[str] | dict = None) -> dict:
+    spider = ListSpider(url=url, fields=_get_fields(fields))
+    spider.fetch_rules()
+    return spider.rules
 
 
 if __name__ == "__main__":