Skip to content

Commit

Permalink
feat: updated code
Browse files Browse the repository at this point in the history
  • Loading branch information
tikazyq committed Apr 12, 2024
1 parent 8fd5132 commit 5528f41
Show file tree
Hide file tree
Showing 8 changed files with 147 additions and 25 deletions.
2 changes: 2 additions & 0 deletions crawlab_ai/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import argparse

from crawlab_ai.cli.gen_code import setup_gen_code_parser
from crawlab_ai.cli.config import setup_config_parser
from crawlab_ai.cli.crawl import setup_crawl_parser

parser = argparse.ArgumentParser(description="Web scraping tool")
subparsers = parser.add_subparsers(dest="command")

setup_crawl_parser(subparsers)
setup_gen_code_parser(subparsers)
setup_config_parser(subparsers)


Expand Down
7 changes: 5 additions & 2 deletions crawlab_ai/cli/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,8 @@ def crawl_list(args):

def crawl_article(args):
data = read_article(args.url)
with open(args.output, "w") as f:
f.write(json.dumps(data))
if args.output:
with open(args.output, "w") as f:
f.write(json.dumps(data))
else:
print(json.dumps(data, indent=2))
33 changes: 33 additions & 0 deletions crawlab_ai/cli/gen_code.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,45 @@
from crawlab_ai.code.article import get_code_article
from crawlab_ai.code.list import get_code_list
from crawlab_ai.utils.auth import get_token


def setup_gen_code_parser(subparsers):
gen_code_parser = subparsers.add_parser(
"gen_code", help="Generate crawler code for a webpage"
)
gen_code_parser.add_argument("url", help="URL to generate code for")
gen_code_parser.add_argument(
"-t",
"--type",
help="Type of the webpage to generate code for",
default="list",
choices=["article", "list"],
)
gen_code_parser.add_argument("-o", "--output", help="Output file path")
gen_code_parser.set_defaults(func=gen_code)


def gen_code(args):
get_token()
if args.type == "list":
gen_code_list(args)
elif args.type == "article":
gen_code_article(args)


def gen_code_list(args):
code = get_code_list(args.url)
if args.output:
with open(args.output, "w") as f:
f.write(code)
else:
print(code)


def gen_code_article(args):
code = get_code_article(args.url)
if args.output:
with open(args.output, "w") as f:
f.write(code)
else:
print(code)
20 changes: 20 additions & 0 deletions crawlab_ai/code/article.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import requests
from crawlab_ai.spider.article_spider import extract_article_rules
from crawlab_ai.utils.auth import get_auth_headers
from crawlab_ai.utils.env import get_api_endpoint


def get_code_article(url: str):
rules = extract_article_rules(url)
res = requests.post(
url=get_api_endpoint() + "/code/article",
headers=get_auth_headers(),
json={"url": url, "rules": rules},
)
res.raise_for_status()
data = res.json()
return data["source_code"]


if __name__ == "__main__":
print(get_code_article("https://www.36kr.com/p/2601845967059847"))
22 changes: 22 additions & 0 deletions crawlab_ai/code/list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from typing import List

import requests
from crawlab_ai.spider.list_spider import extract_list_rules
from crawlab_ai.utils.auth import get_auth_headers
from crawlab_ai.utils.env import get_api_endpoint


def get_code_list(url: str, fields: List[str] | dict = None):
rules = extract_list_rules(url, fields)
res = requests.post(
url=get_api_endpoint() + "/code/list",
headers=get_auth_headers(),
json={"url": url, "rules": rules},
)
res.raise_for_status()
data = res.json()
return data["source_code"]


if __name__ == "__main__":
print(get_code_list("https://quotes.toscrape.com"))
2 changes: 2 additions & 0 deletions crawlab_ai/scrapy/list_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def _fetch_rules(self):
"fields": self.fields,
},
)
res.raise_for_status()

data = res.json()
self._list_element_css_selector = data["model_list"][0]["list_model"][
"list_element_css_selector"
Expand Down
53 changes: 39 additions & 14 deletions crawlab_ai/spider/article_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class ArticleSpider(BaseSpider):
def __init__(self, url: str, get_html=None):
super().__init__(url, get_html)
self.data: dict | None = None
self.rules: dict | None = None
self.url = url
self.get_html = get_html or self.crawl

Expand All @@ -22,32 +22,57 @@ def fetch_rules(self):
"url": self.url,
},
)
if res.status_code != 200:
raise Exception("Failed to fetch rules for URL: " + self.url)
res.raise_for_status()

self.data = res.json()
self.rules = res.json()
logger.info("Rules fetched successfully for URL: " + self.url)
logger.info("Title: %s", self.data.get("title"))
logger.info("Author: %s", self.data.get("author"))
logger.info("Publish date: %s", self.data.get("publish_date"))
logger.info("Content: %s", self.data.get("content")[:200])
logger.info("Title CSS selector: %s", self.data.get("title_css_selector"))
logger.info("Author CSS selector: %s", self.data.get("author_css_selector"))
logger.info("Title: %s", self.rules.get("title"))
logger.info("Author: %s", self.rules.get("author"))
logger.info("Publish date: %s", self.rules.get("publish_date"))
logger.info("Content: %s", self.rules.get("content")[:200])
logger.info("Title CSS selector: %s", self.rules.get("title_css_selector"))
logger.info("Author CSS selector: %s", self.rules.get("author_css_selector"))
logger.info(
"Publish date CSS selector: %s", self.data.get("publish_date_css_selector")
"Publish date CSS selector: %s", self.rules.get("publish_date_css_selector")
)
logger.info("Content CSS selector: %s", self.data.get("content_css_selector"))
logger.info("Content CSS selector: %s", self.rules.get("content_css_selector"))

def crawl(self):
logger.info("Crawling URL: " + self.url)
self.fetch_rules()
logger.info("Crawling completed for URL: " + self.url)


def read_article(url: str, get_html=None):
def read_article(url: str, get_html=None) -> dict:
"""
Read an article from a URL
Args:
url (str): URL of the article
get_html (function): Function to get HTML content
Returns:
dict: Article data
"""
spider = ArticleSpider(url, get_html)
spider.crawl()
return spider.data
return spider.rules


def extract_article_rules(url: str, get_html=None) -> dict:
"""
Extract article rules from a URL
Args:
url (str): URL of the article
get_html (function): Function to get HTML content
Returns:
dict: Article rules
"""
spider = ArticleSpider(url, get_html)
spider.fetch_rules()
return spider.rules


if __name__ == "__main__":
Expand Down
33 changes: 24 additions & 9 deletions crawlab_ai/spider/list_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def __init__(self, url: str, fields: List[dict] = None, get_html=None):
self.url = url
self.fields = fields
self.data = []
self.fetch_rules()

@property
def list_element_css_selector(self):
Expand All @@ -59,9 +58,9 @@ def fetch_rules(self):
"fields": self.fields,
},
)
res.raise_for_status()

data = res.json()
if res.status_code != 200:
raise Exception("Failed to fetch rules for URL: " + self.url)
self.rules = data["model_list"][0]
logger.info("Rules fetched successfully for URL: " + self.url)
logger.info("List element CSS selector: " + self.list_element_css_selector)
Expand All @@ -72,6 +71,7 @@ def fetch_rules(self):
)

def crawl(self):
self.fetch_rules()
futures = []
with ThreadPoolExecutor(max_workers=5) as executor:
futures.append(executor.submit(self._fetch_data, self.url))
Expand Down Expand Up @@ -125,8 +125,12 @@ def _get_fields(fields: List[str] | dict = None) -> Optional[List[str] | List[di


def read_list(
url: str, fields: List[str] | dict = None, get_html=None, as_dataframe=True
) -> DataFrame | List[dict]:
url: str,
fields: List[str] | dict = None,
get_html=None,
as_dataframe=True,
return_rules=False,
) -> DataFrame | List[dict] | tuple[DataFrame | List[dict], dict]:
"""
Reads a list of items from a webpage and returns a DataFrame.
Expand All @@ -135,18 +139,29 @@ def read_list(
fields (List[str] | dict): A list of fields to be extracted from each list element.
get_html (function): A function to fetch the HTML content of a webpage. Defaults to the requests library.
as_dataframe (bool): Whether to return the extracted data as a DataFrame. Defaults to True.
return_rules (bool): Whether to return the rules used for extraction. Defaults to False.
Returns:
DataFrame | List[dict]: A DataFrame containing the extracted data if as_dataframe is True, otherwise a list of
dictionaries.
DataFrame | List[dict] | tuple[DataFrame | List[dict], dict]: The extracted data as a DataFrame or a list of
dictionaries. If return_rules is True, a tuple containing the data and the rules used for extraction is
returned.
"""
spider = ListSpider(url=url, fields=_get_fields(fields), get_html=get_html)
spider.crawl()
if as_dataframe:
return DataFrame(spider.data)
return_data = DataFrame(spider.data)
else:
return spider.data
return_data = spider.data
if return_rules:
return return_data, spider.rules
return return_data


def extract_list_rules(url: str, fields: List[str] | dict = None) -> dict:
spider = ListSpider(url=url, fields=_get_fields(fields))
spider.fetch_rules()
return spider.rules


if __name__ == "__main__":
Expand Down

0 comments on commit 5528f41

Please sign in to comment.