-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
147 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,45 @@ | ||
from crawlab_ai.code.article import get_code_article | ||
from crawlab_ai.code.list import get_code_list | ||
from crawlab_ai.utils.auth import get_token | ||
|
||
|
||
def setup_gen_code_parser(subparsers): | ||
gen_code_parser = subparsers.add_parser( | ||
"gen_code", help="Generate crawler code for a webpage" | ||
) | ||
gen_code_parser.add_argument("url", help="URL to generate code for") | ||
gen_code_parser.add_argument( | ||
"-t", | ||
"--type", | ||
help="Type of the webpage to generate code for", | ||
default="list", | ||
choices=["article", "list"], | ||
) | ||
gen_code_parser.add_argument("-o", "--output", help="Output file path") | ||
gen_code_parser.set_defaults(func=gen_code) | ||
|
||
|
||
def gen_code(args): | ||
get_token() | ||
if args.type == "list": | ||
gen_code_list(args) | ||
elif args.type == "article": | ||
gen_code_article(args) | ||
|
||
|
||
def gen_code_list(args): | ||
code = get_code_list(args.url) | ||
if args.output: | ||
with open(args.output, "w") as f: | ||
f.write(code) | ||
else: | ||
print(code) | ||
|
||
|
||
def gen_code_article(args): | ||
code = get_code_article(args.url) | ||
if args.output: | ||
with open(args.output, "w") as f: | ||
f.write(code) | ||
else: | ||
print(code) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import requests | ||
from crawlab_ai.spider.article_spider import extract_article_rules | ||
from crawlab_ai.utils.auth import get_auth_headers | ||
from crawlab_ai.utils.env import get_api_endpoint | ||
|
||
|
||
def get_code_article(url: str): | ||
rules = extract_article_rules(url) | ||
res = requests.post( | ||
url=get_api_endpoint() + "/code/article", | ||
headers=get_auth_headers(), | ||
json={"url": url, "rules": rules}, | ||
) | ||
res.raise_for_status() | ||
data = res.json() | ||
return data["source_code"] | ||
|
||
|
||
if __name__ == "__main__": | ||
print(get_code_article("https://www.36kr.com/p/2601845967059847")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from typing import List | ||
|
||
import requests | ||
from crawlab_ai.spider.list_spider import extract_list_rules | ||
from crawlab_ai.utils.auth import get_auth_headers | ||
from crawlab_ai.utils.env import get_api_endpoint | ||
|
||
|
||
def get_code_list(url: str, fields: List[str] | dict = None): | ||
rules = extract_list_rules(url, fields) | ||
res = requests.post( | ||
url=get_api_endpoint() + "/code/list", | ||
headers=get_auth_headers(), | ||
json={"url": url, "rules": rules}, | ||
) | ||
res.raise_for_status() | ||
data = res.json() | ||
return data["source_code"] | ||
|
||
|
||
if __name__ == "__main__": | ||
print(get_code_list("https://quotes.toscrape.com")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters