-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Loading status checks…
initial commit
Showing
53 changed files
with
3,325 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
name: Publish Python Package | ||
|
||
on: | ||
push: | ||
branches: [ main, test, develop ] | ||
pull_request: | ||
types: | ||
- opened | ||
|
||
env: | ||
PACKAGE_NAME: crawlab-sdk | ||
|
||
jobs: | ||
deploy: | ||
runs-on: ubuntu-latest | ||
outputs: | ||
is_new_version: ${{ steps.check_version.outputs.is_new_version }} | ||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v1 | ||
with: | ||
python-version: '3.12' | ||
|
||
- name: Install dependencies | ||
id: install_dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install poetry | ||
poetry install | ||
- name: Check version | ||
id: check_version | ||
run: | | ||
version=`poetry version -s` | ||
res=`curl https://pypi.org/project/${{ env.PACKAGE_NAME }}/${version}/ -i -s | grep 'HTTP/2 404' | true` | ||
if [[ $res =~ 404 ]]; then | ||
echo "is_new_version=true" >> $GITHUB_OUTPUT | ||
else: | ||
echo "is_new_version=false" >> $GITHUB_OUTPUT | ||
fi | ||
- name: Build and publish | ||
id: publish | ||
if: ${{ always() && steps.check_version.outputs.is_new_version == 'true' }} | ||
env: | ||
POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }} | ||
run: | | ||
poetry publish --build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,27 @@ | ||
# crawlab-python-sdk | ||
# Crawlab Python SDK | ||
|
||
Python SDK for Crawlab | ||
|
||
## Installation | ||
|
||
```bash | ||
pip install crawlab-sdk | ||
``` | ||
|
||
## Development | ||
|
||
### Install dependencies | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
### Compile gRPC | ||
|
||
```bash | ||
# Set the environment variable CRAWLAB_PROTO_PATH to the path of the gRPC proto files | ||
export CRAWLAB_PROTO_PATH=/path/to/grpc/proto/files | ||
|
||
# Compile gRPC to Python code | ||
./compile_grpc.sh | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/bash | ||
|
||
# proto_path_root is the path to the directory containing the proto files | ||
proto_path_root=${CRAWLAB_PROTO_PATH} | ||
|
||
# check if proto_path_root is empty | ||
if [ -z "$proto_path_root" ]; then | ||
echo "Please set the CRAWLAB_PROTO_PATH environment variable to the path containing the proto files." | ||
exit 1 | ||
fi | ||
|
||
# check if proto_path_root exists | ||
if [ ! -d "$proto_path_root" ]; then | ||
echo "The directory specified by CRAWLAB_PROTO_PATH does not exist." | ||
exit 1 | ||
fi | ||
|
||
# output_path is the path to the directory where the generated Python code will be saved | ||
output_path=./crawlab/grpc | ||
|
||
# Remove the output directory if it exists | ||
if [ -d ${output_path} ]; then | ||
rm -rf ${output_path} | ||
fi | ||
|
||
# Create the output directory | ||
mkdir -p ${output_path} | ||
|
||
# Generate the Python code from the proto files | ||
python -m grpc_tools.protoc \ | ||
-I ${proto_path_root} \ | ||
--python_out=${output_path} \ | ||
--grpc_python_out=${output_path} \ | ||
${proto_path_root}/**/*.proto | ||
|
||
# Convert imports to absolute paths | ||
sed -i 's/from \([a-zA-Z0-9_]*\) import \([a-zA-Z0-9_]*\)_pb2/from crawlab.grpc.\1 import \2_pb2/g' ${output_path}/**/*.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
__all__ = [ | ||
'save_item', | ||
'save_items', | ||
'CrawlabPipeline', | ||
] | ||
|
||
from crawlab.result import save_item, save_items | ||
from crawlab.scrapy.pipelines import CrawlabPipeline |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from crawlab.client import http_post | ||
from crawlab.config.config import config | ||
from crawlab.constants.upload import ( | ||
CLI_DEFAULT_CONFIG_KEY_USERNAME, | ||
CLI_DEFAULT_CONFIG_KEY_PASSWORD, | ||
CLI_DEFAULT_CONFIG_KEY_API_ADDRESS, | ||
CLI_DEFAULT_CONFIG_KEY_TOKEN, | ||
) | ||
|
||
|
||
def login(api_address: str, username: str, password: str): | ||
url = f"{api_address}/login" | ||
try: | ||
res = http_post( | ||
url, | ||
{ | ||
"username": username, | ||
"password": password, | ||
}, | ||
) | ||
print("logged-in successfully") | ||
except Exception as e: | ||
print(e) | ||
return | ||
|
||
token = res.json().get("data") | ||
config.set(CLI_DEFAULT_CONFIG_KEY_USERNAME, username) | ||
config.set(CLI_DEFAULT_CONFIG_KEY_PASSWORD, password) | ||
config.set(CLI_DEFAULT_CONFIG_KEY_API_ADDRESS, api_address) | ||
config.set(CLI_DEFAULT_CONFIG_KEY_TOKEN, token) | ||
config.save() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import os | ||
import re | ||
import sys | ||
from typing import Optional | ||
|
||
from httpx import Response | ||
from rich.console import Console | ||
|
||
from crawlab.client import http_post | ||
from crawlab.constants.upload import ( | ||
CLI_DEFAULT_UPLOAD_SPIDER_MODE, | ||
CLI_DEFAULT_UPLOAD_SPIDER_CMD, | ||
CLI_DEFAULT_UPLOAD_IGNORE_PATTERNS, | ||
) | ||
|
||
console = Console() | ||
|
||
|
||
def create_spider( | ||
name: str, | ||
description: Optional[str] = None, | ||
mode: Optional[str] = None, | ||
priority: Optional[int] = None, | ||
cmd: Optional[str] = None, | ||
param: Optional[str] = None, | ||
col_name: Optional[str] = None, | ||
) -> Response: | ||
# results collection name | ||
if col_name is None: | ||
col_name = f'results_{"_".join(name.lower().split(" "))}' | ||
|
||
# mode | ||
if mode is None: | ||
mode = CLI_DEFAULT_UPLOAD_SPIDER_MODE | ||
|
||
# cmd | ||
if cmd is None: | ||
cmd = CLI_DEFAULT_UPLOAD_SPIDER_CMD | ||
|
||
# http post | ||
return http_post( | ||
url="/spiders", | ||
data={ | ||
"name": name, | ||
"description": description, | ||
"mode": mode, | ||
"priority": priority, | ||
"cmd": cmd, | ||
"param": param, | ||
"col_name": col_name, | ||
}, | ||
) | ||
|
||
|
||
def upload_file(_id: str, file_path: str, target_path: str) -> Response: | ||
with open(file_path, "rb") as f: | ||
data = { | ||
"path": target_path, | ||
} | ||
files = {"file": f} | ||
|
||
url = f"/spiders/{_id}/files/save" | ||
return http_post(url=url, data=data, files=files, headers={}) | ||
|
||
|
||
def upload_dir( | ||
dir_path: str, | ||
create: bool = True, | ||
spider_id: str = None, | ||
name=None, | ||
description=None, | ||
mode=None, | ||
priority=None, | ||
cmd=None, | ||
param=None, | ||
col_name=None, | ||
exclude_path: list = None, | ||
): | ||
# create spider | ||
if create: | ||
response = create_spider( | ||
name=name, | ||
description=description, | ||
mode=mode, | ||
priority=priority, | ||
cmd=cmd, | ||
param=param, | ||
col_name=col_name, | ||
) | ||
if response.status_code != 200: | ||
console.print(f"[red]create spider {name} failed[/red]") | ||
sys.exit(1) | ||
spider_id = response.json().get("data").get("_id") | ||
console.print(f"[green]created spider {name} (id: {spider_id})[/green]") | ||
|
||
# stats | ||
stats = { | ||
"success": 0, | ||
"error": 0, | ||
} | ||
|
||
# iterate all files in the directory | ||
for root, dirs, files in os.walk(dir_path): | ||
for file_name in files: | ||
# file path | ||
file_path = os.path.join(root, file_name) | ||
|
||
# ignored file | ||
if is_ignored(file_path, exclude_path): | ||
continue | ||
|
||
# target path | ||
target_path = file_path.replace(dir_path, "") | ||
|
||
# upload file | ||
response = upload_file(spider_id, file_path, target_path) | ||
if response.status_code != 200: | ||
console.print(f"[red]failed to upload {file_path}[/red]") | ||
stats["error"] += 1 | ||
continue | ||
console.print(f"[green]uploaded {file_path}[/green]") | ||
stats["success"] += 1 | ||
|
||
# logging | ||
console.print(f"[green]uploaded spider {name}[/green]") | ||
console.print(f"[cyan]success: {stats['success']}[/cyan]") | ||
console.print(f"[cyan]failed: {stats['error']}[/cyan]") | ||
|
||
|
||
def is_ignored(file_path: str, exclude_path_patterns: list = None) -> bool: | ||
exclude_path_patterns = exclude_path_patterns or [] | ||
ignore_patterns = exclude_path_patterns + CLI_DEFAULT_UPLOAD_IGNORE_PATTERNS | ||
for pat in ignore_patterns: | ||
if re.search(pat, file_path) is not None: | ||
return True | ||
return False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import os | ||
|
||
from grpc_interceptor_headers.header_manipulator_client_interceptor import header_adder_interceptor | ||
|
||
|
||
def _get_auth_token_env() -> str: | ||
return os.getenv('CRAWLAB_GRPC_AUTH_KEY') | ||
|
||
|
||
def get_auth_token_interceptor(): | ||
header_name = 'authorization' | ||
header_content = _get_auth_token_env() | ||
header_interceptor = header_adder_interceptor(header_name, header_content) | ||
return header_interceptor |
Oops, something went wrong.