Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
tikazyq committed Nov 20, 2024
1 parent 5850768 commit 2335d55
Showing 53 changed files with 3,325 additions and 2 deletions.
50 changes: 50 additions & 0 deletions .github/workflows/publish_python.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: Publish Python Package

on:
push:
branches: [ main, test, develop ]
pull_request:
types:
- opened

env:
PACKAGE_NAME: crawlab-sdk

jobs:
deploy:
runs-on: ubuntu-latest
outputs:
is_new_version: ${{ steps.check_version.outputs.is_new_version }}
steps:
- uses: actions/checkout@v3

- name: Set up Python
uses: actions/setup-python@v1
with:
python-version: '3.12'

- name: Install dependencies
id: install_dependencies
run: |
python -m pip install --upgrade pip
pip install poetry
poetry install
- name: Check version
id: check_version
run: |
version=`poetry version -s`
res=`curl https://pypi.org/project/${{ env.PACKAGE_NAME }}/${version}/ -i -s | grep 'HTTP/2 404' | true`
if [[ $res =~ 404 ]]; then
echo "is_new_version=true" >> $GITHUB_OUTPUT
else:
echo "is_new_version=false" >> $GITHUB_OUTPUT
fi
- name: Build and publish
id: publish
if: ${{ always() && steps.check_version.outputs.is_new_version == 'true' }}
env:
POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }}
run: |
poetry publish --build
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -159,4 +159,4 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/
27 changes: 26 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,27 @@
# crawlab-python-sdk
# Crawlab Python SDK

Python SDK for Crawlab

## Installation

```bash
pip install crawlab-sdk
```

## Development

### Install dependencies

```bash
pip install -r requirements.txt
```

### Compile gRPC

```bash
# Set the environment variable CRAWLAB_PROTO_PATH to the path of the gRPC proto files
export CRAWLAB_PROTO_PATH=/path/to/grpc/proto/files

# Compile gRPC to Python code
./compile_grpc.sh
```
37 changes: 37 additions & 0 deletions compile_grpc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

# proto_path_root is the path to the directory containing the proto files
proto_path_root=${CRAWLAB_PROTO_PATH}

# check if proto_path_root is empty
if [ -z "$proto_path_root" ]; then
echo "Please set the CRAWLAB_PROTO_PATH environment variable to the path containing the proto files."
exit 1
fi

# check if proto_path_root exists
if [ ! -d "$proto_path_root" ]; then
echo "The directory specified by CRAWLAB_PROTO_PATH does not exist."
exit 1
fi

# output_path is the path to the directory where the generated Python code will be saved
output_path=./crawlab/grpc

# Remove the output directory if it exists
if [ -d ${output_path} ]; then
rm -rf ${output_path}
fi

# Create the output directory
mkdir -p ${output_path}

# Generate the Python code from the proto files
python -m grpc_tools.protoc \
-I ${proto_path_root} \
--python_out=${output_path} \
--grpc_python_out=${output_path} \
${proto_path_root}/**/*.proto

# Convert imports to absolute paths
sed -i 's/from \([a-zA-Z0-9_]*\) import \([a-zA-Z0-9_]*\)_pb2/from crawlab.grpc.\1 import \2_pb2/g' ${output_path}/**/*.py
8 changes: 8 additions & 0 deletions crawlab/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
__all__ = [
'save_item',
'save_items',
'CrawlabPipeline',
]

from crawlab.result import save_item, save_items
from crawlab.scrapy.pipelines import CrawlabPipeline
Empty file added crawlab/actions/__init__.py
Empty file.
31 changes: 31 additions & 0 deletions crawlab/actions/login.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from crawlab.client import http_post
from crawlab.config.config import config
from crawlab.constants.upload import (
CLI_DEFAULT_CONFIG_KEY_USERNAME,
CLI_DEFAULT_CONFIG_KEY_PASSWORD,
CLI_DEFAULT_CONFIG_KEY_API_ADDRESS,
CLI_DEFAULT_CONFIG_KEY_TOKEN,
)


def login(api_address: str, username: str, password: str):
url = f"{api_address}/login"
try:
res = http_post(
url,
{
"username": username,
"password": password,
},
)
print("logged-in successfully")
except Exception as e:
print(e)
return

token = res.json().get("data")
config.set(CLI_DEFAULT_CONFIG_KEY_USERNAME, username)
config.set(CLI_DEFAULT_CONFIG_KEY_PASSWORD, password)
config.set(CLI_DEFAULT_CONFIG_KEY_API_ADDRESS, api_address)
config.set(CLI_DEFAULT_CONFIG_KEY_TOKEN, token)
config.save()
136 changes: 136 additions & 0 deletions crawlab/actions/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import os
import re
import sys
from typing import Optional

from httpx import Response
from rich.console import Console

from crawlab.client import http_post
from crawlab.constants.upload import (
CLI_DEFAULT_UPLOAD_SPIDER_MODE,
CLI_DEFAULT_UPLOAD_SPIDER_CMD,
CLI_DEFAULT_UPLOAD_IGNORE_PATTERNS,
)

console = Console()


def create_spider(
name: str,
description: Optional[str] = None,
mode: Optional[str] = None,
priority: Optional[int] = None,
cmd: Optional[str] = None,
param: Optional[str] = None,
col_name: Optional[str] = None,
) -> Response:
# results collection name
if col_name is None:
col_name = f'results_{"_".join(name.lower().split(" "))}'

# mode
if mode is None:
mode = CLI_DEFAULT_UPLOAD_SPIDER_MODE

# cmd
if cmd is None:
cmd = CLI_DEFAULT_UPLOAD_SPIDER_CMD

# http post
return http_post(
url="/spiders",
data={
"name": name,
"description": description,
"mode": mode,
"priority": priority,
"cmd": cmd,
"param": param,
"col_name": col_name,
},
)


def upload_file(_id: str, file_path: str, target_path: str) -> Response:
with open(file_path, "rb") as f:
data = {
"path": target_path,
}
files = {"file": f}

url = f"/spiders/{_id}/files/save"
return http_post(url=url, data=data, files=files, headers={})


def upload_dir(
dir_path: str,
create: bool = True,
spider_id: str = None,
name=None,
description=None,
mode=None,
priority=None,
cmd=None,
param=None,
col_name=None,
exclude_path: list = None,
):
# create spider
if create:
response = create_spider(
name=name,
description=description,
mode=mode,
priority=priority,
cmd=cmd,
param=param,
col_name=col_name,
)
if response.status_code != 200:
console.print(f"[red]create spider {name} failed[/red]")
sys.exit(1)
spider_id = response.json().get("data").get("_id")
console.print(f"[green]created spider {name} (id: {spider_id})[/green]")

# stats
stats = {
"success": 0,
"error": 0,
}

# iterate all files in the directory
for root, dirs, files in os.walk(dir_path):
for file_name in files:
# file path
file_path = os.path.join(root, file_name)

# ignored file
if is_ignored(file_path, exclude_path):
continue

# target path
target_path = file_path.replace(dir_path, "")

# upload file
response = upload_file(spider_id, file_path, target_path)
if response.status_code != 200:
console.print(f"[red]failed to upload {file_path}[/red]")
stats["error"] += 1
continue
console.print(f"[green]uploaded {file_path}[/green]")
stats["success"] += 1

# logging
console.print(f"[green]uploaded spider {name}[/green]")
console.print(f"[cyan]success: {stats['success']}[/cyan]")
console.print(f"[cyan]failed: {stats['error']}[/cyan]")


def is_ignored(file_path: str, exclude_path_patterns: list = None) -> bool:
exclude_path_patterns = exclude_path_patterns or []
ignore_patterns = exclude_path_patterns + CLI_DEFAULT_UPLOAD_IGNORE_PATTERNS
for pat in ignore_patterns:
if re.search(pat, file_path) is not None:
return True
return False
14 changes: 14 additions & 0 deletions crawlab/auth_token.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os

from grpc_interceptor_headers.header_manipulator_client_interceptor import header_adder_interceptor


def _get_auth_token_env() -> str:
return os.getenv('CRAWLAB_GRPC_AUTH_KEY')


def get_auth_token_interceptor():
header_name = 'authorization'
header_content = _get_auth_token_env()
header_interceptor = header_adder_interceptor(header_name, header_content)
return header_interceptor
Loading

0 comments on commit 2335d55

Please sign in to comment.