initial commit

crawlab-team · Nov 20, 2024 · 2335d55 · 2335d55
1 parent 5850768
commit 2335d55
Showing 53 changed files with 3,325 additions and 2 deletions.
diff --git a/.github/workflows/publish_python.yaml b/.github/workflows/publish_python.yaml
@@ -0,0 +1,50 @@
+name: Publish Python Package
+
+on:
+  push:
+    branches: [ main, test, develop ]
+  pull_request:
+    types:
+      - opened
+
+env:
+  PACKAGE_NAME: crawlab-sdk
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    outputs:
+      is_new_version: ${{ steps.check_version.outputs.is_new_version }}
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        id: install_dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install poetry
+          poetry install
+
+      - name: Check version
+        id: check_version
+        run: |
+          version=`poetry version -s`
+          res=`curl https://pypi.org/project/${{ env.PACKAGE_NAME }}/${version}/ -i -s | grep 'HTTP/2 404' | true`
+          if [[ $res =~ 404 ]]; then
+            echo "is_new_version=true" >> $GITHUB_OUTPUT 
+          else:
+            echo "is_new_version=false" >> $GITHUB_OUTPUT 
+          fi
+
+      - name: Build and publish
+        id: publish
+        if: ${{ always() && steps.check_version.outputs.is_new_version == 'true' }}
+        env:
+          POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          poetry publish --build
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
diff --git a/README.md b/README.md
@@ -1,2 +1,27 @@
-# crawlab-python-sdk
+# Crawlab Python SDK
+
 Python SDK for Crawlab
+
+## Installation
+
+```bash
+pip install crawlab-sdk
+```
+
+## Development
+
+### Install dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+### Compile gRPC
+
+```bash
+# Set the environment variable CRAWLAB_PROTO_PATH to the path of the gRPC proto files
+export CRAWLAB_PROTO_PATH=/path/to/grpc/proto/files
+
+# Compile gRPC to Python code
+./compile_grpc.sh
+```
diff --git a/compile_grpc.sh b/compile_grpc.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# proto_path_root is the path to the directory containing the proto files
+proto_path_root=${CRAWLAB_PROTO_PATH}
+
+# check if proto_path_root is empty
+if [ -z "$proto_path_root" ]; then
+    echo "Please set the CRAWLAB_PROTO_PATH environment variable to the path containing the proto files."
+    exit 1
+fi
+
+# check if proto_path_root exists
+if [ ! -d "$proto_path_root" ]; then
+    echo "The directory specified by CRAWLAB_PROTO_PATH does not exist."
+    exit 1
+fi
+
+# output_path is the path to the directory where the generated Python code will be saved
+output_path=./crawlab/grpc
+
+# Remove the output directory if it exists
+if [ -d ${output_path} ]; then
+    rm -rf ${output_path}
+fi
+
+# Create the output directory
+mkdir -p ${output_path}
+
+# Generate the Python code from the proto files
+python -m grpc_tools.protoc \
+    -I ${proto_path_root} \
+    --python_out=${output_path} \
+    --grpc_python_out=${output_path} \
+    ${proto_path_root}/**/*.proto
+
+# Convert imports to absolute paths
+sed -i 's/from \([a-zA-Z0-9_]*\) import \([a-zA-Z0-9_]*\)_pb2/from crawlab.grpc.\1 import \2_pb2/g' ${output_path}/**/*.py
diff --git a/crawlab/__init__.py b/crawlab/__init__.py
@@ -0,0 +1,8 @@
+__all__ = [
+    'save_item',
+    'save_items',
+    'CrawlabPipeline',
+]
+
+from crawlab.result import save_item, save_items
+from crawlab.scrapy.pipelines import CrawlabPipeline
diff --git a/crawlab/actions/__init__.py b/crawlab/actions/__init__.py
diff --git a/crawlab/actions/login.py b/crawlab/actions/login.py
@@ -0,0 +1,31 @@
+from crawlab.client import http_post
+from crawlab.config.config import config
+from crawlab.constants.upload import (
+    CLI_DEFAULT_CONFIG_KEY_USERNAME,
+    CLI_DEFAULT_CONFIG_KEY_PASSWORD,
+    CLI_DEFAULT_CONFIG_KEY_API_ADDRESS,
+    CLI_DEFAULT_CONFIG_KEY_TOKEN,
+)
+
+
+def login(api_address: str, username: str, password: str):
+    url = f"{api_address}/login"
+    try:
+        res = http_post(
+            url,
+            {
+                "username": username,
+                "password": password,
+            },
+        )
+        print("logged-in successfully")
+    except Exception as e:
+        print(e)
+        return
+
+    token = res.json().get("data")
+    config.set(CLI_DEFAULT_CONFIG_KEY_USERNAME, username)
+    config.set(CLI_DEFAULT_CONFIG_KEY_PASSWORD, password)
+    config.set(CLI_DEFAULT_CONFIG_KEY_API_ADDRESS, api_address)
+    config.set(CLI_DEFAULT_CONFIG_KEY_TOKEN, token)
+    config.save()
diff --git a/crawlab/actions/upload.py b/crawlab/actions/upload.py
@@ -0,0 +1,136 @@
+import os
+import re
+import sys
+from typing import Optional
+
+from httpx import Response
+from rich.console import Console
+
+from crawlab.client import http_post
+from crawlab.constants.upload import (
+    CLI_DEFAULT_UPLOAD_SPIDER_MODE,
+    CLI_DEFAULT_UPLOAD_SPIDER_CMD,
+    CLI_DEFAULT_UPLOAD_IGNORE_PATTERNS,
+)
+
+console = Console()
+
+
+def create_spider(
+    name: str,
+    description: Optional[str] = None,
+    mode: Optional[str] = None,
+    priority: Optional[int] = None,
+    cmd: Optional[str] = None,
+    param: Optional[str] = None,
+    col_name: Optional[str] = None,
+) -> Response:
+    # results collection name
+    if col_name is None:
+        col_name = f'results_{"_".join(name.lower().split(" "))}'
+
+    # mode
+    if mode is None:
+        mode = CLI_DEFAULT_UPLOAD_SPIDER_MODE
+
+    # cmd
+    if cmd is None:
+        cmd = CLI_DEFAULT_UPLOAD_SPIDER_CMD
+
+    # http post
+    return http_post(
+        url="/spiders",
+        data={
+            "name": name,
+            "description": description,
+            "mode": mode,
+            "priority": priority,
+            "cmd": cmd,
+            "param": param,
+            "col_name": col_name,
+        },
+    )
+
+
+def upload_file(_id: str, file_path: str, target_path: str) -> Response:
+    with open(file_path, "rb") as f:
+        data = {
+            "path": target_path,
+        }
+        files = {"file": f}
+
+        url = f"/spiders/{_id}/files/save"
+        return http_post(url=url, data=data, files=files, headers={})
+
+
+def upload_dir(
+    dir_path: str,
+    create: bool = True,
+    spider_id: str = None,
+    name=None,
+    description=None,
+    mode=None,
+    priority=None,
+    cmd=None,
+    param=None,
+    col_name=None,
+    exclude_path: list = None,
+):
+    # create spider
+    if create:
+        response = create_spider(
+            name=name,
+            description=description,
+            mode=mode,
+            priority=priority,
+            cmd=cmd,
+            param=param,
+            col_name=col_name,
+        )
+        if response.status_code != 200:
+            console.print(f"[red]create spider {name} failed[/red]")
+            sys.exit(1)
+        spider_id = response.json().get("data").get("_id")
+        console.print(f"[green]created spider {name} (id: {spider_id})[/green]")
+
+    # stats
+    stats = {
+        "success": 0,
+        "error": 0,
+    }
+
+    # iterate all files in the directory
+    for root, dirs, files in os.walk(dir_path):
+        for file_name in files:
+            # file path
+            file_path = os.path.join(root, file_name)
+
+            # ignored file
+            if is_ignored(file_path, exclude_path):
+                continue
+
+            # target path
+            target_path = file_path.replace(dir_path, "")
+
+            # upload file
+            response = upload_file(spider_id, file_path, target_path)
+            if response.status_code != 200:
+                console.print(f"[red]failed to upload {file_path}[/red]")
+                stats["error"] += 1
+                continue
+            console.print(f"[green]uploaded {file_path}[/green]")
+            stats["success"] += 1
+
+    # logging
+    console.print(f"[green]uploaded spider {name}[/green]")
+    console.print(f"[cyan]success: {stats['success']}[/cyan]")
+    console.print(f"[cyan]failed: {stats['error']}[/cyan]")
+
+
+def is_ignored(file_path: str, exclude_path_patterns: list = None) -> bool:
+    exclude_path_patterns = exclude_path_patterns or []
+    ignore_patterns = exclude_path_patterns + CLI_DEFAULT_UPLOAD_IGNORE_PATTERNS
+    for pat in ignore_patterns:
+        if re.search(pat, file_path) is not None:
+            return True
+    return False
diff --git a/crawlab/auth_token.py b/crawlab/auth_token.py
@@ -0,0 +1,14 @@
+import os
+
+from grpc_interceptor_headers.header_manipulator_client_interceptor import header_adder_interceptor
+
+
+def _get_auth_token_env() -> str:
+    return os.getenv('CRAWLAB_GRPC_AUTH_KEY')
+
+
+def get_auth_token_interceptor():
+    header_name = 'authorization'
+    header_content = _get_auth_token_env()
+    header_interceptor = header_adder_interceptor(header_name, header_content)
+    return header_interceptor