bentoml · brightlikethelight · Jul 4, 2025 · Jul 4, 2025 · Jul 5, 2025
@@ -54,6 +54,38 @@ jobs:
           name: coverage-unit-data-${{ matrix.os }}-${{ matrix.python-version }}
           path: .coverage.*
           include-hidden-files: true
+  async-llm-patterns:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: [3.9, 3.11, 3.12]
+    name: async-llm-patterns (python${{ matrix.python-version }}.${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0 # fetch all tags and branches
+      - name: Install the latest version of uv
+        uses: astral-sh/setup-uv@v6
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          cache: pip
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: pipx install pdm && pipx install nox
+      - name: Run OpenLLM async tests
+        run: nox --session openllm-async-${{ matrix.python-version }}
+      - name: Disambiguate coverage filename
+        run: mv .coverage ".coverage.async-patterns.${{ matrix.os }}.${{ matrix.python-version }}"
+      - name: Upload coverage data
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-async-patterns-data-${{ matrix.os }}-${{ matrix.python-version }}
+          path: .coverage.*
+          include-hidden-files: true
   integrations:
     name: framework-integration-tests
     runs-on: ubuntu-latest
@@ -203,6 +235,7 @@ jobs:
       - e2e-monitoring
       - unit
       - integrations
+      - async-llm-patterns
     if: github.event_name == 'pull_request'
     steps:
       - uses: actions/checkout@v4

@@ -89,6 +89,55 @@ def run_e2e_monitoring_test(session: nox.Session):
     session.run(*TEST_ARGS, test_folder)
 
 
+
+@nox.session(name="openllm-async", python=PYTHON_VERSIONS)
+def run_openllm_async_tests(session: nox.Session):
+    """Run async tests for openllm.run functionality with ≥90% coverage."""
+    session.run("pdm", "sync", "-G", "testing", external=True)
+    session.install("pytest-asyncio>=0.21.1", "httpx")
+    session.run(
+        "pytest",
+        "tests/unit/test_openllm_run.py",
+        "--cov=bentoml.openllm",
+        "--cov-fail-under=90",
+        "--cov-report=term-missing",
+        "-v",
+        "--timeout=60",  # 1 minute timeout as per design requirement
+    )
+
+
+@nox.session(name="async-llm-patterns", python=PYTHON_VERSIONS)
+def run_async_llm_pattern_tests(session: nox.Session):
+    """Run lightweight async LLM pattern tests for CI compatibility."""
+    session.run("pdm", "sync", "-G", "testing", external=True)
+    session.install("pytest-asyncio>=0.21.1", "httpx")
+    session.run(
+        "pytest",
+        "tests/unit/test_async_llm_patterns.py",
+        "--cov=bentoml._internal.runner",
+        "--cov=bentoml._internal.service",
+        "--cov-fail-under=80",
+        "--cov-report=term-missing",
+        "-v",
+        "--timeout=60",  # 1 minute timeout for lightweight tests
+    )
+
+
+@nox.session(name="transformers-async-local", python=PYTHON_VERSIONS)
+def run_transformers_async_tests_local(session: nox.Session):
+    """Run full transformers async tests locally (not in CI due to resource constraints)."""
+    session.run("pdm", "sync", "-G", "testing", external=True)
+    session.install("pytest-asyncio>=0.21.1", "transformers", "torch", "tokenizers")
+    session.run(
+        "pytest",
+        "tests/integration/frameworks/test_transformers_async.py",
+        "--cov=bentoml._internal.frameworks.transformers",
+        "--cov-report=term-missing",
+        "-v",
+        "--timeout=300",  # 5 minutes timeout for model loading
+    )
+
+
 @nox.session(name="coverage")
 def coverage_report(session: nox.Session):
     session.run("pdm", "sync", "-G", "testing", external=True)

@@ -0,0 +1,23 @@
+"""
+OpenLLM integration module for BentoML.
+
+This module provides high-level functions for running Large Language Models
+with BentoML's serving infrastructure.
+"""
+# Note: This module implements the async unit tests as specified in the design document
+
+from .inference import batch_run_async
+from .inference import clear_cache
+from .inference import get_cache_stats
+from .inference import run
+from .inference import run_async
+from .runner import LLMRunner
+
+__all__ = [
+    "LLMRunner",
+    "run",
+    "run_async",
+    "batch_run_async",
+    "clear_cache",
+    "get_cache_stats",
+]
@@ -0,0 +1,163 @@
+"""
+High-level inference functions for OpenLLM integration.
+
+This module provides the main `run` functions that were intended to be tested
+in the original design document.
+"""
+
+import asyncio
+from typing import Any
+from typing import Dict
+from typing import List
+
+from .runner import LLMRunner
+
+# Global runner cache for model reuse
+_runner_cache: Dict[str, LLMRunner] = {}
+
+
+def get_or_create_runner(model_name: str, mock: bool = False, **kwargs) -> LLMRunner:
+    """
+    Get or create a runner for the specified model.
+
+    Args:
+        model_name: Name of the model
+        mock: Whether to use mock model for testing
+        **kwargs: Additional runner configuration
+
+    Returns:
+        LLMRunner instance for the model
+    """
+    cache_key = f"{model_name}_{mock}_{hash(frozenset(kwargs.items()))}"
+
+    if cache_key not in _runner_cache:
+        runner = LLMRunner(model_name, mock=mock, **kwargs)
+        _runner_cache[cache_key] = runner
+
+    return _runner_cache[cache_key]
+
+
+def run(
+    prompt: str,
+    model: str = "mock-gpt2",
+    max_length: int = 100,
+    temperature: float = 1.0,
+    mock: bool = False,
+    **kwargs,
+) -> Dict[str, Any]:
+    """
+    Synchronously run LLM inference on a prompt.
+
+    This is the main synchronous function intended to be tested.
+
+    Args:
+        prompt: Input text prompt
+        model: Model name to use for inference
+        max_length: Maximum tokens to generate
+        temperature: Sampling temperature
+        mock: Whether to use mock model for testing
+        **kwargs: Additional generation parameters
+
+    Returns:
+        Generation result dictionary
+    """
+    runner = get_or_create_runner(model, mock=mock, **kwargs)
+
+    # Run async method in sync context
+    try:
+        asyncio.get_running_loop()
+    except RuntimeError:
+        # No event loop running, create new one
+        return asyncio.run(
+            runner.generate_async(
+                prompt, max_length=max_length, temperature=temperature, **kwargs
+            )
+        )
+    else:
+        # Event loop already running, run in executor
+        import concurrent.futures
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(
+                asyncio.run,
+                runner.generate_async(
+                    prompt, max_length=max_length, temperature=temperature, **kwargs
+                ),
+            )
+            return future.result()
+
+
+async def run_async(
+    prompt: str,
+    model: str = "mock-gpt2",
+    max_length: int = 100,
+    temperature: float = 1.0,
+    mock: bool = False,
+    **kwargs,
+) -> Dict[str, Any]:
+    """
+    Asynchronously run LLM inference on a prompt.
+
+    This is the main async function intended to be tested.
+
+    Args:
+        prompt: Input text prompt
+        model: Model name to use for inference
+        max_length: Maximum tokens to generate
+        temperature: Sampling temperature
+        mock: Whether to use mock model for testing
+        **kwargs: Additional generation parameters
+
+    Returns:
+        Generation result dictionary
+    """
+    runner = get_or_create_runner(model, mock=mock, **kwargs)
+
+    return await runner.generate_async(
+        prompt, max_length=max_length, temperature=temperature, **kwargs
+    )
+
+
+async def batch_run_async(
+    prompts: List[str],
+    model: str = "mock-gpt2",
+    max_length: int = 100,
+    temperature: float = 1.0,
+    mock: bool = False,
+    **kwargs,
+) -> List[Dict[str, Any]]:
+    """
+    Asynchronously run LLM inference on multiple prompts.
+
+    This function processes multiple prompts concurrently.
+
+    Args:
+        prompts: List of input prompts
+        model: Model name to use for inference
+        max_length: Maximum tokens to generate per prompt
+        temperature: Sampling temperature
+        mock: Whether to use mock model for testing
+        **kwargs: Additional generation parameters
+
+    Returns:
+        List of generation results
+    """
+    runner = get_or_create_runner(model, mock=mock, **kwargs)
+
+    return await runner.batch_generate_async(
+        prompts, max_length=max_length, temperature=temperature, **kwargs
+    )
+
+
+def clear_cache():
+    """Clear the runner cache."""
+    global _runner_cache
+    _runner_cache.clear()
+
+
+def get_cache_stats() -> Dict[str, Any]:
+    """Get cache statistics."""
+    return {
+        "cached_runners": len(_runner_cache),
+        "cache_keys": list(_runner_cache.keys()),
+    }