pavanjava · srimon12 · May 30, 2026 · May 31, 2026 · May 31, 2026
diff --git a/README.md b/README.md
@@ -5,9 +5,9 @@
 [![PyPI version](https://img.shields.io/pypi/v/qql-cli?color=blue&label=PyPI)](https://pypi.org/project/qql-cli/)
 [![Python 3.12+](https://img.shields.io/pypi/pyversions/qql-cli)](https://pypi.org/project/qql-cli/)
 [![MIT License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
-[![Tests](https://img.shields.io/badge/tests-549%20passing-brightgreen)](tests/)
+[![Tests](https://img.shields.io/badge/tests-633%20passing-brightgreen)](tests/)
 
-Write `INSERT`, `SELECT`, `SEARCH`, `SCROLL`, `RECOMMEND`, `UPDATE`, `DELETE`, and `CREATE COLLECTION` statements instead of Python SDK calls. Supports hybrid dense+sparse vector search, grouped search (GROUP BY), cross-encoder reranking, quantization (scalar, turbo, binary, product), SQL-style `WHERE` filters, script execution, and collection dump/restore.
+Write `INSERT`, `SELECT`, `SEARCH`, `SCROLL`, `RECOMMEND`, `UPDATE`, `DELETE`, and `CREATE COLLECTION` statements instead of Python SDK calls. Supports hybrid dense+sparse vector search, grouped search (GROUP BY), cross-encoder reranking, quantization (scalar, turbo, binary, product), SQL-style `WHERE` filters, script execution, collection dump/restore, async execution, and programmatic gRPC transport.
 
 ```
 qql> INSERT INTO COLLECTION notes VALUES {'text': 'Qdrant is a vector database', 'author': 'alice', 'year': 2024}
@@ -50,7 +50,7 @@ Your query string
 
 When you run `INSERT`, the `text` field is automatically converted into a dense vector using [Fastembed](https://github.com/qdrant/fastembed). In **hybrid mode** (`USING HYBRID`), a sparse BM25 vector is also generated alongside the dense vector, and searches use Qdrant's Reciprocal Rank Fusion (RRF) by default to merge the results of both retrieval methods. You can switch hybrid search to DBSF with `FUSION 'dbsf'`.
 
-QQL also exposes a **programmatic API** for use inside Python applications — no CLI required:
+QQL also exposes a **programmatic API** for use inside Python applications — no CLI required. Use `Connection` for sync code and `AsyncConnection` for async apps:
 
 ```python
 from qql import Connection
@@ -62,6 +62,14 @@ with Connection("http://localhost:6333") as conn:
         print(hit["score"], hit["payload"])
 ```
 
+```python
+from qql import AsyncConnection
+
+async with AsyncConnection("http://localhost:6333", prefer_grpc=True) as conn:
+    result = await conn.run_query("SHOW COLLECTIONS")
+    print(result.data)
+```
+
 ---
 
 ## Installation
@@ -104,7 +112,7 @@ Full documentation lives in the [`docs/`](docs/) folder and at **[pavanjava.gith
 | [WHERE Filters](docs/filters.md) | Full SQL-style filter operators |
 | [Collections & Quantization](docs/collections.md) | SHOW, CREATE, DROP, QUANTIZE (scalar/turbo/binary/product), CREATE INDEX, UPDATE VECTOR, UPDATE PAYLOAD |
 | [Scripts: EXECUTE / DUMP](docs/scripts.md) | Script files, collection backup/restore |
-| [Programmatic Usage](docs/programmatic.md) | Use QQL as a Python library via `Connection` or `run_query()` |
+| [Programmatic Usage](docs/programmatic.md) | Sync/async Python APIs and gRPC |
 | [Reference: Models / Config / Errors](docs/reference.md) | Embedding models, config file, error reference |
 
 ---
@@ -188,7 +196,7 @@ Tests do not require a running Qdrant instance — the Qdrant client is mocked.
 pytest tests/ -v
 ```
 
-Expected: **549 tests passing**.
+Expected: **633 tests passing**.
 
 ---
 

diff --git a/benchmark/qql_transport_benchmark.py b/benchmark/qql_transport_benchmark.py
@@ -0,0 +1,165 @@
+from __future__ import annotations
+
+import asyncio
+import os
+import time
+from dataclasses import dataclass
+
+from qql import AsyncConnection, Connection
+
+
+URL = os.getenv("QQL_BENCH_URL", "http://localhost:6333")
+SECRET = os.getenv("QQL_BENCH_SECRET") or None
+ITERATIONS = int(os.getenv("QQL_BENCH_ITERATIONS", "50"))
+WARMUP = int(os.getenv("QQL_BENCH_WARMUP", "5"))
+CONCURRENCY = int(os.getenv("QQL_BENCH_CONCURRENCY", "10"))
+
+DOCS = [
+    "Qdrant stores vectors and payloads for semantic search workloads",
+    "FastEmbed generates local dense embeddings for short text queries",
+    "gRPC can reduce transport overhead for high volume vector database calls",
+    "REST remains simple and reliable for operational database workflows",
+    "Async clients help Python applications keep network requests in flight",
+    "Local embedding models can dominate latency before the database is called",
+    "Hybrid search combines dense vectors with sparse lexical retrieval",
+    "Payload filters narrow search results by metadata fields and values",
+    "Collection topology determines named dense and sparse vector behavior",
+    "Benchmark results should separate setup cost from measured query latency",
+]
+QUERY_TEXT = "local embedding vector database transport benchmark"
+
+
+@dataclass(frozen=True)
+class Result:
+    mode: str
+    total_ms: float
+    avg_ms: float
+    qps: float
+
+
+def search_query(collection: str) -> str:
+    return f"SEARCH {collection} SIMILAR TO '{QUERY_TEXT}' LIMIT 5"
+
+
+def insert_query(collection: str, idx: int, text: str) -> str:
+    return (
+        f"INSERT INTO COLLECTION {collection} "
+        f"VALUES {{'id': {idx}, 'text': '{text}', 'kind': 'bench'}}"
+    )
+
+
+def ignore_drop(conn: Connection, collection: str) -> None:
+    try:
+        conn.run_query(f"DROP COLLECTION {collection}")
+    except Exception:
+        pass
+
+
+async def ignore_drop_async(conn: AsyncConnection, collection: str) -> None:
+    try:
+        await conn.run_query(f"DROP COLLECTION {collection}")
+    except Exception:
+        pass
+
+
+def setup_sync(collection: str, *, prefer_grpc: bool) -> None:
+    with Connection(URL, secret=SECRET, prefer_grpc=prefer_grpc) as conn:
+        ignore_drop(conn, collection)
+        for idx, text in enumerate(DOCS, start=1):
+            conn.run_query(insert_query(collection, idx, text))
+
+
+async def setup_async(collection: str, *, prefer_grpc: bool) -> None:
+    async with AsyncConnection(URL, secret=SECRET, prefer_grpc=prefer_grpc) as conn:
+        await ignore_drop_async(conn, collection)
+        for idx, text in enumerate(DOCS, start=1):
+            await conn.run_query(insert_query(collection, idx, text))
+
+
+def bench_sync(mode: str, collection: str, *, prefer_grpc: bool) -> Result:
+    setup_sync(collection, prefer_grpc=prefer_grpc)
+    query = search_query(collection)
+    with Connection(URL, secret=SECRET, prefer_grpc=prefer_grpc) as conn:
+        for _ in range(WARMUP):
+            conn.run_query(query)
+        start = time.perf_counter()
+        for _ in range(ITERATIONS):
+            conn.run_query(query)
+        total_ms = (time.perf_counter() - start) * 1000
+    return Result(mode, total_ms, total_ms / ITERATIONS, ITERATIONS / (total_ms / 1000))
+
+
+async def bench_async(mode: str, collection: str, *, prefer_grpc: bool) -> Result:
+    await setup_async(collection, prefer_grpc=prefer_grpc)
+    query = search_query(collection)
+    async with AsyncConnection(URL, secret=SECRET, prefer_grpc=prefer_grpc) as conn:
+        for _ in range(WARMUP):
+            await conn.run_query(query)
+        start = time.perf_counter()
+        for _ in range(ITERATIONS):
+            await conn.run_query(query)
+        total_ms = (time.perf_counter() - start) * 1000
+    return Result(mode, total_ms, total_ms / ITERATIONS, ITERATIONS / (total_ms / 1000))
+
+
+async def bench_async_concurrent(
+    mode: str,
+    collection: str,
+    *,
+    prefer_grpc: bool,
+) -> Result:
+    query = search_query(collection)
+    async with AsyncConnection(URL, secret=SECRET, prefer_grpc=prefer_grpc) as conn:
+        for _ in range(WARMUP):
+            await conn.run_query(query)
+        sem = asyncio.Semaphore(CONCURRENCY)
+
+        async def one() -> None:
+            async with sem:
+                await conn.run_query(query)
+
+        start = time.perf_counter()
+        await asyncio.gather(*(one() for _ in range(ITERATIONS)))
+        total_ms = (time.perf_counter() - start) * 1000
+    return Result(mode, total_ms, total_ms / ITERATIONS, ITERATIONS / (total_ms / 1000))
+
+
+def print_table(title: str, results: list[Result]) -> None:
+    print(f"\n### {title}\n")
+    print("| Mode | Total ms | Avg ms/op | Ops/sec |")
+    print("|---|---:|---:|---:|")
+    for r in results:
+        print(f"| {r.mode} | {r.total_ms:,.2f} | {r.avg_ms:,.2f} | {r.qps:,.2f} |")
+
+
+async def main() -> None:
+    print("QQL SEARCH benchmark")
+    print(f"URL: {URL}")
+    print(f"Workload: {ITERATIONS} measured SEARCH queries, {WARMUP} warmup")
+    print("Embedding: local FastEmbed dense model, warmed before timing")
+
+    latency = [
+        bench_sync("sync REST", "qql_bench_sync_rest", prefer_grpc=False),
+        await bench_async("async REST", "qql_bench_async_rest", prefer_grpc=False),
+        bench_sync("sync gRPC", "qql_bench_sync_grpc", prefer_grpc=True),
+        await bench_async("async gRPC", "qql_bench_async_grpc", prefer_grpc=True),
+    ]
+    print_table("Single-flight latency", latency)
+
+    concurrent = [
+        await bench_async_concurrent(
+            f"async REST x{CONCURRENCY}",
+            "qql_bench_async_rest",
+            prefer_grpc=False,
+        ),
+        await bench_async_concurrent(
+            f"async gRPC x{CONCURRENCY}",
+            "qql_bench_async_grpc",
+            prefer_grpc=True,
+        ),
+    ]
+    print_table("Async concurrent throughput", concurrent)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/programmatic.md b/docs/programmatic.md
@@ -11,10 +11,10 @@ QQL can be used as a Python library without the CLI.
 
 ## `Connection` — Primary API
 
-`Connection` is the recommended way to use QQL programmatically. It opens a
-single connection to Qdrant once and reuses it for every `run_query()` call —
-more efficient than the legacy `run_query()` function, which creates a new
-client on every invocation.
+`Connection` is the recommended sync API for using QQL programmatically. It
+opens a single connection to Qdrant once and reuses it for every `run_query()`
+call — more efficient than the legacy `run_query()` function, which creates a
+new client on every invocation.
 
 ### Basic usage
 
@@ -180,9 +180,55 @@ with Connection("http://localhost:6333") as conn:
 | `secret` | `str \| None` | `None` | API key; `None` for unauthenticated |
 | `default_model` | `str \| None` | `None` → `sentence-transformers/all-MiniLM-L6-v2` | Dense embedding model used when no `USING MODEL` clause is given |
 | `verify` | `bool \| str` | `True` | TLS verification setting; use `False` to skip verification or a CA bundle path for internal/self-signed certificates |
+| `prefer_grpc` | `bool` | `False` | Use Qdrant's gRPC transport when available |
+| `grpc_port` | `int` | `6334` | Qdrant gRPC port used when `prefer_grpc=True` |
 | `default_dense_vector_name` | `str` | `"dense"` | Dense vector name used when QQL creates a collection and no explicit `USING VECTOR` name is given |
 | `default_sparse_vector_name` | `str` | `"sparse"` | Sparse vector name used when QQL creates a hybrid collection and no explicit sparse vector name is given |
 
+### gRPC transport
+
+Pass `prefer_grpc=True` when your Qdrant deployment exposes the gRPC port:
+
+```python
+from qql import Connection
+
+with Connection("http://localhost:6333", prefer_grpc=True) as conn:
+    result = conn.run_query("SHOW COLLECTIONS")
+    print(result.data)
+```
+
+Use `grpc_port` when the deployment uses a non-default gRPC port.
+
+---
+
+## `AsyncConnection` — Async API
+
+`AsyncConnection` mirrors the sync `Connection` API and uses Qdrant's
+`AsyncQdrantClient` under the hood.
+
+```python
+from qql import AsyncConnection
+
+async with AsyncConnection("http://localhost:6333") as conn:
+    await conn.run_query(
+        "INSERT INTO COLLECTION notes VALUES {'text': 'hello async world'}"
+    )
+    result = await conn.run_query("SEARCH notes SIMILAR TO 'async' LIMIT 5")
+    for hit in result.data:
+        print(hit["score"], hit["payload"])
+```
+
+Async connections support the same `url`, `secret`, `default_model`, `verify`,
+`prefer_grpc`, and `grpc_port` parameters:
+
+```python
+async with AsyncConnection(
+    "http://localhost:6333",
+    prefer_grpc=True,
+) as conn:
+    result = await conn.run_query("SHOW COLLECTIONS")
+```
+
 ### Power-user: `executor` property
 
 For low-level access to the pipeline, use `conn.executor` directly:
@@ -225,8 +271,8 @@ for hit in result.data:
     print(hit["score"], hit["payload"])
 ```
 
-`run_query()` accepts the same `url`, `secret`, `default_model`, and `verify`
-parameters as `Connection.__init__()`.
+`run_query()` accepts the same `url`, `secret`, `default_model`, `verify`,
+`prefer_grpc`, and `grpc_port` parameters as `Connection.__init__()`.
 
 ---
 

diff --git a/src/qql/__init__.py b/src/qql/__init__.py
@@ -12,6 +12,8 @@
     QQLConfig,
     load_config,
 )
+from .async_connection import AsyncConnection
+from .async_executor import AsyncExecutor
 from .connection import Connection
 from .exceptions import QQLError, QQLRuntimeError, QQLSyntaxError
 from .executor import ExecutionResult, Executor
@@ -20,6 +22,8 @@
 
 __all__ = [
     "__version__",
+    "AsyncConnection",
+    "AsyncExecutor",
     "Connection",
     "DEFAULT_DENSE_VECTOR_NAME",
     "DEFAULT_MODEL",
@@ -43,6 +47,8 @@ def run_query(
     secret: str | None = None,
     default_model: str | None = None,
     verify: bool | str = True,
+    prefer_grpc: bool = False,
+    grpc_port: int = 6334,
 ) -> ExecutionResult:
     """One-shot convenience function kept for backward compatibility.
 
@@ -61,5 +67,7 @@ def run_query(
         secret=secret,
         default_model=default_model,
         verify=verify,
+        prefer_grpc=prefer_grpc,
+        grpc_port=grpc_port,
     ) as conn:
         return conn.run_query(query)