|
3 | 3 | import multiprocessing as mp
|
4 | 4 | import time
|
5 | 5 | from concurrent.futures import ThreadPoolExecutor
|
| 6 | +from copy import deepcopy |
6 | 7 |
|
7 | 8 | from vectordb_bench import config
|
8 | 9 | from vectordb_bench.backend.clients import api
|
| 10 | +from vectordb_bench.backend.clients.pgvector.pgvector import PgVector |
9 | 11 | from vectordb_bench.backend.dataset import DataSetIterator
|
10 | 12 | from vectordb_bench.backend.utils import time_it
|
11 | 13 |
|
@@ -33,17 +35,27 @@ def __init__(
|
33 | 35 | self.executing_futures = []
|
34 | 36 | self.sig_idx = 0
|
35 | 37 |
|
36 |
| - def send_insert_task(self, db: api.VectorDB, emb: list[list[float]], metadata: list[str], retry_idx: int = 0): |
37 |
| - _, error = db.insert_embeddings(emb, metadata) |
38 |
| - if error is not None: |
39 |
| - log.warning(f"Insert Failed, try_idx={retry_idx}, Exception: {error}") |
40 |
| - retry_idx += 1 |
41 |
| - if retry_idx <= config.MAX_INSERT_RETRY: |
42 |
| - time.sleep(retry_idx) |
43 |
| - self.send_insert_task(db, emb=emb, metadata=metadata, retry_idx=retry_idx) |
44 |
| - else: |
45 |
| - msg = f"Insert failed and retried more than {config.MAX_INSERT_RETRY} times" |
46 |
| - raise RuntimeError(msg) from None |
| 38 | + def send_insert_task(self, db: api.VectorDB, emb: list[list[float]], metadata: list[str]): |
| 39 | + def _insert_embeddings(db: api.VectorDB, emb: list[list[float]], metadata: list[str], retry_idx: int = 0): |
| 40 | + _, error = db.insert_embeddings(emb, metadata) |
| 41 | + if error is not None: |
| 42 | + log.warning(f"Insert Failed, try_idx={retry_idx}, Exception: {error}") |
| 43 | + retry_idx += 1 |
| 44 | + if retry_idx <= config.MAX_INSERT_RETRY: |
| 45 | + time.sleep(retry_idx) |
| 46 | + _insert_embeddings(db, emb=emb, metadata=metadata, retry_idx=retry_idx) |
| 47 | + else: |
| 48 | + msg = f"Insert failed and retried more than {config.MAX_INSERT_RETRY} times" |
| 49 | + raise RuntimeError(msg) from None |
| 50 | + |
| 51 | + if isinstance(db, PgVector): |
| 52 | + # pgvector is not thread-safe for concurrent insert, |
| 53 | + # so we need to copy the db object, make sure each thread has its own connection |
| 54 | + db_copy = deepcopy(db) |
| 55 | + with db_copy.init(): |
| 56 | + _insert_embeddings(db_copy, emb, metadata, retry_idx=0) |
| 57 | + else: |
| 58 | + _insert_embeddings(db, emb, metadata, retry_idx=0) |
47 | 59 |
|
48 | 60 | @time_it
|
49 | 61 | def run_with_rate(self, q: mp.Queue):
|
|
0 commit comments