1
1
#!/usr/bin/env python3
2
2
# github.com/deadbits/vector-embedding-api
3
3
# client.py
4
+ import os
5
+ import sys
4
6
import json
5
7
import argparse
6
8
import requests
9
+ from pydantic import BaseModel
10
+ from typing import List , Optional
11
+ from datetime import datetime
7
12
8
13
9
- def send_request (text , model_type = 'local' ):
14
+ def timestamp_str ():
15
+ return datetime .isoformat (datetime .utcnow ())
16
+
17
+
18
+ class Embedding (BaseModel ):
19
+ text : str = ''
20
+ embedding : List [float ] = []
21
+ metadata : Optional [dict ] = {}
22
+
23
+
24
+ def send_request (text_batch , model_type = 'local' ):
10
25
url = 'http://127.0.0.1:5000/submit'
11
26
headers = {'Content-Type' : 'application/json' }
12
27
payload = {
13
- 'text' : text ,
28
+ 'text' : text_batch ,
14
29
'model' : model_type
15
30
}
16
31
@@ -20,14 +35,33 @@ def send_request(text, model_type='local'):
20
35
headers = headers ,
21
36
data = json .dumps (payload )
22
37
)
23
-
38
+
24
39
response .raise_for_status ()
25
40
return response .json ()
26
41
except requests .RequestException as err :
27
42
print (f'[error] exception sending http request: { err } ' )
28
43
return None
29
44
30
45
46
+ def process_batch (text_batch , model_type , embeddings_list , chunk_num , total_chunks ):
47
+ print (f'[status] { timestamp_str ()} - Processing chunk { chunk_num } of { total_chunks } ' )
48
+ result = send_request (text_batch , model_type )
49
+ if result :
50
+ if result [0 ]['status' ] == 'error' :
51
+ print (f'[error] { timestamp_str ()} - Received error: { result [0 ]["message" ]} ' )
52
+ return
53
+ else :
54
+ print (f'[status] { timestamp_str ()} - Received embeddings: { len (result [0 ]["embeddings" ])} ' )
55
+ for text , em in zip (text_batch , result [0 ]['embeddings' ]):
56
+ metadata = {
57
+ 'status' : result [0 ]['status' ],
58
+ 'elapsed' : result [0 ]['elapsed' ],
59
+ 'model' : result [0 ]['model' ]
60
+ }
61
+ embedding = Embedding (text = text , embedding = em , metadata = metadata )
62
+ embeddings_list .append (embedding .dict ())
63
+
64
+
31
65
if __name__ == '__main__' :
32
66
parser = argparse .ArgumentParser ()
33
67
@@ -40,7 +74,7 @@ def send_request(text, model_type='local'):
40
74
group .add_argument (
41
75
'-f' , '--file' ,
42
76
type = argparse .FileType ('r' ),
43
- help = 'text file to embed'
77
+ help = 'text file to embed (one text per line) '
44
78
)
45
79
46
80
parser .add_argument (
@@ -50,14 +84,65 @@ def send_request(text, model_type='local'):
50
84
default = 'local'
51
85
)
52
86
87
+ parser .add_argument (
88
+ '-o' , '--output' ,
89
+ help = 'output file' ,
90
+ default = 'embeddings.json'
91
+ )
92
+
53
93
args = parser .parse_args ()
94
+ model_type = args .model
95
+ output_file = args .output
96
+ embeddings_list = []
97
+
98
+ if os .path .exists (output_file ):
99
+ print (f'[error] { timestamp_str ()} - Output file already exists' )
100
+ sys .exit (1 )
54
101
55
102
if args .file :
56
- text = args .file .read ()
103
+ if not os .path .exists (args .file .name ):
104
+ print (f'[error] { timestamp_str ()} - File does not exist' )
105
+ sys .exit (1 )
106
+
107
+ print (f'[status] { timestamp_str ()} - Processing file: { args .file .name } ' )
108
+
109
+ text_batch = []
110
+ chunk_size = 100
111
+ total_lines = sum (1 for _ in args .file )
112
+ args .file .seek (0 )
113
+ total_chunks = (total_lines + chunk_size - 1 ) // chunk_size
114
+
115
+ print (f'[info] { timestamp_str ()} - Total chunks: { total_chunks } ' )
116
+
117
+ chunk_num = 1
118
+
119
+ for line in args .file :
120
+ text = line .strip ()
121
+ text_batch .append (text )
122
+ if len (text_batch ) == chunk_size :
123
+ process_batch (text_batch , model_type , embeddings_list , chunk_num , total_chunks )
124
+ text_batch = []
125
+ chunk_num += 1
126
+
127
+ if text_batch :
128
+ process_batch (text_batch , model_type , embeddings_list , chunk_num , total_chunks )
57
129
else :
130
+ print (f'[status] { timestamp_str ()} - Processing text input' )
58
131
text = args .text
132
+ result = send_request ([text ], model_type )
133
+ if result :
134
+ for res in result :
135
+ metadata = {'status' : res ['status' ], 'elapsed' : res ['elapsed' ], 'model' : res ['model' ]}
136
+ embedding = Embedding (text = text , embedding = res ['embedding' ], metadata = metadata )
137
+ embeddings_list .append (embedding .dict ())
138
+
139
+ try :
140
+ print (f'[status] { timestamp_str ()} - Saving embeddings to { output_file } ' )
141
+ with open (output_file , 'w' ) as f :
142
+ json .dump (embeddings_list , f )
143
+
144
+ print (f'[status] { timestamp_str ()} - Embeddings saved to embeddings.json' )
145
+ except Exception as err :
146
+ print (f'[error] { timestamp_str ()} - exception saving embeddings: { err } ' )
147
+ sys .exit (1 )
59
148
60
- model_type = args .model
61
- result = send_request (text , model_type )
62
- if result is not None :
63
- print (result )
0 commit comments