-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathserver.py
61 lines (47 loc) · 1.67 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import time
from subprocess import Popen, PIPE
from flask import Flask, render_template, request
from multiprocessing.pool import ThreadPool
app = Flask(__name__)
app.config["SECRET_KEY"] = "secret"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
LLAMA_DIR = os.path.join(BASE_DIR, "llama.cpp")
# Initialize the llama process
llama_process = Popen([
os.path.join(LLAMA_DIR, "main"),
"-m", os.path.join(LLAMA_DIR, "models/7B/ggml-model-q4_0.bin"),
"-t", "8", "-n", "128"
], stdin=PIPE, stdout=PIPE, stderr=PIPE)
# Initialize the thread pool
pool = ThreadPool(processes=10)
# Cache the responses for 10 seconds
response_cache = {}
CACHE_TIMEOUT = 10
@app.route('/', methods=['GET', 'POST'])
def home():
if request.method == 'POST':
prompt = request.form.get("prompt")
return render_template("index.html", processed_text=llama(prompt))
return render_template("index.html")
def llama(prompt):
# Check if the response is cached
if prompt in response_cache:
if time.time() - response_cache[prompt]['timestamp'] < CACHE_TIMEOUT:
return response_cache[prompt]['response']
else:
del response_cache[prompt]
# Use the pre-warmed llama process
p = llama_process
# Run the llama process asynchronously
async_result = pool.apply_async(p.communicate, (prompt.encode('utf-8'),))
# Wait for the response
stdout, stderr = async_result.get()
# Cache the response
response_cache[prompt] = {
'response': "|".join(stdout.decode("utf-8").splitlines()),
'timestamp': time.time()
}
return response_cache[prompt]['response']
if __name__ == '__main__':
app.run()