main.py

import traceback
from flask import Flask, request, jsonify, abort, Response
from flask_cors import CORS
import traceback
import litellm
from util import handle_error
from litellm import completion 
import os, dotenv, time 
import json
dotenv.load_dotenv()

# TODO: set your keys in .env or here:
# os.environ["OPENAI_API_KEY"] = "" # set your openai key here
# os.environ["ANTHROPIC_API_KEY"] = "" # set your anthropic key here
# os.environ["TOGETHER_AI_API_KEY"] = "" # set your together ai key here
# see supported models / keys here: https://litellm.readthedocs.io/en/latest/supported/
######### ENVIRONMENT VARIABLES ##########
verbose = True

# litellm.caching_with_models = True # CACHING: caching_with_models Keys in the cache are messages + model. - to learn more: https://docs.litellm.ai/docs/caching/
######### PROMPT LOGGING ##########
os.environ["PROMPTLAYER_API_KEY"] = "" # set your promptlayer key here - https://promptlayer.com/

# set callbacks
litellm.success_callback = ["promptlayer"]
############ HELPER FUNCTIONS ###################################

def print_verbose(print_statement):
    if verbose:
        print(print_statement)

app = Flask(__name__)
CORS(app)

@app.route('/')
def index():
    return 'received!', 200

def data_generator(response):
    for chunk in response:
        yield f"data: {json.dumps(chunk)}\n\n"

@app.route('/chat/completions', methods=["POST"])
def api_completion():
    data = request.json
    start_time = time.time() 
    if data.get('stream') == "True":
        data['stream'] = True # convert to boolean
    try:
        if "prompt" not in data:
            raise ValueError("data needs to have prompt")
        data["model"] = "togethercomputer/CodeLlama-34b-Instruct" # by default use Together AI's CodeLlama model - https://api.together.xyz/playground/chat?model=togethercomputer%2FCodeLlama-34b-Instruct
        # COMPLETION CALL
        system_prompt = "Only respond to questions about code. Say 'I don't know' to anything outside of that."
        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": data.pop("prompt")}]
        data["messages"] = messages
        print(f"data: {data}")
        response = completion(**data)
        ## LOG SUCCESS
        end_time = time.time() 
        if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
            return Response(data_generator(response), mimetype='text/event-stream')
    except Exception as e:
        # call handle_error function
        print_verbose(f"Got Error api_completion(): {traceback.format_exc()}")
        ## LOG FAILURE
        end_time = time.time() 
        traceback_exception = traceback.format_exc()
        return handle_error(data=data)
    return response

@app.route('/get_models', methods=["POST"])
def get_models():
    try:
        return litellm.model_list
    except Exception as e:
        traceback.print_exc()
        response = {"error": str(e)}
    return response, 200

if __name__ == "__main__":
  from waitress import serve
  serve(app, host="0.0.0.0", port=4000, threads=500)