Skip to content

Commit

Permalink
Merge pull request #3116 from oobabooga/dev
Browse files Browse the repository at this point in the history
v1.1
  • Loading branch information
oobabooga authored Jul 12, 2023
2 parents ad07839 + 2463d7c commit 6447b2e
Show file tree
Hide file tree
Showing 43 changed files with 1,551 additions and 876 deletions.
5 changes: 3 additions & 2 deletions api-examples/api-example-chat-stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ async def run(user_input, history):
'history': history,
'mode': 'instruct', # Valid options: 'chat', 'chat-instruct', 'instruct'
'character': 'Example',
'instruction_template': 'Vicuna-v1.1',
'instruction_template': 'Vicuna-v1.1', # Will get autodetected if unset
# 'context_instruct': '', # Optional
'your_name': 'You',

'regenerate': False,
Expand All @@ -34,7 +35,7 @@ async def run(user_input, history):

# Generation params. If 'preset' is set to different than 'None', the values
# in presets/preset-name.yaml are used instead of the individual numbers.
'preset': 'None',
'preset': 'None',
'do_sample': True,
'temperature': 0.7,
'top_p': 0.1,
Expand Down
3 changes: 2 additions & 1 deletion api-examples/api-example-chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def run(user_input, history):
'history': history,
'mode': 'instruct', # Valid options: 'chat', 'chat-instruct', 'instruct'
'character': 'Example',
'instruction_template': 'Vicuna-v1.1',
'instruction_template': 'Vicuna-v1.1', # Will get autodetected if unset
# 'context_instruct': '', # Optional
'your_name': 'You',

'regenerate': False,
Expand Down
60 changes: 30 additions & 30 deletions api-examples/api-example-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

HOST = '0.0.0.0:5000'

def generate(prompt, tokens = 200):
request = { 'prompt': prompt, 'max_new_tokens': tokens }

def generate(prompt, tokens=200):
request = {'prompt': prompt, 'max_new_tokens': tokens}
response = requests.post(f'http://{HOST}/api/v1/generate', json=request)

if response.status_code == 200:
Expand All @@ -23,7 +24,7 @@ def print_basic_model_info(response):
print("Model: ", response['result']['model_name'])
print("Lora(s): ", response['result']['lora_names'])
for setting in basic_settings:
print(setting, "=", response['result']['shared.settings'][setting])
print(setting, "=", response['result']['shared.settings'][setting])


# model info
Expand Down Expand Up @@ -54,7 +55,7 @@ def guess_groupsize(model_name):
'action': 'load',
'model_name': model,
'args': {
'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama
'loader': 'AutoGPTQ',

'bf16': False,
'load_in_8bit': False,
Expand All @@ -74,18 +75,18 @@ def guess_groupsize(model_name):
'rwkv_strategy': None,
'rwkv_cuda_on': False,

# b&b 4-bit
#'load_in_4bit': False,
#'compute_dtype': 'float16',
#'quant_type': 'nf4',
#'use_double_quant': False,

#"cpu": false,
#"auto_devices": false,
#"gpu_memory": null,
#"cpu_memory": null,
#"disk": false,
#"disk_cache_dir": "cache",
# b&b 4-bit
# 'load_in_4bit': False,
# 'compute_dtype': 'float16',
# 'quant_type': 'nf4',
# 'use_double_quant': False,

# "cpu": false,
# "auto_devices": false,
# "gpu_memory": null,
# "cpu_memory": null,
# "disk": false,
# "disk_cache_dir": "cache",
},
}

Expand All @@ -104,26 +105,25 @@ def guess_groupsize(model_name):
req['args']['load_in_8bit'] = True
elif '-hf' in model or 'fp16' in model:
if '7b' in model:
req['args']['bf16'] = True # for 24GB
req['args']['bf16'] = True # for 24GB
elif '13b' in model:
req['args']['load_in_8bit'] = True # for 24GB
req['args']['load_in_8bit'] = True # for 24GB
elif 'ggml' in model:
#req['args']['threads'] = 16
# req['args']['threads'] = 16
if '7b' in model:
req['args']['n_gpu_layers'] = 100
elif '13b' in model:
req['args']['n_gpu_layers'] = 100
elif '30b' in model or '33b' in model:
req['args']['n_gpu_layers'] = 59 # 24GB
req['args']['n_gpu_layers'] = 59 # 24GB
elif '65b' in model:
req['args']['n_gpu_layers'] = 42 # 24GB
req['args']['n_gpu_layers'] = 42 # 24GB
elif 'rwkv' in model:
req['args']['rwkv_cuda_on'] = True
if '14b' in model:
req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
else:
req['args']['rwkv_strategy'] = 'cuda f16' # 24GB

req['args']['rwkv_strategy'] = 'cuda f16' # 24GB

return model_api(req)

Expand All @@ -134,25 +134,25 @@ def guess_groupsize(model_name):
resp = complex_model_load(model)

if 'error' in resp:
print (f"❌ {model} FAIL Error: {resp['error']['message']}")
print(f"❌ {model} FAIL Error: {resp['error']['message']}")
continue
else:
print_basic_model_info(resp)

ans = generate("0,1,1,2,3,5,8,13,", tokens=2)

if '21' in ans:
print (f"✅ {model} PASS ({ans})")
print(f"✅ {model} PASS ({ans})")
else:
print (f"❌ {model} FAIL ({ans})")
print(f"❌ {model} FAIL ({ans})")

except Exception as e:
print (f"❌ {model} FAIL Exception: {repr(e)}")
print(f"❌ {model} FAIL Exception: {repr(e)}")


# 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
# Some results below.
""" $ ./model-api-example.py
""" $ ./model-api-example.py
Model: 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda
Lora(s): []
truncation_length = 2048
Expand Down
2 changes: 1 addition & 1 deletion api-examples/api-example-stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ async def run(context):

# Generation params. If 'preset' is set to different than 'None', the values
# in presets/preset-name.yaml are used instead of the individual numbers.
'preset': 'None',
'preset': 'None',
'do_sample': True,
'temperature': 0.7,
'top_p': 0.1,
Expand Down
2 changes: 1 addition & 1 deletion api-examples/api-example.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def run(prompt):

# Generation params. If 'preset' is set to different than 'None', the values
# in presets/preset-name.yaml are used instead of the individual numbers.
'preset': 'None',
'preset': 'None',
'do_sample': True,
'temperature': 0.7,
'top_p': 0.1,
Expand Down
10 changes: 5 additions & 5 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ services:
context: .
args:
# specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}
WEBUI_VERSION: ${WEBUI_VERSION}
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
WEBUI_VERSION: ${WEBUI_VERSION:-HEAD}
env_file: .env
ports:
- "${HOST_PORT}:${CONTAINER_PORT}"
- "${HOST_API_PORT}:${CONTAINER_API_PORT}"
- "${HOST_API_STREAM_PORT}:${CONTAINER_API_STREAM_PORT}"
- "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}"
- "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}"
- "${HOST_API_STREAM_PORT:-5005}:${CONTAINER_API_STREAM_PORT:-5005}"
stdin_open: true
tty: true
volumes:
Expand Down
6 changes: 4 additions & 2 deletions download-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@


class ModelDownloader:
def __init__(self, max_retries = 5):
def __init__(self, max_retries=5):
self.s = requests.Session()
if max_retries:
self.s.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=max_retries))
self.s.mount('https://huggingface.co', HTTPAdapter(max_retries=max_retries))
if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
if os.getenv('HF_TOKEN') is not None:
self.s.headers = {'authorization': f'Bearer {os.getenv("HF_TOKEN")}'}

def sanitize_model_and_branch_names(self, model, branch):
if model[-1] == '/':
Expand Down Expand Up @@ -77,7 +79,7 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False):
is_safetensors = re.match(".*\.safetensors", fname)
is_pt = re.match(".*\.pt", fname)
is_ggml = re.match(".*ggml.*\.bin", fname)
is_tokenizer = re.match("(tokenizer|ice).*\.model", fname)
is_tokenizer = re.match("(tokenizer|ice|spiece).*\.model", fname)
is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
if 'lfs' in dict[i]:
Expand Down
7 changes: 5 additions & 2 deletions extensions/api/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,10 @@ def build_parameters(body, chat=False):

if chat:
character = body.get('character')
instruction_template = body.get('instruction_template')
instruction_template = body.get('instruction_template', shared.settings['instruction_template'])
if str(instruction_template) == "None":
instruction_template = "Vicuna-v1.1"

name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)
name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
generate_params.update({
Expand All @@ -72,7 +75,7 @@ def build_parameters(body, chat=False):
'greeting': greeting,
'name1_instruct': name1_instruct,
'name2_instruct': name2_instruct,
'context_instruct': context_instruct,
'context_instruct': body.get('context_instruct', context_instruct),
'turn_template': turn_template,
'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
'history': body.get('history', {'internal': [], 'visible': []})
Expand Down
22 changes: 19 additions & 3 deletions extensions/elevenlabs_tts/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,20 @@

from modules import chat, shared
from modules.utils import gradio
from modules.logging_colors import logger

params = {
'activate': True,
'api_key': None,
'selected_voice': 'None',
'autoplay': False,
'show_text': True,
'model': 'eleven_monolingual_v1',
}

voices = None
wav_idx = 0
LANG_MODELS = ['eleven_monolingual_v1', 'eleven_multilingual_v1']


def update_api_key(key):
Expand Down Expand Up @@ -108,7 +111,7 @@ def output_modifier(string):
output_file = Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.mp3'.format(wav_idx))
print(f'Outputting audio to {str(output_file)}')
try:
audio = elevenlabs.generate(text=string, voice=params['selected_voice'], model="eleven_monolingual_v1")
audio = elevenlabs.generate(text=string, voice=params['selected_voice'], model=params['model'])
elevenlabs.save(audio, str(output_file))

autoplay = 'autoplay' if params['autoplay'] else ''
Expand All @@ -132,7 +135,12 @@ def ui():
global voices
if not voices:
voices = refresh_voices()
params['selected_voice'] = voices[0]
selected = params['selected_voice']
if selected == 'None':
params['selected_voice'] = voices[0]
elif selected not in voices:
logger.error(f'Selected voice {selected} not available, switching to {voices[0]}')
params['selected_voice'] = voices[0]

# Gradio elements
with gr.Row():
Expand All @@ -145,7 +153,14 @@ def ui():
refresh = gr.Button(value='Refresh')

with gr.Row():
api_key = gr.Textbox(placeholder="Enter your API key.", label='API Key')
if params['api_key']:
api_key = gr.Textbox(value=params['api_key'], label='API Key')
update_api_key(params['api_key'])
else:
api_key = gr.Textbox(placeholder="Enter your API key.", label='API Key')

with gr.Row():
model = gr.Dropdown(value=params['model'], choices=LANG_MODELS, label='Language model')

with gr.Row():
convert = gr.Button('Permanently replace audios with the message texts')
Expand Down Expand Up @@ -175,6 +190,7 @@ def ui():
activate.change(lambda x: params.update({'activate': x}), activate, None)
voice.change(lambda x: params.update({'selected_voice': x}), voice, None)
api_key.change(update_api_key, api_key, None)
model.change(lambda x: params.update({'model': x}), model, None)
# connect.click(check_valid_api, [], connection_status)
refresh.click(refresh_voices_dd, [], voice)
# Event functions to update the parameters in the backend
Expand Down
2 changes: 2 additions & 0 deletions extensions/multimodal/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ As of now, the following multimodal pipelines are supported:
|[LLaVA 7B](https://github.com/haotian-liu/LLaVA)|`llava-7b`|[LLaVA 7B](https://huggingface.co/wojtab/llava-7b-v0-4bit-128g)|GPTQ 4-bit quant, old CUDA|built-in|
|[MiniGPT-4 7B](https://github.com/Vision-CAIR/MiniGPT-4)|`minigpt4-7b`|[Vicuna v0 7B](https://huggingface.co/TheBloke/vicuna-7B-GPTQ-4bit-128g)|GPTQ 4-bit quant, new format|[Wojtab/minigpt-4-pipeline](https://github.com/Wojtab/minigpt-4-pipeline)|
|[MiniGPT-4 13B](https://github.com/Vision-CAIR/MiniGPT-4)|`minigpt4-13b`|[Vicuna v0 13B](https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g)|GPTQ 4-bit quant, old CUDA|[Wojtab/minigpt-4-pipeline](https://github.com/Wojtab/minigpt-4-pipeline)|
|[InstructBLIP 7B](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip)|`instructblip-7b`|[Vicuna v1.1 7B](https://huggingface.co/TheBloke/vicuna-7B-1.1-GPTQ-4bit-128g)|GPTQ 4-bit quant|[kjerk/instructblip-pipeline](https://github.com/kjerk/instructblip-pipeline)|
|[InstructBLIP 13B](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip)|`instructblip-13b`|[Vicuna v1.1 13B](https://huggingface.co/TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g)|GPTQ 4-bit quant|[kjerk/instructblip-pipeline](https://github.com/kjerk/instructblip-pipeline)|

Some pipelines could support different LLMs but do note that while it might work, it isn't a supported configuration.

Expand Down
6 changes: 3 additions & 3 deletions extensions/ngrok/script.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Adds ngrok ingress, to use add `--extension ngrok` to the command line options
#
# Parameters can be customized in settings.json of webui, e.g.:
# Parameters can be customized in settings.json of webui, e.g.:
# {"ngrok": {"basic_auth":"user:password"} }
# or
# or
# {"ngrok": {"oauth_provider":"google", "oauth_allow_emails":["[email protected]"]} }
#
# See this example for full list of options: https://github.com/ngrok/ngrok-py/blob/main/examples/ngrok-connect-full.py
Expand All @@ -22,6 +22,7 @@
'session_metadata': 'text-generation-webui',
}


def ui():
settings = shared.settings.get("ngrok")
if settings:
Expand All @@ -33,4 +34,3 @@ def ui():
logging.info(f"Ingress established at: {tunnel.url()}")
except ModuleNotFoundError:
logging.error("===> ngrok library not found, please run `pip install -r extensions/ngrok/requirements.txt`")

3 changes: 1 addition & 2 deletions extensions/openai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,12 +218,11 @@ but there are some exceptions.
| ✅❌ | langchain | https://github.com/hwchase17/langchain | OPENAI_API_BASE=http://127.0.0.1:5001/v1 even with a good 30B-4bit model the result is poor so far. It assumes zero shot python/json coding. Some model tailored prompt formatting improves results greatly. |
| ✅❌ | Auto-GPT | https://github.com/Significant-Gravitas/Auto-GPT | OPENAI_API_BASE=http://127.0.0.1:5001/v1 Same issues as langchain. Also assumes a 4k+ context |
| ✅❌ | babyagi | https://github.com/yoheinakajima/babyagi | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
|| guidance | https://github.com/microsoft/guidance | logit_bias and logprobs not yet supported |

## Future plans
* better error handling
* model changing, esp. something for swapping loras or embedding models
* consider switching to FastAPI + starlette for SSE (openai SSE seems non-standard)
* do something about rate limiting or locking requests for completions, most systems will only be able handle a single request at a time before OOM

## Bugs? Feedback? Comments? Pull requests?

Expand Down
Loading

0 comments on commit 6447b2e

Please sign in to comment.