Merge pull request #3116 from oobabooga/dev

v1.1
oobabooga · Jul 12, 2023 · 6447b2e · 6447b2e
2 parents ad07839 + 2463d7c
commit 6447b2e
Show file tree

Hide file tree

Showing 43 changed files with 1,551 additions and 876 deletions.
diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
@@ -23,7 +23,8 @@ async def run(user_input, history):
         'history': history,
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
-        'instruction_template': 'Vicuna-v1.1',
+        'instruction_template': 'Vicuna-v1.1',  # Will get autodetected if unset
+        # 'context_instruct': '',  # Optional
         'your_name': 'You',
 
         'regenerate': False,
@@ -34,7 +35,7 @@ async def run(user_input, history):
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
-        'preset': 'None',  
+        'preset': 'None',
         'do_sample': True,
         'temperature': 0.7,
         'top_p': 0.1,

diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
@@ -17,7 +17,8 @@ def run(user_input, history):
         'history': history,
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
-        'instruction_template': 'Vicuna-v1.1',
+        'instruction_template': 'Vicuna-v1.1',  # Will get autodetected if unset
+        # 'context_instruct': '',  # Optional
         'your_name': 'You',
 
         'regenerate': False,

diff --git a/api-examples/api-example-model.py b/api-examples/api-example-model.py
@@ -4,8 +4,9 @@
 
 HOST = '0.0.0.0:5000'
 
-def generate(prompt, tokens = 200):
-    request = { 'prompt': prompt, 'max_new_tokens': tokens }
+
+def generate(prompt, tokens=200):
+    request = {'prompt': prompt, 'max_new_tokens': tokens}
     response = requests.post(f'http://{HOST}/api/v1/generate', json=request)
 
     if response.status_code == 200:
@@ -23,7 +24,7 @@ def print_basic_model_info(response):
     print("Model: ", response['result']['model_name'])
     print("Lora(s): ", response['result']['lora_names'])
     for setting in basic_settings:
-        print(setting, "=",  response['result']['shared.settings'][setting])
+        print(setting, "=", response['result']['shared.settings'][setting])
 
 
 # model info
@@ -54,7 +55,7 @@ def guess_groupsize(model_name):
         'action': 'load',
         'model_name': model,
         'args': {
-            'gptq_for_llama': False, # Use AutoGPTQ by default, set to True for gptq-for-llama
+            'loader': 'AutoGPTQ',
 
             'bf16': False,
             'load_in_8bit': False,
@@ -74,18 +75,18 @@ def guess_groupsize(model_name):
             'rwkv_strategy': None,
             'rwkv_cuda_on': False,
 
-            # b&b 4-bit 
-            #'load_in_4bit': False,
-            #'compute_dtype': 'float16',
-            #'quant_type': 'nf4',
-            #'use_double_quant': False,
-
-            #"cpu": false,
-            #"auto_devices": false,
-            #"gpu_memory": null,
-            #"cpu_memory": null,
-            #"disk": false,
-            #"disk_cache_dir": "cache",
+            # b&b 4-bit
+            # 'load_in_4bit': False,
+            # 'compute_dtype': 'float16',
+            # 'quant_type': 'nf4',
+            # 'use_double_quant': False,
+
+            # "cpu": false,
+            # "auto_devices": false,
+            # "gpu_memory": null,
+            # "cpu_memory": null,
+            # "disk": false,
+            # "disk_cache_dir": "cache",
         },
     }
 
@@ -104,26 +105,25 @@ def guess_groupsize(model_name):
         req['args']['load_in_8bit'] = True
     elif '-hf' in model or 'fp16' in model:
         if '7b' in model:
-            req['args']['bf16'] = True # for 24GB
+            req['args']['bf16'] = True  # for 24GB
         elif '13b' in model:
-            req['args']['load_in_8bit'] = True # for 24GB
+            req['args']['load_in_8bit'] = True  # for 24GB
     elif 'ggml' in model:
-        #req['args']['threads'] = 16
+        # req['args']['threads'] = 16
         if '7b' in model:
             req['args']['n_gpu_layers'] = 100
         elif '13b' in model:
             req['args']['n_gpu_layers'] = 100
         elif '30b' in model or '33b' in model:
-            req['args']['n_gpu_layers'] = 59 # 24GB
+            req['args']['n_gpu_layers'] = 59  # 24GB
         elif '65b' in model:
-            req['args']['n_gpu_layers'] = 42 # 24GB
+            req['args']['n_gpu_layers'] = 42  # 24GB
     elif 'rwkv' in model:
         req['args']['rwkv_cuda_on'] = True
         if '14b' in model:
-            req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
+            req['args']['rwkv_strategy'] = 'cuda f16i8'  # 24GB
         else:
-            req['args']['rwkv_strategy'] = 'cuda f16' # 24GB
-
+            req['args']['rwkv_strategy'] = 'cuda f16'  # 24GB
 
     return model_api(req)
 
@@ -134,25 +134,25 @@ def guess_groupsize(model_name):
             resp = complex_model_load(model)
 
             if 'error' in resp:
-                print (f"❌ {model} FAIL Error: {resp['error']['message']}")
+                print(f"❌ {model} FAIL Error: {resp['error']['message']}")
                 continue
             else:
                 print_basic_model_info(resp)
 
             ans = generate("0,1,1,2,3,5,8,13,", tokens=2)
 
             if '21' in ans:
-                print (f"✅ {model} PASS ({ans})")
+                print(f"✅ {model} PASS ({ans})")
             else:
-                print (f"❌ {model} FAIL ({ans})")
+                print(f"❌ {model} FAIL ({ans})")
 
         except Exception as e:
-            print (f"❌ {model} FAIL Exception: {repr(e)}")
-            
+            print(f"❌ {model} FAIL Exception: {repr(e)}")
+
 
 # 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
 # Some results below.
-""" $ ./model-api-example.py 
+""" $ ./model-api-example.py
 Model:  4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda
 Lora(s):  []
 truncation_length = 2048

diff --git a/api-examples/api-example-stream.py b/api-examples/api-example-stream.py
@@ -23,7 +23,7 @@ async def run(context):
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
-        'preset': 'None',  
+        'preset': 'None',
         'do_sample': True,
         'temperature': 0.7,
         'top_p': 0.1,

diff --git a/api-examples/api-example.py b/api-examples/api-example.py
@@ -15,7 +15,7 @@ def run(prompt):
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
-        'preset': 'None',  
+        'preset': 'None',
         'do_sample': True,
         'temperature': 0.7,
         'top_p': 0.1,

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -5,13 +5,13 @@ services:
       context: .
       args:
         # specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus
-        TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}
-        WEBUI_VERSION: ${WEBUI_VERSION}
+        TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
+        WEBUI_VERSION: ${WEBUI_VERSION:-HEAD}
     env_file: .env
     ports:
-      - "${HOST_PORT}:${CONTAINER_PORT}"
-      - "${HOST_API_PORT}:${CONTAINER_API_PORT}"
-      - "${HOST_API_STREAM_PORT}:${CONTAINER_API_STREAM_PORT}"
+      - "${HOST_PORT:-7860}:${CONTAINER_PORT:-7860}"
+      - "${HOST_API_PORT:-5000}:${CONTAINER_API_PORT:-5000}"
+      - "${HOST_API_STREAM_PORT:-5005}:${CONTAINER_API_STREAM_PORT:-5005}"
     stdin_open: true
     tty: true
     volumes:

diff --git a/download-model.py b/download-model.py
@@ -23,13 +23,15 @@
 
 
 class ModelDownloader:
-    def __init__(self, max_retries = 5):
+    def __init__(self, max_retries=5):
         self.s = requests.Session()
         if max_retries:
             self.s.mount('https://cdn-lfs.huggingface.co', HTTPAdapter(max_retries=max_retries))
             self.s.mount('https://huggingface.co', HTTPAdapter(max_retries=max_retries))
         if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
             self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
+        if os.getenv('HF_TOKEN') is not None:
+            self.s.headers = {'authorization': f'Bearer {os.getenv("HF_TOKEN")}'}
 
     def sanitize_model_and_branch_names(self, model, branch):
         if model[-1] == '/':
@@ -77,7 +79,7 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False):
                 is_safetensors = re.match(".*\.safetensors", fname)
                 is_pt = re.match(".*\.pt", fname)
                 is_ggml = re.match(".*ggml.*\.bin", fname)
-                is_tokenizer = re.match("(tokenizer|ice).*\.model", fname)
+                is_tokenizer = re.match("(tokenizer|ice|spiece).*\.model", fname)
                 is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
                 if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
                     if 'lfs' in dict[i]:

diff --git a/extensions/api/util.py b/extensions/api/util.py
@@ -59,7 +59,10 @@ def build_parameters(body, chat=False):
 
     if chat:
         character = body.get('character')
-        instruction_template = body.get('instruction_template')
+        instruction_template = body.get('instruction_template', shared.settings['instruction_template'])
+        if str(instruction_template) == "None":
+            instruction_template = "Vicuna-v1.1"
+
         name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)
         name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
         generate_params.update({
@@ -72,7 +75,7 @@ def build_parameters(body, chat=False):
             'greeting': greeting,
             'name1_instruct': name1_instruct,
             'name2_instruct': name2_instruct,
-            'context_instruct': context_instruct,
+            'context_instruct': body.get('context_instruct', context_instruct),
             'turn_template': turn_template,
             'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
             'history': body.get('history', {'internal': [], 'visible': []})

diff --git a/extensions/elevenlabs_tts/script.py b/extensions/elevenlabs_tts/script.py
@@ -6,17 +6,20 @@
 
 from modules import chat, shared
 from modules.utils import gradio
+from modules.logging_colors import logger
 
 params = {
     'activate': True,
     'api_key': None,
     'selected_voice': 'None',
     'autoplay': False,
     'show_text': True,
+    'model': 'eleven_monolingual_v1',
 }
 
 voices = None
 wav_idx = 0
+LANG_MODELS = ['eleven_monolingual_v1', 'eleven_multilingual_v1']
 
 
 def update_api_key(key):
@@ -108,7 +111,7 @@ def output_modifier(string):
     output_file = Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.mp3'.format(wav_idx))
     print(f'Outputting audio to {str(output_file)}')
     try:
-        audio = elevenlabs.generate(text=string, voice=params['selected_voice'], model="eleven_monolingual_v1")
+        audio = elevenlabs.generate(text=string, voice=params['selected_voice'], model=params['model'])
         elevenlabs.save(audio, str(output_file))
 
         autoplay = 'autoplay' if params['autoplay'] else ''
@@ -132,7 +135,12 @@ def ui():
     global voices
     if not voices:
         voices = refresh_voices()
-        params['selected_voice'] = voices[0]
+        selected = params['selected_voice']
+        if selected == 'None':
+            params['selected_voice'] = voices[0]
+        elif selected not in voices:
+            logger.error(f'Selected voice {selected} not available, switching to {voices[0]}')
+            params['selected_voice'] = voices[0]
 
     # Gradio elements
     with gr.Row():
@@ -145,7 +153,14 @@ def ui():
         refresh = gr.Button(value='Refresh')
 
     with gr.Row():
-        api_key = gr.Textbox(placeholder="Enter your API key.", label='API Key')
+        if params['api_key']:
+            api_key = gr.Textbox(value=params['api_key'], label='API Key')
+            update_api_key(params['api_key'])
+        else:
+            api_key = gr.Textbox(placeholder="Enter your API key.", label='API Key')
+
+    with gr.Row():
+        model = gr.Dropdown(value=params['model'], choices=LANG_MODELS, label='Language model')
 
     with gr.Row():
         convert = gr.Button('Permanently replace audios with the message texts')
@@ -175,6 +190,7 @@ def ui():
     activate.change(lambda x: params.update({'activate': x}), activate, None)
     voice.change(lambda x: params.update({'selected_voice': x}), voice, None)
     api_key.change(update_api_key, api_key, None)
+    model.change(lambda x: params.update({'model': x}), model, None)
     # connect.click(check_valid_api, [], connection_status)
     refresh.click(refresh_voices_dd, [], voice)
     # Event functions to update the parameters in the backend

diff --git a/extensions/multimodal/README.md b/extensions/multimodal/README.md
@@ -38,6 +38,8 @@ As of now, the following multimodal pipelines are supported:
 |[LLaVA 7B](https://github.com/haotian-liu/LLaVA)|`llava-7b`|[LLaVA 7B](https://huggingface.co/wojtab/llava-7b-v0-4bit-128g)|GPTQ 4-bit quant, old CUDA|built-in|
 |[MiniGPT-4 7B](https://github.com/Vision-CAIR/MiniGPT-4)|`minigpt4-7b`|[Vicuna v0 7B](https://huggingface.co/TheBloke/vicuna-7B-GPTQ-4bit-128g)|GPTQ 4-bit quant, new format|[Wojtab/minigpt-4-pipeline](https://github.com/Wojtab/minigpt-4-pipeline)|
 |[MiniGPT-4 13B](https://github.com/Vision-CAIR/MiniGPT-4)|`minigpt4-13b`|[Vicuna v0 13B](https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g)|GPTQ 4-bit quant, old CUDA|[Wojtab/minigpt-4-pipeline](https://github.com/Wojtab/minigpt-4-pipeline)|
+|[InstructBLIP 7B](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip)|`instructblip-7b`|[Vicuna v1.1 7B](https://huggingface.co/TheBloke/vicuna-7B-1.1-GPTQ-4bit-128g)|GPTQ 4-bit quant|[kjerk/instructblip-pipeline](https://github.com/kjerk/instructblip-pipeline)|
+|[InstructBLIP 13B](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip)|`instructblip-13b`|[Vicuna v1.1 13B](https://huggingface.co/TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g)|GPTQ 4-bit quant|[kjerk/instructblip-pipeline](https://github.com/kjerk/instructblip-pipeline)|
 
 Some pipelines could support different LLMs but do note that while it might work, it isn't a supported configuration.
 

diff --git a/extensions/ngrok/script.py b/extensions/ngrok/script.py
@@ -1,8 +1,8 @@
 # Adds ngrok ingress, to use add `--extension ngrok` to the command line options
 #
-# Parameters can be customized in settings.json of webui, e.g.: 
+# Parameters can be customized in settings.json of webui, e.g.:
 # {"ngrok": {"basic_auth":"user:password"} }
-# or 
+# or
 # {"ngrok": {"oauth_provider":"google", "oauth_allow_emails":["[email protected]"]} }
 #
 # See this example for full list of options: https://github.com/ngrok/ngrok-py/blob/main/examples/ngrok-connect-full.py
@@ -22,6 +22,7 @@
     'session_metadata': 'text-generation-webui',
 }
 
+
 def ui():
     settings = shared.settings.get("ngrok")
     if settings:
@@ -33,4 +34,3 @@ def ui():
         logging.info(f"Ingress established at: {tunnel.url()}")
     except ModuleNotFoundError:
         logging.error("===> ngrok library not found, please run `pip install -r extensions/ngrok/requirements.txt`")
-
diff --git a/extensions/openai/README.md b/extensions/openai/README.md
@@ -218,12 +218,11 @@ but there are some exceptions.
 | ✅❌ | langchain | https://github.com/hwchase17/langchain | OPENAI_API_BASE=http://127.0.0.1:5001/v1 even with a good 30B-4bit model the result is poor so far. It assumes zero shot python/json coding. Some model tailored prompt formatting improves results greatly. |
 | ✅❌ | Auto-GPT | https://github.com/Significant-Gravitas/Auto-GPT | OPENAI_API_BASE=http://127.0.0.1:5001/v1 Same issues as langchain. Also assumes a 4k+ context |
 | ✅❌ | babyagi | https://github.com/yoheinakajima/babyagi | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
+| ❌ | guidance | https://github.com/microsoft/guidance | logit_bias and logprobs not yet supported |
 
 ## Future plans
-* better error handling
 * model changing, esp. something for swapping loras or embedding models
 * consider switching to FastAPI + starlette for SSE (openai SSE seems non-standard)
-* do something about rate limiting or locking requests for completions, most systems will only be able handle a single request at a time before OOM
 
 ## Bugs? Feedback? Comments? Pull requests?