Merge pull request #5848 from oobabooga/dev

Merge dev branch
oobabooga · Apr 12, 2024 · 26d822f · 26d822f
2 parents 91a7370 + 597556c
commit 26d822f
Show file tree

Hide file tree

Showing 24 changed files with 172 additions and 132 deletions.
diff --git a/download-model.py b/download-model.py
@@ -21,7 +21,7 @@
 from requests.adapters import HTTPAdapter
 from tqdm.contrib.concurrent import thread_map
 
-base = "https://huggingface.co"
+base = os.environ.get("HF_ENDPOINT") or "https://huggingface.co"
 
 
 class ModelDownloader:
@@ -112,12 +112,12 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
                         sha256.append([fname, dict[i]['lfs']['oid']])
 
                     if is_text:
-                        links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
+                        links.append(f"{base}/{model}/resolve/{branch}/{fname}")
                         classifications.append('text')
                         continue
 
                     if not text_only:
-                        links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
+                        links.append(f"{base}/{model}/resolve/{branch}/{fname}")
                         if is_safetensors:
                             has_safetensors = True
                             classifications.append('safetensors')

diff --git a/modules/callbacks.py b/modules/callbacks.py
@@ -5,7 +5,7 @@
 
 import torch
 import transformers
-from transformers import is_torch_xpu_available
+from transformers import is_torch_npu_available, is_torch_xpu_available
 
 import modules.shared as shared
 
@@ -99,5 +99,7 @@ def clear_torch_cache():
     if not shared.args.cpu:
         if is_torch_xpu_available():
             torch.xpu.empty_cache()
+        elif is_torch_npu_available():
+            torch.npu.empty_cache()
         else:
             torch.cuda.empty_cache()
diff --git a/modules/html_generator.py b/modules/html_generator.py
@@ -139,7 +139,7 @@ def get_image_cache(path):
             old_p.rename(p)
 
         output_file = p
-        img.convert('RGB').save(output_file, format='PNG')
+        img.convert('RGBA').save(output_file, format='PNG')
         image_cache[path] = [mtime, output_file.as_posix()]
 
     return image_cache[path][1]

diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py
@@ -39,24 +39,26 @@ def eval_with_progress(self, tokens: Sequence[int]):
         progress_bar = range(0, len(tokens), self.n_batch)
 
     for i in progress_bar:
-        batch = tokens[i: min(len(tokens), i + self.n_batch)]
+        batch = tokens[i : min(len(tokens), i + self.n_batch)]
         n_past = self.n_tokens
         n_tokens = len(batch)
         self._batch.set_batch(
             batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
         )
         self._ctx.decode(self._batch)
         # Save tokens
-        self.input_ids[n_past: n_past + n_tokens] = batch
+        self.input_ids[n_past : n_past + n_tokens] = batch
         # Save logits
-        rows = n_tokens
-        cols = self._n_vocab
-        offset = (
-            0 if self.context_params.logits_all else n_tokens - 1
-        )  # NOTE: Only save the last token logits if logits_all is False
-        self.scores[n_past + offset: n_past + n_tokens, :].reshape(-1)[
-            :
-        ] = self._ctx.get_logits()[offset * cols: rows * cols]
+        if self.context_params.logits_all:
+            rows = n_tokens
+            cols = self._n_vocab
+            logits = self._ctx.get_logits()[: rows * cols]
+            self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
+        else:
+            rows = 1
+            cols = self._n_vocab
+            logits = self._ctx.get_logits()[: rows * cols]
+            self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
         # Update n_tokens
         self.n_tokens += n_tokens
 

diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py
@@ -192,7 +192,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         if path.is_file():
             model_file = path
         else:
-            model_file = list(path.glob('*.gguf'))[0]
+            model_file = sorted(path.glob('*.gguf'))[0]
 
         logger.info(f"llama.cpp weights detected: {model_file}\n")
 

diff --git a/modules/logits.py b/modules/logits.py
@@ -1,5 +1,5 @@
 import torch
-from transformers import is_torch_xpu_available
+from transformers import is_torch_npu_available, is_torch_xpu_available
 
 from modules import sampler_hijack, shared
 from modules.logging_colors import logger
@@ -34,6 +34,8 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return
         if is_non_hf_exllamav2:
             if is_torch_xpu_available():
                 tokens = shared.tokenizer.encode(prompt).to("xpu:0")
+            elif is_torch_npu_available():
+                tokens = shared.tokenizer.encode(prompt).to("npu:0")
             else:
                 tokens = shared.tokenizer.encode(prompt).cuda()
             scores = shared.model.get_logits(tokens)[-1][-1]
@@ -43,6 +45,8 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return
         else:
             if is_torch_xpu_available():
                 tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
+            elif is_torch_npu_available():
+                tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("npu:0")
             else:
                 tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
             output = shared.model(input_ids=tokens)

diff --git a/modules/models.py b/modules/models.py
@@ -10,7 +10,11 @@
 import torch
 import transformers
 from accelerate import infer_auto_device_map, init_empty_weights
-from accelerate.utils import is_ccl_available, is_xpu_available
+from accelerate.utils import (
+    is_ccl_available,
+    is_npu_available,
+    is_xpu_available
+)
 from transformers import (
     AutoConfig,
     AutoModel,
@@ -45,6 +49,9 @@
     if is_xpu_available() and is_ccl_available():
         torch.xpu.set_device(local_rank)
         deepspeed.init_distributed(backend="ccl")
+    elif is_npu_available():
+        torch.npu.set_device(local_rank)
+        deepspeed.init_distributed(dist_backend="hccl")
     else:
         torch.cuda.set_device(local_rank)
         deepspeed.init_distributed()
@@ -164,6 +171,9 @@ def huggingface_loader(model_name):
             elif is_xpu_available():
                 device = torch.device("xpu")
                 model = model.to(device)
+            elif is_npu_available():
+                device = torch.device("npu")
+                model = model.to(device)
             else:
                 model = model.cuda()
 

diff --git a/modules/models_settings.py b/modules/models_settings.py
@@ -56,12 +56,13 @@ def get_model_metadata(model):
             model_file = list(path.glob('*.gguf'))[0]
 
         metadata = metadata_gguf.load_metadata(model_file)
-        if 'llama.context_length' in metadata:
-            model_settings['n_ctx'] = metadata['llama.context_length']
-        if 'llama.rope.scale_linear' in metadata:
-            model_settings['compress_pos_emb'] = metadata['llama.rope.scale_linear']
-        if 'llama.rope.freq_base' in metadata:
-            model_settings['rope_freq_base'] = metadata['llama.rope.freq_base']
+        for k in metadata:
+            if k.endswith('context_length'):
+                model_settings['n_ctx'] = metadata[k]
+            elif k.endswith('rope.freq_base'):
+                model_settings['rope_freq_base'] = metadata[k]
+            elif k.endswith('rope.scale_linear'):
+                model_settings['compress_pos_emb'] = metadata[k]
         if 'tokenizer.chat_template' in metadata:
             template = metadata['tokenizer.chat_template']
             eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']]
@@ -77,7 +78,7 @@ def get_model_metadata(model):
         # Transformers metadata
         if hf_metadata is not None:
             metadata = json.loads(open(path, 'r', encoding='utf-8').read())
-            for k in ['max_position_embeddings', 'max_seq_len']:
+            for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']:
                 if k in metadata:
                     model_settings['truncation_length'] = metadata[k]
                     model_settings['max_seq_len'] = metadata[k]

diff --git a/modules/shared.py b/modules/shared.py
@@ -36,7 +36,7 @@
     'chat_style': 'cai-chat',
     'prompt-default': 'QA',
     'prompt-notebook': 'QA',
-    'preset': 'simple-1',
+    'preset': 'min_p',
     'max_new_tokens': 512,
     'max_new_tokens_min': 1,
     'max_new_tokens_max': 4096,

diff --git a/modules/text_generation.py b/modules/text_generation.py
@@ -10,7 +10,11 @@
 import numpy as np
 import torch
 import transformers
-from transformers import LogitsProcessorList, is_torch_xpu_available
+from transformers import (
+    LogitsProcessorList,
+    is_torch_npu_available,
+    is_torch_xpu_available
+)
 
 import modules.shared as shared
 from modules.cache_utils import process_llamacpp_cache
@@ -24,7 +28,7 @@
 from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
 from modules.html_generator import generate_basic_html
 from modules.logging_colors import logger
-from modules.models import clear_torch_cache, local_rank
+from modules.models import clear_torch_cache
 
 
 def generate_reply(*args, **kwargs):
@@ -131,12 +135,15 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model'] or shared.args.cpu:
         return input_ids
     elif shared.args.deepspeed:
-        return input_ids.to(device=local_rank)
+        import deepspeed
+        return input_ids.to(deepspeed.get_accelerator().current_device_name())
     elif torch.backends.mps.is_available():
         device = torch.device('mps')
         return input_ids.to(device)
     elif is_torch_xpu_available():
         return input_ids.to("xpu:0")
+    elif is_torch_npu_available():
+        return input_ids.to("npu:0")
     else:
         return input_ids.cuda()
 
@@ -213,6 +220,8 @@ def set_manual_seed(seed):
         torch.cuda.manual_seed_all(seed)
     elif is_torch_xpu_available():
         torch.xpu.manual_seed_all(seed)
+    elif is_torch_npu_available():
+        torch.npu.manual_seed_all(seed)
 
     return seed
 

diff --git a/modules/ui.py b/modules/ui.py
@@ -233,14 +233,16 @@ def save_settings(state, preset, extensions_list, show_controls, theme_state):
 
     # Save extension values in the UI
     for extension_name in extensions_list:
-        extension = getattr(extensions, extension_name).script
-        if hasattr(extension, 'params'):
-            params = getattr(extension, 'params')
-            for param in params:
-                _id = f"{extension_name}-{param}"
-                # Only save if different from default value
-                if param not in shared.default_settings or params[param] != shared.default_settings[param]:
-                    output[_id] = params[param]
+        extension = getattr(extensions, extension_name, None)
+        if extension:
+            extension = extension.script
+            if hasattr(extension, 'params'):
+                params = getattr(extension, 'params')
+                for param in params:
+                    _id = f"{extension_name}-{param}"
+                    # Only save if different from default value
+                    if param not in shared.default_settings or params[param] != shared.default_settings[param]:
+                        output[_id] = params[param]
 
     # Do not save unchanged settings
     for key in list(output.keys()):

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
@@ -8,7 +8,7 @@
 import gradio as gr
 import psutil
 import torch
-from transformers import is_torch_xpu_available
+from transformers import is_torch_npu_available, is_torch_xpu_available
 
 from modules import loaders, shared, ui, utils
 from modules.logging_colors import logger
@@ -32,6 +32,9 @@ def create_ui():
     if is_torch_xpu_available():
         for i in range(torch.xpu.device_count()):
             total_mem.append(math.floor(torch.xpu.get_device_properties(i).total_memory / (1024 * 1024)))
+    elif is_torch_npu_available():
+        for i in range(torch.npu.device_count()):
+            total_mem.append(math.floor(torch.npu.get_device_properties(i).total_memory / (1024 * 1024)))
     else:
         for i in range(torch.cuda.device_count()):
             total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
@@ -287,6 +290,12 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
 
         yield ("Getting the output folder")
         output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp)
+
+        if output_folder == Path("models"):
+            output_folder = Path(shared.args.model_dir)
+        elif output_folder == Path("loras"):
+            output_folder = Path(shared.args.lora_dir)
+
         if check:
             progress(0.5)
 

diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
@@ -32,6 +32,7 @@ def create_ui(default_preset):
                             shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
                             shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
                             shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
+                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
                             shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
                             shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=generate_params['frequency_penalty'], step=0.05, label='frequency_penalty')
                             shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty')
@@ -69,7 +70,6 @@ def create_ui(default_preset):
 
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
                             shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
                             shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
                             shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=generate_params['smoothing_factor'], step=0.01, label='smoothing_factor', info='Activates Quadratic Sampling.')

diff --git a/presets/min_p.yaml b/presets/min_p.yaml
@@ -0,0 +1 @@
+min_p: 0.05