Skip to content

Commit

Permalink
Merge pull request #5848 from oobabooga/dev
Browse files Browse the repository at this point in the history
Merge dev branch
  • Loading branch information
oobabooga committed Apr 12, 2024
2 parents 91a7370 + 597556c commit 26d822f
Show file tree
Hide file tree
Showing 24 changed files with 172 additions and 132 deletions.
6 changes: 3 additions & 3 deletions download-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from requests.adapters import HTTPAdapter
from tqdm.contrib.concurrent import thread_map

base = "https://huggingface.co"
base = os.environ.get("HF_ENDPOINT") or "https://huggingface.co"


class ModelDownloader:
Expand Down Expand Up @@ -112,12 +112,12 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
sha256.append([fname, dict[i]['lfs']['oid']])

if is_text:
links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
links.append(f"{base}/{model}/resolve/{branch}/{fname}")
classifications.append('text')
continue

if not text_only:
links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
links.append(f"{base}/{model}/resolve/{branch}/{fname}")
if is_safetensors:
has_safetensors = True
classifications.append('safetensors')
Expand Down
4 changes: 3 additions & 1 deletion modules/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import torch
import transformers
from transformers import is_torch_xpu_available
from transformers import is_torch_npu_available, is_torch_xpu_available

import modules.shared as shared

Expand Down Expand Up @@ -99,5 +99,7 @@ def clear_torch_cache():
if not shared.args.cpu:
if is_torch_xpu_available():
torch.xpu.empty_cache()
elif is_torch_npu_available():
torch.npu.empty_cache()
else:
torch.cuda.empty_cache()
2 changes: 1 addition & 1 deletion modules/html_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def get_image_cache(path):
old_p.rename(p)

output_file = p
img.convert('RGB').save(output_file, format='PNG')
img.convert('RGBA').save(output_file, format='PNG')
image_cache[path] = [mtime, output_file.as_posix()]

return image_cache[path][1]
Expand Down
22 changes: 12 additions & 10 deletions modules/llama_cpp_python_hijack.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,26 @@ def eval_with_progress(self, tokens: Sequence[int]):
progress_bar = range(0, len(tokens), self.n_batch)

for i in progress_bar:
batch = tokens[i: min(len(tokens), i + self.n_batch)]
batch = tokens[i : min(len(tokens), i + self.n_batch)]
n_past = self.n_tokens
n_tokens = len(batch)
self._batch.set_batch(
batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
)
self._ctx.decode(self._batch)
# Save tokens
self.input_ids[n_past: n_past + n_tokens] = batch
self.input_ids[n_past : n_past + n_tokens] = batch
# Save logits
rows = n_tokens
cols = self._n_vocab
offset = (
0 if self.context_params.logits_all else n_tokens - 1
) # NOTE: Only save the last token logits if logits_all is False
self.scores[n_past + offset: n_past + n_tokens, :].reshape(-1)[
:
] = self._ctx.get_logits()[offset * cols: rows * cols]
if self.context_params.logits_all:
rows = n_tokens
cols = self._n_vocab
logits = self._ctx.get_logits()[: rows * cols]
self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
else:
rows = 1
cols = self._n_vocab
logits = self._ctx.get_logits()[: rows * cols]
self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
# Update n_tokens
self.n_tokens += n_tokens

Expand Down
2 changes: 1 addition & 1 deletion modules/llamacpp_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
if path.is_file():
model_file = path
else:
model_file = list(path.glob('*.gguf'))[0]
model_file = sorted(path.glob('*.gguf'))[0]

logger.info(f"llama.cpp weights detected: {model_file}\n")

Expand Down
6 changes: 5 additions & 1 deletion modules/logits.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import torch
from transformers import is_torch_xpu_available
from transformers import is_torch_npu_available, is_torch_xpu_available

from modules import sampler_hijack, shared
from modules.logging_colors import logger
Expand Down Expand Up @@ -34,6 +34,8 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return
if is_non_hf_exllamav2:
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt).to("xpu:0")
elif is_torch_npu_available():
tokens = shared.tokenizer.encode(prompt).to("npu:0")
else:
tokens = shared.tokenizer.encode(prompt).cuda()
scores = shared.model.get_logits(tokens)[-1][-1]
Expand All @@ -43,6 +45,8 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return
else:
if is_torch_xpu_available():
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("xpu:0")
elif is_torch_npu_available():
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').to("npu:0")
else:
tokens = shared.tokenizer.encode(prompt, return_tensors='pt').cuda()
output = shared.model(input_ids=tokens)
Expand Down
12 changes: 11 additions & 1 deletion modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
import torch
import transformers
from accelerate import infer_auto_device_map, init_empty_weights
from accelerate.utils import is_ccl_available, is_xpu_available
from accelerate.utils import (
is_ccl_available,
is_npu_available,
is_xpu_available
)
from transformers import (
AutoConfig,
AutoModel,
Expand Down Expand Up @@ -45,6 +49,9 @@
if is_xpu_available() and is_ccl_available():
torch.xpu.set_device(local_rank)
deepspeed.init_distributed(backend="ccl")
elif is_npu_available():
torch.npu.set_device(local_rank)
deepspeed.init_distributed(dist_backend="hccl")
else:
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()
Expand Down Expand Up @@ -164,6 +171,9 @@ def huggingface_loader(model_name):
elif is_xpu_available():
device = torch.device("xpu")
model = model.to(device)
elif is_npu_available():
device = torch.device("npu")
model = model.to(device)
else:
model = model.cuda()

Expand Down
15 changes: 8 additions & 7 deletions modules/models_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,13 @@ def get_model_metadata(model):
model_file = list(path.glob('*.gguf'))[0]

metadata = metadata_gguf.load_metadata(model_file)
if 'llama.context_length' in metadata:
model_settings['n_ctx'] = metadata['llama.context_length']
if 'llama.rope.scale_linear' in metadata:
model_settings['compress_pos_emb'] = metadata['llama.rope.scale_linear']
if 'llama.rope.freq_base' in metadata:
model_settings['rope_freq_base'] = metadata['llama.rope.freq_base']
for k in metadata:
if k.endswith('context_length'):
model_settings['n_ctx'] = metadata[k]
elif k.endswith('rope.freq_base'):
model_settings['rope_freq_base'] = metadata[k]
elif k.endswith('rope.scale_linear'):
model_settings['compress_pos_emb'] = metadata[k]
if 'tokenizer.chat_template' in metadata:
template = metadata['tokenizer.chat_template']
eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']]
Expand All @@ -77,7 +78,7 @@ def get_model_metadata(model):
# Transformers metadata
if hf_metadata is not None:
metadata = json.loads(open(path, 'r', encoding='utf-8').read())
for k in ['max_position_embeddings', 'max_seq_len']:
for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']:
if k in metadata:
model_settings['truncation_length'] = metadata[k]
model_settings['max_seq_len'] = metadata[k]
Expand Down
2 changes: 1 addition & 1 deletion modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
'chat_style': 'cai-chat',
'prompt-default': 'QA',
'prompt-notebook': 'QA',
'preset': 'simple-1',
'preset': 'min_p',
'max_new_tokens': 512,
'max_new_tokens_min': 1,
'max_new_tokens_max': 4096,
Expand Down
15 changes: 12 additions & 3 deletions modules/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@
import numpy as np
import torch
import transformers
from transformers import LogitsProcessorList, is_torch_xpu_available
from transformers import (
LogitsProcessorList,
is_torch_npu_available,
is_torch_xpu_available
)

import modules.shared as shared
from modules.cache_utils import process_llamacpp_cache
Expand All @@ -24,7 +28,7 @@
from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
from modules.html_generator import generate_basic_html
from modules.logging_colors import logger
from modules.models import clear_torch_cache, local_rank
from modules.models import clear_torch_cache


def generate_reply(*args, **kwargs):
Expand Down Expand Up @@ -131,12 +135,15 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model'] or shared.args.cpu:
return input_ids
elif shared.args.deepspeed:
return input_ids.to(device=local_rank)
import deepspeed
return input_ids.to(deepspeed.get_accelerator().current_device_name())
elif torch.backends.mps.is_available():
device = torch.device('mps')
return input_ids.to(device)
elif is_torch_xpu_available():
return input_ids.to("xpu:0")
elif is_torch_npu_available():
return input_ids.to("npu:0")
else:
return input_ids.cuda()

Expand Down Expand Up @@ -213,6 +220,8 @@ def set_manual_seed(seed):
torch.cuda.manual_seed_all(seed)
elif is_torch_xpu_available():
torch.xpu.manual_seed_all(seed)
elif is_torch_npu_available():
torch.npu.manual_seed_all(seed)

return seed

Expand Down
18 changes: 10 additions & 8 deletions modules/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,14 +233,16 @@ def save_settings(state, preset, extensions_list, show_controls, theme_state):

# Save extension values in the UI
for extension_name in extensions_list:
extension = getattr(extensions, extension_name).script
if hasattr(extension, 'params'):
params = getattr(extension, 'params')
for param in params:
_id = f"{extension_name}-{param}"
# Only save if different from default value
if param not in shared.default_settings or params[param] != shared.default_settings[param]:
output[_id] = params[param]
extension = getattr(extensions, extension_name, None)
if extension:
extension = extension.script
if hasattr(extension, 'params'):
params = getattr(extension, 'params')
for param in params:
_id = f"{extension_name}-{param}"
# Only save if different from default value
if param not in shared.default_settings or params[param] != shared.default_settings[param]:
output[_id] = params[param]

# Do not save unchanged settings
for key in list(output.keys()):
Expand Down
11 changes: 10 additions & 1 deletion modules/ui_model_menu.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import gradio as gr
import psutil
import torch
from transformers import is_torch_xpu_available
from transformers import is_torch_npu_available, is_torch_xpu_available

from modules import loaders, shared, ui, utils
from modules.logging_colors import logger
Expand All @@ -32,6 +32,9 @@ def create_ui():
if is_torch_xpu_available():
for i in range(torch.xpu.device_count()):
total_mem.append(math.floor(torch.xpu.get_device_properties(i).total_memory / (1024 * 1024)))
elif is_torch_npu_available():
for i in range(torch.npu.device_count()):
total_mem.append(math.floor(torch.npu.get_device_properties(i).total_memory / (1024 * 1024)))
else:
for i in range(torch.cuda.device_count()):
total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
Expand Down Expand Up @@ -287,6 +290,12 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur

yield ("Getting the output folder")
output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp)

if output_folder == Path("models"):
output_folder = Path(shared.args.model_dir)
elif output_folder == Path("loras"):
output_folder = Path(shared.args.lora_dir)

if check:
progress(0.5)

Expand Down
2 changes: 1 addition & 1 deletion modules/ui_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def create_ui(default_preset):
shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty')
shared.gradio['frequency_penalty'] = gr.Slider(0, 2, value=generate_params['frequency_penalty'], step=0.05, label='frequency_penalty')
shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty')
Expand Down Expand Up @@ -69,7 +70,6 @@ def create_ui(default_preset):

with gr.Row():
with gr.Column():
shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
shared.gradio['smoothing_factor'] = gr.Slider(0.0, 10.0, value=generate_params['smoothing_factor'], step=0.01, label='smoothing_factor', info='Activates Quadratic Sampling.')
Expand Down
1 change: 1 addition & 0 deletions presets/min_p.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
min_p: 0.05
Loading

0 comments on commit 26d822f

Please sign in to comment.