2
2
from tokenizer import ExLlamaTokenizer
3
3
from generator import ExLlamaGenerator
4
4
import torch
5
+ import torch .nn .functional as F
5
6
import os , glob
6
7
import cuda_ext
7
8
20
21
21
22
config = ExLlamaConfig (model_config_path ) # create config from config.json
22
23
config .model_path = model_path # supply path to model weights file
23
- config .max_input_len = 16
24
24
25
25
model = ExLlama (config ) # create ExLlama instance and load the weights
26
26
tokenizer = ExLlamaTokenizer (tokenizer_path ) # create tokenizer from tokenizer model file
31
31
# Configure generator
32
32
33
33
generator .settings .token_repetition_penalty_max = 1.15
34
- generator .settings .temperature = 0.75
34
+ generator .settings .temperature = 0.95
35
35
generator .settings .top_k = 40
36
- generator .settings .top_p = 0.65
37
- # generator.settings.typical = 0.5
36
+ generator .settings .top_p = 0.75
37
+ # generator.settings.typical = 0.95
38
38
39
39
# Prompts to mix
40
40
46
46
47
47
f2 = \
48
48
"""[INST] <<SYS>>
49
- You are a rude and obnoxious assistant. You hate everything and everyone.
50
49
<</SYS>>
50
+ You are a rude and obnoxious assistant. You hate everything and everyone.
51
51
{prompt}[/INST]"""
52
52
53
+
53
54
prompts = \
54
55
[
55
56
f1 .replace ("{prompt}" , "Tell me about Homer Simpson" ),
56
57
f2 .replace ("{prompt}" , "Tell me about Homer Simpson" ),
57
58
]
58
59
59
- def mixed_generation (prompts , alpha , max_new_tokens ):
60
+ def generate_cfg (prompts , alpha , max_new_tokens ):
60
61
61
62
ids , mask = tokenizer .encode (prompts , return_mask = True )
62
63
generator .gen_begin (ids , mask = mask )
63
64
64
65
# Sampling loop
65
66
66
- for i in range (max_new_tokens ):
67
+ for _ in range (max_new_tokens ):
67
68
68
69
logits = model .forward (generator .sequence [:, - 1 :], cache , input_mask = mask )
69
70
generator .apply_rep_penalty (logits )
70
71
72
+ logits = F .log_softmax (logits , dim = - 1 )
71
73
logits_mixed = (1 - alpha ) * logits [0 ] + alpha * logits [1 ]
72
74
73
75
sampled_token , _ = generator .sample_current (logits_mixed )
@@ -86,5 +88,5 @@ def mixed_generation(prompts, alpha, max_new_tokens):
86
88
print (f"--------------------------------------" )
87
89
print (f"alpha = { alpha :.1f} " )
88
90
print (f"--------------------------------------" )
89
- output = mixed_generation (prompts , alpha , 200 )
91
+ output = generate_cfg (prompts , alpha , 200 )
90
92
print (output [len (prompts [0 ]):].strip ())
0 commit comments