0cc4m
diff --git a/‎README.md
Lines changed: 21 additions & 15 deletions b/‎README.md
Lines changed: 21 additions & 15 deletions
diff --git a/‎cuda_ext.py
Lines changed: 159 additions & 0 deletions b/‎cuda_ext.py
Lines changed: 159 additions & 0 deletions
diff --git a/‎datasets/download_datasets.py
Lines changed: 19 additions & 0 deletions b/‎datasets/download_datasets.py
Lines changed: 19 additions & 0 deletions
diff --git a/‎testdata.jsonl renamed to ‎datasets/wikitext2_val_sample.jsonl b/‎testdata.jsonl renamed to ‎datasets/wikitext2_val_sample.jsonl
diff --git a/‎doc/TODO.md
Lines changed: 11 additions & 6 deletions b/‎doc/TODO.md
Lines changed: 11 additions & 6 deletions
diff --git a/‎doc/model_compatibility.md
Lines changed: 3 additions & 0 deletions b/‎doc/model_compatibility.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎example_basic.py
Lines changed: 45 additions & 0 deletions b/‎example_basic.py
Lines changed: 45 additions & 0 deletions
diff --git a/‎exllama/test_chatbot.py renamed to ‎example_chatbot.py b/‎exllama/test_chatbot.py renamed to ‎example_chatbot.py
@@ -53,7 +53,7 @@ a minute to compile.
 
 Chatbot example:
 
-    python test_chatbot.py -d <path_to_model_files> -un "Jeff" -p prompt_chatbort.txt
+    python example_chatbot.py -d <path_to_model_files> -un "Jeff" -p prompt_chatbort.txt
 
 ## Web UI
 
@@ -120,11 +120,11 @@ docker run --gpus all -p 5000:5000 -v <path_to_model_files>:/app/model/ --rm -it
 |----------|------|-------|-----------------|----------------------|-----------|------------|---------|---------|------|
 | Llama    | 7B   | 128   | no              | 2,048 t              | 5,194 MB  | 13,918 t/s | 173 t/s | 140 t/s | 6.45 |
 | Llama    | 13B  | 128   | no              | 2,048 t              | 9,127 MB  | 7,507 t/s  | 102 t/s | 86 t/s  | 5.60 |
-| Llama    | 30B  | 128   | no              | 2,048 t              | 20,795 MB | 2,959 t/s  | 47 t/s  | 40 t/s  | 4.60 |
-| Llama    | 30B  | 128   | yes             | 2,048 t              | 20,795 MB | 2,784 t/s  | 45 t/s  | 37 t/s  | 4.55 |
-| Llama    | 30B  | 32    | yes             | 1,550 t <sup>1</sup> | 21,486 MB | 2,636 t/s  | 41 t/s  | 37 t/s  | 4.52 |
+| Llama    | 33B  | 128   | no              | 2,048 t              | 20,795 MB | 2,959 t/s  | 47 t/s  | 40 t/s  | 4.60 |
+| Llama    | 33B  | 128   | yes             | 2,048 t              | 20,795 MB | 2,784 t/s  | 45 t/s  | 37 t/s  | 4.55 |
+| Llama    | 33B  | 32    | yes             | 1,550 t <sup>1</sup> | 21,486 MB | 2,636 t/s  | 41 t/s  | 37 t/s  | 4.52 |
 | Koala    | 13B  | 128   | yes             | 2,048 t              | 9,127 MB  | 5,529 t/s  | 93 t/s  | 79 t/s  | 6.73 |
-| WizardLM | 30B  | -     | no <sup>2</sup> | 2,048 t              | 20,199 MB | 2,313 t/s  | 47 t/s  | 40 t/s  | 5.75 |
+| WizardLM | 33B  | -     | no <sup>2</sup> | 2,048 t              | 20,199 MB | 2,313 t/s  | 47 t/s  | 40 t/s  | 5.75 |
 
 <sup>1</sup> Can not achieve full sequence length without OoM (yet)  
 <sup>2</sup> Not quite sure if this is act-order or not. Weights have no group index, at least   
@@ -156,16 +156,16 @@ following benchmarks are from a 4090 + 3090-Ti with `-gs 17.2,24`:
 
 ### Testing long sequences
 
-The following tests were all done on **30B/65B, 4bit 128g** with various settings, just to test the max sequence length
+The following tests were all done on **33B/65B, 4bit 128g** with various settings, just to test the max sequence length
 and get a sense of what can be achieved with different or multiple GPUs right now. Llama goes incoherent generating 
 past 2048 tokens anyway, but with some fine-tuning, who knows? Note that these tests were run a while ago and the
 speeds are no longer current.
 
 |                        | Size | Seq. len. | VRAM                 | Long seq. | Ind.   | 
 |------------------------|------|-----------|----------------------|-----------|--------|
-| 4090/24GB              | 30B  | 2,516 t   | 22,145 MB            | 1140 t/s  | 28 t/s |
-| 4090/24GB + 3070Ti/8GB | 30B  | 3,932 t   | 22,055 MB + 7,377 MB | 840 t/s   | 22 t/s |
-| A6000/48GB (headless)  | 30B  | 9,032 t   | 46,863 MB            | 645 t/s   | 12 t/s |
+| 4090/24GB              | 33B  | 2,516 t   | 22,145 MB            | 1140 t/s  | 28 t/s |
+| 4090/24GB + 3070Ti/8GB | 33B  | 3,932 t   | 22,055 MB + 7,377 MB | 840 t/s   | 22 t/s |
+| A6000/48GB (headless)  | 33B  | 9,032 t   | 46,863 MB            | 645 t/s   | 12 t/s |
 | A100/80GB (headless)   | 65B  | 9,520 t   | 79,009 MB            | 650 t/s   | 9 t/s  |
 
 ## Todo
@@ -197,18 +197,24 @@ for individual tokens, but benchmarks updated anyway. Closing in on 10k tokens/s
 rewrite at some point to make the client-side code less seizure-inducing. It has multibot mode, chat rewind and editing
 features, sessions, and more. I'm going to build it out with support for instruct prompting and such, in time.
 
-**2024-06-04**: Refactored a whole bunch to move more of the work into the extension, setting up for more tuning
+**2023-06-04**: Refactored a whole bunch to move more of the work into the extension, setting up for more tuning
 options to come soon and eventually auto tuning. Also optimized a little, for about a 5% speedup.
 
-**2024-06-06**: Some minor optimizations. Also it should now compile the extension more easily and run more seamlessly
+**2023-06-06**: Some minor optimizations. Also it should now compile the extension more easily and run more seamlessly
 on Windows.
 
-**2024-06-09**: Fused most of the self-attention step. More to come. Slight speedup already, but more importantly went
+**2023-06-09**: Fused most of the self-attention step. More to come. Slight speedup already, but more importantly went
 from 69% actual CPU utilization to 37%. This should do a lot to address the bottleneck on CPUs with lower 
 single-threaded performance.
 
-**2024-06-10**: Docker support now! And some minor optimizations. Cleaned up the project a bit.
+**2023-06-10**: Docker support now! And some minor optimizations. Cleaned up the project a bit.
 
-**2024-06-11**: Added some concurrency a couple of places. It's only beneficial on the 4090, on small models where the
+**2023-06-11**: Added some concurrency a couple of places. It's only beneficial on the 4090, on small models where the
 cores are somewhat underutilized and the L2 cache can keep up. For the 3090 it's detrimental to performance, so it's
-disabled by default. YMMV. Use `-cs` to try it out.
+disabled by default. YMMV. Use `-cs` to try it out.
+
+**2023-06-17**: Fixed a nasty bug in the fused attention that was causing slightly incorrect cache states on 13B and
+33B models. You definitely want to update.
+
+**2023-06-18**: LoRA support now. Still needs a lot of testing and som optimization, and currently you can't stack
+multiple LoRAs during the same inference. There's also no support in the web UI yet.
@@ -0,0 +1,159 @@
+# from abc import ABC
+import torch
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.utils.cpp_extension import load
+import os
+import sys
+import platform
+
+library_dir = os.path.dirname(os.path.abspath(__file__))
+extension_name = "exllama_ext"
+verbose = False
+
+# another kludge to get things compiling in Windows
+windows = os.name == "nt"
+if windows:
+    def find_msvc():
+        for msvc_dir in [a + "\\Microsoft Visual Studio\\" + b + "\\" + c + "\\VC\Tools\\MSVC\\"
+            for b in ["2022", "2019", "2017"]
+            for a in [os.environ["ProgramW6432"], os.environ["ProgramFiles(x86)"]]
+            for c in ["BuildTools", "Community", "Professional", "Enterprise", "Preview"]
+        ]:
+            if not os.path.exists(msvc_dir):
+                continue
+            versions = sorted(os.listdir(msvc_dir), reverse=True)
+            for version in versions:
+                compiler_dir = msvc_dir + version + "\\bin\\Hostx64\\x64"
+                if os.path.exists(compiler_dir) and os.path.exists(compiler_dir + "\\cl.exe"):
+                    return compiler_dir
+        return None
+    
+    import subprocess
+    try:
+        subprocess.check_output(["where", "cl"])
+    except subprocess.CalledProcessError as e:
+        cl_path = find_msvc()
+        if cl_path:
+            print("Injected compiler path:", cl_path)
+            os.environ["path"] += ";" + cl_path
+        else:
+            print("Unable to find cl.exe; compilation will probably fail.")
+
+exllama_ext = load(
+    name = extension_name,
+    sources = [
+        os.path.join(library_dir, "exllama_ext/exllama_ext.cpp"),
+        os.path.join(library_dir, "exllama_ext/cuda_buffers.cu"),
+        os.path.join(library_dir, "exllama_ext/cuda_func/q4_matrix.cu"),
+        os.path.join(library_dir, "exllama_ext/cuda_func/q4_matmul.cu"),
+        os.path.join(library_dir, "exllama_ext/cuda_func/column_remap.cu"),
+        os.path.join(library_dir, "exllama_ext/cuda_func/rms_norm.cu"),
+        os.path.join(library_dir, "exllama_ext/cuda_func/rope.cu"),
+        os.path.join(library_dir, "exllama_ext/cuda_func/half_matmul.cu"),
+        os.path.join(library_dir, "exllama_ext/cuda_func/q4_attn.cu"),
+        os.path.join(library_dir, "exllama_ext/cuda_func/q4_mlp.cu"),
+        os.path.join(library_dir, "exllama_ext/cpu_func/rep_penalty.cpp")
+    ],
+    extra_include_paths = [os.path.join(library_dir, "exllama_ext")],
+    verbose = verbose,
+    extra_ldflags = ["cublas.lib"] if windows else [],
+    extra_cuda_cflags = ["-lineinfo"] + (["-U__HIP_NO_HALF_CONVERSIONS__", "-O3"] if torch.version.hip else []),
+    extra_cflags = ["-O3"]
+    # extra_cflags = ["-ftime-report", "-DTORCH_USE_CUDA_DSA"]
+)
+
+# from exllama_ext import set_tuning_params
+# from exllama_ext import prepare_buffers
+from exllama_ext import make_q4
+from exllama_ext import q4_matmul
+from exllama_ext import q4_matmul_lora
+from exllama_ext import half_matmul
+from exllama_ext import half_matmul_cublas
+# from exllama_ext import q4_mlp
+from exllama_ext import rms_norm
+from exllama_ext import rope_
+from exllama_ext import rep_penalty
+
+
+# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
+
+none_tensor = torch.empty((1, 1), device = "meta")
+
+
+# Construct Q4Matrix, return handle
+
+def ext_make_q4(qweight, qzeros, scales, g_idx, device):
+
+    return make_q4(qweight,
+                   qzeros,
+                   scales,
+                   g_idx if g_idx is not None else none_tensor,
+                   device)
+
+
+# Matrix multiplication, returns x @ q4
+
+def ext_q4_matmul(x, q4, q4_width, lora_A = None, lora_B = None):
+
+    outshape = x.shape[:-1] + (q4_width,)
+    x = x.view(-1, x.shape[-1])
+    output = torch.empty((x.shape[0], q4_width), dtype = torch.float16, device = x.device)
+
+    if lora_A is None:
+        q4_matmul(x, q4, output)
+    else:
+        lora_temp = torch.empty((x.shape[0], lora_A.shape[1]), dtype = torch.float16, device = x.device)
+        q4_matmul_lora(x, q4, output, lora_A, lora_B, lora_temp)
+
+    return output.view(outshape)
+
+
+# Matrix multiplication, returns x @ w, both half-precision tensors
+
+def ext_half_matmul(x, w, cublas = False):
+
+    outshape = x.shape[:-1] + (w.shape[1],)
+    x = x.view(-1, x.shape[-1])
+
+    if cublas:
+        output = torch.empty((x.shape[0], w.shape[1]), dtype = torch.float16, device = x.device)
+        half_matmul_cublas(x, w, output)
+    else:
+        output = torch.zeros((x.shape[0], w.shape[1]), dtype = torch.float16, device = x.device)
+        half_matmul(x, w, output)
+
+    return output.view(outshape)  ##
+
+
+# RoPE embeddings, in_place
+
+def ext_rope_(x, sin, cos, past_len, num_heads, head_dim):
+
+    rope_(x, sin, cos, past_len, num_heads, head_dim)
+
+
+# RMS norm: x = x * w / sqrt(row_mean(x * x) + epsilon)
+
+def ext_rms_norm(x, w, epsilon):
+
+    outshape = x.shape
+    x = x.view(-1, x.shape[-1])
+    output = torch.empty_like(x)
+    rms_norm(x, w, output, epsilon)
+
+    return output.view(outshape)
+
+def ext_rms_norm_(x, w, epsilon):
+
+    outshape = x.shape
+    x = x.view(-1, x.shape[-1])
+    rms_norm(x, w, x, epsilon)
+
+
+# Repetition penalty
+
+def ext_rep_penalty_mask_cpu(vocab_size, sequence, penalty_max, sustain, decay):
+
+    rep_mask = torch.empty(vocab_size, dtype = torch.float32)
+    rep_penalty(sequence, rep_mask, penalty_max, sustain, decay)
+    return rep_mask
@@ -0,0 +1,19 @@
+# import torch
+# from tokenizer import ExLlamaTokenizer
+from datasets import load_dataset
+import os
+
+# Download samples from HF datasets to run equivalent GPTQ-for-LLaMa equivalent benchmark
+
+def download_hf(filename, dataset, subset, split, key, div):
+
+    print(f"Downloading from {dataset}: {subset}, split: {split} ...")
+    hf_dataset = load_dataset(dataset, subset, split = split)
+    data = div.join(hf_dataset[key])
+
+    with open(filename, "w") as f:
+        f.write(data)
+
+download_hf("wikitext2.txt", "wikitext", "wikitext-2-raw-v1", "test", "text", "\n\n")
+download_hf("ptb.txt", "ptb_text_only", "penn_treebank", "validation", "sentence", "\n\n")
+download_hf("ptb_new.txt", "ptb_text_only", "penn_treebank", "test", "sentence", " ")
@@ -3,8 +3,7 @@
 - [x] Support for act-order models ~~(a bit slow for now)~~
 - [x] ~~Support for v1 models without groupsize~~ Nah.
 - [x] Test more models
-- [ ] Consider support for loading GGML models
-- [ ] Utility to scan and validate .safetensors files
+- [x] Consider support for loading GGML models (not feasible)
 - [x] Figure out if there are quantized models with irregular groupsize (there are some at least with no groupsize)
 
 ## GPU compatibility (etc.)
@@ -22,8 +21,9 @@
 
 ## Testing
 
-- [ ] Figure out an apples-to-apples way of comparing perplexity with other implementations
+- [x] Figure out an apples-to-apples way of comparing perplexity with other implementations
 - [ ] Compile charts of inference speed vs context length for variety of models, compare to other implementations
+- [ ] Test a bunch of LoRAs to make sure all combinations of rank and target layers work
 
 ## VRAM optimization
 
@@ -41,27 +41,30 @@
 - [x] ~~Build attention mask in CUDA rather than PyTorch~~
 - [x] ~~Disable attention mask when it isn't needed~~ (not possible with SDP)
 - [x] Figure out why inference appears to be CPU-bound (kernel launch overhead)
-- [ ] Reduce no. kernel launches to minimum (tail launch, fusion etc.)
+- [x] Reduce no. kernel launches to minimum (tail launch, fusion etc.)
 - [x] Measure PyTorch module overhead (negligible in eval mode)
 - [x] Examine if scaled_dot_product_attention is actually the best attention method for single tokens (it's not)
 - [ ] Implement attention in CUDA
 - [x] Rewrite at least the quantized matmul kernel. Should be a bunch of special cases to consider
 - [x] Experiment with concurrent streams where possible (fused MLP and QKV proj.)
+- [x] Faster low-rank matmul to speed up LoRAs
 
 ## Generation
 
 - [x] Memory-efficient beam search implementation
 - [ ] Optimized beam search
 - [ ] Multi-token censoring/de-censoring
 - [ ] Multi-token repetition penalties
-- [ ] (Multi) LoRA support
+- [x] (Multi) LoRA support
+- [ ] Allow stackable LoRAs
 - [x] Guided generation (chat with multiple bots at once, etc.)
 - [ ] Multiple chat modes with prompt templates (instruct, etc.)
+- [ ] Batched generation
 
 ## Interface
 
 - [x] Simple web interface?
-- [ ] API server 
+- [ ] API server
 
 ## Web UI
 
@@ -71,9 +74,11 @@
 - [ ] Make it a little prettier
 - [ ] Test various edge cases
 - [ ] Better error handling
+- [ ] LoRA controls
 
 ## ??
 
+- [ ] FP8/FP16 overlays
 - [ ] Allow for backpropagation
 - [ ] LoRA training features
 - [ ] Soft prompt training
@@ -9,16 +9,19 @@ As of **2023-05-24**, the following GPTQ models on HuggingFace all appear to be
 - Neko-Institute-of-Science/LLaMA-65B-4bit-32g
 - Neko-Institute-of-Science/LLaMA-65B-4bit-128g
 - reeducator/bluemoonrp-13b
+- reeducator/bluemoonrp-30b
 - TehVenom/Metharme-13b-4bit-GPTQ
 - TheBloke/airoboros-13B-GPTQ
 - TheBloke/gpt4-x-vicuna-13B-GPTQ
 - TheBloke/GPT4All-13B-snoozy-GPTQ
 - TheBloke/guanaco-33B-GPTQ
+- TheBloke/guanaco-65B-GPTQ
 - TheBloke/h2ogpt-oasst1-512-30B-GPTQ <sup>1</sup> 
 - TheBloke/koala-13B-GPTQ-4bit-128g
 - TheBloke/Manticore-13B-GPTQ
 - TheBloke/medalpaca-13B-GPTQ-4bit
 - TheBloke/medalpaca-13B-GPTQ-4bit (compat version)
+- TheBloke/Nous-Hermes-13B-GPTQ
 - TheBloke/tulu-30B-GPTQ
 - TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g
 - TheBloke/VicUnlocked-30B-LoRA-GPTQ
 
@@ -0,0 +1,45 @@
+from model import ExLlama, ExLlamaCache, ExLlamaConfig
+from tokenizer import ExLlamaTokenizer
+from generator import ExLlamaGenerator
+import os, glob
+
+# Directory containt model, tokenizer, generator
+
+model_directory =  "/mnt/str/models/llama-13b-4bit-128g/"
+
+# Locate files we need within that directory
+
+tokenizer_path = os.path.join(model_directory, "tokenizer.model")
+model_config_path = os.path.join(model_directory, "config.json")
+st_pattern = os.path.join(model_directory, "*.safetensors")
+model_path = glob.glob(st_pattern)[0]
+
+# Create config, model, tokenizer and generator
+
+config = ExLlamaConfig(model_config_path)               # create config from config.json
+config.model_path = model_path                          # supply path to model weights file
+
+model = ExLlama(config)                                 # create ExLlama instance and load the weights
+tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
+
+cache = ExLlamaCache(model)                             # create cache for inference
+generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
+
+# Configure generator
+
+generator.disallow_tokens([tokenizer.eos_token_id])
+
+generator.settings.token_repetition_penalty_max = 1.2
+generator.settings.temperature = 0.95
+generator.settings.top_p = 0.65
+generator.settings.top_k = 100
+generator.settings.typical = 0.5
+
+# Produce a simple generation
+
+prompt = "Once upon a time,"
+print (prompt, end = "")
+
+output = generator.generate_simple(prompt, max_new_tokens = 200)
+
+print(output[len(prompt):])