Merge upstream changes, fix conflict

0cc4m · Jun 27, 2023 · 53fb462 · 53fb462
2 parents ecfea95 + a01b25c
commit 53fb462
Show file tree

Hide file tree

Showing 38 changed files with 845 additions and 985 deletions.
diff --git a/.env b/.env
@@ -1,4 +1,5 @@
 PORT=5000
+RUN_UID=1000  # set to 0 to run the service as root inside the container
+APPLICATION_STATE_PATH=/data  # path to the directory holding application state inside the container
 MODEL_PATH=models/LLaMA-7B-4bit-128g  # replace with the actual model path on the host
-CONTAINER_MODEL_PATH=/app/model
-SESSIONS_PATH=./exllama_sessions
+SESSIONS_PATH=~/exllama_sessions  # replace with the actual directory on the host where chat sessions should be stored
diff --git a/Dockerfile b/Dockerfile
@@ -1,22 +1,31 @@
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 as build
-
-ENV RUN_UID=1000
+ARG RUN_UID="1000" \
+    APPLICATION_STATE_PATH="/data"
+ENV RUN_UID=$RUN_UID \
+    APPLICATION_STATE_PATH=$APPLICATION_STATE_PATH \
+    CONTAINER_MODEL_PATH=$APPLICATION_STATE_PATH/model \
+    CONTAINER_SESSIONS_PATH=$APPLICATION_STATE_PATH/exllama_sessions
 
 RUN apt-get update && \
     DEBIAN_FRONTEND=noninteractive apt-get install -y ninja-build python3 python3-pip && \
     rm -rf /var/lib/apt/lists/*
 
-# Setup user which will run the service
-RUN useradd -m -u $RUN_UID user
-USER user
+# Setup user which will run the service and create application state directory
+RUN if [ ${RUN_UID} -ne 0 ] ; then useradd -m -u $RUN_UID user ; fi \
+    && mkdir -p $APPLICATION_STATE_PATH \
+    && mkdir -p $CONTAINER_MODEL_PATH \
+    && mkdir -p $CONTAINER_SESSIONS_PATH \
+    && chown -R $RUN_UID $APPLICATION_STATE_PATH
+USER $RUN_UID
 
-COPY --chown=user . /app
+COPY --chown=$RUN_UID . /app
 
 WORKDIR /app
 
+# Create application state directory and install python packages
 RUN pip install --upgrade pip setuptools wheel \
     && pip install -r requirements.txt \
-    && pip install flask==2.3.2
+    && pip install -r requirements-web.txt
 
 USER root
 

diff --git a/README.md b/README.md
@@ -12,11 +12,16 @@ incompatibilities with older cards.
 
 ## Dependencies
 
+* Python 3.9 or newer
 * `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118
 * `safetensors` 0.3.1
 * `sentencepiece`
 * `ninja`
-* `flask` (only for the web UI)
+
+Additionally, only for the web UI:
+
+* `flask`
+* `waitress`
 
 ## Linux/WSL prerequisites
 
@@ -30,7 +35,7 @@ To run on Windows (without WSL):
 Studio 2022` IDE, or alternatively just the `Build Tools for Visual Studio 2022` package (make sure `Desktop
 development with C++` is ticked in the installer), it doesn't really matter which.
 2. Install the appropriate version of [PyTorch](https://pytorch.org/get-started/locally/), choosing one of the CUDA
-versions. I am developing on the nightly build, but the stable version should also work.
+versions. I am developing on the nightly build, but the stable version (2.0.1) should also work.
 3. Install CUDA Toolkit, ([11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) and 
 [11.8](https://developer.nvidia.com/cuda-11-8-0-download-archive) both seem to work, just make sure to match PyTorch's
 Compute Platform version).
@@ -40,7 +45,7 @@ Compute Platform version).
 
 Install dependencies, clone repo and run benchmark:
 
-    pip install safetensors sentencepiece ninja
+    pip install -r requirements.txt
 
     git clone https://github.com/turboderp/exllama
     cd exllama
@@ -65,11 +70,12 @@ multibot mode:
 
 To run it:
 
-    pip install flask
+    pip install -r requirements-web.txt
 
     python webui/app.py -d <path_to_model_files>
 
-Note that sessions are stored in `~/exllama_sessions/`. 
+Note that sessions are stored in `~/exllama_sessions/`. You can change the location of the sessions storage with `-sd`
+if you want.
 
 ## Docker
 For security benefits and easier deployment, it is also possible to run the web UI in an isolated docker container. Note: the docker image currently only supports NVIDIA GPUs.
@@ -91,7 +97,13 @@ docker compose build
 It is also possible to manually build the image:
 
 ```
-docker build -t exllama-web
+docker build -t exllama-web .
+```
+
+NOTE: by default, the service inside the docker container is run by a non-root user. Hence, the ownership of bind-mounted directories (`/data/model` and `/data/exllama_sessions` in the default `docker-compose.yml` file) is changed to this non-root user in the container entrypoint (`entrypoint.sh`). To disable this, set `RUN_UID=0` in the `.env` file if using `docker compose`, or the following command if you manually build the image:
+
+```
+docker build -t exllama-web --build-arg RUN_UID=0 .
 ```
 
 ### Run
@@ -109,7 +121,7 @@ The configuration can be viewed in `docker-compose.yml` and changed by creating
 Run manually: 
 
 ```
-docker run --gpus all -p 5000:5000 -v <path_to_model_files>:/app/model/ --rm -it exllama-web --host 0.0.0.0:5000
+docker run --gpus all -p 5000:5000 -v <path_to_model_dir>:/data/model/ -v <path_to_session_dir>:/data/exllama_sessions --rm -it exllama-web --host 0.0.0.0:5000
 ```
 
 
@@ -179,20 +191,6 @@ confirmed to be working right now.
 
 ## Recent updates
 
-**2023-05-24**: Added fused rotary embeddings and some minor optimizations. 13% faster on 7B, 9% on 13B. Small
-improvement on larger models. Added best-case scores to benchmark results and some clarification. For easier
-comparisons to other implementations, or whatever.
-
-**2023-05-27**: Better memory management in CUDA. Introduced auto switch between Torch's SDP backend and regular 
-matmul attention with some tweaks. Finished CUDA MLP. All in all about 10% faster with these updates.
-
-**2023-05-29**: Web UI is _almost_ up and running. Having to learn JavaScript, and it turns out I hate JavaScript. But
-ChatGPT is an incredible resource for learning new languages, I gotta say, so it's not as painful as it could have
-been. Anyway, in the process of working with the UI I discovered I've been measuring prompt speed incorrectly. Either
-Torch or CUDA or the GPU driver does some sort of caching or self-calibration or lazy initialization during the first
-pass through the model, so subsequent passes are actually _way_ faster than what I've been recording. Doesn't do much
-for individual tokens, but benchmarks updated anyway. Closing in on 10k tokens/second for 7B. (!)
-
 **2023-06-02**: Web UI is now in a fairly working state. Expect it to be a little scuffed in places. There will be a
 rewrite at some point to make the client-side code less seizure-inducing. It has multibot mode, chat rewind and editing
 features, sessions, and more. I'm going to build it out with support for instruct prompting and such, in time.
@@ -216,5 +214,5 @@ disabled by default. YMMV. Use `-cs` to try it out.
 **2023-06-17**: Fixed a nasty bug in the fused attention that was causing slightly incorrect cache states on 13B and
 33B models. You definitely want to update.
 
-**2023-06-18**: LoRA support now. Still needs a lot of testing and som optimization, and currently you can't stack
-multiple LoRAs during the same inference. There's also no support in the web UI yet.
+**2023-06-18**: LoRA support now. Still needs a lot of testing and some optimization, and currently you can't stack
+multiple LoRAs during the same inference. There's also no support in the web UI yet.
diff --git a/cuda_ext.py b/cuda_ext.py
@@ -30,14 +30,15 @@ def find_msvc():
 
     import subprocess
     try:
-        subprocess.check_output(["where", "cl"])
+        subprocess.check_output(["where", "/Q", "cl"])
     except subprocess.CalledProcessError as e:
         cl_path = find_msvc()
         if cl_path:
-            print("Injected compiler path:", cl_path)
+            if verbose:
+                print("Injected compiler path:", cl_path)
             os.environ["path"] += ";" + cl_path
         else:
-            print("Unable to find cl.exe; compilation will probably fail.")
+            print("Unable to find cl.exe; compilation will probably fail.", file=sys.stderr)
 
 exllama_ext = load(
     name = extension_name,
@@ -56,7 +57,7 @@ def find_msvc():
     ],
     extra_include_paths = [os.path.join(library_dir, "exllama_ext")],
     verbose = verbose,
-    extra_ldflags = ["cublas.lib"] if windows else [],
+    extra_ldflags = (["cublas.lib"] + ([f"/LIBPATH:{os.path.join(sys.base_prefix, 'libs')}"] if sys.base_prefix != sys.prefix else [])) if windows else [],
     extra_cuda_cflags = ["-lineinfo"] + (["-U__HIP_NO_HALF_CONVERSIONS__", "-O3"] if torch.version.hip else []),
     extra_cflags = ["-O3"]
     # extra_cflags = ["-ftime-report", "-DTORCH_USE_CUDA_DSA"]
@@ -73,6 +74,7 @@ def find_msvc():
 from exllama_ext import rms_norm
 from exllama_ext import rope_
 from exllama_ext import rep_penalty
+from exllama_ext import apply_rep_penalty
 
 
 # Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
@@ -157,3 +159,9 @@ def ext_rep_penalty_mask_cpu(vocab_size, sequence, penalty_max, sustain, decay):
     rep_mask = torch.empty(vocab_size, dtype = torch.float32)
     rep_penalty(sequence, rep_mask, penalty_max, sustain, decay)
     return rep_mask
+
+
+def ext_apply_rep_penalty_mask_cpu(sequence, penalty_max, sustain, decay, logits):
+
+    apply_rep_penalty(sequence, penalty_max, sustain, decay, logits)
+
diff --git a/cuda_test/compile.sh b/cuda_test/compile.sh