v1.3.4

OlivierDehaene · OlivierDehaene · commit 630800eed37b · 2023-12-22T15:46:04.000+01:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -8,7 +8,7 @@ members = [
 ]
 
 [workspace.package]
-version = "1.3.3"
+version = "1.3.4"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -10,7 +10,7 @@
       "name": "Apache 2.0",
       "url": "https://www.apache.org/licenses/LICENSE-2.0"
     },
-    "version": "1.3.3"
+    "version": "1.3.4"
   },
   "paths": {
     "/": {
diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-integration-tests"
-version = "1.3.3"
+version = "1.3.4"
 description = "Text Generation Inference integration tests"
 authors = ["Nicolas Patry <nicolas@huggingface.co>"]
 
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "1.3.3"
+version = "1.3.4"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
 
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
@@ -39,7 +39,7 @@
     V2 = False
     log_once(
         logger.warning,
-        "Disabling exllama v2 and using v1 instead because there are issues when sharding"
+        "Disabling exllama v2 and using v1 instead because there are issues when sharding",
     )
 
 if os.getenv("DISABLE_EXLLAMA") == "True":
diff --git a/server/text_generation_server/utils/log.py b/server/text_generation_server/utils/log.py
@@ -2,5 +2,5 @@
 
 
 @lru_cache(10)
-def log_once(log, msg:str):
+def log_once(log, msg: str):
     log(msg)
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
@@ -215,7 +215,9 @@ def get_multi_weights_col(self, prefixes: List[str], quantize: str, dim: int):
             bits, groupsize, desc_act = self._get_gptq_params()
             from text_generation_server.utils.layers import HAS_EXLLAMA
 
-            use_exllama = bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act
+            use_exllama = (
+                bits == 4 and HAS_EXLLAMA and quantize == "gptq" and not desc_act
+            )
             weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
         else:
             w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
@@ -281,14 +283,11 @@ def get_multi_weights_row(self, prefix: str, quantize: str):
                     if CAN_EXLLAMA:
                         log_once(
                             logger.warning,
-                            "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True"
+                            "Exllama GPTQ cuda kernels (which are faster) could have been used, but are not currently installed, try using BUILD_EXTENSIONS=True",
                         )
                     use_exllama = False
                 else:
-                    log_once(
-                        logger.info,
-                        f"Using exllama kernels v{HAS_EXLLAMA}"
-                    )
+                    log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
 
             g_idx = self.get_sharded(f"{prefix}.g_idx", dim=0)
 

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ members = [`
`8`	`8`	`]`
`9`	`9`
`10`	`10`	`[workspace.package]`
`11`		`-version = "1.3.3"`
	`11`	`+version = "1.3.4"`
`12`	`12`	`edition = "2021"`
`13`	`13`	`authors = ["Olivier Dehaene"]`
`14`	`14`	`homepage = "https://github.com/huggingface/text-generation-inference"`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@`
`39`	`39`	`V2 = False`
`40`	`40`	`log_once(`
`41`	`41`	`logger.warning,`
`42`		`- "Disabling exllama v2 and using v1 instead because there are issues when sharding"`
	`42`	`+ "Disabling exllama v2 and using v1 instead because there are issues when sharding",`
`43`	`43`	`)`
`44`	`44`
`45`	`45`	`if os.getenv("DISABLE_EXLLAMA") == "True":`