fix: better warmup error

OlivierDehaene · OlivierDehaene · commit 96a982ad8fc2 · 2023-10-25T10:18:58.000+02:00
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -670,7 +670,7 @@ def warmup(self, batch: FlashCausalLMBatch):
                 self.device,
             )
             _, batch = self.generate_token(batch)
-        except Exception as e:
+        except torch.cuda.OutOfMemoryError as e:
             raise RuntimeError(
                 f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
                 f"You need to decrease `--max-batch-prefill-tokens`"

Original file line number	Diff line number	Diff line change
`@@ -670,7 +670,7 @@ def warmup(self, batch: FlashCausalLMBatch):`
`670`	`670`	`self.device,`
`671`	`671`	`)`
`672`	`672`	`_, batch = self.generate_token(batch)`
`673`		`- except Exception as e:`
	`673`	`+ except torch.cuda.OutOfMemoryError as e:`
`674`	`674`	`raise RuntimeError(`
`675`	`675`	`f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "`
`676`	`676`	f"You need to decrease `--max-batch-prefill-tokens`"