[Misc] Allow passing logits_soft_cap for xformers backend (#11252)

Signed-off-by: Isotr0py <[email protected]>
vllm-project · Dec 17, 2024 · f9ecbb1 · f9ecbb1
1 parent 02222a0
commit f9ecbb1
Showing 1 changed file with 3 additions and 5 deletions.
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
@@ -17,9 +17,7 @@
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
+from vllm.utils import print_warning_once
 
 
 class XFormersBackend(AttentionBackend):
@@ -386,8 +384,8 @@ def __init__(
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
         if logits_soft_cap is not None:
-            raise ValueError(
-                "XFormers does not support attention logits soft capping.")
+            print_warning_once("XFormers does not support logits soft cap. "
+                               "Outputs may be slightly off.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)