[Core] Default to using per_token quantization for fp8 when cutlass is supported. (#8651)

Signed-off-by: mgoin <michael@neuralmagic.com> Co-authored-by: Michael Goin <mgoin@redhat.com> Co-authored-by: mgoin <michael@neuralmagic.com>
2025-01-15 20:31:27 -08:00
parent cd9d06fb8d
commit fa0050db08
1 changed files with 2 additions and 1 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@ -355,7 +355,8 @@ class Fp8LinearMethod(LinearMethodBase):
            input_scale=layer.input_scale,
            bias=bias,
            cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=False)
+            # Default to using per_token quantization if cutlass is supported
+            use_per_token_if_dynamic=self.cutlass_fp8_supported)


 class Fp8MoEMethod(FusedMoEMethodBase):