[UX] Remove "quantization is not fully optimized yet" log (#25012)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@ -1086,22 +1086,6 @@ class ModelConfig:
|
||||
|
||||
def _verify_quantization(self) -> None:
|
||||
supported_quantization = me_quant.QUANTIZATION_METHODS
|
||||
optimized_quantization_methods = [
|
||||
"fp8",
|
||||
"modelopt",
|
||||
"gptq_marlin_24",
|
||||
"gptq_marlin",
|
||||
"awq_marlin",
|
||||
"fbgemm_fp8",
|
||||
"compressed-tensors",
|
||||
"experts_int8",
|
||||
"quark",
|
||||
"modelopt_fp4",
|
||||
"bitblas",
|
||||
"gptq_bitblas",
|
||||
"inc",
|
||||
"petit_nvfp4",
|
||||
]
|
||||
if self.quantization is not None:
|
||||
self.quantization = cast(me_quant.QuantizationMethods,
|
||||
self.quantization)
|
||||
@ -1183,11 +1167,6 @@ class ModelConfig:
|
||||
f"be one of {supported_quantization}.")
|
||||
from vllm.platforms import current_platform
|
||||
current_platform.verify_quantization(self.quantization)
|
||||
if self.quantization not in optimized_quantization_methods:
|
||||
logger.warning(
|
||||
"%s quantization is not fully "
|
||||
"optimized yet. The speed can be slower than "
|
||||
"non-quantized models.", self.quantization)
|
||||
|
||||
def _verify_cuda_graph(self) -> None:
|
||||
# The `max_seq_len_to_capture` was incorrectly
|
||||
|
||||
Reference in New Issue
Block a user