diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index d3040e9738..9298d3b58d 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -631,7 +631,7 @@ def main(args: argparse.Namespace): else: ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") shard_intermediate_size = 2 * intermediate_size // args.tp_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" block_quant_shape = get_weight_block_size_safety(config) diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 04d2205aa3..459eafa6d9 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -344,7 +344,7 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok hidden_size = config.hidden_size - dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" use_customized_permute = args.use_customized_permute diff --git a/docs/features/quantization/auto_round.md b/docs/features/quantization/auto_round.md index ac766d5e29..9c14f362b6 100644 --- a/docs/features/quantization/auto_round.md +++ b/docs/features/quantization/auto_round.md @@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound model_name = "Qwen/Qwen3-0.6B" -model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") +model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) bits, group_size, sym = 4, 128, True diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index a54acdbb96..0c5111fb8a 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -43,7 +43,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", - torch_dtype="auto", + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index 5d8e06ffb5..035e7ea291 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -41,7 +41,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", - torch_dtype="auto", + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index ee1de21460..ec8a77f74f 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -46,7 +46,7 @@ MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", - torch_dtype="auto", + dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index e0585a8845..56cf057678 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -82,7 +82,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models # Select model and load it MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" - model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") + model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Select calibration dataset diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index f0cd20b733..385e3bbb87 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -50,7 +50,7 @@ to fetch model and tokenizer. model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", - torch_dtype="auto", + dtype="auto", ) model.eval() diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md index 6932445997..b95b560882 100644 --- a/docs/features/quantization/torchao.md +++ b/docs/features/quantization/torchao.md @@ -27,7 +27,7 @@ You can quantize your own huggingface model with torchao, e.g. [transformers](ht quantization_config = TorchAoConfig(Int8WeightOnlyConfig()) quantized_model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype="auto", + dtype="auto", device_map="auto", quantization_config=quantization_config ) diff --git a/requirements/common.txt b/requirements/common.txt index ec668e16d0..5e7769561c 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -7,7 +7,7 @@ requests >= 2.26.0 tqdm blake3 py-cpuinfo -transformers >= 4.55.2 +transformers >= 4.56.0 tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. diff --git a/tests/conftest.py b/tests/conftest.py index 9126b3d668..369acb92cf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -334,7 +334,7 @@ class HfRunner: trust_remote_code=trust_remote_code, ) self.device = self.get_default_device() - self.dtype = torch_dtype = _get_and_verify_dtype( + self.dtype = dtype = _get_and_verify_dtype( self.model_name, self.config, dtype=dtype, @@ -342,7 +342,7 @@ class HfRunner: ) model_kwargs = model_kwargs if model_kwargs is not None else {} - model_kwargs.setdefault("torch_dtype", torch_dtype) + model_kwargs.setdefault("dtype", dtype) if is_sentence_transformer: # Lazy init required for AMD CI @@ -388,7 +388,7 @@ class HfRunner: if not skip_tokenizer_init: self.tokenizer = AutoTokenizer.from_pretrained( model_name, - torch_dtype=torch_dtype, + dtype=dtype, trust_remote_code=trust_remote_code, ) @@ -398,7 +398,7 @@ class HfRunner: self.processor = AutoProcessor.from_pretrained( model_name, - torch_dtype=torch_dtype, + dtype=dtype, trust_remote_code=trust_remote_code, ) if skip_tokenizer_init: diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py index b474e85131..74e30c4307 100644 --- a/tests/models/multimodal/pooling/test_intern_vit.py +++ b/tests/models/multimodal/pooling/test_intern_vit.py @@ -38,7 +38,7 @@ def run_intern_vit_test( config.norm_type = "rms_norm" hf_model = AutoModel.from_pretrained( - model, torch_dtype=torch_dtype, trust_remote_code=True + model, dtype=torch_dtype, trust_remote_code=True ).to("cuda") hf_outputs_per_image = [ hf_model(pixel_value.to("cuda")).last_hidden_state diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py index 80f594021c..414e99a71e 100644 --- a/tests/models/multimodal/pooling/test_radio.py +++ b/tests/models/multimodal/pooling/test_radio.py @@ -45,7 +45,7 @@ def run_radio_test( hf_model = AutoModel.from_pretrained( model_id, config=config, - torch_dtype=torch_dtype, + dtype=torch_dtype, trust_remote_code=True, ).to("cuda") hf_model.eval() diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 01c6824ac9..ad111a1ebd 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -251,7 +251,7 @@ def run_hf( disable_detokenize: bool = False, ) -> float: llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + model, dtype=torch.float16, trust_remote_code=trust_remote_code ) if llm.config.model_type == "llama": # To enable padding in the HF backend. diff --git a/vllm/config/model.py b/vllm/config/model.py index 2be939eb65..6e5757ba03 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1837,18 +1837,18 @@ def _find_dtype( *, revision: str | None, ): - # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct - # because config.torch_dtype can be None. - config_dtype = getattr(config, "torch_dtype", None) + # NOTE: getattr(config, "dtype", torch.float32) is not correct + # because config.dtype can be None. + config_dtype = getattr(config, "dtype", None) # Fallbacks for multi-modal models if the root config - # does not define torch_dtype + # does not define dtype if config_dtype is None: - config_dtype = getattr(config.get_text_config(), "torch_dtype", None) + config_dtype = getattr(config.get_text_config(), "dtype", None) if config_dtype is None and hasattr(config, "vision_config"): - config_dtype = getattr(config.vision_config, "torch_dtype", None) + config_dtype = getattr(config.vision_config, "dtype", None) if config_dtype is None and hasattr(config, "encoder_config"): - config_dtype = getattr(config.encoder_config, "torch_dtype", None) + config_dtype = getattr(config.encoder_config, "dtype", None) # Try to read the dtype of the weights if they are in safetensors format if config_dtype is None: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e2db9d049a..5883b92acd 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -117,9 +117,8 @@ class LLM: execution with tensor parallelism. dtype: The data type for the model weights and activations. Currently, we support `float32`, `float16`, and `bfloat16`. If `auto`, we use - the `torch_dtype` attribute specified in the model config file. - However, if the `torch_dtype` in the config is `float32`, we will - use `float16` instead. + the `dtype` attribute of the Transformers model's config. However, + if the `dtype` in the config is `float32`, we will use `float16` instead. quantization: The method used to quantize the model weights. Currently, we support "awq", "gptq", and "fp8" (experimental). If None, we first check the `quantization_config` attribute in the diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index ce5c050697..4ebfba65ac 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -518,7 +518,7 @@ def init_tensorizer_model( ) -> nn.Module: assert tensorizer_config.hf_config is not None model_args = tensorizer_config.hf_config - model_args.torch_dtype = tensorizer_config.dtype + model_args.dtype = tensorizer_config.dtype assert tensorizer_config.model_class is not None # TODO: Do we need to consider old-style model class? with meta_tensor_mode(), set_current_vllm_config(vllm_config, check_compile=True): diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 2ca761dd2b..6f7e18d78b 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -999,7 +999,7 @@ class ChameleonForConditionalGeneration( return [] assert self.model.vqmodel is not None image_tokens = self.model.get_image_tokens( - image_input["data"].to(self.config.torch_dtype) + image_input["data"].to(self.config.dtype) ) vision_embeddings = self.model.get_input_embeddings(image_tokens) return vision_embeddings diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index f40bd01dec..e5badc0a28 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1089,7 +1089,7 @@ class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessin pixel_values = ( rescale_factor * pixel_values.to(torch.float32) - image_mean_tensor ) / image_std_tensor - pixel_values = pixel_values.to(hf_config.torch_dtype) + pixel_values = pixel_values.to(hf_config.dtype) return pixel_values def _call_hf_processor( diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 1bad8b0405..a247ba55c5 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -615,7 +615,7 @@ class GLM4VForCausalLM( return None def _process_image_input(self, image_input: GLMVImagePixelInputs) -> torch.Tensor: - pixel_values = image_input["data"].to(dtype=self.config.torch_dtype) + pixel_values = image_input["data"].to(dtype=self.config.dtype) return self.transformer.vision(pixel_values) diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index 5d26e1c38e..5671347c00 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -114,7 +114,7 @@ class FlashConfig(PretrainedConfig): attention_dropout=0.0, mla_scale_q_lora=False, mla_scale_kv_lora=False, - torch_dtype="bfloat16", + dtype="bfloat16", params_dtype="bfloat16", router_dtype="float32", router_bias=False, @@ -130,7 +130,7 @@ class FlashConfig(PretrainedConfig): bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, - torch_dtype=torch_dtype, + dtype=dtype, params_dtype=params_dtype, router_dtype=router_dtype, topk_method=topk_method, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index e874aaa0fc..77d77e7b9f 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -987,7 +987,7 @@ class NemotronH_Nano_VL_V2( prefix=maybe_prefix(prefix, "language_model"), ) self.vision_model = self.get_vit_model_from_radio_config(config).to( - self.language_model.config.torch_dtype + self.language_model.config.dtype ) # Construct the vision projection. @@ -1008,7 +1008,7 @@ class NemotronH_Nano_VL_V2( ReLUSquaredActivation(), nn.Linear(vision_projection_hidden_size, llm_hidden_size, bias=False), ) - self.mlp1 = self.mlp1.to(self.language_model.config.torch_dtype) + self.mlp1 = self.mlp1.to(self.language_model.config.dtype) self.config = config self.model_config = vllm_config.model_config diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index a29def57c4..ac038aa3a9 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -338,7 +338,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): group_size=None, norm_before_gate=True, device=current_platform.current_device(), - dtype=config.torch_dtype, + dtype=config.dtype, ) self.out_proj = RowParallelLinear( @@ -847,7 +847,7 @@ class Qwen3NextDecoderLayer(nn.Module): 1, 1, config.hidden_size, - dtype=config.torch_dtype, + dtype=config.dtype, ), ) self.ffn_layer_scale = torch.nn.Parameter( @@ -855,7 +855,7 @@ class Qwen3NextDecoderLayer(nn.Module): 1, 1, config.hidden_size, - dtype=config.torch_dtype, + dtype=config.dtype, ), ) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 82f5410ece..a8709ea426 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -530,7 +530,7 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): with init_on_device_without_buffers("meta"): self.model: PreTrainedModel = AutoModel.from_config( self.config, - torch_dtype=self.model_config.dtype, + dtype=self.model_config.dtype, trust_remote_code=self.model_config.trust_remote_code, ) diff --git a/vllm/model_executor/models/transformers_pooling.py b/vllm/model_executor/models/transformers_pooling.py index 7ddeb403da..7063a72748 100644 --- a/vllm/model_executor/models/transformers_pooling.py +++ b/vllm/model_executor/models/transformers_pooling.py @@ -157,7 +157,7 @@ class TransformersForSequenceClassification(TransformersPoolingBase): with torch.device("meta"): seq_cls_model = AutoModelForSequenceClassification.from_config( self.config, - torch_dtype=self.model_config.dtype, + dtype=self.model_config.dtype, trust_remote_code=self.model_config.trust_remote_code, ) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 04c2bbb438..a6b9df7c14 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -500,8 +500,8 @@ class CudaPlatformBase(Platform): return supported @classmethod - def check_if_supports_dtype(cls, torch_dtype: torch.dtype): - if torch_dtype == torch.bfloat16: # noqa: SIM102 + def check_if_supports_dtype(cls, dtype: torch.dtype): + if dtype == torch.bfloat16: # noqa: SIM102 if not cls.has_device_capability(80): capability = cls.get_device_capability() gpu_name = cls.get_device_name() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index f08e62a4aa..f9f2cc4d34 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -563,7 +563,7 @@ class Platform: return False @classmethod - def check_if_supports_dtype(cls, torch_dtype: torch.dtype): + def check_if_supports_dtype(cls, dtype: torch.dtype): """ Check if the dtype is supported by the current platform. """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 8fa07b10d3..b25b968893 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -484,8 +484,8 @@ class RocmPlatform(Platform): return True @classmethod - def check_if_supports_dtype(cls, torch_dtype: torch.dtype): - if torch_dtype == torch.bfloat16: # noqa: SIM102 + def check_if_supports_dtype(cls, dtype: torch.dtype): + if dtype == torch.bfloat16: # noqa: SIM102 if not cls.has_device_capability(80): capability = cls.get_device_capability() gpu_name = cls.get_device_name() diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 4638e9fa30..5e109cccfe 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -236,8 +236,8 @@ class XPUPlatform(Platform): return torch.xpu.device_count() @classmethod - def check_if_supports_dtype(cls, torch_dtype: torch.dtype): - if torch_dtype == torch.bfloat16: # noqa: SIM102 + def check_if_supports_dtype(cls, dtype: torch.dtype): + if dtype == torch.bfloat16: # noqa: SIM102 device_name = cls.get_device_name().lower() # client gpu a770 if device_name.count("a770") > 0: diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index bb5d3a6880..ad0918a6ed 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -806,7 +806,7 @@ def create_kv_caches_with_random_flash( current_platform.seed_everything(seed) - torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) + dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) assert cache_layout in ("NHD", "HND") stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3, 2, 4) @@ -819,7 +819,7 @@ def create_kv_caches_with_random_flash( for _ in range(num_layers): key_value_cache = torch.empty( - size=kv_cache_allocation_shape, dtype=torch_dtype, device=device + size=kv_cache_allocation_shape, dtype=dtype, device=device ).permute(*stride_order) if cache_dtype in ["auto", "half", "bfloat16", "float"]: key_value_cache.uniform_(-scale, scale) @@ -851,14 +851,14 @@ def create_kv_caches_with_random( current_platform.seed_everything(seed) - torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) + dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) scale = head_size**-0.5 - x = 16 // torch.tensor([], dtype=torch_dtype).element_size() + x = 16 // torch.tensor([], dtype=dtype).element_size() key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) key_caches: list[torch.Tensor] = [] for _ in range(num_layers): - key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, device=device) + key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device=device) if cache_dtype in ["auto", "half", "bfloat16", "float"]: key_cache.uniform_(-scale, scale) elif cache_dtype == "fp8": @@ -870,9 +870,7 @@ def create_kv_caches_with_random( value_cache_shape = (num_blocks, num_heads, head_size, block_size) value_caches: list[torch.Tensor] = [] for _ in range(num_layers): - value_cache = torch.empty( - size=value_cache_shape, dtype=torch_dtype, device=device - ) + value_cache = torch.empty(size=value_cache_shape, dtype=dtype, device=device) if cache_dtype in ["auto", "half", "bfloat16", "float"]: value_cache.uniform_(-scale, scale) elif cache_dtype == "fp8":