Compare commits
6 Commits
gpu-ids
...
dockerfile
| Author | SHA1 | Date | |
|---|---|---|---|
| 9e011d3954 | |||
| b24f0531e3 | |||
| f1fd89a9bf | |||
| 721dcb2ebc | |||
| 0204263598 | |||
| 4ac9c33f78 |
@ -171,6 +171,13 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Set nvcc fatbin compression.
|
||||
#
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_GPU_FLAGS "-Xfatbin" "-compress-all" "-compress-mode=size")
|
||||
endif()
|
||||
|
||||
|
||||
#
|
||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||
@ -393,7 +400,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.0 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||
@ -409,7 +416,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
|
||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||
"later if you intend on running FP8 quantized models on "
|
||||
@ -424,7 +431,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
||||
@ -438,7 +445,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||
message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
|
||||
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
||||
"later if you intend on running FP8 quantized models on "
|
||||
@ -453,7 +460,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||
# require CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||
@ -468,7 +475,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
|
||||
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
||||
"later if you intend on running FP8 quantized models on "
|
||||
@ -511,7 +518,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
||||
# require CUDA 12.2 or later (and only work on Hopper).
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
@ -520,7 +527,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
||||
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
|
||||
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
||||
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
||||
"if you intend on running FP8 sparse quantized models on Hopper.")
|
||||
@ -532,7 +539,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# FP4 Archs and flags
|
||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||
@ -553,7 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# CUTLASS MLA Archs and flags
|
||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
@ -642,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# The machete kernels only work on hopper and require CUDA 12.0 or later.
|
||||
# Only build Machete kernels if we are building for something compatible with sm90a
|
||||
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
|
||||
#
|
||||
# For the Machete kernels we automatically generate sources for various
|
||||
# preselected input type pairs and schedules.
|
||||
@ -694,7 +701,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
|
||||
else()
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
|
||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
|
||||
AND MACHETE_ARCHS)
|
||||
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
|
||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||
|
||||
@ -19,20 +19,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
||||
To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
|
||||
before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
|
||||
|
||||
|
||||
To control which devices are used, you can either set the `CUDA_VISIBLE_DEVICES`
|
||||
environment variable, pass the `gpu_ids` parameter to the [LLM] constructor,
|
||||
or use the `--gpu-ids` option with `vllm serve`.
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# Use GPUs 0 and 2 for execution without setting CUDA_VISIBLE_DEVICES env var
|
||||
llm = LLM(
|
||||
model="your-model",
|
||||
gpu_ids=[0, 2],
|
||||
)
|
||||
```
|
||||
To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
|
||||
|
||||
!!! note
|
||||
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
||||
|
||||
@ -103,25 +103,6 @@ def write_keyfile(keyfile_path: str):
|
||||
f.write(encryption_params.key)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
|
||||
def test_can_deserialize_s3(vllm_runner):
|
||||
model_ref = "EleutherAI/pythia-1.4b"
|
||||
tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
|
||||
|
||||
with vllm_runner(model_ref,
|
||||
load_format="tensorizer",
|
||||
model_loader_extra_config=TensorizerConfig(
|
||||
tensorizer_uri=tensorized_path,
|
||||
num_readers=1,
|
||||
s3_endpoint="object.ord1.coreweave.com",
|
||||
)) as loaded_hf_model:
|
||||
deserialized_outputs = loaded_hf_model.generate(
|
||||
prompts, sampling_params)
|
||||
# noqa: E501
|
||||
|
||||
assert deserialized_outputs
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
|
||||
def test_deserialized_encrypted_vllm_model_has_same_outputs(
|
||||
model_ref, vllm_runner, tmp_path, model_path):
|
||||
|
||||
@ -1115,12 +1115,6 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
|
||||
MoE all2all (DeepEP) usually allocate the communication buffer
|
||||
based on the model shape for optimal performance.
|
||||
"""
|
||||
orig = torch.cuda.current_device()
|
||||
for d in range(8):
|
||||
torch.cuda.set_device(d)
|
||||
torch.zeros(1, device=f'cuda:{d}')
|
||||
torch.cuda.set_device(orig)
|
||||
print("pre-warmed all GPUs")
|
||||
if _TP is not None:
|
||||
_TP.prepare_communication_buffer_for_model(model)
|
||||
if _PP is not None:
|
||||
|
||||
@ -1003,41 +1003,27 @@ class EngineArgs:
|
||||
override_attention_dtype=self.override_attention_dtype,
|
||||
)
|
||||
|
||||
def valid_tensorizer_config_provided(self) -> bool:
|
||||
"""
|
||||
Checks if a parseable TensorizerConfig was passed to
|
||||
self.model_loader_extra_config. It first checks if the config passed
|
||||
is a dict or a TensorizerConfig object directly, and if the latter is
|
||||
true (by checking that the object has TensorizerConfig's
|
||||
.to_serializable() method), converts it in to a serializable dict
|
||||
format
|
||||
"""
|
||||
if self.model_loader_extra_config:
|
||||
if hasattr(self.model_loader_extra_config, "to_serializable"):
|
||||
self.model_loader_extra_config = (
|
||||
self.model_loader_extra_config.to_serializable())
|
||||
for allowed_to_pass in ["tensorizer_uri", "tensorizer_dir"]:
|
||||
try:
|
||||
self.model_loader_extra_config[allowed_to_pass]
|
||||
return False
|
||||
except KeyError:
|
||||
pass
|
||||
return True
|
||||
def validate_tensorizer_args(self):
|
||||
from vllm.model_executor.model_loader.tensorizer import (
|
||||
TensorizerConfig)
|
||||
for key in self.model_loader_extra_config:
|
||||
if key in TensorizerConfig._fields:
|
||||
self.model_loader_extra_config["tensorizer_config"][
|
||||
key] = self.model_loader_extra_config[key]
|
||||
|
||||
def create_load_config(self) -> LoadConfig:
|
||||
|
||||
if self.quantization == "bitsandbytes":
|
||||
self.load_format = "bitsandbytes"
|
||||
|
||||
if (self.load_format == "tensorizer"
|
||||
and self.valid_tensorizer_config_provided()):
|
||||
logger.info("Inferring Tensorizer args from %s", self.model)
|
||||
self.model_loader_extra_config = {"tensorizer_dir": self.model}
|
||||
else:
|
||||
logger.info(
|
||||
"Using Tensorizer args from --model-loader-extra-config. "
|
||||
"Note that you can now simply pass the S3 directory in the "
|
||||
"model tag instead of providing the JSON string.")
|
||||
if self.load_format == "tensorizer":
|
||||
if hasattr(self.model_loader_extra_config, "to_serializable"):
|
||||
self.model_loader_extra_config = (
|
||||
self.model_loader_extra_config.to_serializable())
|
||||
self.model_loader_extra_config["tensorizer_config"] = {}
|
||||
self.model_loader_extra_config["tensorizer_config"][
|
||||
"tensorizer_dir"] = self.model
|
||||
self.validate_tensorizer_args()
|
||||
|
||||
return LoadConfig(
|
||||
load_format=self.load_format,
|
||||
|
||||
@ -38,9 +38,6 @@ class ServeSubcommand(CLISubcommand):
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
# Allow overriding visible GPUs via --gpu-ids (comma-separated or single int)
|
||||
if hasattr(args, 'gpu_ids') and args.gpu_ids is not None:
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_ids
|
||||
# If model is specified in CLI (as positional arg), it takes precedence
|
||||
if hasattr(args, 'model_tag') and args.model_tag is not None:
|
||||
args.model = args.model_tag
|
||||
@ -101,13 +98,8 @@ class ServeSubcommand(CLISubcommand):
|
||||
help="Read CLI options from a config file. "
|
||||
"Must be a YAML with the following options: "
|
||||
"https://docs.vllm.ai/en/latest/configuration/serve_args.html")
|
||||
|
||||
serve_parser = make_arg_parser(serve_parser)
|
||||
serve_parser.add_argument(
|
||||
"--gpu-ids",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Comma-separated GPU IDs or a single GPU ID to use for vLLM serve. "
|
||||
"Overrides CUDA_VISIBLE_DEVICES.")
|
||||
show_filtered_argument_or_group_from_help(serve_parser, ["serve"])
|
||||
serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG
|
||||
return serve_parser
|
||||
|
||||
@ -9,7 +9,6 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
|
||||
cast, overload)
|
||||
|
||||
import cloudpickle
|
||||
import os
|
||||
import torch.nn as nn
|
||||
from pydantic import ValidationError
|
||||
from tqdm.auto import tqdm
|
||||
@ -76,9 +75,6 @@ class LLM:
|
||||
skip_tokenizer_init: If true, skip initialization of tokenizer and
|
||||
detokenizer. Expect valid prompt_token_ids and None for prompt
|
||||
from the input.
|
||||
gpu_ids: A list of GPU device IDs or a comma-separated string to use
|
||||
for vLLM execution. Overrides the CUDA_VISIBLE_DEVICES environment
|
||||
variable.
|
||||
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
|
||||
downloading the model and tokenizer.
|
||||
allowed_local_media_path: Allowing API requests to read local images
|
||||
@ -174,7 +170,6 @@ class LLM:
|
||||
tokenizer: Optional[str] = None,
|
||||
tokenizer_mode: TokenizerMode = "auto",
|
||||
skip_tokenizer_init: bool = False,
|
||||
gpu_ids: Optional[Union[Sequence[int], str]] = None,
|
||||
trust_remote_code: bool = False,
|
||||
allowed_local_media_path: str = "",
|
||||
tensor_parallel_size: int = 1,
|
||||
@ -203,13 +198,6 @@ class LLM:
|
||||
if "disable_log_stats" not in kwargs:
|
||||
kwargs["disable_log_stats"] = True
|
||||
|
||||
# Allow specifying GPU device IDs without using CUDA_VISIBLE_DEVICES env var
|
||||
if gpu_ids is not None:
|
||||
# gpu_ids can be a sequence of ints or a string
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = (
|
||||
",".join(map(str, gpu_ids)) if isinstance(gpu_ids, (list, tuple))
|
||||
else str(gpu_ids)
|
||||
)
|
||||
if "worker_cls" in kwargs:
|
||||
worker_cls = kwargs["worker_cls"]
|
||||
# if the worker_cls is not qualified string name,
|
||||
|
||||
@ -223,9 +223,11 @@ class TensorizerConfig(MutableMapping):
|
||||
and re.search(r'%0\dd', self.tensorizer_uri) is not None
|
||||
|
||||
if self.tensorizer_dir and self.tensorizer_uri:
|
||||
raise ValueError(
|
||||
"Either tensorizer_dir or tensorizer_uri must be provided, "
|
||||
"not both.")
|
||||
logger.warning_once(
|
||||
"Provided both tensorizer_dir and tensorizer_uri. "
|
||||
"Inferring tensorizer_dir from tensorizer_uri as the "
|
||||
"latter takes precedence.")
|
||||
self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
|
||||
if self.tensorizer_dir and self.lora_dir:
|
||||
raise ValueError(
|
||||
"Only one of tensorizer_dir or lora_dir may be specified. "
|
||||
|
||||
@ -43,7 +43,7 @@ class TensorizerLoader(BaseModelLoader):
|
||||
else:
|
||||
validate_config(load_config.model_loader_extra_config)
|
||||
self.tensorizer_config = TensorizerConfig(
|
||||
**load_config.model_loader_extra_config)
|
||||
**load_config.model_loader_extra_config["tensorizer_config"])
|
||||
|
||||
def _verify_config(self, model_config: ModelConfig,
|
||||
parallel_config: ParallelConfig):
|
||||
|
||||
Reference in New Issue
Block a user