Compare commits
5 Commits
copilot/di
...
copilot/fi
| Author | SHA1 | Date | |
|---|---|---|---|
| 0b85cc9fd4 | |||
| 1c3d99d6a3 | |||
| 1eec2bf88b | |||
| 55812718ab | |||
| 79dff4ac72 |
1272
vllm/envs.py
1272
vllm/envs.py
File diff suppressed because it is too large
Load Diff
296
vllm/envs/__init__.py
Normal file
296
vllm/envs/__init__.py
Normal file
@ -0,0 +1,296 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Environment variable management for vLLM.
|
||||
|
||||
This module provides access to all vLLM environment variables with lazy evaluation
|
||||
and type conversion. Environment variables are looked up from the actual OS
|
||||
environment using os.getenv() with appropriate type conversion and default values.
|
||||
|
||||
The module maintains backwards compatibility with the original vllm.envs interface
|
||||
while providing a cleaner separation between variable definitions and environment
|
||||
lookups.
|
||||
|
||||
Usage:
|
||||
import vllm.envs as envs
|
||||
|
||||
# Access environment variables
|
||||
device = envs.VLLM_TARGET_DEVICE # Returns string value
|
||||
port = envs.VLLM_PORT # Returns int or None
|
||||
|
||||
# Check if variable is explicitly set
|
||||
if envs.is_set("VLLM_USE_V1"):
|
||||
print("V1 is explicitly configured")
|
||||
|
||||
# Get all available variables
|
||||
all_vars = dir(envs)
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, get_type_hints, Type, Union, get_origin, get_args, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from . import _variables
|
||||
from ._variables import __defaults as _env_defaults
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This way IDEs & type checkers get the declarations directly
|
||||
from ._variables import *
|
||||
|
||||
|
||||
def get_default_cache_root() -> str:
|
||||
"""Get the default cache root directory."""
|
||||
return os.getenv(
|
||||
"XDG_CACHE_HOME",
|
||||
os.path.join(os.path.expanduser("~"), ".cache"),
|
||||
)
|
||||
|
||||
|
||||
def get_default_config_root() -> str:
|
||||
"""Get the default config root directory."""
|
||||
return os.getenv(
|
||||
"XDG_CONFIG_HOME",
|
||||
os.path.join(os.path.expanduser("~"), ".config"),
|
||||
)
|
||||
|
||||
|
||||
def _unwrap_optional(type_: Type) -> Type:
|
||||
"""Unwrap Optional[T] to get T."""
|
||||
origin = get_origin(type_)
|
||||
if origin is not Union:
|
||||
return type_
|
||||
|
||||
args = get_args(type_)
|
||||
if len(args) != 2 or type(None) not in args:
|
||||
raise ValueError("Unions not currently supported")
|
||||
|
||||
return next(arg for arg in args if arg is not type(None))
|
||||
|
||||
|
||||
def _get_vllm_port() -> Optional[int]:
|
||||
"""Get the port from VLLM_PORT environment variable with special validation."""
|
||||
if 'VLLM_PORT' not in os.environ:
|
||||
return None
|
||||
|
||||
port = os.getenv('VLLM_PORT', '0')
|
||||
|
||||
try:
|
||||
return int(port)
|
||||
except ValueError as err:
|
||||
parsed = urlparse(port)
|
||||
if parsed.scheme:
|
||||
raise ValueError(
|
||||
f"VLLM_PORT '{port}' appears to be a URI. "
|
||||
"This may be caused by a Kubernetes service discovery issue,"
|
||||
"check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
|
||||
) from None
|
||||
raise ValueError(
|
||||
f"VLLM_PORT '{port}' must be a valid integer") from err
|
||||
|
||||
|
||||
def _parse_list_value(value: str) -> list[str]:
|
||||
"""Parse comma-separated string into list."""
|
||||
if not value:
|
||||
return []
|
||||
return [item.strip() for item in value.split(',') if item.strip()]
|
||||
|
||||
|
||||
_type_hints = get_type_hints(_variables)
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
"""Lazy evaluation of environment variables with standardized parsing."""
|
||||
if name not in _env_defaults:
|
||||
raise AttributeError(f"module {__name__} has no attribute {name}")
|
||||
|
||||
# Special handling for complex variables
|
||||
if name == "VLLM_PORT":
|
||||
return _get_vllm_port()
|
||||
|
||||
# Handle variables that need path expansion
|
||||
if name == "VLLM_CONFIG_ROOT":
|
||||
return Path(os.path.expanduser(
|
||||
os.getenv(
|
||||
"VLLM_CONFIG_ROOT",
|
||||
os.path.join(get_default_config_root(), "vllm"),
|
||||
)
|
||||
)).resolve()
|
||||
|
||||
if name == "VLLM_CACHE_ROOT":
|
||||
return Path(os.path.expanduser(
|
||||
os.getenv(
|
||||
"VLLM_CACHE_ROOT",
|
||||
os.path.join(get_default_cache_root(), "vllm"),
|
||||
)
|
||||
)).resolve()
|
||||
|
||||
if name == "VLLM_ASSETS_CACHE":
|
||||
return Path(os.path.expanduser(
|
||||
os.getenv(
|
||||
"VLLM_ASSETS_CACHE",
|
||||
os.path.join(get_default_cache_root(), "vllm", "assets"),
|
||||
)
|
||||
)).resolve()
|
||||
|
||||
if name == "VLLM_XLA_CACHE_PATH":
|
||||
return Path(os.path.expanduser(
|
||||
os.getenv(
|
||||
"VLLM_XLA_CACHE_PATH",
|
||||
os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
|
||||
)
|
||||
)).resolve()
|
||||
|
||||
if name == "VLLM_RPC_BASE_PATH":
|
||||
return Path(os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir())).resolve()
|
||||
|
||||
# Handle special cases for compound logic
|
||||
if name == "VLLM_USE_PRECOMPILED":
|
||||
return (os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in
|
||||
("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")))
|
||||
|
||||
if name == "VLLM_DO_NOT_TRACK":
|
||||
return (os.environ.get("VLLM_DO_NOT_TRACK", None) or
|
||||
os.environ.get("DO_NOT_TRACK", None) or "0") == "1"
|
||||
|
||||
if name == "VLLM_DP_RANK_LOCAL":
|
||||
return int(os.getenv("VLLM_DP_RANK_LOCAL", os.getenv("VLLM_DP_RANK", "0")))
|
||||
|
||||
if name == "VLLM_TPU_USING_PATHWAYS":
|
||||
return bool("proxy" in os.getenv("JAX_PLATFORMS", "").lower())
|
||||
|
||||
if name == "VLLM_TORCH_PROFILER_DIR":
|
||||
value = os.getenv("VLLM_TORCH_PROFILER_DIR", None)
|
||||
return None if value is None else Path(os.path.abspath(os.path.expanduser(value)))
|
||||
|
||||
# Get environment value
|
||||
env_value = os.getenv(name)
|
||||
if env_value is None:
|
||||
return _env_defaults[name]
|
||||
|
||||
# Get type for this variable
|
||||
var_type = _type_hints[name]
|
||||
var_type = _unwrap_optional(var_type)
|
||||
|
||||
# Parse based on type
|
||||
if var_type is str:
|
||||
# Handle special string parsing
|
||||
if name == "VLLM_TARGET_DEVICE":
|
||||
return env_value.lower()
|
||||
if name == "VLLM_LOGGING_LEVEL":
|
||||
return env_value.upper()
|
||||
if name == "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION":
|
||||
return env_value.upper()
|
||||
if name == "VLLM_MOE_ROUTING_SIMULATION_STRATEGY":
|
||||
return env_value.lower()
|
||||
return env_value
|
||||
|
||||
if var_type is Path:
|
||||
# Parse path with expansion and absolute path conversion
|
||||
expanded_path = os.path.expanduser(env_value)
|
||||
return Path(os.path.abspath(expanded_path))
|
||||
|
||||
if var_type is bool:
|
||||
return env_value.lower() in ("1", "true")
|
||||
|
||||
if var_type in (int, float):
|
||||
return var_type(env_value)
|
||||
|
||||
if var_type == list[str] or (hasattr(var_type, '__origin__') and var_type.__origin__ is list):
|
||||
return _parse_list_value(env_value)
|
||||
|
||||
raise ValueError(f"Unsupported type {var_type} for environment variable {name}")
|
||||
|
||||
|
||||
def __dir__():
|
||||
"""Return list of available environment variables."""
|
||||
return list(_env_defaults.keys())
|
||||
|
||||
|
||||
def is_set(name: str) -> bool:
|
||||
"""Check if an environment variable is explicitly set."""
|
||||
if name not in _env_defaults:
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
return name in os.environ
|
||||
|
||||
|
||||
def set_vllm_use_v1(use_v1: bool) -> None:
|
||||
"""Set VLLM_USE_V1 environment variable."""
|
||||
if is_set("VLLM_USE_V1"):
|
||||
raise ValueError(
|
||||
"Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
|
||||
"explicitly by the user. Please raise this as a Github "
|
||||
"Issue and explicitly set VLLM_USE_V1=0 or 1.")
|
||||
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
|
||||
|
||||
|
||||
def compute_hash() -> str:
|
||||
"""
|
||||
Compute hash of environment variables that affect computation graph.
|
||||
|
||||
WARNING: Whenever a new key is added to the environment variables,
|
||||
ensure that it is included in the factors list if it affects the
|
||||
computation graph. For example, different values of
|
||||
VLLM_PP_LAYER_PARTITION will generate different computation graphs,
|
||||
so it is included in the factors list. The env vars that affect
|
||||
the choice of different kernels or attention backends should also
|
||||
be included in the factors list.
|
||||
"""
|
||||
# The values of envs may affects the computation graph.
|
||||
environment_variables_to_hash = [
|
||||
"VLLM_PP_LAYER_PARTITION",
|
||||
"VLLM_MLA_DISABLE",
|
||||
"VLLM_USE_TRITON_FLASH_ATTN",
|
||||
"VLLM_USE_TRITON_AWQ",
|
||||
"VLLM_DP_RANK",
|
||||
"VLLM_DP_SIZE",
|
||||
"VLLM_USE_STANDALONE_COMPILE",
|
||||
"VLLM_FUSED_MOE_CHUNK_SIZE",
|
||||
"VLLM_FLASHINFER_MOE_BACKEND",
|
||||
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
|
||||
"VLLM_USE_AITER_UNIFIED_ATTENTION",
|
||||
"VLLM_ATTENTION_BACKEND",
|
||||
"VLLM_USE_FLASHINFER_SAMPLER",
|
||||
"VLLM_DISABLED_KERNELS",
|
||||
"VLLM_USE_DEEP_GEMM",
|
||||
"VLLM_USE_TRTLLM_FP4_GEMM",
|
||||
"VLLM_USE_FUSED_MOE_GROUPED_TOPK",
|
||||
"VLLM_USE_FLASHINFER_MOE_FP8",
|
||||
"VLLM_USE_FLASHINFER_MOE_FP4",
|
||||
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
|
||||
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
|
||||
"VLLM_USE_CUDNN_PREFILL",
|
||||
"VLLM_USE_TRTLLM_ATTENTION",
|
||||
"VLLM_ROCM_USE_AITER",
|
||||
"VLLM_ROCM_USE_AITER_PAGED_ATTN",
|
||||
"VLLM_ROCM_USE_AITER_LINEAR",
|
||||
"VLLM_ROCM_USE_AITER_MOE",
|
||||
"VLLM_ROCM_USE_AITER_RMSNORM",
|
||||
"VLLM_ROCM_USE_AITER_MLA",
|
||||
"VLLM_ROCM_USE_AITER_MHA",
|
||||
"VLLM_ROCM_USE_SKINNY_GEMM",
|
||||
"VLLM_ROCM_FP8_PADDING",
|
||||
"VLLM_ROCM_MOE_PADDING",
|
||||
"VLLM_ROCM_CUSTOM_PAGED_ATTN",
|
||||
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
|
||||
"VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16",
|
||||
"VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB",
|
||||
]
|
||||
|
||||
for key in environment_variables_to_hash:
|
||||
# if this goes out of sync with _env_defaults,
|
||||
# it's not a user error, it's a bug
|
||||
assert key in _env_defaults, \
|
||||
f"Please update environment_variables_to_hash in envs/__init__.py. Missing: {key}"
|
||||
|
||||
factors = [
|
||||
getattr(__import__(__name__), key) for key in environment_variables_to_hash
|
||||
]
|
||||
|
||||
hash_str = hashlib.md5(str(factors).encode(),
|
||||
usedforsecurity=False).hexdigest()
|
||||
|
||||
return hash_str
|
||||
341
vllm/envs/_variables.py
Normal file
341
vllm/envs/_variables.py
Normal file
@ -0,0 +1,341 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Environment variable definitions with type annotations and default values.
|
||||
|
||||
This module defines all environment variables used by vLLM with their expected
|
||||
data types and default values. The supported data types are:
|
||||
- str: String values
|
||||
- int: Integer values
|
||||
- float: Floating point values
|
||||
- bool: Boolean values (typically parsed from "0"/"1" or "true"/"false")
|
||||
- Optional[T]: Optional values that can be None
|
||||
- list[str]: Lists of strings (typically comma-separated)
|
||||
|
||||
Each variable is defined with its type annotation and default value.
|
||||
The actual environment variable lookup and conversion is handled by the
|
||||
parent module (__init__.py).
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# Environment variable definitions with type annotations and defaults
|
||||
# These match the TYPE_CHECKING section from the original envs.py
|
||||
|
||||
# Installation Time Environment Variables
|
||||
VLLM_TARGET_DEVICE: str = "cuda"
|
||||
MAX_JOBS: Optional[str] = None
|
||||
NVCC_THREADS: Optional[str] = None
|
||||
VLLM_USE_PRECOMPILED: bool = False
|
||||
VLLM_DOCKER_BUILD_CONTEXT: bool = False
|
||||
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
|
||||
CMAKE_BUILD_TYPE: Optional[str] = None
|
||||
VERBOSE: bool = False
|
||||
|
||||
# Configuration and cache paths
|
||||
VLLM_CONFIG_ROOT: Path = Path.home() / ".config" / "vllm"
|
||||
"""Root directory for vLLM configuration files.
|
||||
|
||||
Note that this not only affects how vllm finds its configuration files
|
||||
during runtime, but also affects how vllm installs its configuration
|
||||
files during **installation**.
|
||||
"""
|
||||
|
||||
VLLM_CACHE_ROOT: Path = Path.home() / ".cache" / "vllm"
|
||||
"""Root directory for vLLM cache files.
|
||||
|
||||
Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set.
|
||||
"""
|
||||
|
||||
# Runtime Environment Variables
|
||||
VLLM_HOST_IP: str = ""
|
||||
VLLM_PORT: Optional[int] = None
|
||||
VLLM_RPC_BASE_PATH: Path = Path(tempfile.gettempdir())
|
||||
"""Base path for RPC temporary files."""
|
||||
VLLM_USE_MODELSCOPE: bool = False
|
||||
VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
|
||||
CUDA_HOME: Optional[str] = None
|
||||
VLLM_NCCL_SO_PATH: Optional[Path] = None
|
||||
"""Path to the NCCL shared object library.
|
||||
|
||||
When `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
|
||||
library file in the locations specified by `LD_LIBRARY_PATH`.
|
||||
"""
|
||||
|
||||
LD_LIBRARY_PATH: Optional[Path] = None
|
||||
"""Path for dynamic library loading.
|
||||
|
||||
When `VLLM_NCCL_SO_PATH` is not set, vllm will try to find the nccl
|
||||
library file in the locations specified by `LD_LIBRARY_PATH`.
|
||||
"""
|
||||
|
||||
# Attention and kernel settings
|
||||
VLLM_USE_TRITON_FLASH_ATTN: bool = True
|
||||
VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
|
||||
VLLM_USE_AITER_UNIFIED_ATTENTION: bool = False
|
||||
VLLM_FLASH_ATTN_VERSION: Optional[int] = None
|
||||
VLLM_ATTENTION_BACKEND: Optional[str] = None
|
||||
VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
|
||||
|
||||
# Testing and debugging
|
||||
VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE: bool = True
|
||||
VLLM_USE_STANDALONE_COMPILE: bool = True
|
||||
|
||||
# Distributed computing
|
||||
LOCAL_RANK: int = 0
|
||||
CUDA_VISIBLE_DEVICES: Optional[str] = None
|
||||
VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60
|
||||
|
||||
# API and security
|
||||
VLLM_API_KEY: Optional[str] = None
|
||||
VLLM_DEBUG_LOG_API_SERVER_RESPONSE: bool = False
|
||||
|
||||
# S3 configuration
|
||||
S3_ACCESS_KEY_ID: Optional[str] = None
|
||||
S3_SECRET_ACCESS_KEY: Optional[str] = None
|
||||
S3_ENDPOINT_URL: Optional[str] = None
|
||||
|
||||
# Usage statistics
|
||||
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
|
||||
VLLM_NO_USAGE_STATS: bool = False
|
||||
VLLM_DO_NOT_TRACK: bool = False
|
||||
VLLM_USAGE_SOURCE: str = "production"
|
||||
|
||||
# Logging configuration
|
||||
VLLM_CONFIGURE_LOGGING: int = 1
|
||||
VLLM_LOGGING_CONFIG_PATH: Optional[Path] = None
|
||||
"""Path to custom logging configuration file."""
|
||||
VLLM_LOGGING_LEVEL: str = "INFO"
|
||||
VLLM_LOGGING_PREFIX: str = ""
|
||||
VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
|
||||
VLLM_LOG_STATS_INTERVAL: float = 10.0
|
||||
VLLM_TRACE_FUNCTION: int = 0
|
||||
|
||||
# Pipeline and partitioning
|
||||
VLLM_PP_LAYER_PARTITION: Optional[str] = None
|
||||
|
||||
# CPU backend settings
|
||||
VLLM_CPU_KVCACHE_SPACE: Optional[int] = None
|
||||
"""(CPU backend only) KV cache space size in MB."""
|
||||
|
||||
VLLM_CPU_OMP_THREADS_BIND: str = "auto"
|
||||
"""(CPU backend only) CPU core ids bound by OpenMP threads.
|
||||
|
||||
Examples: "0-31", "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'.
|
||||
"""
|
||||
|
||||
VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None
|
||||
"""(CPU backend only) CPU cores not used by OMP threads.
|
||||
|
||||
Those CPU cores will not be used by OMP threads of a rank.
|
||||
"""
|
||||
VLLM_CPU_MOE_PREPACK: bool = True
|
||||
VLLM_CPU_SGL_KERNEL: bool = False
|
||||
|
||||
# XLA settings
|
||||
VLLM_XLA_CACHE_PATH: Path = Path.home() / ".cache" / "vllm" / "xla_cache"
|
||||
"""Path to the XLA persistent cache directory.
|
||||
|
||||
Only used for XLA devices such as TPUs.
|
||||
"""
|
||||
VLLM_XLA_CHECK_RECOMPILATION: bool = False
|
||||
VLLM_XLA_USE_SPMD: bool = False
|
||||
|
||||
# MoE (Mixture of Experts) settings
|
||||
VLLM_FUSED_MOE_CHUNK_SIZE: int = 32768
|
||||
VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
|
||||
|
||||
# Ray distributed computing
|
||||
VLLM_USE_RAY_SPMD_WORKER: bool = False
|
||||
VLLM_USE_RAY_COMPILED_DAG: bool = False
|
||||
VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
|
||||
VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
|
||||
VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
|
||||
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
|
||||
|
||||
# Multimodal settings
|
||||
VLLM_ASSETS_CACHE: Path = Path.home() / ".cache" / "vllm" / "assets"
|
||||
"""Path to the cache for storing downloaded assets."""
|
||||
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
|
||||
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
|
||||
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
|
||||
VLLM_MEDIA_LOADING_THREAD_COUNT: int = 8
|
||||
VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25
|
||||
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
|
||||
VLLM_MM_INPUT_CACHE_GIB: int = 4
|
||||
|
||||
# Engine and model settings
|
||||
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
|
||||
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
|
||||
VLLM_TEST_FORCE_FP8_MARLIN: bool = False
|
||||
VLLM_TEST_FORCE_LOAD_FORMAT: str = "dummy"
|
||||
|
||||
# Network and communication
|
||||
VLLM_RPC_TIMEOUT: int = 10000 # ms
|
||||
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
|
||||
|
||||
# Plugin system
|
||||
VLLM_PLUGINS: Optional[list[str]] = None
|
||||
"""List of enabled plugins."""
|
||||
|
||||
VLLM_LORA_RESOLVER_CACHE_DIR: Optional[Path] = None
|
||||
"""Directory for LoRA resolver cache."""
|
||||
|
||||
# Profiling
|
||||
VLLM_TORCH_PROFILER_DIR: Optional[Path] = None
|
||||
"""Directory for torch profiler output.
|
||||
|
||||
Both AsyncLLM's CPU traces as well as workers' traces (CPU & GPU) will be
|
||||
saved under this directory. Note that it must be an absolute path.
|
||||
"""
|
||||
VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
|
||||
VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
|
||||
VLLM_TORCH_PROFILER_WITH_STACK: bool = True
|
||||
VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
|
||||
|
||||
# Quantization and kernels
|
||||
VLLM_USE_TRITON_AWQ: bool = False
|
||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||
VLLM_SKIP_P2P_CHECK: bool = True
|
||||
VLLM_DISABLED_KERNELS: list[str] = []
|
||||
|
||||
# Version control
|
||||
VLLM_USE_V1: bool = True
|
||||
|
||||
# ROCm specific settings
|
||||
VLLM_ROCM_USE_AITER: bool = False
|
||||
VLLM_ROCM_USE_AITER_PAGED_ATTN: bool = False
|
||||
VLLM_ROCM_USE_AITER_LINEAR: bool = True
|
||||
VLLM_ROCM_USE_AITER_MOE: bool = True
|
||||
VLLM_ROCM_USE_AITER_RMSNORM: bool = True
|
||||
VLLM_ROCM_USE_AITER_MLA: bool = True
|
||||
VLLM_ROCM_USE_AITER_MHA: bool = True
|
||||
VLLM_ROCM_USE_SKINNY_GEMM: bool = True
|
||||
VLLM_ROCM_FP8_PADDING: bool = True
|
||||
VLLM_ROCM_MOE_PADDING: bool = True
|
||||
VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
|
||||
VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE"
|
||||
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
|
||||
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
|
||||
|
||||
# V1 specific settings
|
||||
VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
|
||||
VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
|
||||
VLLM_DISABLE_COMPILE_CACHE: bool = False
|
||||
|
||||
# Scale constants for FP8 KV Cache
|
||||
Q_SCALE_CONSTANT: int = 200
|
||||
K_SCALE_CONSTANT: int = 200
|
||||
V_SCALE_CONSTANT: int = 100
|
||||
|
||||
# Development and debugging
|
||||
VLLM_SERVER_DEV_MODE: bool = False
|
||||
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
|
||||
VLLM_MLA_DISABLE: bool = False
|
||||
|
||||
# Ray settings continued
|
||||
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
|
||||
VLLM_RAY_BUNDLE_INDICES: str = ""
|
||||
|
||||
# CUDA settings
|
||||
VLLM_CUDART_SO_PATH: Optional[Path] = None
|
||||
"""Path to the CUDA runtime shared object library."""
|
||||
|
||||
# Data parallel settings
|
||||
VLLM_DP_RANK: int = 0
|
||||
VLLM_DP_RANK_LOCAL: int = 0 # Will be computed dynamically
|
||||
VLLM_DP_SIZE: int = 1
|
||||
VLLM_DP_MASTER_IP: str = "127.0.0.1"
|
||||
VLLM_DP_MASTER_PORT: int = 0
|
||||
VLLM_MOE_DP_CHUNK_SIZE: int = 256
|
||||
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
|
||||
|
||||
# CI and testing
|
||||
VLLM_CI_USE_S3: bool = False
|
||||
|
||||
# Model redirection and quantization
|
||||
VLLM_MODEL_REDIRECT_PATH: Optional[Path] = None
|
||||
"""Path for model redirection."""
|
||||
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
|
||||
VLLM_MXFP4_USE_MARLIN: Optional[bool] = None
|
||||
|
||||
# Cache settings
|
||||
VLLM_V0_USE_OUTLINES_CACHE: bool = False
|
||||
VLLM_V1_USE_OUTLINES_CACHE: bool = False
|
||||
|
||||
# TPU settings
|
||||
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
|
||||
VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
|
||||
VLLM_TPU_USING_PATHWAYS: bool = False
|
||||
|
||||
# DeepGemm settings
|
||||
VLLM_USE_DEEP_GEMM: bool = False
|
||||
VLLM_USE_DEEP_GEMM_E8M0: bool = True
|
||||
VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
|
||||
|
||||
# FlashInfer settings
|
||||
VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
|
||||
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_FP4: bool = False
|
||||
VLLM_FLASHINFER_MOE_BACKEND: str = "throughput"
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
|
||||
|
||||
# Additional settings
|
||||
VLLM_XGRAMMAR_CACHE_MB: int = 512
|
||||
VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
|
||||
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
|
||||
|
||||
# NIXL settings
|
||||
VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
|
||||
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
||||
|
||||
# Communication backends
|
||||
VLLM_ALL2ALL_BACKEND: str = "naive"
|
||||
|
||||
# Expert parallel settings
|
||||
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
|
||||
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: str = ""
|
||||
|
||||
# Tool and timeout settings
|
||||
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
|
||||
VLLM_SLEEP_WHEN_IDLE: bool = False
|
||||
VLLM_MQ_MAX_CHUNK_BYTES_MB: int = 16
|
||||
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
|
||||
|
||||
# Cache and memory settings
|
||||
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
|
||||
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
|
||||
VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
|
||||
|
||||
# CUDA specific settings
|
||||
VLLM_USE_CUDNN_PREFILL: bool = False
|
||||
VLLM_USE_TRTLLM_ATTENTION: Optional[str] = None
|
||||
VLLM_HAS_FLASHINFER_CUBIN: bool = False
|
||||
VLLM_USE_TRTLLM_FP4_GEMM: bool = False
|
||||
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
|
||||
|
||||
# Network settings
|
||||
VLLM_LOOPBACK_IP: str = ""
|
||||
VLLM_PROCESS_NAME_PREFIX: str = "VLLM"
|
||||
|
||||
# Attention and cache management
|
||||
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
|
||||
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
|
||||
VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
|
||||
|
||||
# Configuration folder
|
||||
VLLM_TUNED_CONFIG_FOLDER: Optional[Path] = None
|
||||
"""Allows vllm to find tuned config under customized folder."""
|
||||
|
||||
|
||||
# Create a dictionary of all defaults for easy access
|
||||
# This will be used by __init__.py for standardized parsing
|
||||
__defaults = {name: value for name, value in globals().items()
|
||||
if not name.startswith('_') and not callable(value) and name.isupper()}
|
||||
Reference in New Issue
Block a user