diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 95a3866e6b..b4518866b4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -55,11 +55,6 @@ repos: types_or: [python, pyi] require_serial: true additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] - - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward - name: Run mypy for Python 3.9 - entry: python tools/pre_commit/mypy.py 1 "3.9" - <<: *mypy_common - stages: [manual] # Only run in CI - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.10 entry: python tools/pre_commit/mypy.py 1 "3.10" @@ -75,6 +70,11 @@ repos: entry: python tools/pre_commit/mypy.py 1 "3.12" <<: *mypy_common stages: [manual] # Only run in CI + - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.13 + entry: python tools/pre_commit/mypy.py 1 "3.13" + <<: *mypy_common + stages: [manual] # Only run in CI - id: shellcheck name: Lint shell scripts entry: tools/shellcheck.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 66967b655a..db72244e0e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. # -set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13") +set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13") # Supported AMD GPU architectures. set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151") diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index 66d85eaf51..233ed460fc 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -13,7 +13,7 @@ from datetime import datetime from enum import Enum from http import HTTPStatus from statistics import mean -from typing import NamedTuple, Optional, Union +from typing import NamedTuple, Union import aiohttp # type: ignore import numpy as np # type: ignore @@ -46,9 +46,9 @@ class ConversationSampling(str, Enum): class ClientArgs(NamedTuple): seed: int - max_num_requests: Optional[int] + max_num_requests: int | None skip_first_turn: bool - max_turns: Optional[int] + max_turns: int | None max_active_conversations: int verbose: bool print_content: bool @@ -109,9 +109,9 @@ class RequestStats(NamedTuple): class MetricStats: def __init__(self) -> None: - self.min: Optional[float] = None - self.max: Optional[float] = None - self.avg: Optional[float] = None + self.min: float | None = None + self.max: float | None = None + self.avg: float | None = None self.sum = 0.0 self.count = 0 @@ -143,7 +143,7 @@ class MovingAverage: self.index = 0 self.sum = 0.0 self.count = 0 - self.avg: Optional[float] = None + self.avg: float | None = None def update(self, new_value: float) -> None: if self.count < self.window_size: @@ -198,14 +198,6 @@ class DebugStats: self.logger.info("-" * 50) -# Must support Python 3.8, we can't use str.removeprefix(prefix) -# introduced in Python 3.9 -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix) :] - return text - - def nanosec_to_millisec(value: float) -> float: return value / 1000000.0 @@ -220,8 +212,8 @@ async def send_request( chat_url: str, model: str, stream: bool = True, - min_tokens: Optional[int] = None, - max_tokens: Optional[int] = None, + min_tokens: int | None = None, + max_tokens: int | None = None, ) -> ServerResponse: payload = { "model": model, @@ -250,9 +242,9 @@ async def send_request( timeout = aiohttp.ClientTimeout(total=timeout_sec) valid_response = True - ttft: Optional[float] = None + ttft: float | None = None chunk_delay: list[int] = [] - latency: Optional[float] = None + latency: float | None = None first_chunk = "" generated_text = "" @@ -269,7 +261,7 @@ async def send_request( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk == "[DONE]": # End of stream latency = time.perf_counter_ns() - start_time @@ -364,7 +356,7 @@ async def send_turn( req_args: RequestArgs, verbose: bool, verify_output: bool, -) -> Optional[RequestStats]: +) -> RequestStats | None: assert messages_to_use > 0 assert messages_to_use <= len(conversation_messages) @@ -769,7 +761,7 @@ def get_client_config( "Number of conversations must be equal or larger than the number of clients" ) - max_req_per_client: Optional[int] = None + max_req_per_client: int | None = None if args.max_num_requests is not None: # Max number of requests per client req_per_client = args.max_num_requests // args.num_clients @@ -1032,7 +1024,7 @@ def process_statistics( warmup_percentages: list[float], test_params: dict, verbose: bool, - gen_conv_args: Optional[GenConvArgs] = None, + gen_conv_args: GenConvArgs | None = None, excel_output: bool = False, ) -> None: if len(client_metrics) == 0: diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 2ed02ff9e3..2aed1872ee 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -13,7 +13,7 @@ # vllm-dev: used for development # # Build arguments: -# PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9 +# PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10 # VLLM_CPU_DISABLE_AVX512=false (default)|true # VLLM_CPU_AVX512BF16=false (default)|true # VLLM_CPU_AVX512VNNI=false (default)|true diff --git a/docs/contributing/README.md b/docs/contributing/README.md index b0a95b3b3d..b52bdf7f02 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -54,7 +54,7 @@ For more details about installing from source and installing for other hardware, For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations. !!! tip - vLLM is compatible with Python versions 3.9 to 3.12. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12. + vLLM is compatible with Python versions 3.10 to 3.13. However, vLLM's default [Dockerfile](gh-file:docker/Dockerfile) ships with Python 3.12 and tests in CI (except `mypy`) are run with Python 3.12. Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment. @@ -83,7 +83,7 @@ vLLM's `pre-commit` hooks will now run automatically every time you commit. ```bash pre-commit run --hook-stage manual markdownlint - pre-commit run --hook-stage manual mypy-3.9 + pre-commit run --hook-stage manual mypy-3.10 ``` ### Documentation diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index f8b4f75308..f290836f94 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -20,7 +20,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C ## Requirements -- Python: 3.9 -- 3.12 +- Python: 3.10 -- 3.13 === "Intel/AMD x86" diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md index e688cefea0..45162b86e2 100644 --- a/docs/getting_started/installation/gpu.md +++ b/docs/getting_started/installation/gpu.md @@ -17,7 +17,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G ## Requirements - OS: Linux -- Python: 3.9 -- 3.12 +- Python: 3.10 -- 3.13 !!! note vLLM does not support Windows natively. To run vLLM on Windows, you can use the Windows Subsystem for Linux (WSL) with a compatible Linux distribution, or use some community-maintained forks, e.g. [https://github.com/SystemPanic/vllm-windows](https://github.com/SystemPanic/vllm-windows). diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 2af26626d2..49e1f6fac7 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -8,7 +8,7 @@ This guide will help you quickly get started with vLLM to perform: ## Prerequisites - OS: Linux -- Python: 3.9 -- 3.13 +- Python: 3.10 -- 3.13 ## Installation diff --git a/examples/online_serving/structured_outputs/pyproject.toml b/examples/online_serving/structured_outputs/pyproject.toml index 8f31405ff5..5e366ab0a0 100644 --- a/examples/online_serving/structured_outputs/pyproject.toml +++ b/examples/online_serving/structured_outputs/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "examples-online-structured-outputs" -requires-python = ">=3.9, <3.13" +requires-python = ">=3.10, <3.14" dependencies = ["openai==1.78.1", "pydantic==2.11.4"] version = "0.0.0" diff --git a/pyproject.toml b/pyproject.toml index 471eed98f9..49a7a0b8b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ license-files = ["LICENSE"] readme = "README.md" description = "A high-throughput and memory-efficient inference and serving engine for LLMs" classifiers = [ - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -31,7 +30,7 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Information Analysis", ] -requires-python = ">=3.9,<3.14" +requires-python = ">=3.10,<3.14" dynamic = [ "version", "dependencies", "optional-dependencies"] [project.urls] @@ -79,12 +78,18 @@ ignore = [ "F405", "F403", # lambda expression assignment "E731", + # zip without `strict=` + "B905", # Loop control variable not used within loop body "B007", # f-string format "UP032", # Can remove once 3.10+ is the minimum Python version "UP007", + "UP027", + "UP035", + "UP038", + "UP045", ] [tool.ruff.format] diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 2db6d87ee6..d53ab36493 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -1,8 +1,7 @@ # Common dependencies -r common.txt -numba == 0.60.0; python_version == '3.9' and platform_machine != "s390x" # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding -numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x" +numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding # Dependencies for CPUs packaging>=24.2 diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 3f8b8fca32..ed03247bcf 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -1,8 +1,7 @@ # Common dependencies -r common.txt -numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding -numba == 0.61.2; python_version > '3.9' +numba == 0.61.2 # Required for N-gram speculative decoding # Dependencies for NVIDIA GPUs ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1. diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 33f1bc04ea..dc1a9c0263 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -40,8 +40,7 @@ buildkite-test-collector==0.1.9 genai_perf==0.0.8 tritonclient==2.51.0 -numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding -numba == 0.61.2; python_version > '3.9' +numba == 0.61.2 # Required for N-gram speculative decoding numpy runai-model-streamer[s3,gcs]==0.14.0 fastsafetensors>=0.1.10 diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 9077085f26..d9743f0446 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -1,8 +1,7 @@ # Common dependencies -r common.txt -numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding -numba == 0.61.2; python_version > '3.9' +numba == 0.61.2 # Required for N-gram speculative decoding # Dependencies for AMD GPUs datasets diff --git a/requirements/test.in b/requirements/test.in index ef21d6db5b..b62bf62e57 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -48,8 +48,7 @@ buildkite-test-collector==0.1.9 genai_perf==0.0.8 tritonclient==2.51.0 -numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding -numba == 0.61.2; python_version > '3.9' +numba == 0.61.2 # Required for N-gram speculative decoding numpy runai-model-streamer[s3,gcs]==0.14.0 fastsafetensors>=0.1.10 diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 74f5b05b23..5d52400e50 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -9,7 +9,7 @@ setuptools>=77.0.3,<80.0.0 wheel jinja2>=3.1.6 datasets # for benchmark scripts -numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding +numba == 0.61.2 # Required for N-gram speculative decoding nixl==0.3.0 # for PD disaggregation torch==2.8.0+xpu torchaudio diff --git a/tools/validate_config.py b/tools/validate_config.py index d779edabc8..fb6f0e6a92 100644 --- a/tools/validate_config.py +++ b/tools/validate_config.py @@ -8,6 +8,7 @@ and that each field has a docstring. import ast import inspect import sys +from itertools import pairwise import regex as re @@ -20,19 +21,6 @@ def get_attr_docs(cls_node: ast.ClassDef) -> dict[str, str]: https://davidism.com/mit-license/ """ - def pairwise(iterable): - """ - Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise - - Can be removed when Python 3.9 support is dropped. - """ - iterator = iter(iterable) - a = next(iterator, None) - - for b in iterator: - yield a, b - a = b - out = {} # Consider each pair of nodes. diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 889ebf45b1..ff2196f2d0 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -7,6 +7,7 @@ import inspect import textwrap from collections.abc import Iterable from dataclasses import MISSING, Field, field, fields, is_dataclass, replace +from itertools import pairwise from typing import TYPE_CHECKING, Any, Protocol, TypeVar import regex as re @@ -102,19 +103,6 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]: https://davidism.com/mit-license/ """ - def pairwise(iterable): - """ - Manually implement https://docs.python.org/3/library/itertools.html#itertools.pairwise - - Can be removed when Python 3.9 support is dropped. - """ - iterator = iter(iterable) - a = next(iterator, None) - - for b in iterator: - yield a, b - a = b - try: cls_node = ast.parse(textwrap.dedent(inspect.getsource(cls))).body[0] except (OSError, KeyError, TypeError): diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 0c83d49c45..094bda3f93 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -15,12 +15,7 @@ plugins_loaded = False def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]: - import sys - - if sys.version_info < (3, 10): - from importlib_metadata import entry_points - else: - from importlib.metadata import entry_points + from importlib.metadata import entry_points allowed_plugins = envs.VLLM_PLUGINS diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py index e9935f72c1..06b9e4b12d 100644 --- a/vllm/v1/sample/logits_processor/__init__.py +++ b/vllm/v1/sample/logits_processor/__init__.py @@ -55,12 +55,7 @@ BUILTIN_LOGITS_PROCESSORS: list[type[LogitsProcessor]] = [ def _load_logitsprocs_plugins() -> list[type[LogitsProcessor]]: """Load all installed logit processor plugins""" - import sys - - if sys.version_info < (3, 10): - from importlib_metadata import entry_points - else: - from importlib.metadata import entry_points + from importlib.metadata import entry_points installed_logitsprocs_plugins = entry_points(group=LOGITSPROCS_GROUP) if len(installed_logitsprocs_plugins) == 0: