Compare commits

..

2 Commits

Author SHA1 Message Date
ebfce922f9 full cg support
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-26 12:51:46 -07:00
b761df963c [Doc]: improve CPU(x86) build-wheel-from-source section (#25617)
Signed-off-by: Kosseila (CloudThrill) <klouddude@gmail.com>
2025-09-26 10:26:33 -07:00
4 changed files with 106 additions and 17 deletions

View File

@ -582,9 +582,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
auto problem_shape = params.problem_shape;
auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
auto seqlen = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (seqlen == 0) continue;
get<1>(problem_shape) = seqlen;
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
@ -609,9 +607,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
auto problem_shape = params.problem_shape;
auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
auto seqlen = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (seqlen == 0) continue;
get<1>(problem_shape) = seqlen;
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}
@ -640,9 +636,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
auto problem_shape = params.problem_shape;
auto local_split_kv = params.split_kv;
if (params.mainloop.ptr_seq != nullptr) {
auto seqlen = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (seqlen == 0) continue;
get<1>(problem_shape) = seqlen;
get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
if (params.ptr_split_kv != nullptr) {
local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
}

View File

@ -20,7 +20,80 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
# --8<-- [end:pre-built-wheels]
# --8<-- [start:build-wheel-from-source]
--8<-- "docs/getting_started/installation/cpu/build.inc.md"
Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
```bash
sudo apt-get update -y
sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
```
Clone the vLLM project:
```bash
git clone https://github.com/vllm-project/vllm.git vllm_source
cd vllm_source
```
Install the required dependencies:
```bash
uv pip install -r requirements/cpu-build.txt --torch-backend cpu
uv pip install -r requirements/cpu.txt --torch-backend cpu
```
??? console "pip"
```bash
pip install --upgrade pip
pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Build and install vLLM:
```bash
VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
```
If you want to develop vLLM, install it in editable mode instead.
```bash
VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
```
Optionally, build a portable wheel which you can then install elsewhere:
```bash
VLLM_TARGET_DEVICE=cpu uv build --wheel
```
```bash
uv pip install dist/*.whl
```
??? console "pip"
```bash
VLLM_TARGET_DEVICE=cpu python -m build --wheel --no-isolation
```
```bash
pip install dist/*.whl
```
!!! example "Troubleshooting"
- **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
- **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
- `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
- If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
```toml title="pyproject.toml"
[build-system]
requires = [
"cmake>=3.26.1",
...
"torch==X.Y.Z+cpu" # <-------
]
```
- If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
# --8<-- [end:build-wheel-from-source]
# --8<-- [start:pre-built-images]
@ -57,4 +130,4 @@ docker run --rm \
# --8<-- [end:build-image-from-source]
# --8<-- [start:extra-information]
# --8<-- [end:extra-information]
# --8<-- [end:extra-information]

View File

@ -1,16 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional, Union
from typing import ClassVar, Optional, Union
import torch
from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
from vllm.attention.backends.abstract import AttentionLayer, AttentionType
from vllm.logger import init_logger
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
MLACommonImpl,
MLACommonMetadata)
MLACommonMetadata,
MLACommonMetadataBuilder)
logger = init_logger(__name__)
@ -23,6 +25,10 @@ class FlashInferMLABackend(MLACommonBackend):
def get_name() -> str:
return "FLASHINFER_MLA"
@staticmethod
def get_builder_cls() -> type["FlashInferMLAMetadataBuilder"]:
return FlashInferMLAMetadataBuilder
@staticmethod
def get_impl_cls() -> type["FlashInferMLAImpl"]:
return FlashInferMLAImpl
@ -34,6 +40,11 @@ g_fi_workspace = torch.zeros(
device="cuda",
)
class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
cudagraph_support: ClassVar[
AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
pass
class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional, Union
from typing import ClassVar, Optional, Union
import torch
@ -13,9 +13,11 @@ from vllm.attention.ops.triton_flash_attention import triton_attention
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.triton_utils import HAS_TRITON
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
MLACommonImpl,
MLACommonMetadata)
MLACommonMetadata,
MLACommonMetadataBuilder)
logger = init_logger(__name__)
@ -24,12 +26,21 @@ class TritonMLABackend(MLACommonBackend):
@staticmethod
def get_name() -> str:
return "TRITON_MLA"
return "TRITON_MLA_VLLM_V1"
@staticmethod
def get_builder_cls() -> type["TritonMLAMetadataBuilder"]:
return TritonMLAMetadataBuilder
@staticmethod
def get_impl_cls() -> type["TritonMLAImpl"]:
return TritonMLAImpl
class TritonMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
cudagraph_support: ClassVar[AttentionCGSupport] = \
AttentionCGSupport.UNIFORM_BATCH
pass
class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
can_return_lse_for_decode: bool = True