full cg support

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
[Doc]: improve CPU(x86) build-wheel-from-source section (#25617 )
2025-09-26 12:51:46 -07:00 · 2025-09-26 10:26:33 -07:00
4 changed files with 106 additions and 17 deletions
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@ -582,9 +582,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
        auto problem_shape = params.problem_shape;
 	auto local_split_kv = params.split_kv;
        if (params.mainloop.ptr_seq != nullptr) {
-          auto seqlen = params.mainloop.ptr_seq[get<2>(blk_coord)];
-          if (seqlen == 0) continue;
-          get<1>(problem_shape) = seqlen;
+          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
 	  if (params.ptr_split_kv != nullptr) {
            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
          }
@ -609,9 +607,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
 	  auto problem_shape = params.problem_shape;
 	  auto local_split_kv = params.split_kv;
          if (params.mainloop.ptr_seq != nullptr) {
-          auto seqlen = params.mainloop.ptr_seq[get<2>(blk_coord)];
-          if (seqlen == 0) continue;
-          get<1>(problem_shape) = seqlen;
+            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
 	    if (params.ptr_split_kv != nullptr) {
              local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
            }
@ -640,9 +636,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
 	    auto problem_shape = params.problem_shape;
 	    auto local_split_kv = params.split_kv;
            if (params.mainloop.ptr_seq != nullptr) {
-            auto seqlen = params.mainloop.ptr_seq[get<2>(blk_coord)];
-            if (seqlen == 0) continue;
-            get<1>(problem_shape) = seqlen;
+              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
 	      if (params.ptr_split_kv != nullptr) {
 	        local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
 	      }
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@ -20,7 +20,80 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

--8<-- "docs/getting_started/installation/cpu/build.inc.md"
+Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
+
+```bash
+sudo apt-get update -y
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+```
+
+Clone the vLLM project:
+
+```bash
+git clone https://github.com/vllm-project/vllm.git vllm_source
+cd vllm_source
+```
+
+Install the required dependencies:
+
+```bash
+uv pip install -r requirements/cpu-build.txt --torch-backend cpu
+uv pip install -r requirements/cpu.txt --torch-backend cpu
+```
+
+??? console "pip"
+    ```bash
+    pip install --upgrade pip
+    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    ```
+
+Build and install vLLM:
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
+```
+
+If you want to develop vLLM, install it in editable mode instead.
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
+```
+
+Optionally, build a portable wheel which you can then install elsewhere:
+
+```bash
+VLLM_TARGET_DEVICE=cpu uv build --wheel
+```
+
+```bash
+uv pip install dist/*.whl
+```
+
+??? console "pip"
+    ```bash
+    VLLM_TARGET_DEVICE=cpu python -m build --wheel --no-isolation
+    ```
+
+    ```bash
+    pip install dist/*.whl
+    ```
+
+!!! example "Troubleshooting"
+    - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
+    - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
+    - `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
+    - If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
+    ```toml title="pyproject.toml"
+    [build-system]
+    requires = [
+      "cmake>=3.26.1",
+      ...
+      "torch==X.Y.Z+cpu"   # <-------
+    ]
+    ```
+    - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.

 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
@ -57,4 +130,4 @@ docker run --rm \

 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
-# --8<-- [end:extra-information]
+# --8<-- [end:extra-information]
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@ -1,16 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional, Union
+from typing import ClassVar, Optional, Union

 import torch
 from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla

 from vllm.attention.backends.abstract import AttentionLayer, AttentionType
 from vllm.logger import init_logger
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                   MLACommonImpl,
-                                                   MLACommonMetadata)
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)

 logger = init_logger(__name__)

@ -23,6 +25,10 @@ class FlashInferMLABackend(MLACommonBackend):
    def get_name() -> str:
        return "FLASHINFER_MLA"

+    @staticmethod
+    def get_builder_cls() -> type["FlashInferMLAMetadataBuilder"]:
+        return FlashInferMLAMetadataBuilder
+
    @staticmethod
    def get_impl_cls() -> type["FlashInferMLAImpl"]:
        return FlashInferMLAImpl
@ -34,6 +40,11 @@ g_fi_workspace = torch.zeros(
    device="cuda",
 )

+class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
+    cudagraph_support: ClassVar[
+        AttentionCGSupport] = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+    pass
+

 class FlashInferMLAImpl(MLACommonImpl[MLACommonMetadata]):

--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from typing import Optional, Union
+from typing import ClassVar, Optional, Union

 import torch

@ -13,9 +13,11 @@ from vllm.attention.ops.triton_flash_attention import triton_attention
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.triton_utils import HAS_TRITON
+from vllm.v1.attention.backends.utils import AttentionCGSupport
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                   MLACommonImpl,
-                                                   MLACommonMetadata)
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)

 logger = init_logger(__name__)

@ -24,12 +26,21 @@ class TritonMLABackend(MLACommonBackend):

    @staticmethod
    def get_name() -> str:
-        return "TRITON_MLA"
+        return "TRITON_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_builder_cls() -> type["TritonMLAMetadataBuilder"]:
+        return TritonMLAMetadataBuilder

    @staticmethod
    def get_impl_cls() -> type["TritonMLAImpl"]:
        return TritonMLAImpl
-
+    
+    
+class TritonMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
+    cudagraph_support: ClassVar[AttentionCGSupport] = \
+        AttentionCGSupport.UNIFORM_BATCH
+    pass

 class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
    can_return_lse_for_decode: bool = True
Author	SHA1	Message	Date
Lucas Wilkinson	ebfce922f9	full cg support Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-09-26 12:51:46 -07:00
Clouddude	b761df963c	[Doc]: improve CPU(x86) build-wheel-from-source section (#25617 ) Signed-off-by: Kosseila (CloudThrill) <klouddude@gmail.com>	2025-09-26 10:26:33 -07:00