potential hang fix

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-09-26 10:24:13 -07:00
2 changed files with 11 additions and 78 deletions
--- a/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@ -582,7 +582,9 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
        auto problem_shape = params.problem_shape;
 	auto local_split_kv = params.split_kv;
        if (params.mainloop.ptr_seq != nullptr) {
-          get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+          auto seqlen = params.mainloop.ptr_seq[get<2>(blk_coord)];
+          if (seqlen == 0) continue;
+          get<1>(problem_shape) = seqlen;
 	  if (params.ptr_split_kv != nullptr) {
            local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
          }
@ -607,7 +609,9 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
 	  auto problem_shape = params.problem_shape;
 	  auto local_split_kv = params.split_kv;
          if (params.mainloop.ptr_seq != nullptr) {
-            get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+          auto seqlen = params.mainloop.ptr_seq[get<2>(blk_coord)];
+          if (seqlen == 0) continue;
+          get<1>(problem_shape) = seqlen;
 	    if (params.ptr_split_kv != nullptr) {
              local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
            }
@ -636,7 +640,9 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
 	    auto problem_shape = params.problem_shape;
 	    auto local_split_kv = params.split_kv;
            if (params.mainloop.ptr_seq != nullptr) {
-              get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)];
+            auto seqlen = params.mainloop.ptr_seq[get<2>(blk_coord)];
+            if (seqlen == 0) continue;
+            get<1>(problem_shape) = seqlen;
 	      if (params.ptr_split_kv != nullptr) {
 	        local_split_kv = params.ptr_split_kv[get<2>(blk_coord)];
 	      }
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@ -20,80 +20,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

-Install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run:
-
-```bash
-sudo apt-get update -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
-sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-```
-
-Clone the vLLM project:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git vllm_source
-cd vllm_source
-```
-
-Install the required dependencies:
-
-```bash
-uv pip install -r requirements/cpu-build.txt --torch-backend cpu
-uv pip install -r requirements/cpu.txt --torch-backend cpu
-```
-
-??? console "pip"
-    ```bash
-    pip install --upgrade pip
-    pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-    pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-    ```
-
-Build and install vLLM:
-
-```bash
-VLLM_TARGET_DEVICE=cpu uv pip install . --no-build-isolation
-```
-
-If you want to develop vLLM, install it in editable mode instead.
-
-```bash
-VLLM_TARGET_DEVICE=cpu uv pip install -e . --no-build-isolation
-```
-
-Optionally, build a portable wheel which you can then install elsewhere:
-
-```bash
-VLLM_TARGET_DEVICE=cpu uv build --wheel
-```
-
-```bash
-uv pip install dist/*.whl
-```
-
-??? console "pip"
-    ```bash
-    VLLM_TARGET_DEVICE=cpu python -m build --wheel --no-isolation
-    ```
-
-    ```bash
-    pip install dist/*.whl
-    ```
-
-!!! example "Troubleshooting"
-    - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
-    - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
-    - `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
-    - If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
-    ```toml title="pyproject.toml"
-    [build-system]
-    requires = [
-      "cmake>=3.26.1",
-      ...
-      "torch==X.Y.Z+cpu"   # <-------
-    ]
-    ```
-    - If you are building vLLM from source and not using the pre-built images, remember to set `LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD"` on x86 machines before running vLLM.
+--8<-- "docs/getting_started/installation/cpu/build.inc.md"

 # --8<-- [end:build-wheel-from-source]
 # --8<-- [start:pre-built-images]
@ -130,4 +57,4 @@ docker run --rm \

 # --8<-- [end:build-image-from-source]
 # --8<-- [start:extra-information]
-# --8<-- [end:extra-information]
+# --8<-- [end:extra-information]