iterate

Signed-off-by: ShriKode <shrikode@gmail.com>
initial
2025-06-28 22:21:17 +00:00 · 2025-06-27 20:18:15 +00:00 · 2025-06-27 09:04:39 -07:00 · 2025-06-27 09:01:28 -07:00 · 2025-06-27 09:42:22 -06:00 · 2025-06-27 09:00:42 -06:00
182 changed files with 10750 additions and 1809 deletions
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -159,6 +159,8 @@ run_and_track_test 14 "test_tpu_qkv_linear.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
 run_and_track_test 15 "test_spmd_model_weight_loading.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
+run_and_track_test 16 "test_kv_cache_update_kernel.py" \
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"

 # After all tests have been attempted, exit with the overall status.
 if [ "$overall_script_exit_code" -ne 0 ]; then
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -28,4 +28,5 @@ docker run \
    sh -c '
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
 '
--- a/.buildkite/scripts/tpu/docker_run_bm.sh
+++ b/.buildkite/scripts/tpu/docker_run_bm.sh
@ -68,7 +68,7 @@ docker run \

 echo "run script..."
 echo
-docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
+docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"

 echo "copy result back..."
 VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -41,6 +41,16 @@ steps:
  # TODO: add `--strict` once warnings in docstrings are fixed
  - mkdocs build

+- label: Pytorch Nightly Dependency Override Check # 2min
+  # if this test fails, it means the nightly torch version is not compatible with some
+  # of the dependencies. Please check the error message and add the package to whitelist
+  # in /vllm/tools/generate_nightly_torch_test.py
+  soft_fail: true
+  source_file_dependencies:
+  - requirements/nightly_torch_test.txt
+  commands:
+  - bash standalone_tests/pytorch_nightly_dependency.sh
+
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
@ -168,6 +178,23 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

+- label: EPLB Algorithm Test
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_algo.py
+  commands:
+  - pytest -v -s distributed/test_eplb_algo.py
+
+- label: EPLB Execution Test # 5min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/eplb
+  - tests/distributed/test_eplb_execute.py
+  commands:
+  - pytest -v -s distributed/test_eplb_execute.py
+
 - label: Metrics, Tracing Test # 10min
  mirror_hardwares: [amdexperimental, amdproduction]
  num_gpus: 2
@ -615,13 +642,18 @@ steps:
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
+  - tests/examples/offline_inference/data_parallel.py
  commands:
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
+    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code

 - label: Distributed Tests (2 GPUs) # 40min
  mirror_hardwares: [amdexperimental]
@ -745,7 +777,7 @@ steps:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt

 - label: Weight Loading Multiple GPU Test - Large Models # optional
-  mirror_hardwares: [amdexperimental] 
+  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  gpu: a100
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -18,6 +18,10 @@
 /vllm/entrypoints @aarnphm
 CMakeLists.txt @tlrmchlsmth

+# Any change to the VllmConfig changes can have a large user-facing impact,
+# so spam a lot of people
+/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
+
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/structured_output @mgoin @russellb @aarnphm
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -53,6 +53,11 @@ repos:
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
+  - id: format-torch-nightly-test
+    name: reformat nightly_torch_test.txt to be in sync with test.in
+    language: python
+    entry: python tools/generate_nightly_torch_test.py
+    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
    name: Run mypy for local Python installation
    entry: tools/mypy.sh 0 "local"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -513,6 +513,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
      CUDA_ARCHS "${FP4_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
  else()
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
@ -547,8 +548,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # if it's possible to compile MoE kernels that use its output.
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
-             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
@ -566,6 +566,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

+  # moe_data.cu is used by all CUTLASS MoE kernels.
+  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+  endif() 
+
  #
  # Machete kernels

@ -638,6 +648,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 # if CUDA endif
 endif()

+if (VLLM_GPU_LANG STREQUAL "HIP")
+  # Add QuickReduce kernels
+  list(APPEND VLLM_EXT_SRC
+    "csrc/custom_quickreduce.cu"
+  )
+# if ROCM endif
+endif()
+
 message(STATUS "Enabling C extension.")
 define_gpu_extension_target(
  _C
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
 datasets supported on vLLM. It’s a living document, updated as new features and datasets
 become available.

-## Dataset Overview
+**Dataset Overview**

 <table style="width:100%; border-collapse: collapse;">
  <thead>
@ -82,7 +82,10 @@ become available.
 **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`

 ---
-## Example - Online Benchmark
+<details>
+<summary><b>🚀 Example - Online Benchmark</b></summary>
+
+<br/>

 First start serving your model

@ -130,7 +133,8 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```

-### Custom Dataset
+**Custom Dataset**
+
 If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl

 ```
@ -162,7 +166,7 @@ python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detaile

 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.

-### VisionArena Benchmark for Vision Language Models
+**VisionArena Benchmark for Vision Language Models**

 ```bash
 # need a model with vision capability here
@ -180,7 +184,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts 1000
 ```

-### InstructCoder Benchmark with Speculative Decoding
+**InstructCoder Benchmark with Speculative Decoding**

 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@ -197,7 +201,7 @@ python3 benchmarks/benchmark_serving.py \
    --num-prompts 2048
 ```

-### Other HuggingFaceDataset Examples
+**Other HuggingFaceDataset Examples**

 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
@ -251,7 +255,7 @@ python3 vllm/benchmarks/benchmark_serving.py \
    --num-prompts 80
 ```

-### Running With Sampling Parameters
+**Running With Sampling Parameters**

 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
@ -269,8 +273,27 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts 10
 ```

---
-## Example - Offline Throughput Benchmark
+**Running With Ramp-Up Request Rate**
+
+The benchmark tool also supports ramping up the request rate over the
+duration of the benchmark run. This can be useful for stress testing the
+server or finding the maximum throughput that it can handle, given some latency budget.
+
+Two ramp-up strategies are supported:
+- `linear`: Increases the request rate linearly from a start value to an end value.
+- `exponential`: Increases the request rate exponentially.
+
+The following arguments can be used to control the ramp-up:
+- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
+- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
+- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
+
+</details>
+
+<details>
+<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
+
+<br/>

 ```bash
 python3 vllm/benchmarks/benchmark_throughput.py \
@ -288,7 +311,7 @@ Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```

-### VisionArena Benchmark for Vision Language Models
+**VisionArena Benchmark for Vision Language Models**

 ``` bash
 python3 vllm/benchmarks/benchmark_throughput.py \
@ -308,7 +331,7 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```

-### InstructCoder Benchmark with Speculative Decoding
+**InstructCoder Benchmark with Speculative Decoding**

 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
@ -332,7 +355,7 @@ Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```

-### Other HuggingFaceDataset Examples
+**Other HuggingFaceDataset Examples**

 **`lmms-lab/LLaVA-OneVision-Data`**

@ -371,7 +394,7 @@ python3 benchmarks/benchmark_throughput.py \
  --num-prompts 10
 ```

-### Benchmark with LoRA Adapters
+**Benchmark with LoRA Adapters**

 ``` bash
 # download dataset
@ -388,18 +411,22 @@ python3 vllm/benchmarks/benchmark_throughput.py \
  --lora-path yard1/llama-2-7b-sql-lora-test
  ```

---
-## Example - Structured Output Benchmark
+</details>
+
+<details>
+<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
+
+<br/>

 Benchmark the performance of structured output generation (JSON, grammar, regex).

-### Server Setup
+**Server Setup**

 ```bash
 vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
 ```

-### JSON Schema Benchmark
+**JSON Schema Benchmark**

 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -411,7 +438,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
  --num-prompts 1000
 ```

-### Grammar-based Generation Benchmark
+**Grammar-based Generation Benchmark**

 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -423,7 +450,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
  --num-prompts 1000
 ```

-### Regex-based Generation Benchmark
+**Regex-based Generation Benchmark**

 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -434,7 +461,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
  --num-prompts 1000
 ```

-### Choice-based Generation Benchmark
+**Choice-based Generation Benchmark**

 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -445,7 +472,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
  --num-prompts 1000
 ```

-### XGrammar Benchmark Dataset
+**XGrammar Benchmark Dataset**

 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@ -456,12 +483,16 @@ python3 benchmarks/benchmark_serving_structured_output.py \
  --num-prompts 1000
 ```

---
-## Example - Long Document QA Throughput Benchmark
+</details>
+
+<details>
+<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
+
+<br/>

 Benchmark the performance of long document question-answering with prefix caching.

-### Basic Long Document QA Test
+**Basic Long Document QA Test**

 ```bash
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
@ -473,7 +504,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
  --repeat-count 5
 ```

-### Different Repeat Modes
+**Different Repeat Modes**

 ```bash
 # Random mode (default) - shuffle prompts randomly
@ -504,12 +535,16 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
  --repeat-mode interleave
 ```

---
-## Example - Prefix Caching Benchmark
+</details>
+
+<details>
+<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
+
+<br/>

 Benchmark the efficiency of automatic prefix caching.

-### Fixed Prompt with Prefix Caching
+**Fixed Prompt with Prefix Caching**

 ```bash
 python3 benchmarks/benchmark_prefix_caching.py \
@ -520,7 +555,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
  --input-length-range 128:256
 ```

-### ShareGPT Dataset with Prefix Caching
+**ShareGPT Dataset with Prefix Caching**

 ```bash
 # download dataset
@ -535,12 +570,16 @@ python3 benchmarks/benchmark_prefix_caching.py \
  --input-length-range 128:256
 ```

---
-## Example - Request Prioritization Benchmark
+</details>
+
+<details>
+<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
+
+<br/>

 Benchmark the performance of request prioritization in vLLM.

-### Basic Prioritization Test
+**Basic Prioritization Test**

 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@ -551,7 +590,7 @@ python3 benchmarks/benchmark_prioritization.py \
  --scheduling-policy priority
 ```

-### Multiple Sequences per Prompt
+**Multiple Sequences per Prompt**

 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@ -562,3 +601,5 @@ python3 benchmarks/benchmark_prioritization.py \
  --scheduling-policy priority \
  --n 2
 ```
+
+</details>
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -349,8 +349,9 @@ class RandomDataset(BenchmarkDataset):
            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
            # To avoid uncontrolled change of the prompt length,
            # the encoded sequence is truncated before being decode again.
+            total_input_len = prefix_len + int(input_lens[i])
            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
-                : input_lens[i]
+                :total_input_len
            ]
            prompt = tokenizer.decode(re_encoded_sequence)
            total_input_len = len(re_encoded_sequence)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -33,7 +33,7 @@ import warnings
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, Optional
+from typing import Any, Literal, Optional

 import numpy as np
 from tqdm.asyncio import tqdm
@ -107,14 +107,42 @@ class BenchmarkMetrics:
    percentiles_e2el_ms: list[tuple[float, float]]


+def _get_current_request_rate(
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
+    ramp_up_start_rps: Optional[int],
+    ramp_up_end_rps: Optional[int],
+    request_index: int,
+    total_requests: int,
+    request_rate: float,
+) -> float:
+    if (
+        ramp_up_strategy
+        and ramp_up_start_rps is not None
+        and ramp_up_end_rps is not None
+    ):
+        progress = request_index / max(total_requests - 1, 1)
+        if ramp_up_strategy == "linear":
+            increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
+            return ramp_up_start_rps + increase
+        elif ramp_up_strategy == "exponential":
+            ratio = ramp_up_end_rps / ramp_up_start_rps
+            return ramp_up_start_rps * (ratio**progress)
+        else:
+            raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
+    return request_rate
+
+
 async def get_request(
    input_requests: list[SampleRequest],
    request_rate: float,
    burstiness: float = 1.0,
-) -> AsyncGenerator[SampleRequest, None]:
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
+) -> AsyncGenerator[tuple[SampleRequest, float], None]:
    """
    Asynchronously generates requests at a specified rate
-    with OPTIONAL burstiness.
+    with OPTIONAL burstiness and OPTIONAL ramp-up strategy.

    Args:
        input_requests:
@ -129,22 +157,44 @@ async def get_request(
            A lower burstiness value (0 < burstiness < 1) results
            in more bursty requests, while a higher burstiness value
            (burstiness > 1) results in a more uniform arrival of requests.
+         ramp_up_strategy (optional):
+            The ramp-up strategy. Can be "linear" or "exponential".
+            If None, uses constant request rate (specified by request_rate).
+        ramp_up_start_rps (optional):
+            The starting request rate for ramp-up.
+        ramp_up_end_rps (optional):
+            The ending request rate for ramp-up.
    """
-    input_requests: Iterable[SampleRequest] = iter(input_requests)
-
-    # Calculate scale parameter theta to maintain the desired request_rate.
    assert burstiness > 0, (
        f"A positive burstiness factor is expected, but given {burstiness}."
    )
-    theta = 1.0 / (request_rate * burstiness)
+    # Convert to list to get length for ramp-up calculations
+    if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
+        input_requests = list(input_requests)
+
+    total_requests = len(input_requests)
+    request_index = 0

    for request in input_requests:
-        yield request
+        current_request_rate = _get_current_request_rate(
+            ramp_up_strategy,
+            ramp_up_start_rps,
+            ramp_up_end_rps,
+            request_index,
+            total_requests,
+            request_rate,
+        )

-        if request_rate == float("inf"):
+        yield request, current_request_rate
+
+        request_index += 1
+
+        if current_request_rate == float("inf"):
            # If the request rate is infinity, then we don't need to wait.
            continue

+        theta = 1.0 / (current_request_rate * burstiness)
+
        # Sample the request interval from the gamma distribution.
        # If burstiness is 1, it follows exponential distribution.
        interval = np.random.gamma(shape=burstiness, scale=theta)
@ -290,6 +340,9 @@ async def benchmark(
    max_concurrency: Optional[int],
    lora_modules: Optional[Iterable[str]],
    extra_body: Optional[dict],
+    ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
+    ramp_up_start_rps: Optional[int] = None,
+    ramp_up_end_rps: Optional[int] = None,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -353,7 +406,15 @@ async def benchmark(

    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"

-    print(f"Traffic request rate: {request_rate}")
+    if ramp_up_strategy is not None:
+        print(
+            f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
+            f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
+            "the duration of the benchmark."
+        )
+    else:
+        print(f"Traffic request rate: {request_rate} RPS.")
+
    print(f"Burstiness factor: {burstiness} ({distribution})")
    print(f"Maximum request concurrency: {max_concurrency}")

@ -373,7 +434,34 @@ async def benchmark(

    benchmark_start_time = time.perf_counter()
    tasks: list[asyncio.Task] = []
-    async for request in get_request(input_requests, request_rate, burstiness):
+
+    rps_change_events = []
+    last_int_rps = -1
+    if ramp_up_strategy is not None and ramp_up_start_rps is not None:
+        last_int_rps = ramp_up_start_rps
+        rps_change_events.append(
+            {
+                "rps": last_int_rps,
+                "timestamp": datetime.now().isoformat(),
+            }
+        )
+
+    async for request, current_request_rate in get_request(
+        input_requests,
+        request_rate,
+        burstiness,
+        ramp_up_strategy,
+        ramp_up_start_rps,
+        ramp_up_end_rps,
+    ):
+        if ramp_up_strategy is not None:
+            current_int_rps = int(current_request_rate)
+            if current_int_rps > last_int_rps:
+                timestamp = datetime.now().isoformat()
+                for rps_val in range(last_int_rps + 1, current_int_rps + 1):
+                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
+                last_int_rps = current_int_rps
+
        prompt, prompt_len, output_len, mm_content = (
            request.prompt,
            request.prompt_len,
@ -397,11 +485,8 @@ async def benchmark(
            ignore_eos=ignore_eos,
            extra_body=extra_body,
        )
-        tasks.append(
-            asyncio.create_task(
-                limited_request_func(request_func_input=request_func_input, pbar=pbar)
-            )
-        )
+        task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
+        tasks.append(asyncio.create_task(task))
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)

    if profile:
@ -477,6 +562,9 @@ async def benchmark(
        "errors": [output.error for output in outputs],
    }

+    if rps_change_events:
+        result["rps_change_events"] = rps_change_events
+
    def process_one_metric(
        # E.g., "ttft"
        metric_attribute_name: str,
@ -610,6 +698,26 @@ def main(args: argparse.Namespace):
    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
    tokenizer_mode = args.tokenizer_mode

+    # Validate ramp-up arguments
+    if args.ramp_up_strategy is not None:
+        if args.request_rate != float("inf"):
+            raise ValueError(
+                "When using ramp-up, do not specify --request-rate. "
+                "The request rate will be controlled by ramp-up parameters. "
+                "Please remove the --request-rate argument."
+            )
+        if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
+            raise ValueError(
+                "When using --ramp-up-strategy, both --ramp-up-start-rps and "
+                "--ramp-up-end-rps must be specified"
+            )
+        if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
+            raise ValueError("Ramp-up start and end RPS must be non-negative")
+        if args.ramp_up_start_rps > args.ramp_up_end_rps:
+            raise ValueError("Ramp-up start RPS must be less than end RPS")
+        if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
+            raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
+
    if args.base_url is not None:
        api_url = f"{args.base_url}{args.endpoint}"
        base_url = f"{args.base_url}"
@ -802,6 +910,9 @@ def main(args: argparse.Namespace):
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
            extra_body=sampling_params,
+            ramp_up_strategy=args.ramp_up_strategy,
+            ramp_up_start_rps=args.ramp_up_start_rps,
+            ramp_up_end_rps=args.ramp_up_end_rps,
        )
    )

@ -834,6 +945,11 @@ def main(args: argparse.Namespace):
        result_json["burstiness"] = args.burstiness
        result_json["max_concurrency"] = args.max_concurrency

+        if args.ramp_up_strategy is not None:
+            result_json["ramp_up_strategy"] = args.ramp_up_strategy
+            result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
+            result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
+
        # Merge with benchmark result
        result_json = {**result_json, **benchmark_result}

@ -859,7 +975,10 @@ def main(args: argparse.Namespace):
            if args.max_concurrency is not None
            else ""
        )
-        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        if args.ramp_up_strategy is not None:
+            file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        else:
+            file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
        if args.result_filename:
            file_name = args.result_filename
        if args.result_dir:
@ -1225,6 +1344,31 @@ def create_argument_parser():
        "script chooses a LoRA module at random.",
    )

+    parser.add_argument(
+        "--ramp-up-strategy",
+        type=str,
+        default=None,
+        choices=["linear", "exponential"],
+        help="The ramp-up strategy. This would be used to "
+        "ramp up the request rate from initial RPS to final "
+        "RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
+        "over the duration of the benchmark.",
+    )
+    parser.add_argument(
+        "--ramp-up-start-rps",
+        type=int,
+        default=None,
+        help="The starting request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+    parser.add_argument(
+        "--ramp-up-end-rps",
+        type=int,
+        default=None,
+        help="The ending request rate for ramp-up (RPS). "
+        "Needs to be specified when --ramp-up-strategy is used.",
+    )
+
    return parser


--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -19,7 +19,7 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    w8a8_block_fp8_matmul,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, cdiv

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@ -117,14 +117,9 @@ def bench_fp8(
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)

-    def ceil_div(x: int, y: int) -> int:
-        return (x + y - 1) // y
-
-    block_scale_a = torch.rand(
-        (m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
-    )
+    block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32)
    block_scale_b = torch.rand(
-        ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
+        cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32
    )
    block_scale_a_M_major = block_scale_a.t().contiguous().t()
    block_scale_b_K_major = block_scale_b.t().contiguous().t()
--- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@ -85,12 +85,6 @@ def benchmark_shape(m: int,

    # === DeepGEMM Implementation ===
    def deepgemm_gemm():
-        # A quantization is inside the loop as it depends on activations
-        # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
-        # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
-        #     A, block_size[1])
-        # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
-        # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
        deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
                                       (B_deepgemm, B_scale_deepgemm),
                                       C_deepgemm)
@ -98,8 +92,6 @@ def benchmark_shape(m: int,

    # === vLLM Triton Implementation ===
    def vllm_triton_gemm():
-        # A quantization is inside the loop as it depends on activations
-        # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
        return w8a8_block_fp8_matmul(A_vllm,
                                     B_vllm,
                                     A_scale_vllm,
@ -109,9 +101,6 @@ def benchmark_shape(m: int,

    # === vLLM CUTLASS Implementation ===
    def vllm_cutlass_gemm():
-        # A quantization is inside the loop as it depends on activations
-        # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
-        #     A, block_size[1], column_major_scales=True)
        return ops.cutlass_scaled_mm(A_vllm_cutlass,
                                     B_vllm.T,
                                     scale_a=A_scale_vllm_cutlass,
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 763ad155a1c826f71ff318f41edb1e4e5e376ddb
+          GIT_TAG 5f3644181c7a15345ce20bfc65af117d3601b524
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/attention/mla/cutlass_mla_kernels.cu
+++ b/csrc/attention/mla/cutlass_mla_kernels.cu
@ -207,7 +207,7 @@ void cutlass_mla_decode_sm100a(torch::Tensor const& out,
              "page_table must be a 32-bit integer tensor");

  auto in_dtype = q_nope.dtype();
-  at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
  const cudaStream_t stream =
      at::cuda::getCurrentCUDAStream(q_nope.get_device());
  if (in_dtype == at::ScalarType::Half) {
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -131,16 +131,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // Quantization
 #ifdef __AVX512F__
+  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
-      "Tensor? azp) -> ()");
+      "Tensor? azp) -> ()",
+      {stride_tag});
  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);

  // Compute int8 quantized tensor and scaling factor
  ops.def(
      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
-      "Tensor!? azp) -> ()");
+      "Tensor!? azp) -> ()",
+      {stride_tag});
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
@ -148,7 +151,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def(
      "cutlass_scaled_mm(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()");
+      "                  Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
  // quantization.
@ -156,7 +160,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
      "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()");
+      "                  Tensor? azp, Tensor? bias) -> ()",
+      {stride_tag});
  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #elif defined(__powerpc64__)
  // Compute int8 quantized tensor for given scaling factor.
--- a/csrc/custom_quickreduce.cu
+++ b/csrc/custom_quickreduce.cu
@ -0,0 +1,114 @@
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+
+#ifdef USE_ROCM
+
+  #include "quickreduce/quick_reduce.h"
+
+quickreduce::fptr_t init_custom_qr(int64_t rank, int64_t world_size,
+                                   std::optional<int64_t> qr_max_size) {
+  if (world_size > 8)
+    throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size == 6)
+    throw std::invalid_argument("world size == 6 is not supported");
+  if (world_size % 2 != 0)
+    throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (rank < 0 || rank >= world_size)
+    throw std::invalid_argument("invalid rank passed in");
+  quickreduce::DeviceComms* fptr = new quickreduce::DeviceComms();
+  fptr->init(world_size, rank, qr_max_size);
+  return (quickreduce::fptr_t)fptr;
+}
+
+void qr_destroy(quickreduce::fptr_t _fa) {
+  if (_fa) {
+    auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+    fa->destroy();
+    delete fa;
+  }
+}
+
+torch::Tensor qr_get_handle(quickreduce::fptr_t _fa) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  hipIpcMemHandle_t handle = fa->get_handle();
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto data_handle =
+      torch::empty({static_cast<int64_t>(sizeof(hipIpcMemHandle_t))}, options);
+  std::memcpy(data_handle.data_ptr(), &handle, sizeof(hipIpcMemHandle_t));
+  return data_handle;
+}
+
+void qr_open_handles(quickreduce::fptr_t _fa,
+                     const std::vector<torch::Tensor>& handles) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  std::vector<hipIpcMemHandle_t> ipc_handles;
+  ipc_handles.reserve(handles.size());
+  for (auto& handle : handles) {
+    // Ensure the tensor is on the same device as the current device.
+    hipIpcMemHandle_t ipc_handle;
+    std::memcpy(&ipc_handle, handle.data_ptr(), sizeof(hipIpcMemHandle_t));
+    ipc_handles.push_back(ipc_handle);
+  }
+  fa->open_ipc_handles(ipc_handles);
+}
+
+void qr_all_reduce(quickreduce::fptr_t _fa, torch::Tensor& inp,
+                   torch::Tensor& out, int64_t quant_level, bool cast_bf2half) {
+  auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = at::cuda::getCurrentHIPStreamMasqueradingAsCUDA();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
+  TORCH_CHECK_LE(out.numel(), fa->kMaxProblemSize);
+  if (out.scalar_type() == at::ScalarType::Half) {
+    fa->allreduce<half, false>(reinterpret_cast<half*>(inp.data_ptr()),
+                               reinterpret_cast<half*>(out.data_ptr()),
+                               out.numel(), quant_level, stream);
+  } else if (out.scalar_type() == at::ScalarType::BFloat16) {
+    if (cast_bf2half) {
+      fa->allreduce<half, true>(reinterpret_cast<half*>(inp.data_ptr()),
+                                reinterpret_cast<half*>(out.data_ptr()),
+                                out.numel(), quant_level, stream);
+    } else {
+      fa->allreduce<quickreduce::nv_bfloat16, false>(
+          reinterpret_cast<quickreduce::nv_bfloat16*>(inp.data_ptr()),
+          reinterpret_cast<quickreduce::nv_bfloat16*>(out.data_ptr()),
+          out.numel(), quant_level, stream);
+    }
+  } else {
+    throw std::runtime_error(
+        "quick allreduce only supports float16 and bfloat16");
+  }
+}
+
+int64_t qr_max_size() {
+  // The default is 2GB (2,147,483,648 bytes)
+  return static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
+}
+
+  #define INSTANTIATE_FOR_WORLDSIZE(T, Codec, cast_bf2half)       \
+    template struct quickreduce::AllReduceTwoshot<T, Codec<T, 2>, \
+                                                  cast_bf2half>;  \
+    template struct quickreduce::AllReduceTwoshot<T, Codec<T, 4>, \
+                                                  cast_bf2half>;  \
+    template struct quickreduce::AllReduceTwoshot<T, Codec<T, 8>, cast_bf2half>;
+
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, false)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, true)
+INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, true)
+
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecFP, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ4, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ6, false)
+INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ8, false)
+
+#endif  // USE_ROCM
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@ -185,9 +185,7 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
        params.conv_states_ptr = nullptr;
    }

-    // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
            causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
@ -278,9 +276,7 @@ void causal_conv1d_update(const at::Tensor &x,
        params.conv_state_indices_ptr = nullptr;
    }

-    // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
            causal_conv1d_update_cuda<input_t, weight_t>(params, stream);
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -647,9 +647,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                       );

    
-    // Otherwise the kernel will be launched from cuda:0 device
-    // Cast to char to avoid compiler warning about narrowing
-    at::cuda::CUDAGuard device_guard{(char)u.get_device()};
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
    auto stream = at::cuda::getCurrentCUDAStream().stream();
    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -360,3 +360,14 @@ std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
    int64_t size);
 int64_t open_mem_handle(torch::Tensor& mem_handle);
 void free_shared_buffer(int64_t buffer);
+
+#ifdef USE_ROCM
+fptr_t init_custom_qr(int64_t rank, int64_t world_size,
+                      std::optional<int64_t> qr_max_size = std::nullopt);
+void qr_destroy(fptr_t _fa);
+torch::Tensor qr_get_handle(fptr_t _fa);
+void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
+void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+                   int64_t quant_level, bool cast_bf2half = false);
+int64_t qr_max_size();
+#endif
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@ -29,26 +29,12 @@ struct sm100_fp8_config_default {
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_M256 {
-  // M in (128, 256]
+  // M in (64, 256]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _2, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm100_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
-  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_128, _128, _256>;
-  using ClusterShape = Shape<_2, _4, _1>;
+  using ClusterShape = Shape<_2, _1, _1>;
  using Cutlass3xGemm =
      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                            KernelSchedule, EpilogueSchedule>;
@ -57,12 +43,26 @@ struct sm100_fp8_config_M128 {
 template <typename InType, typename OutType,
          template <typename, typename, typename> typename Epilogue>
 struct sm100_fp8_config_M64 {
-  // M in [1, 64]
+  // M in (16, 64]
  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_M16 {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _4, _1>;
  using Cutlass3xGemm =
      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
                            KernelSchedule, EpilogueSchedule>;
@ -82,27 +82,27 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
  using Cutlass3xGemmDefault =
      typename sm100_fp8_config_default<InType, OutType,
                                        Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM16 =
+      typename sm100_fp8_config_M16<InType, OutType, Epilogue>::Cutlass3xGemm;
  using Cutlass3xGemmM64 =
      typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
  using Cutlass3xGemmM256 =
      typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;

  uint32_t const m = a.size(0);
  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2

-  if (mp2 <= 64) {
-    // m in [1, 64]
+  if (mp2 <= 16) {
+    // m in [1, 16]
+    return cutlass_gemm_caller<Cutlass3xGemmM16>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 64) {
+    // m in (16, 64]
    return cutlass_gemm_caller<Cutlass3xGemmM64>(
        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
  } else if (mp2 <= 256) {
-    // m in (128, 256]
+    // m in (64, 256]
    return cutlass_gemm_caller<Cutlass3xGemmM256>(
        out, a, b, std::forward<EpilogueArgs>(args)...);
  } else {
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@ -241,7 +241,7 @@ void get_cutlass_moe_mm_data(
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
 #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM90)
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                 problem_sizes2, input_permutation,
                                 output_permutation, num_experts, n, k,
@ -252,7 +252,7 @@ void get_cutlass_moe_mm_data(
      false,
      "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
      "CUDA device capability: ",
-      version_num, ". Required capability: 90");
+      version_num, ". Required capability: 90 or 100");
 }

 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@ -265,7 +265,8 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
  // This function currently gets compiled only if we have a valid cutlass moe
  // mm to run it for.
  int32_t version_num = get_sm_version_num();
-#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
                                      problem_sizes2, expert_num_tokens,
                                      num_local_experts, padded_m, n, k);
@ -275,7 +276,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
      false,
      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
      "for CUDA device capability: ",
-      version_num, ". Required capability: 90");
+      version_num, ". Required capability: 90 or 100");
 }

 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@ -561,7 +561,7 @@ void scaled_fp4_experts_quant_sm100a(
  TORCH_CHECK(output_scale.size(1) * 4 == padded_k);

  auto in_dtype = input.dtype();
-  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream =
      at::cuda::getCurrentCUDAStream(input.get_device());
  if (in_dtype == at::ScalarType::Half) {
@ -579,4 +579,4 @@ void scaled_fp4_experts_quant_sm100a(
  } else {
    TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
  }
-}
+}
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@ -347,7 +347,7 @@ void scaled_fp4_quant_sm100a(torch::Tensor const& output,
  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
-  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());

  // We don't support e8m0 scales at this moment.
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@ -267,7 +267,7 @@ void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
              B_sf.sizes()[1], ")");

  auto out_dtype = D.dtype();
-  at::cuda::CUDAGuard device_guard{(char)A.get_device()};
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());

  if (out_dtype == at::ScalarType::Half) {
--- a/csrc/quickreduce/base.h
+++ b/csrc/quickreduce/base.h
@ -0,0 +1,338 @@
+#pragma once
+
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bf16.h>
+
+#define __quickreduce_device_inline__ __device__ __forceinline__
+#define __quickreduce_launch_bounds_two_shot__ __launch_bounds__(256, 4)
+#define __quickreduce_launch_bounds_one_shot__ __launch_bounds__(512, 4)
+
+namespace quickreduce {
+
+typedef __hip_bfloat16 nv_bfloat16;
+typedef __hip_bfloat162 nv_bfloat162;
+
+using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+
+// Setup acquire-release semantics for vector memory reads (mubuf instruction)
+// as per architecture.
+#if defined(__gfx942__)
+// CDNA3: Scope bits sc0, sc1
+  #define MUBUF_ACQUIRE 16
+  #define MUBUF_RELEASE 16
+#elif (defined(__gfx908__) || defined(__gfx90a__))
+// CDNA1 and CDNA2 - glc bit
+  #define MUBUF_ACQUIRE 1
+  #define MUBUF_RELEASE 0
+#endif
+
+static constexpr int kNegOne = 0xBC00BC00;  // {-1, -1}, fp16x2_t
+
+// Number of atoms (4xf16x2_t) processed by a single thread
+static constexpr int kAtoms = 8;
+
+// We use a workgroup of 256 threads
+static constexpr int kBlockSize = 256;
+static constexpr int kAtomStride = kBlockSize;
+
+// Size and atom stride of source/destination data that the block will
+// process.
+// Workgroup scope = Tile = (256 threads x 8 atoms x 16B)
+static constexpr int kTileSize = kBlockSize * kAtoms * sizeof(int32x4_t);
+
+// Max number of blocks. 304 CUs on MI300
+static constexpr int kMaxNumBlocks = 304 * 4;
+
+// Standard CDNA wavefront size.
+static constexpr int kWavefront = 64;
+
+// 256 thread, 4 wavefronts.
+static dim3 constexpr kBlockTwoShot = {kWavefront, kBlockSize / kWavefront, 1};
+
+// Number of threads in a group for quantization
+// It corresponds to 32 F16 elements in quantization block
+static constexpr int kThreadGroupSize = 8;
+
+// Methods
+__quickreduce_device_inline__ __host__ unsigned long divceil(unsigned long x,
+                                                             unsigned long y) {
+  return ((x + y - 1) / y);
+}
+
+union BufferResource {
+  __quickreduce_device_inline__ constexpr BufferResource()
+      : config(0x00020000U) {}
+
+  __quickreduce_device_inline__ constexpr BufferResource(void* buffer_address,
+                                                         uint32_t buffer_size)
+      : address(buffer_address), range(buffer_size), config(0x00020000U) {}
+
+  int32x4_t descriptor;
+  struct {
+    void* address;  // 8B, out of which first 48b is address, and 16b is stride
+    // (unused)
+    uint32_t range;   // Byte range for the buffer resource
+    uint32_t config;  // Constant, DFMT=32b
+  };
+};
+
+__quickreduce_device_inline__ static int32x4_t buffer_load_dwordx4(
+    int32x4_t srsrc, int32_t voffset, int32_t soffset,
+    int32_t aux) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
+
+__quickreduce_device_inline__ static void buffer_store_dwordx4(
+    int32x4_t data, int32x4_t srsrc, int32_t voffset, int32_t soffset,
+    int32_t aux) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
+
+__quickreduce_device_inline__ static void set_fp16_ovfl(bool const value) {
+#if defined(__gfx942__)
+  if (value) {
+    asm volatile("s_setreg_imm32_b32 0xdc1, 1;" ::);
+  } else {
+    asm volatile("s_setreg_imm32_b32 0xdc1, 0;" ::);
+  }
+#endif
+}
+union bf162_int_union {
+  int i;
+  nv_bfloat162 bf2;
+};
+
+template <typename T>
+__quickreduce_device_inline__ void packed_assign_add(int32x4_t* A,
+                                                     int32x4_t* B);
+
+template <>
+__quickreduce_device_inline__ void packed_assign_add<half>(int32x4_t* A,
+                                                           int32x4_t* B) {
+  int32x4_t& tR_fragment = A[0];
+  int32x4_t& tA_fragment = B[0];
+
+  asm volatile("v_pk_add_f16 %0, %1, %2"
+               : "=v"(tR_fragment[0])
+               : "v"(tR_fragment[0]), "v"(tA_fragment[0]));
+  asm volatile("v_pk_add_f16 %0, %1, %2"
+               : "=v"(tR_fragment[1])
+               : "v"(tR_fragment[1]), "v"(tA_fragment[1]));
+  asm volatile("v_pk_add_f16 %0, %1, %2"
+               : "=v"(tR_fragment[2])
+               : "v"(tR_fragment[2]), "v"(tA_fragment[2]));
+  asm volatile("v_pk_add_f16 %0, %1, %2"
+               : "=v"(tR_fragment[3])
+               : "v"(tR_fragment[3]), "v"(tA_fragment[3]));
+}
+
+template <>
+__quickreduce_device_inline__ void packed_assign_add<nv_bfloat16>(
+    int32x4_t* A, int32x4_t* B) {
+  nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(A);
+  nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(B);
+#pragma unroll
+  for (int i = 0; i < 4; i++) {
+    tA[i] = __hadd2(tA[i], tB[i]);
+  }
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_max(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_max<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_max_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_max<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hmax2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_min(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_min<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_min_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_min<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hmin2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_abs_max(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_abs_max<half>(int a, int b) {
+  half2 wmaxh2 = __builtin_bit_cast(half2, a);
+  half2 wminh2 = __builtin_bit_cast(half2, b);
+  half2 wblockmaxh2;
+
+  wblockmaxh2.x =
+      __hgt(__habs(wmaxh2.x), __habs(wminh2.x)) ? wmaxh2.x : wminh2.x;
+  wblockmaxh2.y =
+      __hgt(__habs(wmaxh2.y), __habs(wminh2.y)) ? wmaxh2.y : wminh2.y;
+  return __builtin_bit_cast(int, wblockmaxh2);
+}
+
+template <>
+__quickreduce_device_inline__ int packed_abs_max<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2.x = __hgt(__habs(A.bf2.x), __habs(B.bf2.x)) ? A.bf2.x : B.bf2.x;
+  R.bf2.y = __hgt(__habs(A.bf2.y), __habs(B.bf2.y)) ? A.bf2.y : B.bf2.y;
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_add(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_add<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_add<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hadd2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_add<int16_t>(int a, int b) {
+  int result;
+  asm volatile("v_pk_add_i16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_sub(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_sub<half>(int a, int b) {
+  int result;
+
+  // MI300 lacks packed fp16 sub instruction. So we do -1 * min + max
+  asm volatile("v_pk_fma_f16 %0, %1, %2 %3"
+               : "=v"(result)
+               : "v"(kNegOne), "v"(b), "v"(a));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_sub<nv_bfloat16>(int a, int b) {
+  bf162_int_union A, B, R;
+  A.i = a;
+  B.i = b;
+  R.bf2 = __hsub2(A.bf2, B.bf2);
+  return R.i;
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_mul(int a, int b);
+
+template <>
+__quickreduce_device_inline__ int packed_mul<half>(int a, int b) {
+  int result;
+  asm volatile("v_pk_mul_f16 %0, %1, %2" : "=v"(result) : "v"(a), "v"(b));
+  return result;
+}
+
+template <>
+__quickreduce_device_inline__ int packed_mul<nv_bfloat16>(int a, int b) {
+  nv_bfloat162* tA = reinterpret_cast<nv_bfloat162*>(&a);
+  nv_bfloat162* tB = reinterpret_cast<nv_bfloat162*>(&b);
+  nv_bfloat162 tR = __hmul2(*tA, *tB);
+  return *(reinterpret_cast<int*>(&tR));
+}
+
+template <typename T>
+__quickreduce_device_inline__ int packed_rcp(int a);
+
+template <>
+__quickreduce_device_inline__ int packed_rcp<half>(int a) {
+  return __builtin_bit_cast(int, h2rcp(__builtin_bit_cast(half2, a)));
+}
+
+template <>
+__quickreduce_device_inline__ int packed_rcp<nv_bfloat16>(int a) {
+  bf162_int_union A, R;
+  A.i = a;
+  R.bf2 = h2rcp(A.bf2);
+  return R.i;
+}
+
+// changes dtype
+__quickreduce_device_inline__ float T2float_cast(half a) {
+  return __half2float(a);
+}
+
+__quickreduce_device_inline__ float T2float_cast(nv_bfloat16 a) {
+  return __bfloat162float(a);
+}
+
+template <typename T>
+__quickreduce_device_inline__ int group_abs_max(int32x4_t atom) {
+  const int group_leader = (threadIdx.x / kThreadGroupSize) * kThreadGroupSize;
+
+  int wmax, wmin, wblockmax;
+  int a, b;
+  a = packed_max<T>(atom[0], atom[1]);
+  b = packed_max<T>(atom[2], atom[3]);
+
+  wmax = packed_max<T>(a, b);
+
+  a = packed_min<T>(atom[0], atom[1]);
+  b = packed_min<T>(atom[2], atom[3]);
+
+  wmin = packed_min<T>(a, b);
+
+  // Reduce the max among a group of threads
+  // Note: This is basically 2 blocks of values setup as the
+  // upper/lower halves of the f16x2_t
+  for (int i = 1; i < kThreadGroupSize; i <<= 1) {
+    int x = __shfl_down(wmax, i);
+    wmax = packed_max<T>(wmax, x);
+
+    int y = __shfl_down(wmin, i);
+    wmin = packed_min<T>(wmin, y);
+  }
+  wblockmax = packed_abs_max<T>(wmax, wmin);
+  // Share with the cohort
+  wblockmax = __shfl(wblockmax, group_leader);
+  return wblockmax;
+}
+
+__quickreduce_device_inline__ void set_sync_flag(uint32_t* flag_ptr,
+                                                 uint32_t flag) {
+  __atomic_store_n(flag_ptr, flag, __ATOMIC_RELEASE);
+}
+
+__quickreduce_device_inline__ void wait_sync_flag(uint32_t* flag_ptr,
+                                                  uint32_t flag) {
+  while (__atomic_load_n(flag_ptr, __ATOMIC_RELAXED) != flag) {
+  }
+}
+
+}  // namespace quickreduce
--- a/csrc/quickreduce/quick_reduce.h
+++ b/csrc/quickreduce/quick_reduce.h
@ -0,0 +1,196 @@
+#pragma once
+
+#include <vector>
+#include <hip/hip_runtime.h>
+#include "quick_reduce_impl.cuh"
+
+#define HIP_CHECK(err)                                                     \
+  do {                                                                     \
+    hipError_t err_ = (err);                                               \
+    if (err_ != hipSuccess) {                                              \
+      std::printf("HIP error %d at %s:%d. %s\n", err_, __FILE__, __LINE__, \
+                  hipGetErrorString(err_));                                \
+      throw std::runtime_error("HIP error");                               \
+    }                                                                      \
+  } while (0)
+
+namespace quickreduce {
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+template <typename AllReduceKernel, typename T>
+__global__ __quickreduce_launch_bounds_two_shot__ static void
+allreduce_prototype_twoshot(T const* A, T* B, uint32_t N, uint32_t num_blocks,
+                            int rank, uint8_t** dbuffer_list,
+                            uint32_t data_offset, uint32_t flag_color) {
+  int block = blockIdx.x;
+  int grid = gridDim.x;
+
+  while (block < num_blocks) {
+    AllReduceKernel::run(A, B, N, block, rank, dbuffer_list, data_offset,
+                         flag_color);
+    block += grid;
+    flag_color++;
+  }
+}
+
+#define TWOSHOT_DISPATCH(__codec)                                           \
+  if (world_size == 2) {                                                    \
+    using LineCodec = __codec<T, 2>;                                        \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
+    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
+                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
+                       num_blocks, rank, dbuffer_list, data_offset,         \
+                       flag_color);                                         \
+  } else if (world_size == 4) {                                             \
+    using LineCodec = __codec<T, 4>;                                        \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
+    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
+                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
+                       num_blocks, rank, dbuffer_list, data_offset,         \
+                       flag_color);                                         \
+  } else if (world_size == 8) {                                             \
+    using LineCodec = __codec<T, 8>;                                        \
+    using AllReduceKernel = AllReduceTwoshot<T, LineCodec, cast_bf2half>;   \
+    hipLaunchKernelGGL((allreduce_prototype_twoshot<AllReduceKernel, T>),   \
+                       dim3(grid), dim3(kBlockTwoShot), 0, stream, A, B, N, \
+                       num_blocks, rank, dbuffer_list, data_offset,         \
+                       flag_color);                                         \
+  }
+
+enum QuickReduceQuantLevel {
+  F16 = 0,
+  INT8 = 1,
+  INT6 = 2,
+  INT4 = 3,
+};
+
+struct DeviceComms {
+  // Max problem size is 2GB (in bytes) or half of uint32_t max value.
+  int64_t kMaxProblemSize =
+      static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
+
+  // Max TP-8
+  static int constexpr kMaxWorldSize = 8;
+
+  bool initialized = false;
+  uint32_t flag_color = 1;
+  int world_size;
+  int rank;
+
+  uint8_t* dbuffer;
+  uint8_t** dbuffer_list;
+  hipIpcMemHandle_t buffer_ipc_handle;
+  std::vector<hipIpcMemHandle_t> all_buffer_ipc_handles;
+  std::vector<uint8_t*> buffer_list;
+  uint32_t data_offset;
+
+  DeviceComms() : initialized(false), world_size(1), rank(0) {}
+  ~DeviceComms() { destroy(); }
+
+  void init(int world_size, int rank,
+            std::optional<int64_t> max_problem_size = std::nullopt) {
+    destroy();
+    this->world_size = world_size;
+    this->rank = rank;
+    if (max_problem_size.has_value() && max_problem_size.value() > 0) {
+      this->kMaxProblemSize = max_problem_size.value();
+    }
+    // Allocate buffer size for worst case: F16 2-stage buffer.
+    uint32_t flags_buffer_size =
+        2 * world_size * kMaxNumBlocks * sizeof(uint32_t);
+    static int64_t data_buffer_size = 2 * this->kMaxProblemSize;
+    int64_t total_buffer_size = flags_buffer_size + data_buffer_size;
+    data_offset = flags_buffer_size;
+    HIP_CHECK(hipExtMallocWithFlags((void**)&dbuffer, total_buffer_size,
+                                    hipDeviceMallocUncached));
+
+    // Clear the flags buffer.
+    HIP_CHECK(hipMemset(dbuffer, 0, flags_buffer_size));
+
+    // Device-side list of IPC buffers.
+    buffer_list.resize(world_size);
+    HIP_CHECK(hipMalloc(&dbuffer_list, world_size * sizeof(uint8_t*)));
+
+    // Create IPC handles for rank's communication buffer.
+    all_buffer_ipc_handles.resize(world_size);
+    HIP_CHECK(hipIpcGetMemHandle(&buffer_ipc_handle, dbuffer));
+
+    initialized = true;
+  }
+  int get_world_size() { return world_size; }
+  int get_rank() { return rank; }
+  bool status() { return initialized; }
+  hipIpcMemHandle_t const get_handle() { return buffer_ipc_handle; }
+
+  void destroy() {
+    if (initialized) {
+      for (int i = 0; i < world_size; i++) {
+        if (i != rank) {
+          HIP_CHECK(hipIpcCloseMemHandle(dbuffer_list[i]));
+        }
+      }
+
+      HIP_CHECK(hipFree(dbuffer));
+      HIP_CHECK(hipFree(dbuffer_list));
+
+      initialized = false;
+    }
+  }
+
+  void open_ipc_handles(std::vector<hipIpcMemHandle_t> const& ipc_handles) {
+    assert(ipc_handles.size() == all_buffer_ipc_handles.size());
+    for (int i = 0; i < world_size; i++) {
+      all_buffer_ipc_handles[i] = ipc_handles[i];
+    }
+
+    // Open device memory access to the IPC communication buffers.
+    // Note: For our own rank, we do not need to open a handle.
+    for (int i = 0; i < world_size; i++) {
+      if (i != rank) {
+        HIP_CHECK(hipIpcOpenMemHandle((void**)&buffer_list[i],
+                                      all_buffer_ipc_handles[i],
+                                      hipIpcMemLazyEnablePeerAccess));
+      } else {
+        buffer_list[i] = dbuffer;
+      }
+    }
+
+    HIP_CHECK(hipMemcpy(dbuffer_list, buffer_list.data(),
+                        world_size * sizeof(uint8_t*), hipMemcpyHostToDevice));
+  }
+
+  template <typename T, bool cast_bf2half>
+  void allreduce(T const* A, T* B, uint32_t N, int quant_level,
+                 hipStream_t stream) {
+    if (world_size != 2 && world_size != 4 && world_size != 8) {
+      throw std::runtime_error("All Reduce not supported for world_size = " +
+                               std::to_string(world_size));
+    }
+
+    // Configuration.
+    uint32_t msg_size = N * sizeof(T);
+    uint32_t num_blocks = divceil(msg_size, kTileSize);
+    uint32_t grid = min(kMaxNumBlocks, num_blocks);
+    auto quant_level_ = static_cast<QuickReduceQuantLevel>(quant_level);
+    switch (quant_level_) {
+      case QuickReduceQuantLevel::INT8:
+        TWOSHOT_DISPATCH(CodecQ8)
+        break;
+      case QuickReduceQuantLevel::INT6:
+        TWOSHOT_DISPATCH(CodecQ6)
+        break;
+      case QuickReduceQuantLevel::INT4:
+        TWOSHOT_DISPATCH(CodecQ4)
+        break;
+      default:
+        TWOSHOT_DISPATCH(CodecFP)
+        break;
+    }
+    HIP_CHECK(cudaGetLastError());
+    // Rotate the flag color.
+    flag_color += divceil(N, grid);
+  }
+};
+
+}  // namespace quickreduce
--- a/csrc/quickreduce/quick_reduce_impl.cuh
+++ b/csrc/quickreduce/quick_reduce_impl.cuh
@ -0,0 +1,698 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include "base.h"
+
+namespace quickreduce {
+
+struct CodecBase {
+  const int thread;
+  const int rank;
+  const int group_leader;
+  __quickreduce_device_inline__ CodecBase(int thread, int rank)
+      : thread(thread),
+        rank(rank),
+        group_leader((threadIdx.x / kThreadGroupSize) * kThreadGroupSize) {
+    set_fp16_ovfl(true);
+  }
+};
+
+// Default full precision codec.
+template <typename T, int world_size>
+struct CodecFP : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+
+  // Codec tile size process by this workgroup.
+  // Each thread processes atoms of f16x8_t (16B).
+  static constexpr int kRankTransmittedTileSize =
+      kBlockSize * kRankAtoms * sizeof(int32x4_t);
+  static_assert(kRankTransmittedTileSize % 16 == 0,
+                "kRankTransmittedTileSize must be 16B aligned.");
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize =
+      kRankTransmittedTileSize * kWorldSize;
+
+  __quickreduce_device_inline__ CodecFP(int thread, int rank)
+      : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
+                                          const int32x4_t* __restrict__ data) {
+    for (int i = 0; i < kRankAtoms; i++) {
+      __builtin_nontemporal_store(data[i], send_buffer + thread);
+      send_buffer += kAtomStride;
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
+                                          int32x4_t* __restrict__ data) {
+    for (int i = 0; i < kRankAtoms; i++) {
+      data[i] = __builtin_nontemporal_load(*recv_buffer + thread);
+      *recv_buffer += kAtomStride;
+    }
+  }
+};
+
+// Int4 symmetric quantization codec.
+// We quantize the FP16 data to block-scaled Int4 in blocks of 4 *
+// kThreadGroupSize.
+template <typename T, int world_size>
+struct CodecQ4 : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+
+  // Codec tile size process by this workgroup.
+  // Each threads processes a fragment of fp16x8_t (16B),
+  // into a int4x8_t (4B) and a fp16 scale shared among 32 values.
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+  static constexpr int kRankTileStride = 1152;
+  static constexpr int kRankTileScaleOffset = 1024;
+  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
+  static_assert(kRankTransmittedTileSize % 16 == 0,
+                "kRankTransmittedTileSize must be 16B aligned.");
+
+  static constexpr int kRankBufferTileStride =
+      kRankTileStride / sizeof(int32x4_t);
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize =
+      kRankTransmittedTileSize * kWorldSize;
+
+  // Constants configuration
+
+  // {-1/8.0h, -1/8.0h}, f16x2_t
+  static constexpr int kScaleFactor =
+      std::is_same<T, half>::value ? 0xB000B000 : 0xBE00BE00;
+
+  // {1e-7, 1e-7}, f16x2_t
+  static constexpr int kScaleEpsilon =
+      std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
+
+  // {-8, -8}, f16x2_t
+  static constexpr int kRangeMin =
+      std::is_same<T, half>::value ? 0xC800C800 : 0xC100C100;
+
+  // {+7, +7}, f16x2_t
+  static constexpr int kRangeMax =
+      std::is_same<T, half>::value ? 0x47004700 : 0x40E040E0;
+
+  // {+8, +8}, int16x2_t
+  static constexpr int kRangeBias = 0x00080008;
+
+  __quickreduce_device_inline__ CodecQ4(int thread, int rank)
+      : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
+                                          const int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      int32x4_t const atom = data[k];
+
+      // Compute the absolute maximum of the atom in the thread group
+      // In 2 blocks of values, upper/lower halves of the f16x2_t
+      int wblockmax = group_abs_max<T>(atom);
+
+      // Derive scales
+      int decoding_scale;
+      int encoding_scale;
+      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
+      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
+      encoding_scale = packed_rcp<T>(encoding_scale);
+
+      // Apply scales to get quantized values
+      int32x4_t w;
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(atom[i], encoding_scale);
+        w[i] = packed_max<T>(w[i], kRangeMin);
+        w[i] = packed_min<T>(w[i], kRangeMax);
+      }
+
+      // Convert from f16x2_t to uint16x2_t
+      int32x4_t q;
+      {
+        int16_t* qi = reinterpret_cast<int16_t*>(&q);
+        T* wh = reinterpret_cast<T*>(&w);
+        for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
+
+        for (int i = 0; i < 4; i++) {
+          q[i] = packed_add<int16_t>(q[i], kRangeBias);
+        }
+      }
+
+      // Pack 8 x q4 into int32_t
+      int qw = q[0] | (q[1] << 4) | (q[2] << 8) | (q[3] << 12);
+
+      // Write quantized atom to send_buffer
+      // note: only the group leader stores the scale
+      uint8_t* atom_ptr =
+          reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
+      int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      __builtin_nontemporal_store(qw, qw_ptr);
+      if (threadIdx.x == group_leader) {
+        __builtin_nontemporal_store(decoding_scale, qs_ptr);
+      }
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
+                                          int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      // Directly read quantized atom from recv_buffer
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
+      int32_t* qw_ptr = reinterpret_cast<int32_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      int32_t qw = __builtin_nontemporal_load(qw_ptr);
+      int qs = __builtin_nontemporal_load(qs_ptr);
+
+      *recv_buffer += kRankBufferTileStride;
+
+      // Unpack q4 into f16x8_t
+      int32x4_t w;
+      {
+        static constexpr uint kMask000F = 0x000F000F;
+        static constexpr uint kHalf2_1024 =
+            0x64006400;  // {1024.0, 1024.0}, fp16x2_t
+        static uint constexpr kHalf2_1032 =
+            0xE408E408;  // {-1032.0, -1032.0}, fp16x2_t
+
+        for (int i = 0; i < 4; i++) {
+          if constexpr (std::is_same<T, half>::value) {
+            int32_t q4 = ((qw >> (i * 4)) & kMask000F) | kHalf2_1024;
+            w[i] = packed_add<half>(q4, kHalf2_1032);
+          } else {
+            int32_t int16_2 = (qw >> (i * 4)) & kMask000F;
+            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
+            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
+            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
+            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
+            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
+            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
+            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
+          }
+        }
+      }
+
+      // Apply decoding scales
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(w[i], qs);
+      }
+
+      data[k] = w;
+    }
+  }
+};
+
+// Int6 symmetric quantization codec.
+// We quantize the FP16 data to block-scaled Int6 in blocks of 4 *
+// kThreadGroupSize.
+template <typename T, int world_size>
+struct CodecQ6 : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+
+  // Codec tile size process by this workgroup.
+  // Each threads processes a fragment of fp16x8_t (16B),
+  // into a int6x8_t (4B + 2B) and a fp16 scale shared among 32 values.
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+  static constexpr int kRankTileStride = 1664;
+  static constexpr int kRankTileQ2Offset = 1024;
+  static constexpr int kRankTileScaleOffset = 1536;
+  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
+  static_assert(kRankTransmittedTileSize % 16 == 0,
+                "kRankTransmittedTileSize must be 16B aligned.");
+
+  static constexpr int kRankBufferTileStride =
+      kRankTileStride / sizeof(int32x4_t);
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize =
+      kRankTransmittedTileSize * kWorldSize;
+
+  // Constants configuration
+
+  // {-1/32.0h, -1/32.0h}, fp16x2_t
+  static constexpr int kScaleFactor =
+      std::is_same<T, half>::value ? 0xA800A800 : 0xBD00BD00;
+
+  // {1e-7, 1e-7}, fp16x2_t
+  static constexpr int kScaleEpsilon =
+      std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
+
+  // {-32, -32}, fp16x2_t
+  static constexpr int kRangeMin =
+      std::is_same<T, half>::value ? 0xD000D000 : 0xC200C200;
+
+  // {+31, +31}, fp16x2_t
+  static constexpr int kRangeMax =
+      std::is_same<T, half>::value ? 0x4FC04FC0 : 0x41F841F8;
+
+  // {+32, +32}, int16x2_t
+  static constexpr int kRangeBias = 0x00200020;
+
+  __quickreduce_device_inline__ CodecQ6(int thread, int rank)
+      : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
+                                          const int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      int32x4_t const atom = data[k];
+
+      // Compute the absolute maximum of the atom in the thread group
+      // In 2 blocks of values, upper/lower halves of the f16x2_t
+      int wblockmax = group_abs_max<T>(atom);
+
+      // Derive scales
+      int decoding_scale;
+      int encoding_scale;
+      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
+      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
+      encoding_scale = packed_rcp<T>(encoding_scale);
+
+      // Apply scales to get quantized values
+      int32x4_t w;
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(atom[i], encoding_scale);
+        w[i] = packed_max<T>(w[i], kRangeMin);
+        w[i] = packed_min<T>(w[i], kRangeMax);
+      }
+
+      // Convert from f16x2_t to uint16x2_t
+      int32x4_t q;
+      {
+        int16_t* qi = reinterpret_cast<int16_t*>(&q);
+        T* wh = reinterpret_cast<T*>(&w);
+        for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
+
+        for (int i = 0; i < 4; i++) {
+          q[i] = packed_add<int16_t>(q[i], kRangeBias);
+        }
+      }
+
+      // Pack 8 x q6 into int32_t + int16_t
+      uint32_t q4w;
+      uint16_t q2w = 0;
+      q4w = (q[0] & 0x000F000F) | ((q[1] & 0x000F000F) << 4) |
+            ((q[2] & 0x000F000F) << 8) | ((q[3] & 0x000F000F) << 12);
+      {
+        int16_t* tw = reinterpret_cast<int16_t*>(&q);
+#pragma unroll
+        for (int i = 0; i < 8; i++) {
+          q2w |= (tw[i] >> 4) << (i * 2);
+        }
+      }
+      // Write quantized atom to send_buffer
+      // note: only the group leader stores the scale
+      uint8_t* atom_ptr =
+          reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
+      uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
+      uint16_t* q2w_ptr =
+          reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      __builtin_nontemporal_store(q4w, q4w_ptr);
+      __builtin_nontemporal_store(q2w, q2w_ptr);
+      if (threadIdx.x == group_leader) {
+        __builtin_nontemporal_store(decoding_scale, qs_ptr);
+      }
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
+                                          int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      // Directly read quantized atom from recv_buffer
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
+      uint32_t* q4w_ptr = reinterpret_cast<uint32_t*>(atom_ptr) + thread;
+      uint16_t* q2w_ptr =
+          reinterpret_cast<uint16_t*>(atom_ptr + kRankTileQ2Offset) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      uint32_t q4w = __builtin_nontemporal_load(q4w_ptr);
+      uint16_t q2w = __builtin_nontemporal_load(q2w_ptr);
+      int qs = __builtin_nontemporal_load(qs_ptr);
+
+      *recv_buffer += kRankBufferTileStride;
+
+      // Unpack q6 into fp16x8_t
+      int32x4_t w;
+      {
+        static uint constexpr kMask000F = 0x000F000F;
+        static uint constexpr kHalf2_1024 =
+            0x64006400;  // {1024.0, 1024.0}, fp16x2_t
+        static uint constexpr kHalf2_1056 =
+            0xE420E420;  // {-1056.0, -1056.0}, fp16x2_t
+
+#pragma unroll
+        for (int i = 0; i < 4; i++) {
+          int32_t q4 = q4w & kMask000F;
+          int32_t q2 = (q2w & 0x3) | ((q2w & 0xC) << 14);
+          q4w >>= 4;
+          q2w >>= 4;
+          if constexpr (std::is_same<T, half>::value) {
+            int32_t q6 = q4 | (q2 << 4) | kHalf2_1024;
+            asm volatile("v_pk_add_f16 %0, %1, %2"
+                         : "=v"(w[i])
+                         : "v"(q6), "v"(kHalf2_1056));
+          } else {
+            int32_t int16_2 = q4 | (q2 << 4);
+            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
+            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
+
+            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
+            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
+            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
+            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
+            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
+          }
+        }
+      }
+
+      // Apply decoding scales
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(w[i], qs);
+      }
+
+      // That's pretty much it...
+      data[k] = w;
+    }
+  }
+};
+
+// Int8 symmetric quantization codec.
+// We quantize the FP16 data to block-scaled Int8 in blocks of 4 *
+// kThreadGroupSize.
+template <typename T, int world_size>
+struct CodecQ8 : public CodecBase {
+  static constexpr int kWorldSize = world_size;
+
+  // Codec tile size process by this workgroup.
+  // Each threads processes a fragment of f16x8_t (16B),
+  // into a int8x8_t (8B) and a f16 scale shared among 32 values.
+  static constexpr int kRankAtoms = kAtoms / kWorldSize;
+  static constexpr int kRankTileStride = 2176;
+  static constexpr int kRankTileScaleOffset = 2048;
+  static constexpr int kRankTransmittedTileSize = kRankTileStride * kRankAtoms;
+  static_assert(kRankTransmittedTileSize % 16 == 0,
+                "kRankTileSize must be 16B aligned.");
+
+  static constexpr int kRankBufferTileStride =
+      kRankTileStride / sizeof(int32x4_t);
+
+  // Total tile size for the collective communication.
+  static constexpr int kTransmittedTileSize =
+      kRankTransmittedTileSize * kWorldSize;
+
+  // Constants configuration
+
+  // {-1/128.0h, -1/128.0h}, f16x2_t
+  static constexpr int kScaleFactor =
+      std::is_same<T, half>::value ? 0xA000A000 : 0xBC00BC00;
+
+  // {1e-7, 1e-7}, f16x2_t
+  static constexpr int kScaleEpsilon =
+      std::is_same<T, half>::value ? 0x00010001 : 0x33D733D7;
+
+  // {-128, -128}, f16x2_t
+  static constexpr int kRangeMin =
+      std::is_same<T, half>::value ? 0xD800D800 : 0xC300C300;
+  // {+127, +127}, f16x2_t
+  static constexpr int kRangeMax =
+      std::is_same<T, half>::value ? 0x57F057F0 : 0x42FE42FE;
+
+  // {+128, +128}, int16x2_t
+  static constexpr int kRangeBias = 0x00800080;
+
+  __quickreduce_device_inline__ CodecQ8(int thread, int rank)
+      : CodecBase(thread, rank) {}
+
+  __quickreduce_device_inline__ void send(int32x4_t* __restrict__ send_buffer,
+                                          int32x4_t const* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      int32x4_t const atom = data[k];
+      // Compute the absolute maximum of the atom in the thread group
+      // In 2 blocks of values, upper/lower halves of the f16x2_t
+      int wblockmax = group_abs_max<T>(atom);
+
+      // Derive scales
+      int decoding_scale;
+      int encoding_scale;
+      decoding_scale = packed_mul<T>(wblockmax, kScaleFactor);
+      encoding_scale = packed_add<T>(decoding_scale, kScaleEpsilon);
+      encoding_scale = packed_rcp<T>(encoding_scale);
+
+      // Apply scales to get quantized values
+      int32x4_t w;
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(atom[i], encoding_scale);
+        w[i] = packed_max<T>(w[i], kRangeMin);
+        w[i] = packed_min<T>(w[i], kRangeMax);
+      }
+
+      // Convert from f16x2_t to uint16x2_t
+      int32x4_t q;
+      {
+        int16_t* qi = reinterpret_cast<int16_t*>(&q);
+        T* wh = reinterpret_cast<T*>(&w);
+        for (int i = 0; i < 8; i++) qi[i] = (int16_t)rintf(T2float_cast(wh[i]));
+
+        for (int i = 0; i < 4; i++) {
+          q[i] = packed_add<int16_t>(q[i], kRangeBias);
+        }
+      }
+
+      // Pack 8 x q8 into int32x2_t
+      int32x2_t qw;
+      qw[0] = q[0] | (q[1] << 8);
+      qw[1] = q[2] | (q[3] << 8);
+
+      // Write quantized atom to send_buffer
+      // note: only the group leader stores the scale
+      uint8_t* atom_ptr =
+          reinterpret_cast<uint8_t*>(send_buffer + k * kRankBufferTileStride);
+      int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      __builtin_nontemporal_store(qw, qw_ptr);
+      if (threadIdx.x == group_leader) {
+        __builtin_nontemporal_store(decoding_scale, qs_ptr);
+      }
+    }
+  }
+
+  __quickreduce_device_inline__ void recv(int32x4_t** __restrict__ recv_buffer,
+                                          int32x4_t* __restrict__ data) {
+    for (int k = 0; k < kRankAtoms; k++) {
+      // Directly read quantized atom from recv_buffer
+      uint8_t* atom_ptr = reinterpret_cast<uint8_t*>(*recv_buffer);
+      int32x2_t* qw_ptr = reinterpret_cast<int32x2_t*>(atom_ptr) + thread;
+      int* qs_ptr = reinterpret_cast<int*>(atom_ptr + kRankTileScaleOffset) +
+                    (thread / 8);
+
+      int32x2_t qw = __builtin_nontemporal_load(qw_ptr);
+      int qs = __builtin_nontemporal_load(qs_ptr);
+
+      *recv_buffer += kRankBufferTileStride;
+
+      // Unpack q8 into fp16x8_t
+      int32x4_t w;
+      {
+        static uint constexpr kMask00FF = 0x00FF00FF;
+
+        // {1024.0, 1024.0}, fp16x2_t
+        static uint constexpr kHalf2_1024 = 0x64006400;
+
+        // {-1152.0, -1152.0}, fp16x2_t
+        static uint constexpr kHalf2_1152 = 0xE480E480;
+
+#pragma unroll
+        for (int i = 0; i < 4; i++) {
+          if constexpr (std::is_same<T, half>::value) {
+            int32_t q8 =
+                ((qw[i / 2] >> ((i % 2) * 8)) & kMask00FF) | kHalf2_1024;
+            w[i] = packed_add<half>(q8, kHalf2_1152);
+          } else {
+            int32_t int16_2 = (qw[i / 2] >> ((i % 2) * 8)) & kMask00FF;
+            int16_t low = static_cast<int16_t>(int16_2 & 0xFFFF);
+            int16_t high = static_cast<int16_t>((int16_2 >> 16) & 0xFFFF);
+            nv_bfloat16 bf_low = __float2bfloat16(static_cast<float>(low));
+            nv_bfloat16 bf_high = __float2bfloat16(static_cast<float>(high));
+            nv_bfloat162 bf2 = __halves2bfloat162(bf_low, bf_high);
+            int32_t packed_bf16 = *reinterpret_cast<int32_t*>(&bf2);
+            w[i] = packed_add<nv_bfloat16>(packed_bf16, kRangeMin);
+          }
+        }
+      }
+
+      // Apply decoding scales
+      for (int i = 0; i < 4; i++) {
+        w[i] = packed_mul<T>(w[i], qs);
+      }
+
+      data[k] = w;
+    }
+  }
+};
+
+// Twoshot All Reduce
+template <typename T, class Codec, bool cast_bf2half>
+struct AllReduceTwoshot {
+  static_assert(sizeof(T) == 2);
+
+  static constexpr int kWorldSize = Codec::kWorldSize;
+
+  __device__ static void run(
+      T const* __restrict__ input, T* __restrict__ output,
+      uint32_t const N,                    // number of elements
+      int const block,                     // block index
+      int const rank,                      // rank index
+      uint8_t** __restrict__ buffer_list,  // communication buffers
+      uint32_t const data_offset,          // offset to start of the data buffer
+      uint32_t flag_color) {
+    // Topology
+    int thread = threadIdx.x + threadIdx.y * kWavefront;
+    uint8_t* rank_buffer = buffer_list[rank];
+    Codec codec(thread, rank);
+    int block_id = blockIdx.x;
+    int grid_size = gridDim.x;
+    // --------------------------------------------------------
+    // Read input into registers
+    int32x4_t tA[kAtoms];
+
+    BufferResource src_buffer(const_cast<T*>(input), N * sizeof(T));
+    uint32_t src_offset = block * kTileSize + thread * sizeof(int32x4_t);
+
+    for (int i = 0; i < kAtoms; i++) {
+      tA[i] = buffer_load_dwordx4(src_buffer.descriptor, src_offset, 0, 0);
+      src_offset += kAtomStride * sizeof(int32x4_t);
+      if constexpr (cast_bf2half) {
+        const nv_bfloat162* bf_buf =
+            reinterpret_cast<const nv_bfloat162*>(&tA[i]);
+        half2 half_buf[4];
+#pragma unroll
+        for (int j = 0; j < 4; ++j) {
+          float2 f = __bfloat1622float2(bf_buf[j]);
+          half_buf[j] = __float22half2_rn(f);
+        }
+        tA[i] = *reinterpret_cast<const int32x4_t*>(half_buf);
+      }
+    }
+
+    // --------------------------------------------------------
+    // Phase-1A: Write segment data into the communication buffer of the target
+    // rank responsible for this segment.
+    uint32_t comm_data0_offset =
+        data_offset + block_id * Codec::kTransmittedTileSize;
+    uint32_t comm_data1_offset =
+        grid_size * Codec::kTransmittedTileSize + comm_data0_offset;
+
+    uint32_t comm_flags0_offset = block_id * (kWorldSize * sizeof(uint32_t));
+    uint32_t comm_flags1_offset =
+        grid_size * (kWorldSize * sizeof(uint32_t)) + comm_flags0_offset;
+
+    for (int r = 0; r < kWorldSize; r++) {
+      int32x4_t* send_buffer =
+          reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data0_offset +
+                                       rank * Codec::kRankTransmittedTileSize);
+      codec.send(send_buffer, &tA[r * Codec::kRankAtoms]);
+    }
+
+    __syncthreads();
+    if (thread < kWorldSize) {
+      int r = thread;
+      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(
+          buffer_list[r] + comm_flags0_offset + rank * sizeof(uint32_t));
+      set_sync_flag(flag_ptr, flag_color);
+    }
+    // --------------------------------------------------------
+    // Phase-1B: Reduce the segment data from the communication buffers.
+    int32x4_t tR[Codec::kRankAtoms] = {};
+    {
+      // Read the data from the communication buffer.
+      int32x4_t* recv_buffer =
+          reinterpret_cast<int32x4_t*>(rank_buffer + comm_data0_offset);
+      uint32_t* flag_ptr =
+          reinterpret_cast<uint32_t*>(rank_buffer + comm_flags0_offset);
+
+      for (int r = 0; r < kWorldSize; r++) {
+        // Wait for the flags to be set.
+        if (thread == 0) {
+          wait_sync_flag(&flag_ptr[r], flag_color);
+        }
+        __syncthreads();
+
+        // note: we reuse tA as temp buffer here
+        codec.recv(&recv_buffer, tA);
+
+        for (int i = 0; i < Codec::kRankAtoms; i++) {
+          packed_assign_add<T>(&tR[i], &tA[i]);
+        }
+      }
+    }
+
+    // Phase-2: Write the reduced segment to every other rank
+    for (int r = 0; r < kWorldSize; r++) {
+      int32x4_t* send_buffer =
+          reinterpret_cast<int32x4_t*>(buffer_list[r] + comm_data1_offset +
+                                       rank * Codec::kRankTransmittedTileSize);
+      codec.send(send_buffer, tR);
+    }
+
+    __syncthreads();
+    if (thread < kWorldSize) {
+      int r = thread;
+      uint32_t* flag_ptr = reinterpret_cast<uint32_t*>(
+          buffer_list[r] + comm_flags1_offset + rank * sizeof(uint32_t));
+      set_sync_flag(flag_ptr, flag_color);
+    }
+
+    // Phase-2: Read the gather segments from the rank's communication buffer.
+    {
+      // Read the data from the communication buffer.
+      int32x4_t* recv_buffer =
+          reinterpret_cast<int32x4_t*>(rank_buffer + comm_data1_offset);
+      uint32_t* flag_ptr =
+          reinterpret_cast<uint32_t*>(rank_buffer + comm_flags1_offset);
+
+      for (int r = 0; r < kWorldSize; r++) {
+        // Wait for the flags to be set.
+        if (thread == 0) {
+          wait_sync_flag(&flag_ptr[r], flag_color);
+        }
+        __syncthreads();
+
+        // Gather all reduced and final rank segments into tA.
+        codec.recv(&recv_buffer, &tA[r * Codec::kRankAtoms]);
+      }
+    }
+
+    // --------------------------------------------------------
+    // Write the result to output.
+    BufferResource dst_buffer(output, N * sizeof(T));
+    uint32_t dst_offset = block * kTileSize + thread * sizeof(int32x4_t);
+
+    for (int i = 0; i < kAtoms; i++) {
+      if constexpr (cast_bf2half) {
+        const half2* half_buf = reinterpret_cast<const half2*>(&tA[i]);
+        nv_bfloat162 bf16_buf[4];
+#pragma unroll
+        for (int j = 0; j < 4; ++j) {
+          float2 f = __half22float2(half_buf[j]);
+          bf16_buf[j] = __float22bfloat162_rn(f);
+        }
+        buffer_store_dwordx4(*reinterpret_cast<const int32x4_t*>(bf16_buf),
+                             dst_buffer.descriptor, dst_offset, 0, 0);
+      } else {
+        buffer_store_dwordx4(tA[i], dst_buffer.descriptor, dst_offset, 0, 0);
+      }
+      dst_offset += kAtomStride * sizeof(int32x4_t);
+    }
+  }
+};
+
+}  // namespace quickreduce
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -1598,7 +1598,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
  const int warpid = threadIdx.x / WARP_SIZE;
  const int laneid = threadIdx.x % WARP_SIZE;
  const int lane2id = laneid % 2;
-  const int lane4id = laneid % 4;
  const int lane16id = laneid % 16;
  const int rowid = laneid / 16;

@ -1745,7 +1744,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
    const int klocal_token_idx =
        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
-    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;

@ -2368,7 +2366,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
  const int warpid = threadIdx.x / WARP_SIZE;
  const int laneid = threadIdx.x % WARP_SIZE;
  const int lane2id = laneid % 2;
-  const int lane4id = laneid % 4;
  const int lane16id = laneid % 16;
  const int rowid = laneid / 16;

@ -2514,7 +2511,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
    const int klocal_token_idx =
        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
-    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -725,6 +725,24 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);

  custom_ar.def("free_shared_buffer", &free_shared_buffer);
+#ifdef USE_ROCM
+  // Quick Reduce all-reduce kernels
+  custom_ar.def(
+      "qr_all_reduce(int fa, Tensor inp, Tensor out, int quant_level, bool "
+      "cast_bf2half) -> ()");
+  custom_ar.impl("qr_all_reduce", torch::kCUDA, &qr_all_reduce);
+
+  custom_ar.def("init_custom_qr", &init_custom_qr);
+  custom_ar.def("qr_destroy", &qr_destroy);
+
+  custom_ar.def("qr_get_handle", &qr_get_handle);
+
+  custom_ar.def("qr_open_handles(int _fa, Tensor[](b!) handles) -> ()");
+  custom_ar.impl("qr_open_handles", torch::kCPU, &qr_open_handles);
+
+  // Max input size in bytes
+  custom_ar.def("qr_max_size", &qr_max_size);
+#endif
 }

 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -6,30 +6,106 @@
 # docs/assets/contributing/dockerfile-stages-dependency.png

 ARG CUDA_VERSION=12.8.1
+ARG PYTHON_VERSION=3.12
+
+# By parameterizing the base images, we allow third-party to use their own
+# base images. One use case is hermetic builds with base images stored in
+# private registries that use a different repository naming conventions.
+#
+# Example:
+# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+# By parameterizing the Deadsnakes repository URL, we allow third-party to use
+# their own mirror. When doing so, we don't benefit from the transparent
+# installation of the GPG key of the PPA, as done by add-apt-repository, so we
+# also need a URL for the GPG key.
+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
+
+# The PyPA get-pip.py script is a self contained script+zip file, that provides
+# both the installer script and the pip base85-encoded zip archive. This allows
+# bootstrapping pip in environment where a dsitribution package does not exist.
+#
+# By parameterizing the URL for get-pip.py installation script, we allow
+# third-party to use their own copy of the script stored in a private mirror.
+# We set the default value to the PyPA owned get-pip.py script.
+#
+# Reference: https://pip.pypa.io/en/stable/installation/#get-pip-py
+ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
+
+# PIP supports fetching the packages from custom indexes, allowing third-party
+# to host the packages in private mirrors. The PIP_INDEX_URL and
+# PIP_EXTRA_INDEX_URL are standard PIP environment variables to override the
+# default indexes. By letting them empty by default, PIP will use its default
+# indexes if the build process doesn't override the indexes.
+#
+# Uv uses different variables. We set them by default to the same values as
+# PIP, but they can be overridden.
+ARG PIP_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL
+ARG UV_INDEX_URL=${PIP_INDEX_URL}
+ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+
+# PyTorch provides its own indexes for standard and nightly builds
+ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
+
+# PIP supports multiple authentication schemes, including keyring
+# By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
+# disabled by default, we allow third-party to use keyring authentication for
+# their private Python indexes, while not changing the default behavior which
+# is no authentication.
+#
+# Reference: https://pip.pypa.io/en/stable/topics/authentication/#keyring-support
+ARG PIP_KEYRING_PROVIDER=disabled
+ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
+
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
-ARG CUDA_VERSION=12.8.1
-ARG PYTHON_VERSION=3.12
+FROM ${BUILD_BASE_IMAGE} AS base
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive

+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
+ARG GET_PIP_URL
+
 # Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo \
-    && for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-    done \
+    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
+        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
+            mkdir -p -m 0755 /etc/apt/keyrings ; \
+            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
+            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
+            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
+        fi ; \
+    else \
+        for i in 1 2 3; do \
+            add-apt-repository -y ppa:deadsnakes/ppa && break || \
+            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+        done ; \
+    fi \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
+ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
+
 # Install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv
@ -63,21 +139,25 @@ WORKDIR /workspace
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            --pre pytorch_triton==3.3.0+gitab727c40; \
    fi

 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/cuda.txt \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
 # see https://github.com/pytorch/pytorch/pull/123243
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # Override the arch list for flash-attn to reduce the binary size
 ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
@ -88,6 +168,10 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 FROM base AS build
 ARG TARGETPLATFORM

+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
 # install build dependencies
 COPY requirements/build.txt requirements/build.txt

@ -98,7 +182,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"

 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/build.txt \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 COPY . .
 ARG GIT_REPO_CHECK=0
@ -113,6 +197,8 @@ ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads

 ARG USE_SCCACHE
+ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
+ARG SCCACHE_ENDPOINT
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
@ -121,10 +207,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
        && tar -xzf sccache.tar.gz \
        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -162,6 +249,10 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
 #################### DEV IMAGE ####################
 FROM base as dev

+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
@ -176,21 +267,25 @@ COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/dev.txt \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 #################### DEV IMAGE ####################

 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 # TODO: Restore to base image after FlashInfer AOT wheel fixed
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION=12.8.1
-ARG PYTHON_VERSION=3.12
+FROM ${FINAL_BASE_IMAGE} AS vllm-base
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM

 SHELL ["/bin/bash", "-c"]

+ARG DEADSNAKES_MIRROR_URL
+ARG DEADSNAKES_GPGKEY_URL
+ARG GET_PIP_URL
+
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment

@ -200,17 +295,33 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-    done \
+    && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
+        if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
+            mkdir -p -m 0755 /etc/apt/keyrings ; \
+            curl -L ${DEADSNAKES_GPGKEY_URL} | gpg --dearmor > /etc/apt/keyrings/deadsnakes.gpg ; \
+            sudo chmod 644 /etc/apt/keyrings/deadsnakes.gpg ; \
+            echo "deb [signed-by=/etc/apt/keyrings/deadsnakes.gpg] ${DEADSNAKES_MIRROR_URL} $(lsb_release -cs) main" > /etc/apt/sources.list.d/deadsnakes.list ; \
+        fi ; \
+    else \
+        for i in 1 2 3; do \
+            add-apt-repository -y ppa:deadsnakes/ppa && break || \
+            { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+        done ; \
+    fi \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
+ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
+
 # Install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv
@ -232,19 +343,23 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
+        uv pip install --system \
+            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
+            --pre pytorch_triton==3.3.0+gitab727c40 ; \
    fi

 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system dist/*.whl --verbose \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # If we need to build FlashInfer wheel before its release:
 # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
-# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a'
+# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0'
 # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
 # $ cd flashinfer
 # $ git checkout v0.2.6.post1
@ -254,15 +369,20 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl

+# Allow specifying a version, Git revision or local .whl file
+ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashinfer"
+ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
+ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
+ARG FLASHINFER_GIT_REF="v0.2.6.post1"
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
    # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
    if [[ "$CUDA_VERSION" == 12.8* ]]; then \
-        uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl; \
+        uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL} ; \
    else \
-        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' && \
-        git clone https://github.com/flashinfer-ai/flashinfer.git --single-branch --branch v0.2.6.post1 --recursive && \
+        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' && \
+        git clone ${FLASHINFER_GIT_REPO} --single-branch --branch ${FLASHINFER_GIT_REF} --recursive && \
        # Needed to build AOT kernels
        (cd flashinfer && \
            python3 -m flashinfer.aot && \
@ -286,7 +406,7 @@ uv pip list
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/build.txt \
-    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 #################### vLLM installation IMAGE ####################

@ -297,6 +417,11 @@ FROM vllm-base AS test

 ADD . /vllm-workspace/

+ARG PYTHON_VERSION
+
+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
@ -307,7 +432,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"

 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \   
+RUN --mount=type=cache,target=/root/.cache/uv \
    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
        uv pip install --system -r requirements/dev.txt; \
@ -323,7 +448,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ENV HF_HUB_ENABLE_HF_TRANSFER 1

 # Copy in the v1 package for testing (it isn't distributed yet)
-COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
+COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1

 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
@ -340,6 +465,9 @@ RUN mv mkdocs.yaml test_docs/
 FROM vllm-base AS vllm-openai-base
 ARG TARGETPLATFORM

+ARG PIP_INDEX_URL UV_INDEX_URL
+ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -66,7 +66,7 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 WORKDIR /workspace/vllm

 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    --mount=type=bind,src=requirements/cpu-build.txt,target=requirements/build.txt \
    uv pip install -r requirements/build.txt

 COPY . .
@ -79,6 +79,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 

+######################### TEST DEPS #########################
+FROM base AS vllm-test-deps
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
+    cp requirements/test.in requirements/cpu-test.in && \
+    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
+    sed -i 's/torch==.*/torch==2.6.0/g' requirements/cpu-test.in && \
+    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
+    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
+    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -r requirements/cpu-test.txt 
+
 ######################### DEV IMAGE #########################
 FROM vllm-build AS vllm-dev

@ -97,28 +113,19 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 

+COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
+
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
-    cp requirements/test.in requirements/test-cpu.in && \
-    sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
-    uv pip compile requirements/test-cpu.in -o requirements/test.txt && \
    uv pip install -r requirements/dev.txt && \
    pre-commit install --hook-type pre-commit --hook-type commit-msg

 ENTRYPOINT ["bash"]

 ######################### TEST IMAGE #########################
-FROM base AS vllm-test
+FROM vllm-test-deps AS vllm-test

 WORKDIR /workspace/

-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
-    cp requirements/test.in requirements/test-cpu.in && \
-    sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
-    uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
-    uv pip install -r requirements/cpu-test.txt
-
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
    uv pip install dist/*.whl
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@ -35,6 +35,7 @@ RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi

 ENV VLLM_TARGET_DEVICE=xpu
+ENV VLLM_WORKER_MULTIPROC_METHOD=spawn

 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=.git,target=.git \
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@ -48,7 +48,12 @@ nav:
    - General:
      - glob: contributing/*
        flatten_single_child_sections: true
-    - Model Implementation: contributing/model
+    - Model Implementation: 
+      - contributing/model/README.md
+      - contributing/model/basic.md
+      - contributing/model/registration.md
+      - contributing/model/tests.md
+      - contributing/model/multimodal.md
    - Design Documents:
      - V0: design
      - V1: design/v1
--- a/docs/README.md
+++ b/docs/README.md
@ -40,7 +40,7 @@ vLLM is flexible and easy to use with:
 - OpenAI-compatible API server
 - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 - Prefix caching support
- Multi-lora support
+- Multi-LoRA support

 For more information, check out the following:

--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@ -29,6 +29,8 @@ See <gh-file:LICENSE>.
 Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
 Check out the [building from source][build-from-source] documentation for details.

+For an optimized workflow when iterating on C++/CUDA kernels, see the [Incremental Compilation Workflow](./incremental_build.md) for recommendations.
+
 ### Building the docs with MkDocs

 #### Introduction to MkDocs
@ -149,6 +151,14 @@ the terms of the DCO.

 Using `-s` with `git commit` will automatically add this header.

+!!! tip
+    You can enable automatic sign-off via your IDE:
+  
+    - **PyCharm**: Click on the `Show Commit Options` icon to the right of the `Commit and Push...` button in the `Commit` window.
+      It will bring up a `git` window where you can modify the `Author` and enable `Sign-off commit`.
+    - **VSCode**: Open the [Settings editor](https://code.visualstudio.com/docs/configure/settings)
+      and enable the `Git: Always Sign Off` (`git.alwaysSignOff`) field.
+
 ### PR Title and Classification

 Only specific types of PRs will be reviewed. The PR title is prefixed
@ -188,6 +198,7 @@ The PR needs to meet the following code quality standards:

 ### Adding or Changing Kernels

+When actively developing or modifying kernels, using the [Incremental Compilation Workflow](./incremental_build.md) is highly recommended for faster build times.
 Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

 - Make sure custom ops are registered following PyTorch guidelines:
--- a/docs/contributing/incremental_build.md
+++ b/docs/contributing/incremental_build.md
@ -0,0 +1,138 @@
+# Incremental Compilation Workflow
+
+When working on vLLM's C++/CUDA kernels located in the `csrc/` directory, recompiling the entire project with `uv pip install -e .` for every change can be time-consuming. An incremental compilation workflow using CMake allows for faster iteration by only recompiling the necessary components after an initial setup. This guide details how to set up and use such a workflow, which complements your editable Python installation.
+
+## Prerequisites
+
+Before setting up the incremental build:
+
+1. **vLLM Editable Install:** Ensure you have vLLM installed from source in an editable mode. Using pre-compiled wheels for the initial editable setup can be faster, as the CMake workflow will handle subsequent kernel recompilations.
+
+    ```console
+    uv venv --python 3.12 --seed
+    source .venv/bin/activate
+    VLLM_USE_PRECOMPILED=1 uv pip install -U -e . --torch-backend=auto
+    ```
+
+2. **CUDA Toolkit:** Verify that the NVIDIA CUDA Toolkit is correctly installed and `nvcc` is accessible in your `PATH`. CMake relies on `nvcc` to compile CUDA code. You can typically find `nvcc` in `$CUDA_HOME/bin/nvcc` or by running `which nvcc`. If you encounter issues, refer to the [official CUDA Toolkit installation guides](https://developer.nvidia.com/cuda-toolkit-archive) and vLLM's main [GPU installation documentation](../getting_started/installation/gpu/cuda.inc.md#troubleshooting) for troubleshooting. The `CMAKE_CUDA_COMPILER` variable in your `CMakeUserPresets.json` should also point to your `nvcc` binary.
+
+3. **Build Tools:** It is highly recommended to install `ccache` for fast rebuilds by caching compilation results (e.g., `sudo apt install ccache` or `conda install ccache`). Also, ensure the core build dependencies like `cmake` and `ninja` are installed. These are installable through `requirements/build.txt` or your system's package manager.
+
+    ```console
+    uv pip install -r requirements/build.txt --torch-backend=auto
+    ```
+
+## Setting up the CMake Build Environment
+
+The incremental build process is managed through CMake. You can configure your build settings using a `CMakeUserPresets.json` file at the root of the vLLM repository.
+
+### Generate `CMakeUserPresets.json` using the helper script
+
+To simplify the setup, vLLM provides a helper script that attempts to auto-detect your system's configuration (like CUDA path, Python environment, and CPU cores) and generates the `CMakeUserPresets.json` file for you.
+
+**Run the script:**
+
+Navigate to the root of your vLLM clone and execute the following command:
+
+```console
+python tools/generate_cmake_presets.py
+```
+
+The script will prompt you if it cannot automatically determine certain paths (e.g., `nvcc` or a specific Python executable for your vLLM development environment). Follow the on-screen prompts. If an existing `CMakeUserPresets.json` is found, the script will ask for confirmation before overwriting it.
+
+After running the script, a `CMakeUserPresets.json` file will be created in the root of your vLLM repository.
+
+### Example `CMakeUserPresets.json`
+
+Below is an example of what the generated `CMakeUserPresets.json` might look like. The script will tailor these values based on your system and any input you provide.
+
+```json
+{
+    "version": 6,
+    "cmakeMinimumRequired": {
+        "major": 3,
+        "minor": 26,
+        "patch": 1
+    },
+    "configurePresets": [
+        {
+            "name": "release",
+            "generator": "Ninja",
+            "binaryDir": "${sourceDir}/cmake-build-release",
+            "cacheVariables": {
+                "CMAKE_CUDA_COMPILER": "/usr/local/cuda/bin/nvcc",
+                "CMAKE_C_COMPILER_LAUNCHER": "ccache",
+                "CMAKE_CXX_COMPILER_LAUNCHER": "ccache",
+                "CMAKE_CUDA_COMPILER_LAUNCHER": "ccache",
+                "CMAKE_BUILD_TYPE": "Release",
+                "VLLM_PYTHON_EXECUTABLE": "/home/user/venvs/vllm/bin/python",
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}",
+                "CMAKE_CUDA_FLAGS": "",
+                "NVCC_THREADS": "4",
+                "CMAKE_JOB_POOLS": "compile=32"
+            }
+        }
+    ],
+    "buildPresets": [
+        {
+            "name": "release",
+            "configurePreset": "release",
+            "jobs": 32
+        }
+    ]
+}
+```
+
+**What do the various configurations mean?**
+- `CMAKE_CUDA_COMPILER`: Path to your `nvcc` binary. The script attempts to find this automatically.
+- `CMAKE_C_COMPILER_LAUNCHER`, `CMAKE_CXX_COMPILER_LAUNCHER`, `CMAKE_CUDA_COMPILER_LAUNCHER`: Setting these to `ccache` (or `sccache`) significantly speeds up rebuilds by caching compilation results. Ensure `ccache` is installed (e.g., `sudo apt install ccache` or `conda install ccache`). The script sets these by default.
+- `VLLM_PYTHON_EXECUTABLE`: Path to the Python executable in your vLLM development environment. The script will prompt for this, defaulting to the current Python environment if suitable.
+- `CMAKE_INSTALL_PREFIX: "${sourceDir}"`: Specifies that the compiled components should be installed back into your vLLM source directory. This is crucial for the editable install, as it makes the newly built kernels immediately available to your Python environment.
+- `CMAKE_JOB_POOLS` and `jobs` in build presets: Control the parallelism of the build. The script sets these based on the number of CPU cores detected on your system.
+- `binaryDir`: Specifies where the build artifacts will be stored (e.g., `cmake-build-release`).
+
+## Building and Installing with CMake
+
+Once your `CMakeUserPresets.json` is configured:
+
+1. **Initialize the CMake build environment:**
+   This step configures the build system according to your chosen preset (e.g., `release`) and creates the build directory at `binaryDir`
+
+   ```console
+   cmake --preset release
+   ```
+
+2. **Build and install the vLLM components:**
+   This command compiles the code and installs the resulting binaries into your vLLM source directory, making them available to your editable Python installation.
+
+   ```console
+   cmake --build --preset release --target install
+   ```
+
+3. **Make changes and repeat!**
+    Now you start using your editable install of vLLM, testing and making changes as needed. If you need to build again to update based on changes, simply run the CMake command again to build only the affected files.
+
+    ```console
+    cmake --build --preset release --target install
+    ```
+
+## Verifying the Build
+
+After a successful build, you will find a populated build directory (e.g., `cmake-build-release/` if you used the `release` preset and the example configuration).
+
+```console
+> ls cmake-build-release/
+bin             cmake_install.cmake      _deps                                machete_generation.log
+build.ninja     CPackConfig.cmake        detect_cuda_compute_capabilities.cu  marlin_generation.log
+_C.abi3.so      CPackSourceConfig.cmake  detect_cuda_version.cc               _moe_C.abi3.so
+CMakeCache.txt  ctest                    _flashmla_C.abi3.so                  moe_marlin_generation.log
+CMakeFiles      cumem_allocator.abi3.so  install_local_manifest.txt           vllm-flash-attn
+```
+
+The `cmake --build ... --target install` command copies the compiled shared libraries (like `_C.abi3.so`, `_moe_C.abi3.so`, etc.) into the appropriate `vllm` package directory within your source tree. This updates your editable installation with the newly compiled kernels.
+
+## Additional Tips
+
+- **Adjust Parallelism:** Fine-tune the `CMAKE_JOB_POOLS` in `configurePresets` and `jobs` in `buildPresets` in your `CMakeUserPresets.json`. Too many jobs can overload systems with limited RAM or CPU cores, leading to slower builds or system instability. Too few won't fully utilize available resources.
+- **Clean Builds When Necessary:** If you encounter persistent or strange build errors, especially after significant changes or switching branches, consider removing the CMake build directory (e.g., `rm -rf cmake-build-release`) and re-running the `cmake --preset` and `cmake --build` commands.
+- **Specific Target Builds:** For even faster iterations when working on a specific module, you can sometimes build a specific target instead of the full `install` target, though `install` ensures all necessary components are updated in your Python environment. Refer to CMake documentation for more advanced target management.
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@ -1,21 +1,23 @@
 ---
-title: Adding a New Model
+title: Summary
 ---
 [](){ #new-model }

-This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
+!!! important
+    Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!

-Contents:
+vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features][compatibility-matrix] to optimize their performance.

- [Basic](basic.md)
- [Registration](registration.md)
- [Tests](tests.md)
- [Multimodal](multimodal.md)
+The complexity of integrating a model into vLLM depends heavily on the model's architecture.
+The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+However, this can be more complex for models that include new operators (e.g., a new attention mechanism).

-!!! note
-    The complexity of adding a new model depends heavily on the model's architecture.
-    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
-    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+Read through these pages for a step-by-step guide:
+
+- [Basic Model](basic.md)
+- [Registering a Model](registration.md)
+- [Unit Testing](tests.md)
+- [Multi-Modal Support](multimodal.md)

 !!! tip
    If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@ -1,5 +1,5 @@
 ---
-title: Implementing a Basic Model
+title: Basic Model
 ---
 [](){ #new-model-basic }

--- a/docs/contributing/model/registration.md
+++ b/docs/contributing/model/registration.md
@ -1,5 +1,5 @@
 ---
-title: Registering a Model to vLLM
+title: Registering a Model
 ---
 [](){ #new-model-registration }

--- a/docs/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@ -1,5 +1,5 @@
 ---
-title: Writing Unit Tests
+title: Unit Testing
 ---
 [](){ #new-model-tests }

--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@ -5,9 +5,9 @@ title: Helm

 A Helm chart to deploy vLLM for Kubernetes

-Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
+Helm is a package manager for Kubernetes. It helps automate the deployment of vLLM applications on Kubernetes. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.

-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for Helm installation and documentation on architecture and values file.

 ## Prerequisites

@ -16,17 +16,23 @@ Before you begin, ensure that you have the following:
 - A running Kubernetes cluster
 - NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
 - Available GPU resources in your cluster
- S3 with the model which will be deployed
+- An S3 with the model which will be deployed

 ## Installing the chart

 To install the chart with the release name `test-vllm`:

 ```bash
-helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+helm upgrade --install --create-namespace \
+  --namespace=ns-vllm test-vllm . \
+  -f values.yaml \
+  --set secrets.s3endpoint=$ACCESS_POINT \
+  --set secrets.s3bucketname=$BUCKET \
+  --set secrets.s3accesskeyid=$ACCESS_KEY \
+  --set secrets.s3accesskey=$SECRET_KEY
 ```

-## Uninstalling the Chart
+## Uninstalling the chart

 To uninstall the `test-vllm` deployment:

@ -39,57 +45,59 @@ chart **including persistent volumes** and deletes the release.

 ## Architecture

-![](../../assets/deployment/architecture_helm_deployment.png)
+![helm deployment architecture](../../assets/deployment/architecture_helm_deployment.png)

 ## Values

-| Key                                        | Type    | Default                                                                                                                                                  | Description                                                                                                                               |
-|--------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------|
-| autoscaling                                | object  | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}                                                                  | Autoscaling configuration                                                                                                                 |
-| autoscaling.enabled                        | bool    | false                                                                                                                                                    | Enable autoscaling                                                                                                                        |
-| autoscaling.maxReplicas                    | int     | 100                                                                                                                                                      | Maximum replicas                                                                                                                          |
-| autoscaling.minReplicas                    | int     | 1                                                                                                                                                        | Minimum replicas                                                                                                                          |
-| autoscaling.targetCPUUtilizationPercentage | int     | 80                                                                                                                                                       | Target CPU utilization for autoscaling                                                                                                    |
-| configs                                    | object  | {}                                                                                                                                                       | Configmap                                                                                                                                 |
-| containerPort                              | int     | 8000                                                                                                                                                     | Container port                                                                                                                            |
-| customObjects                              | list    | []                                                                                                                                                       | Custom Objects configuration                                                                                                              |
-| deploymentStrategy                         | object  | {}                                                                                                                                                       | Deployment strategy configuration                                                                                                         |
-| externalConfigs                            | list    | []                                                                                                                                                       | External configuration                                                                                                                    |
-| extraContainers                            | list    | []                                                                                                                                                       | Additional containers configuration                                                                                                       |
-| extraInit                                  | object  | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}                                                     | Additional configuration for the init container                                                                                           |
-| extraInit.pvcStorage                       | string  | "50Gi"                                                                                                                                                   | Storage size of the s3                                                                                                                    |
-| extraInit.s3modelpath                      | string  | "relative_s3_model_path/opt-125m"                                                                                                                        | Path of the model on the s3 which hosts model weights and config files                                                                    |
-| extraInit.awsEc2MetadataDisabled           | boolean | true                                                                                                                                                     | Disables the use of the Amazon EC2 instance metadata service                                                                              |
-| extraPorts                                 | list    | []                                                                                                                                                       | Additional ports configuration                                                                                                            |
-| gpuModels                                  | list    | ["TYPE_GPU_USED"]                                                                                                                                        | Type of gpu used                                                                                                                          |
-| image                                      | object  | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration                                                                                                                       |
-| image.command                              | list    | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]                                                            | Container launch command                                                                                                                  |
-| image.repository                           | string  | "vllm/vllm-openai"                                                                                                                                       | Image repository                                                                                                                          |
-| image.tag                                  | string  | "latest"                                                                                                                                                 | Image tag                                                                                                                                 |
-| livenessProbe                              | object  | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}                                              | Liveness probe configuration                                                                                                              |
-| livenessProbe.failureThreshold             | int     | 3                                                                                                                                                        | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
-| livenessProbe.httpGet                      | object  | {"path":"/health","port":8000}                                                                                                                           | Configuration of the Kubelet http request on the server                                                                                   |
-| livenessProbe.httpGet.path                 | string  | "/health"                                                                                                                                                | Path to access on the HTTP server                                                                                                         |
-| livenessProbe.httpGet.port                 | int     | 8000                                                                                                                                                     | Name or number of the port to access on the container, on which the server is listening                                                   |
-| livenessProbe.initialDelaySeconds          | int     | 15                                                                                                                                                       | Number of seconds after the container has started before liveness probe is initiated                                                      |
-| livenessProbe.periodSeconds                | int     | 10                                                                                                                                                       | How often (in seconds) to perform the liveness probe                                                                                      |
-| maxUnavailablePodDisruptionBudget          | string  | ""                                                                                                                                                       | Disruption Budget Configuration                                                                                                           |
-| readinessProbe                             | object  | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}                                                | Readiness probe configuration                                                                                                             |
-| readinessProbe.failureThreshold            | int     | 3                                                                                                                                                        | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
-| readinessProbe.httpGet                     | object  | {"path":"/health","port":8000}                                                                                                                           | Configuration of the Kubelet http request on the server                                                                                   |
-| readinessProbe.httpGet.path                | string  | "/health"                                                                                                                                                | Path to access on the HTTP server                                                                                                         |
-| readinessProbe.httpGet.port                | int     | 8000                                                                                                                                                     | Name or number of the port to access on the container, on which the server is listening                                                   |
-| readinessProbe.initialDelaySeconds         | int     | 5                                                                                                                                                        | Number of seconds after the container has started before readiness probe is initiated                                                     |
-| readinessProbe.periodSeconds               | int     | 5                                                                                                                                                        | How often (in seconds) to perform the readiness probe                                                                                     |
-| replicaCount                               | int     | 1                                                                                                                                                        | Number of replicas                                                                                                                        |
-| resources                                  | object  | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}                                          | Resource configuration                                                                                                                    |
-| resources.limits."nvidia.com/gpu"          | int     | 1                                                                                                                                                        | Number of gpus used                                                                                                                       |
-| resources.limits.cpu                       | int     | 4                                                                                                                                                        | Number of CPUs                                                                                                                            |
-| resources.limits.memory                    | string  | "16Gi"                                                                                                                                                   | CPU memory configuration                                                                                                                  |
-| resources.requests."nvidia.com/gpu"        | int     | 1                                                                                                                                                        | Number of gpus used                                                                                                                       |
-| resources.requests.cpu                     | int     | 4                                                                                                                                                        | Number of CPUs                                                                                                                            |
-| resources.requests.memory                  | string  | "16Gi"                                                                                                                                                   | CPU memory configuration                                                                                                                  |
-| secrets                                    | object  | {}                                                                                                                                                       | Secrets configuration                                                                                                                     |
-| serviceName                                | string  | Service name                                                                                                                                             |                                                                                                                                           |
-| servicePort                                | int     | 80                                                                                                                                                       | Service port                                                                                                                              |
-| labels.environment                         | string  | test                                                                                                                                                     | Environment name                                                                                                                          |
+The following table describes configurable parameters of the chart in `values.yaml`:
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
+| autoscaling.enabled | bool | false | Enable autoscaling |
+| autoscaling.maxReplicas | int | 100 | Maximum replicas |
+| autoscaling.minReplicas | int | 1 | Minimum replicas |
+| autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling |
+| configs | object | {} | Configmap |
+| containerPort | int | 8000 | Container port |
+| customObjects | list | [] | Custom Objects configuration |
+| deploymentStrategy | object | {} | Deployment strategy configuration |
+| externalConfigs | list | [] | External configuration |
+| extraContainers | list | [] | Additional containers configuration |
+| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
+| extraInit.pvcStorage | string | "1Gi" | Storage size of the s3 |
+| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
+| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
+| extraPorts | list | [] | Additional ports configuration |
+| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
+| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
+| image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command |
+| image.repository | string | "vllm/vllm-openai" | Image repository |
+| image.tag | string | "latest" | Image tag |
+| livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration |
+| livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
+| livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
+| livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
+| livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
+| livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated |
+| livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe |
+| maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration |
+| readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration |
+| readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
+| readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
+| readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
+| readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
+| readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated |
+| readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe |
+| replicaCount | int | 1 | Number of replicas |
+| resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration |
+| resources.limits."nvidia.com/gpu" | int | 1 | Number of GPUs used |
+| resources.limits.cpu | int | 4 | Number of CPUs |
+| resources.limits.memory | string | "16Gi" | CPU memory configuration |
+| resources.requests."nvidia.com/gpu" | int | 1 | Number of GPUs used |
+| resources.requests.cpu | int | 4 | Number of CPUs |
+| resources.requests.memory | string | "16Gi" | CPU memory configuration |
+| secrets | object | {} | Secrets configuration |
+| serviceName | string | "" | Service name |
+| servicePort | int | 80 | Service port |
+| labels.environment | string | test | Environment name |
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@ -151,6 +151,9 @@ pip install -e .
    [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
    The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.

+!!! note "Faster Kernel Development"
+    For frequent C++/CUDA kernel changes, after the initial `pip install -e .` setup, consider using the [Incremental Compilation Workflow](../../contributing/incremental_build.md) for significantly faster rebuilds of only the modified kernel code.
+
 ##### Use an existing PyTorch installation

 There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
--- a/docs/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@ -22,7 +22,7 @@ Currently, there are no pre-built XPU wheels.
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]

- First, install required driver and Intel OneAPI 2025.0 or later.
+- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.0 or later.
 - Second, install Python packages for vLLM XPU backend building:

 ```bash
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -336,6 +336,7 @@ Specified using `--task generate`.
 | `GemmaForCausalLM`                                | Gemma                                               | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.                                                                                                                            | ✅︎                     | ✅︎                          | ✅︎                     |
 | `Gemma2ForCausalLM`                               | Gemma 2                                             | `google/gemma-2-9b`, `google/gemma-2-27b`, etc.                                                                                                                              | ✅︎                     | ✅︎                          | ✅︎                     |
 | `Gemma3ForCausalLM`                               | Gemma 3                                             | `google/gemma-3-1b-it`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          | ✅︎                     |
+| `Gemma3nForConditionalGeneration`                  | Gemma 3n                                             | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc.                                                                                                                                                 |                      |                           | ✅︎                     |
 | `GlmForCausalLM`                                  | GLM-4                                               | `THUDM/glm-4-9b-chat-hf`, etc.                                                                                                                                               | ✅︎                     | ✅︎                          | ✅︎                     |
 | `Glm4ForCausalLM`                                 | GLM-4-0414                                          | `THUDM/GLM-4-32B-0414`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          | ✅︎                     |
 | `GPT2LMHeadModel`                                 | GPT-2                                               | `gpt2`, `gpt2-xl`, etc.                                                                                                                                                      |                        | ✅︎                          | ✅︎                     |
@ -392,6 +393,9 @@ Specified using `--task generate`.
 !!! note
    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.

+!!! note
+    Only text inputs are currently supported for `Gemma3nForConditionalGeneration`. To use this model, please upgrade Hugging Face Transformers to version 4.53.0.
+
 ### Pooling Models

 See [this page](./pooling_models.md) for more information on how to use pooling models.
@ -427,7 +431,7 @@ Specified using `--task embed`.
    See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).

 !!! note
-    `jinaai/jina-embeddings-v3` supports multiple tasks through lora, while vllm temporarily only supports text-matching tasks by merging lora weights.
+    `jinaai/jina-embeddings-v3` supports multiple tasks through LoRA, while vllm temporarily only supports text-matching tasks by merging LoRA weights.

 !!! note
    The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture.
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@ -57,6 +57,8 @@ We currently support the following OpenAI APIs:
    - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
 - [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`)
    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
+- [Translation API][translations-api] (`/v1/audio/translations`)
+    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).

 In addition, we have the following custom APIs:

@ -144,11 +146,6 @@ completion = client.chat.completions.create(
 Only `X-Request-Id` HTTP request header is supported for now. It can be enabled
 with `--enable-request-id-headers`.

-> Note that enablement of the headers can impact performance significantly at high QPS
-> rates. We recommend implementing HTTP headers at the router level (e.g. via Istio),
-> rather than within the vLLM layer for this reason.
-> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details.
-
 ??? Code

    ```python
@ -374,6 +371,34 @@ The following extra parameters are supported:
    ```python
    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
    ```
+  
+[](){ #translations-api }
+
+### Translations API
+
+Our Translation API is compatible with [OpenAI's Translations API](https://platform.openai.com/docs/api-reference/audio/createTranslation);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+Whisper models can translate audio from one of the 55 non-English supported languages into English.
+Please mind that the popular `openai/whisper-large-v3-turbo` model does not support translating.
+
+!!! note
+    To use the Translation API, please install with extra audio dependencies using `pip install vllm[audio]`.
+
+Code example: <gh-file:examples/online_serving/openai_translation_client.py>
+
+#### Extra Parameters
+
+The following [sampling parameters][sampling-params] are supported.
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
+```
+
+The following extra parameters are supported:
+
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
+```

 [](){ #tokenizer-api }

--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@ -39,6 +39,9 @@ def parse_args():
    parser.add_argument("--top-k", type=int, default=-1)
    parser.add_argument("--print-output", action="store_true")
    parser.add_argument("--output-len", type=int, default=256)
+    parser.add_argument("--model-dir", type=str, default=None)
+    parser.add_argument("--eagle-dir", type=str, default=None)
+    parser.add_argument("--max-model-len", type=int, default=2048)
    return parser.parse_args()


@ -46,9 +49,10 @@ def main():
    args = parse_args()
    args.endpoint_type = "openai-chat"

-    model_dir = "meta-llama/Llama-3.1-8B-Instruct"
+    model_dir = args.model_dir
+    if args.model_dir is None:
+        model_dir = "meta-llama/Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
-    max_model_len = 2048

    prompts = get_samples(args, tokenizer)
    # add_special_tokens is False to avoid adding bos twice when using chat templates
@ -57,16 +61,18 @@ def main():
    ]

    if args.method == "eagle" or args.method == "eagle3":
-        if args.method == "eagle":
+        eagle_dir = args.eagle_dir
+        if args.method == "eagle" and eagle_dir is None:
            eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
-        elif args.method == "eagle3":
+
+        elif args.method == "eagle3" and eagle_dir is None:
            eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
        speculative_config = {
            "method": args.method,
            "model": eagle_dir,
            "num_speculative_tokens": args.num_spec_tokens,
            "draft_tensor_parallel_size": args.draft_tp,
-            "max_model_len": max_model_len,
+            "max_model_len": args.max_model_len,
        }
    elif args.method == "ngram":
        speculative_config = {
@ -74,7 +80,7 @@ def main():
            "num_speculative_tokens": args.num_spec_tokens,
            "prompt_lookup_max": args.prompt_lookup_max,
            "prompt_lookup_min": args.prompt_lookup_min,
-            "max_model_len": max_model_len,
+            "max_model_len": args.max_model_len,
        }
    else:
        raise ValueError(f"unknown method: {args.method}")
@ -86,7 +92,7 @@ def main():
        enable_chunked_prefill=args.enable_chunked_prefill,
        max_num_batched_tokens=args.max_num_batched_tokens,
        enforce_eager=args.enforce_eager,
-        max_model_len=max_model_len,
+        max_model_len=args.max_model_len,
        max_num_seqs=args.max_num_seqs,
        gpu_memory_utilization=0.8,
        speculative_config=speculative_config,
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@ -26,23 +26,12 @@ from openai import OpenAI

 from vllm.assets.audio import AudioAsset

-mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path()
-winning_call = AudioAsset("winning_call").get_local_path()

-# Modify OpenAI's API key and API base to use vLLM's API server.
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-
-def sync_openai():
+def sync_openai(audio_path: str, client: OpenAI):
    """
    Perform synchronous transcription using OpenAI-compatible API.
    """
-    with open(str(mary_had_lamb), "rb") as f:
+    with open(audio_path, "rb") as f:
        transcription = client.audio.transcriptions.create(
            file=f,
            model="openai/whisper-large-v3",
@ -58,8 +47,7 @@ def sync_openai():
        print("transcription result:", transcription.text)


-# OpenAI Transcription API client does not support streaming.
-async def stream_openai_response():
+async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
    """
    Perform streaming transcription using vLLM's raw HTTP streaming API.
    """
@ -68,11 +56,12 @@ async def stream_openai_response():
        "stream": True,
        "model": "openai/whisper-large-v3",
    }
-    url = openai_api_base + "/audio/transcriptions"
-    headers = {"Authorization": f"Bearer {openai_api_key}"}
+    url = base_url + "/audio/transcriptions"
+    headers = {"Authorization": f"Bearer {api_key}"}
    print("transcription result:", end=" ")
+    # OpenAI Transcription API client does not support streaming.
    async with httpx.AsyncClient() as client:
-        with open(str(winning_call), "rb") as f:
+        with open(audio_path, "rb") as f:
            async with client.stream(
                "POST", url, files={"file": f}, data=data, headers=headers
            ) as response:
@ -93,10 +82,20 @@ async def stream_openai_response():


 def main():
-    sync_openai()
+    mary_had_lamb = str(AudioAsset("mary_had_lamb").get_local_path())
+    winning_call = str(AudioAsset("winning_call").get_local_path())

+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    sync_openai(mary_had_lamb, client)
    # Run the asynchronous function
-    asyncio.run(stream_openai_response())
+    asyncio.run(stream_openai_response(winning_call, openai_api_base, openai_api_key))


 if __name__ == "__main__":
--- a/examples/online_serving/openai_translation_client.py
+++ b/examples/online_serving/openai_translation_client.py
@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import json
+
+import httpx
+from openai import OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+
+def sync_openai(audio_path: str, client: OpenAI):
+    with open(audio_path, "rb") as f:
+        translation = client.audio.translations.create(
+            file=f,
+            model="openai/whisper-large-v3",
+            response_format="json",
+            temperature=0.0,
+            # Additional params not provided by OpenAI API.
+            extra_body=dict(
+                language="it",
+                seed=4419,
+                repetition_penalty=1.3,
+            ),
+        )
+        print("translation result:", translation.text)
+
+
+async def stream_openai_response(audio_path: str, base_url: str, api_key: str):
+    data = {
+        "language": "it",
+        "stream": True,
+        "model": "openai/whisper-large-v3",
+    }
+    url = base_url + "/audio/translations"
+    headers = {"Authorization": f"Bearer {api_key}"}
+    print("translation result:", end=" ")
+    # OpenAI translation API client does not support streaming.
+    async with httpx.AsyncClient() as client:
+        with open(audio_path, "rb") as f:
+            async with client.stream(
+                "POST", url, files={"file": f}, data=data, headers=headers
+            ) as response:
+                async for line in response.aiter_lines():
+                    # Each line is a JSON object prefixed with 'data: '
+                    if line:
+                        if line.startswith("data: "):
+                            line = line[len("data: ") :]
+                        # Last chunk, stream ends
+                        if line.strip() == "[DONE]":
+                            break
+                        # Parse the JSON response
+                        chunk = json.loads(line)
+                        # Extract and print the content
+                        content = chunk["choices"][0].get("delta", {}).get("content")
+                        print(content, end="")
+
+
+def main():
+    foscolo = str(AudioAsset("azacinto_foscolo").get_local_path())
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    sync_openai(foscolo, client)
+    # Run the asynchronous function
+    asyncio.run(stream_openai_response(foscolo, openai_api_base, openai_api_key))
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -37,10 +37,11 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.10.1 # required for compressed-tensors
+compressed-tensors == 0.10.2 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/others/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
+pybase64 # fast base64 implementation
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@ -0,0 +1,12 @@
+# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
+# see https://github.com/pytorch/pytorch/pull/151218
+cmake>=3.26.1
+ninja
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
+setuptools-scm>=8
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.6.0+cpu
+wheel
+jinja2>=3.1.6
+regex
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9'
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.7.0+cpu; platform_machine == "x86_64"
+torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
 torch==2.7.0; platform_system == "Darwin"
 torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"

@ -23,6 +23,7 @@ datasets # for benchmark scripts

 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
-intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
+intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
 py-libnuma; platform_system != "Darwin"
 psutil; platform_system != "Darwin"
+triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@ -1,47 +1,50 @@
-# Dependency that able to run entrypoints test
-# pytest and its extensions
+# testing
 pytest
-pytest-asyncio
+tensorizer>=2.9.0
 pytest-forked
-pytest-mock
+pytest-asyncio
 pytest-rerunfailures
 pytest-shard
 pytest-timeout

-librosa # required by audio tests in entrypoints/openai
-sentence-transformers # required for embedding tests
-transformers==4.52.4
-transformers_stream_generator # required for qwen-vl test
-numba == 0.61.2; python_version > '3.9'
 # testing utils
-boto3
-botocore
-datasets
-ray >= 2.10.0
+backoff # required for phi4mm test
+blobfile # required for kimi-vl test
+einops # required for MPT, qwen-vl and Mamba
+httpx
+librosa # required for audio tests
+vocos # required for minicpmo_26 test
 peft
+pqdm
+ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
+sentence-transformers # required for embedding tests
+soundfile # required for audio tests
+jiwer # required for audio tests
+timm # required for internvl test
+transformers_stream_generator # required for qwen-vl test
+matplotlib # required for qwen-vl test
+mistral_common[opencv] >= 1.6.2 # required for pixtral test
+num2words # required for smolvlm test
+opencv-python-headless >= 4.11.0 # required for video test
+datamodel_code_generator # required for minicpm3 test
+lm-eval[api]==0.4.8 # required for model evaluation test
+mteb>=1.38.11, <2 # required for mteb test
+transformers==4.52.4
+tokenizers==0.21.1
+huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
+schemathesis>=3.39.15 # Required for openai schema test.
+# quantization
+bitsandbytes>=0.45.3
+buildkite-test-collector==0.1.9
+
+
+genai_perf==0.0.8
+tritonclient==2.51.0
+
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61.2; python_version > '3.9'
+numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
-tensorizer>=2.9.0
-lm-eval==0.4.8
-buildkite-test-collector==0.1.9
-lm-eval[api]==0.4.8 # required for model evaluation test
-
-# required for quantization test
-bitsandbytes>=0.45.3
-
-# required for minicpmo_26 test
-vector_quantize_pytorch
-vocos
-
-# required for Basic Models Test
-blobfile # required for kimi-vl test
-matplotlib # required for qwen-vl test
-
-# required for  Multi-Modal Models Test (Standard)
-num2words # required for smolvlm test
-pqdm
-timm # required for internvl test
-mistral-common==1.6.2
-
-schemathesis==3.39.15  # Required for openai schema test.
-mteb>=1.38.11, <2 # required for mteb test
+fastsafetensors>=0.1.10
+pydantic>=2.10 # 2.9 leads to error on python 3.10
--- a/requirements/test.in
+++ b/requirements/test.in
@ -42,6 +42,7 @@ schemathesis>=3.39.15 # Required for openai schema test.
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9

+
 genai_perf==0.0.8
 tritonclient==2.51.0

@ -51,4 +52,4 @@ numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
-pydantic>=2.10 # 2.9 leads to error on python 3.10
+pydantic>=2.10 # 2.9 leads to error on python 3.10
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@ -9,6 +9,7 @@ setuptools>=77.0.3,<80.0.0
 wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
+numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding

 torch==2.7.0+xpu
 torchaudio
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@ -5,6 +5,15 @@ import pytest
 import vllm
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
+from vllm.utils import _is_torch_equal_or_newer
+
+
+def test_version():
+    assert _is_torch_equal_or_newer('2.8.0.dev20250624+cu128', '2.8.0.dev')
+    assert _is_torch_equal_or_newer('2.8.0a0+gitc82a174', '2.8.0.dev')
+    assert _is_torch_equal_or_newer('2.8.0', '2.8.0.dev')
+    assert _is_torch_equal_or_newer('2.8.1', '2.8.0.dev')
+    assert not _is_torch_equal_or_newer('2.7.1', '2.8.0.dev')


 def test_use_cudagraphs_dynamic(monkeypatch):
--- a/tests/config/test_config_generation.py
+++ b/tests/config/test_config_generation.py
@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.layers.quantization.quark.utils import deep_compare
+
+
+def test_cuda_empty_vs_unset_configs(monkeypatch: pytest.MonkeyPatch):
+    """Test that configs created with normal (untouched) CUDA_VISIBLE_DEVICES
+    and CUDA_VISIBLE_DEVICES="" are equivalent. This ensures consistent
+    behavior regardless of whether GPU visibility is disabled via empty string
+    or left in its normal state.
+    """
+
+    def create_config():
+        engine_args = EngineArgs(model="deepseek-ai/DeepSeek-V2-Lite",
+                                 trust_remote_code=True)
+        return engine_args.create_engine_config()
+
+    # Create config with CUDA_VISIBLE_DEVICES set normally
+    normal_config = create_config()
+
+    # Create config with CUDA_VISIBLE_DEVICES=""
+    with monkeypatch.context() as m:
+        m.setenv("CUDA_VISIBLE_DEVICES", "")
+        empty_config = create_config()
+
+    normal_config_dict = vars(normal_config)
+    empty_config_dict = vars(empty_config)
+
+    # Remove instance_id before comparison as it's expected to be different
+    normal_config_dict.pop("instance_id", None)
+    empty_config_dict.pop("instance_id", None)
+
+    assert deep_compare(normal_config_dict, empty_config_dict), (
+        "Configs with normal CUDA_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES=\"\""
+        " should be equivalent")
--- a/tests/distributed/test_eplb_algo.py
+++ b/tests/distributed/test_eplb_algo.py
@ -0,0 +1,292 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.distributed.eplb.rebalance_algo import rebalance_experts
+
+
+def test_basic_rebalance():
+    """Test basic rebalancing functionality"""
+    # Example from https://github.com/deepseek-ai/eplb
+    weight = torch.tensor([
+        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+    ])
+
+    num_layers = weight.shape[0]
+    num_replicas = 16
+    num_groups = 4
+    num_nodes = 2
+    num_gpus = 8
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify output shapes
+    assert phy2log.shape == (
+        2,
+        16,
+    ), f"Expected `phy2log` shape (2, 16), got {phy2log.shape}"
+    assert (log2phy.shape[0] == 2
+            ), f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
+    assert (
+        log2phy.shape[1] == 12
+    ), f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
+    assert logcnt.shape == (
+        2,
+        12,
+    ), f"Expected `logcnt` shape (2, 12), got {logcnt.shape}"
+
+    # Verify physical to logical expert mapping range is correct
+    assert torch.all(phy2log >= 0) and torch.all(
+        phy2log < 12), "Physical to logical mapping should be in range [0, 12)"
+
+    # Verify expert count reasonableness
+    assert torch.all(
+        logcnt >= 1), "Each logical expert should have at least 1 replica"
+    assert (
+        torch.sum(logcnt, dim=1).sum() == num_replicas *
+        num_layers), f"Total replicas should be {num_replicas * num_layers}"
+
+    # Verify expected output
+    expected_phy2log = torch.tensor([
+        [5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
+        [7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
+    ])
+    assert torch.all(phy2log == expected_phy2log)
+
+    expected_logcnt = torch.tensor([[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1],
+                                    [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]])
+    assert torch.all(logcnt == expected_logcnt)
+
+
+def test_single_gpu_case():
+    """Test single GPU case"""
+    weight = torch.tensor([[10, 20, 30, 40]])
+    num_replicas = 4
+    num_groups = 1
+    num_nodes = 1
+    num_gpus = 1
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify shapes
+    assert phy2log.shape == (1, 4)
+    assert log2phy.shape[0] == 1
+    assert log2phy.shape[1] == 4
+    assert logcnt.shape == (1, 4)
+
+    # Verify all logical experts are mapped
+    assert set(phy2log[0].tolist()) == {0, 1, 2, 3}
+
+
+def test_equal_weights():
+    """Test case with equal weights"""
+    weight = torch.tensor([[50, 50, 50, 50, 50, 50, 50, 50]])
+    num_replicas = 8
+    num_groups = 2
+    num_nodes = 2
+    num_gpus = 4
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify shapes
+    assert phy2log.shape == (1, 8)
+    assert logcnt.shape == (1, 8)
+
+    # With equal weights, each expert should have exactly one replica
+    assert torch.all(
+        logcnt == 1
+    ), "With equal weights and no replication, " \
+       "each expert should have exactly 1 replica"
+
+
+def test_extreme_weight_imbalance():
+    """Test extreme weight imbalance case"""
+    weight = torch.tensor([[1000, 1, 1, 1, 1, 1, 1, 1]])
+    num_replicas = 12
+    num_groups = 2
+    num_nodes = 2
+    num_gpus = 4
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify shapes
+    assert phy2log.shape == (1, 12)
+    assert logcnt.shape == (1, 8)
+
+    # Expert with highest weight (index 0) should have more replicas
+    assert (
+        logcnt[0, 0]
+        > logcnt[0, 1]), "Expert with highest weight should have more replicas"
+
+
+def test_multiple_layers():
+    """Test multiple layers case"""
+    weight = torch.tensor([
+        [10, 20, 30, 40, 50, 60],  # First layer
+        [60, 50, 40, 30, 20, 10],  # Second layer (opposite weight pattern)
+        [25, 25, 25, 25, 25, 25],  # Third layer (equal weights)
+    ])
+    num_replicas = 8
+    num_groups = 2
+    num_nodes = 2
+    num_gpus = 4
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify shapes
+    assert phy2log.shape == (3, 8)
+    assert logcnt.shape == (3, 6)
+
+    # Verify expert allocation is reasonable for each layer
+    for layer in range(3):
+        assert torch.all(phy2log[layer] >= 0) and torch.all(
+            phy2log[layer] < 6
+        ), f"Layer {layer} physical to logical mapping" \
+            "should be in range [0, 6)"
+        assert (torch.sum(logcnt[layer]) == num_replicas
+                ), f"Layer {layer} total replicas should be {num_replicas}"
+
+
+def test_parameter_validation():
+    """Test parameter validation"""
+    weight = torch.tensor([[10, 20, 30, 40]])
+
+    # Test non-divisible case - this should handle normally without throwing
+    # errors because the function will fall back to global load balancing
+    # strategy
+    phy2log, log2phy, logcnt = rebalance_experts(weight, 8, 3, 2, 4)
+    assert phy2log.shape == (1, 8)
+    assert logcnt.shape == (1, 4)
+
+    # Test cases that will actually cause errors:
+    # num_physical_experts not divisible by num_gpus
+    with pytest.raises(AssertionError):
+        rebalance_experts(weight, 7, 2, 2, 4)  # 7 not divisible by 4
+
+
+def test_small_scale_hierarchical():
+    """Test small-scale hierarchical load balancing"""
+    weight = torch.tensor([
+        [100, 50, 200, 75, 150, 25, 300, 80],  # 8 experts
+    ])
+    num_replicas = 12
+    num_groups = 4  # 4 groups, 2 experts each
+    num_nodes = 2  # 2 nodes
+    num_gpus = 4  # 4 GPUs
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Verify basic constraints
+    assert phy2log.shape == (1, 12)
+    assert logcnt.shape == (1, 8)
+    assert torch.sum(logcnt) == num_replicas
+    assert torch.all(logcnt >= 1)
+
+    # Expert with highest weight should have more replicas
+    max_weight_expert = torch.argmax(weight[0])
+    assert (logcnt[0, max_weight_expert]
+            >= 2), "Highest weight expert should have multiple replicas"
+
+
+def test_global_load_balance_fallback():
+    """Test global load balancing fallback case"""
+    # When num_groups % num_nodes != 0, should fall back to global load
+    # balancing
+    weight = torch.tensor([[10, 20, 30, 40, 50, 60]])
+    num_replicas = 8
+    num_groups = 3  # Cannot be divided evenly by num_nodes=2
+    num_nodes = 2
+    num_gpus = 4
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Should work normally, just using global load balancing strategy
+    assert phy2log.shape == (1, 8)
+    assert logcnt.shape == (1, 6)
+    assert torch.sum(logcnt) == num_replicas
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_device_compatibility(device):
+    """Test device compatibility"""
+    if device == "cuda" and not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    weight = torch.tensor([[10, 20, 30, 40]], device=device)
+    num_replicas = 6
+    num_groups = 2
+    num_nodes = 1
+    num_gpus = 2
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+
+    # Function will convert to CPU internally, but should handle different
+    # device inputs normally
+    assert phy2log.shape == (1, 6)
+    assert logcnt.shape == (1, 4)
+
+
+def test_additional_cases():
+    """Test more edge cases and different parameter combinations"""
+
+    # Test case 1: Large-scale distributed setup
+    weight1 = torch.tensor(
+        [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]])
+    phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)
+
+    assert phy2log1.shape == (1, 24)
+    assert logcnt1.shape == (1, 16)
+    assert torch.sum(logcnt1) == 24
+
+    # Test case 2: Different weight distributions
+    weight2 = torch.tensor([
+        [200, 150, 100, 50, 25, 12],  # Decreasing weights
+        [12, 25, 50, 100, 150, 200],  # Increasing weights
+    ])
+    phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)
+
+    assert phy2log2.shape == (2, 10)
+    assert logcnt2.shape == (2, 6)
+
+    # Verify high-weight experts have more replicas
+    for layer in range(2):
+        max_weight_idx = torch.argmax(weight2[layer])
+        assert logcnt2[layer, max_weight_idx] >= 2
+
+
+if __name__ == "__main__":
+    weight = torch.tensor([
+        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+    ])
+
+    num_replicas = 16
+    num_groups = 4
+    num_nodes = 2
+    num_gpus = 8
+
+    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
+                                                 num_groups, num_nodes,
+                                                 num_gpus)
+    print(phy2log)
+
+    test_basic_rebalance()
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@ -0,0 +1,504 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import multiprocessing
+import os
+import random
+
+import pytest
+import torch
+import torch.distributed
+
+from vllm.distributed.eplb.rebalance_execute import (
+    rearrange_expert_weights_inplace)
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             get_tp_group,
+                                             init_distributed_environment)
+from vllm.utils import update_environment_variables
+
+
+def distributed_run(fn, world_size):
+    number_of_processes = world_size
+    processes: list[multiprocessing.Process] = []
+    for i in range(number_of_processes):
+        env: dict[str, str] = {}
+        env['RANK'] = str(i)
+        env['LOCAL_RANK'] = str(i)
+        env['WORLD_SIZE'] = str(number_of_processes)
+        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
+        env['MASTER_ADDR'] = 'localhost'
+        env['MASTER_PORT'] = '12345'
+        p = multiprocessing.Process(target=fn, args=(env, ))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def worker_fn_wrapper(fn):
+    # `multiprocessing.Process` cannot accept environment variables directly
+    # so we need to pass the environment variables as arguments
+    # and update the environment variables in the function
+    def wrapped_fn(env):
+        update_environment_variables(env)
+        local_rank = os.environ['LOCAL_RANK']
+        device = torch.device(f"cuda:{local_rank}")
+        torch.cuda.set_device(device)
+        init_distributed_environment()
+
+        # Ensure each worker process has the same random seed
+        random.seed(42)
+        torch.manual_seed(42)
+
+        fn()
+
+    return wrapped_fn
+
+
+def create_expert_indices_with_redundancy(
+        num_layers: int,
+        num_logical_experts: int,
+        total_physical_experts: int,
+        redundancy_config: list[int],  # redundancy for each logical expert
+) -> torch.Tensor:
+    """
+    Create expert indices with redundancy.
+    
+    Args:
+        num_layers: number of layers
+        num_logical_experts: number of logical experts
+        total_physical_experts: total number of physical experts
+        redundancy_config: redundancy for each logical expert
+    
+    Returns:
+        indices: Shape (num_layers, total_physical_experts)
+    """
+    assert sum(redundancy_config) == total_physical_experts
+    assert len(redundancy_config) == num_logical_experts
+
+    indices = torch.zeros(num_layers, total_physical_experts, dtype=torch.long)
+
+    for layer in range(num_layers):
+        physical_pos = 0
+        for logical_expert_id, redundancy in enumerate(redundancy_config):
+            for _ in range(redundancy):
+                indices[layer, physical_pos] = logical_expert_id
+                physical_pos += 1
+
+    # Shuffle the indices at dim 1
+    for layer in range(num_layers):
+        indices[layer] = indices[layer][torch.randperm(indices.shape[1])]
+
+    return indices
+
+
+def create_expert_weights(
+    num_layers: int,
+    num_local_experts: int,
+    hidden_sizes: list[int],
+    rank: int,
+    device: torch.device,
+    physical_to_logical_mapping: torch.Tensor,
+) -> list[list[torch.Tensor]]:
+    """
+    Create fake expert weights tensor for testing.
+    
+    Use `arange` to generate predictable weights values, based on logical
+    expert ID.
+    All replicas of the same logical expert should have the same weights.
+    
+    Args:
+        physical_to_logical_mapping: Shape (num_layers, num_local_experts)
+            mapping[layer, physical_pos] = logical_expert_id
+    """
+    expert_weights = []
+
+    for layer in range(num_layers):
+        layer_weights = []
+        for weight_idx, hidden_size in enumerate(hidden_sizes):
+            weight_tensor = torch.zeros(num_local_experts,
+                                        hidden_size,
+                                        device=device,
+                                        dtype=torch.float32)
+
+            for local_expert in range(num_local_experts):
+                # Get the logical expert ID for this physical expert
+                global_pos = rank * num_local_experts + local_expert
+                logical_expert_id = physical_to_logical_mapping[
+                    layer, global_pos].item()
+
+                # Generate weights based on logical expert ID
+                # (so that all replicas of the same logical expert have the
+                # same weights)
+                base_value = (logical_expert_id * 1000 + layer * 100 +
+                              weight_idx * 10)
+                weight_tensor[local_expert] = torch.arange(base_value,
+                                                           base_value +
+                                                           hidden_size,
+                                                           device=device,
+                                                           dtype=torch.float32)
+
+            layer_weights.append(weight_tensor)
+        expert_weights.append(layer_weights)
+
+    return expert_weights
+
+
+def create_redundancy_config(
+    num_logical_experts: int,
+    num_physical_experts: int,
+) -> list[int]:
+    """Create a redundancy configuration."""
+    redundancy_config = [1] * num_logical_experts
+    remaining = num_physical_experts - num_logical_experts
+    # Randomly assign the remaining physical experts to the logical experts
+    for _ in range(remaining):
+        redundancy_config[random.choice(range(num_logical_experts))] += 1
+    return redundancy_config
+
+
+def verify_expert_weights_after_shuffle(
+    expert_weights: list[list[torch.Tensor]],
+    new_indices: torch.Tensor,
+    hidden_sizes: list[int],
+    ep_rank: int,
+    num_local_experts: int,
+):
+    """Verify the weights after shuffling are correct."""
+    num_layers = len(expert_weights)
+
+    for layer in range(num_layers):
+        for weight_idx, hidden_size in enumerate(hidden_sizes):
+            weight_tensor = expert_weights[layer][weight_idx]
+
+            for local_expert in range(num_local_experts):
+                # Calculate the global expert ID for this local expert
+                global_pos = ep_rank * num_local_experts + local_expert
+                expected_logical_expert = new_indices[layer, global_pos].item()
+
+                # Check if the weights are correct
+                actual_weights = weight_tensor[local_expert]
+                expected_base = (expected_logical_expert * 1000 + layer * 100 +
+                                 weight_idx * 10)
+                expected_weights = torch.arange(expected_base,
+                                                expected_base + hidden_size,
+                                                device=actual_weights.device,
+                                                dtype=actual_weights.dtype)
+
+                torch.testing.assert_close(
+                    actual_weights,
+                    expected_weights,
+                    msg=f"Layer {layer}, weight {weight_idx},"
+                    f"local expert {local_expert}: "
+                    f"weights do not match. "
+                    f"Expected logical expert {expected_logical_expert}")
+
+
+def verify_redundant_experts_have_same_weights(
+    expert_weights: list[list[torch.Tensor]],
+    indices: torch.Tensor,
+    hidden_sizes: list[int],
+    world_size: int,
+    num_local_experts: int,
+):
+    """
+    Verify that all replicas of the same logical expert have the same weights.
+    """
+    num_layers = len(expert_weights)
+    total_physical_experts = world_size * num_local_experts
+
+    for layer in range(num_layers):
+        # Collect weights for all physical experts for each weight matrix
+        all_weights: list[torch.Tensor] = []
+
+        for weight_idx, hidden_size in enumerate(hidden_sizes):
+            # Create tensor to store all expert weights
+            # Shape: [total_physical_experts, hidden_size]
+            gathered_weights = torch.zeros(
+                total_physical_experts,
+                hidden_size,
+                device=expert_weights[layer][weight_idx].device,
+                dtype=expert_weights[layer][weight_idx].dtype)
+
+            # Use all_gather to collect expert weights from current node
+            # expert_weights[layer][weight_idx] shape:
+            # [num_local_experts, hidden_size]
+            local_weights = expert_weights[layer][
+                weight_idx]  # [num_local_experts, hidden_size]
+
+            # Split tensor along dim 0 into a list for all_gather
+            gathered_weights_list = torch.chunk(gathered_weights,
+                                                world_size,
+                                                dim=0)
+
+            torch.distributed.all_gather(
+                # Output list: each element corresponds to one rank's weights
+                list(gathered_weights_list),
+                local_weights  # Input: current rank's local weights
+            )
+
+            all_weights.append(gathered_weights)
+
+        # Verify that all replicas of the same logical expert have the same
+        # weights
+        logical_expert_weights: dict[int, dict[int, torch.Tensor]] = {}
+
+        for physical_pos in range(total_physical_experts):
+            logical_expert_id = int(indices[layer, physical_pos].item())
+
+            if logical_expert_id not in logical_expert_weights:
+                # First time encountering this logical expert, save its weights
+                logical_expert_weights[logical_expert_id] = {
+                    weight_idx: all_weights[weight_idx][physical_pos]
+                    for weight_idx in range(len(hidden_sizes))
+                }
+            else:
+                # Verify that current physical expert's weights match the
+                # previously saved logical expert weights
+                for weight_idx in range(len(hidden_sizes)):
+                    torch.testing.assert_close(
+                        all_weights[weight_idx][physical_pos],
+                        logical_expert_weights[logical_expert_id][weight_idx],
+                        msg=f"Layer {layer}, weight {weight_idx},"
+                        f"logical expert {logical_expert_id}: "
+                        f"Physical expert {physical_pos} has different weights"
+                        f"than expected")
+
+
+@pytest.mark.parametrize(
+    "world_size,num_layers,num_local_experts,num_logical_experts",
+    [
+        # 2 GPU, 2 experts per GPU
+        # 3 logical experts, 4 physical experts, 1 redundant experts
+        (2, 1, 2, 3),
+        # 2 GPU, 3 experts per GPU
+        # 4 logical experts, 6 physical experts, 2 redundant experts
+        (2, 2, 3, 4),
+        # 2 GPU, 8 experts per GPU
+        # 16 logical experts, 16 physical experts, 0 redundant experts
+        (2, 4, 8, 16),
+        # 4 GPU, 2 experts per GPU
+        # 6 logical experts, 8 physical experts, 2 redundant experts
+        (4, 1, 2, 6),
+        # 4 GPU, 2 experts per GPU
+        # 5 logical experts, 8 physical experts, 3 redundant experts
+        (4, 2, 2, 5),
+        # 4 GPU, 8 experts per GPU
+        # 16 logical experts, 32 physical experts, 16 redundant experts
+        (4, 8, 8, 16),
+    ])
+def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
+                                                  num_local_experts,
+                                                  num_logical_experts):
+    """Test the functionality of rearranging expert weights with redundancy."""
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    @worker_fn_wrapper
+    def worker_fn():
+        # Initialize model parallel (using tensor parallel as an entrypoint
+        # to expert parallel)
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size,
+            pipeline_model_parallel_size=1)
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
+
+        # Test parameters
+        total_physical_experts = world_size * num_local_experts
+        hidden_sizes = [32, 64]  # Two different weight matrices
+
+        # Create old expert indices (with redundancy)
+        redundancy_config = create_redundancy_config(num_logical_experts,
+                                                     total_physical_experts)
+
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            redundancy_config,
+        )
+
+        # Create new expert indices (with redundancy)
+        new_redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts)
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers,
+            num_logical_experts,
+            total_physical_experts,
+            new_redundancy_config,
+        )
+
+        # Create expert weights
+        expert_weights = create_expert_weights(num_layers, num_local_experts,
+                                               hidden_sizes, ep_rank, device,
+                                               old_indices)
+
+        # Execute weight rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        # Verify the rearrangement result
+        verify_expert_weights_after_shuffle(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            ep_rank,
+            num_local_experts,
+        )
+
+        verify_redundant_experts_have_same_weights(
+            expert_weights,
+            new_indices,
+            hidden_sizes,
+            world_size,
+            num_local_experts,
+        )
+
+    distributed_run(worker_fn, world_size)
+
+
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_rearrange_expert_weights_no_change(world_size):
+    """
+    Test that when the indices do not change, the weights should remain
+    unchanged.
+    """
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    @worker_fn_wrapper
+    def worker_fn():
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size,
+            pipeline_model_parallel_size=1)
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
+
+        num_layers = 2
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2  # Some redundancy
+        hidden_sizes = [32, 64]
+
+        # Create redundancy configuration
+        redundancy_config = [2] * num_logical_experts
+
+        # Same indices - no change
+        indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts,
+            redundancy_config)
+
+        expert_weights = create_expert_weights(num_layers, num_local_experts,
+                                               hidden_sizes, ep_rank, device,
+                                               indices)
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute rearrangement (should be no change)
+        rearrange_expert_weights_inplace(
+            indices,
+            indices,  # Same indices
+            expert_weights,
+            ep_group,
+            is_profile=False)
+
+        # Verify that the weights have not changed
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg=f"Layer {layer}, weight {weight_idx} should remain "
+                    f"unchanged")
+
+    distributed_run(worker_fn, world_size)
+
+
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_rearrange_expert_weights_profile_mode(world_size):
+    """Test profile mode (should not copy actual weights)"""
+
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    @worker_fn_wrapper
+    def worker_fn():
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size,
+            pipeline_model_parallel_size=1)
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+        device = torch.device(f"cuda:{ep_rank}")
+
+        num_layers = 1
+        num_local_experts = 2
+        total_physical_experts = world_size * num_local_experts
+        num_logical_experts = total_physical_experts // 2
+        hidden_sizes = [32]
+
+        # Create different index distributions
+        old_redundancy = create_redundancy_config(num_logical_experts,
+                                                  total_physical_experts)
+        new_redundancy = create_redundancy_config(num_logical_experts,
+                                                  total_physical_experts)
+
+        old_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts,
+            old_redundancy)
+        new_indices = create_expert_indices_with_redundancy(
+            num_layers, num_logical_experts, total_physical_experts,
+            new_redundancy)
+
+        expert_weights = create_expert_weights(num_layers, num_local_experts,
+                                               hidden_sizes, ep_rank, device,
+                                               old_indices)
+
+        # Save original weights
+        original_weights = []
+        for layer_weights in expert_weights:
+            layer_copy = []
+            for weight in layer_weights:
+                layer_copy.append(weight.clone())
+            original_weights.append(layer_copy)
+
+        # Execute profile mode rearrangement
+        rearrange_expert_weights_inplace(
+            old_indices,
+            new_indices,
+            expert_weights,
+            ep_group,
+            is_profile=True  # Profile mode
+        )
+
+        # In profile mode, the weights should remain unchanged
+        for layer in range(num_layers):
+            for weight_idx in range(len(hidden_sizes)):
+                torch.testing.assert_close(
+                    expert_weights[layer][weight_idx],
+                    original_weights[layer][weight_idx],
+                    msg="In profile mode, the weights should remain unchanged")
+
+    distributed_run(worker_fn, world_size)
--- a/tests/distributed/test_node_count.py
+++ b/tests/distributed/test_node_count.py
@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+
+import torch.distributed as dist
+
+from vllm.distributed.parallel_state import _node_count
+from vllm.distributed.utils import StatelessProcessGroup
+from vllm.utils import get_ip, get_open_port
+
+if __name__ == "__main__":
+    dist.init_process_group(backend="gloo")
+
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    if rank == 0:
+        port = get_open_port()
+        ip = get_ip()
+        dist.broadcast_object_list([ip, port], src=0)
+    else:
+        recv = [None, None]
+        dist.broadcast_object_list(recv, src=0)
+        ip, port = recv
+
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank, world_size)
+
+    for pg in [dist.group.WORLD, stateless_pg]:
+        test_result = _node_count(pg)
+
+        # Expected node count based on environment variable)
+        expected = int(os.environ.get("NUM_NODES", "1"))
+
+        assert test_result == expected, \
+            f"Expected {expected} nodes, got {test_result}"
+
+        if pg == dist.group.WORLD:
+            print(f"Node count test passed! Got {test_result} nodes "
+                  f"when using torch distributed!")
+        else:
+            print(f"Node count test passed! Got {test_result} nodes "
+                  f"when using StatelessProcessGroup!")
--- a/tests/distributed/test_quick_all_reduce.py
+++ b/tests/distributed/test_quick_all_reduce.py
@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import pytest
+import ray
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.communication_op import (  # noqa
+    tensor_model_parallel_all_reduce)
+from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
+                                             get_tp_group, graph_capture)
+from vllm.platforms import current_platform
+
+from ..utils import (ensure_model_parallel_initialized,
+                     init_test_distributed_environment, multi_process_parallel)
+
+torch.manual_seed(42)
+random.seed(44)
+# Size over 8MB is sufficient for custom quick allreduce.
+test_sizes = [
+    random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)
+]
+for i, v in enumerate(test_sizes):
+    test_sizes[i] -= v % 8
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def graph_quickreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+        ensure_model_parallel_initialized(tp_size, pp_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
+
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
+
+        for sz in test_sizes:
+            for dtype in [torch.float16, torch.bfloat16]:
+                with graph_capture(device=device) as graph_capture_context:
+                    inp1 = torch.randint(1,
+                                         23, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    inp2 = torch.randint(-23,
+                                         1, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    torch.cuda.synchronize()
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph,
+                                          stream=graph_capture_context.stream):
+                        for _ in range(num_communication):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                graph.replay()
+                torch.testing.assert_close(out1, inp1, atol=2.5, rtol=0.1)
+                torch.testing.assert_close(out2, inp2, atol=2.5, rtol=0.1)
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def eager_quickreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+
+        # Size over 8MB is sufficient for custom quick allreduce.
+        sz = 16 * 1024 * 1024
+        fa = get_tp_group().device_communicator.qr_comm
+        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
+                           dtype=torch.float16,
+                           device=device)
+        out = fa.quick_all_reduce(inp)
+        torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
+
+        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
+                           dtype=torch.bfloat16,
+                           device=device)
+        out = fa.quick_all_reduce(inp)
+        torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)
+
+
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="only test quick allreduce for rocm")
+@pytest.mark.parametrize("quant_mode", ["FP", "INT8", "INT6", "INT4"])
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
+@pytest.mark.parametrize("test_target", [graph_quickreduce, eager_quickreduce])
+def test_custom_quick_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
+                                pipeline_parallel_size, test_target,
+                                quant_mode):
+    world_size = tp_size * pipeline_parallel_size
+    if world_size > torch.cuda.device_count():
+        pytest.skip("Not enough GPUs to run the test.")
+
+    monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
+
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
+                           test_target)
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for middleware that's off by default and can be toggled through
+server arguments, mainly --api-key and --enable-request-id-headers.
+"""
+
+from http import HTTPStatus
+
+import pytest
+import requests
+
+from ...utils import RemoteOpenAIServer
+
+# Use a small embeddings model for faster startup and smaller memory footprint.
+# Since we are not testing any chat functionality,
+# using a chat capable model is overkill.
+MODEL_NAME = "intfloat/multilingual-e5-small"
+
+
+@pytest.fixture(scope="module")
+def server(request: pytest.FixtureRequest):
+    passed_params = []
+    if hasattr(request, "param"):
+        passed_params = request.param
+    if isinstance(passed_params, str):
+        passed_params = [passed_params]
+
+    args = [
+        "--task",
+        "embed",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "512",
+        "--enforce-eager",
+        "--max-num-seqs",
+        "2",
+        *passed_params
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_no_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"))
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.asyncio
+async def test_no_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+    assert "X-Request-Id" not in response.headers
+
+
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_missing_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"))
+    assert response.status_code == HTTPStatus.UNAUTHORIZED
+
+
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_passed_api_token(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("v1/models"),
+                            headers={"Authorization": "Bearer test"})
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server",
+    [["--api-key", "test"]],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_not_v1_api_token(server: RemoteOpenAIServer):
+    # Authorization check is skipped for any paths that
+    # don't start with /v1 (e.g. /v1/chat/completions).
+    response = requests.get(server.url_for("health"))
+    assert response.status_code == HTTPStatus.OK
+
+
+@pytest.mark.parametrize(
+    "server",
+    ["--enable-request-id-headers"],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_enable_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"))
+    assert "X-Request-Id" in response.headers
+    assert len(response.headers.get("X-Request-Id", "")) == 32
+
+
+@pytest.mark.parametrize(
+    "server",
+    ["--enable-request-id-headers"],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_custom_request_id_header(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("health"),
+                            headers={"X-Request-Id": "Custom"})
+    assert "X-Request-Id" in response.headers
+    assert response.headers.get("X-Request-Id") == "Custom"
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -82,6 +82,8 @@ async def test_long_audio_request(mary_had_lamb):

    mary_had_lamb.seek(0)
    audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
    repeated_audio = np.tile(audio, 10)
    # Repeated audio to buffer
    buffer = io.BytesIO()
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+# imports for guided decoding tests
+import json
+from unittest.mock import patch
+
+import librosa
+import numpy as np
+import pytest
+import soundfile as sf
+from openai._base_client import AsyncAPIClient
+
+from vllm.assets.audio import AudioAsset
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.fixture
+def foscolo():
+    # Test translation it->en
+    path = AudioAsset('azacinto_foscolo').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
+@pytest.mark.asyncio
+async def test_basic_audio(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            response_format="text",
+            # TODO remove once language detection is implemented
+            extra_body=dict(language="it"),
+            temperature=0.0)
+        out = json.loads(translation)['text'].strip()
+        assert "Nor will I ever touch the sacred" in out
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    # Condition whisper on starting text
+    prompt = "Nor have I ever"
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            prompt=prompt,
+            extra_body=dict(language="it"),
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "Nor will I ever touch the sacred" not in out
+        assert prompt not in out
+
+
+@pytest.mark.asyncio
+async def test_non_asr_model(foscolo):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.audio.translations.create(model=model_name,
+                                                     file=foscolo,
+                                                     temperature=0.0)
+        assert res.code == 400 and not res.text
+        assert res.message == "The model does not support Translations API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    translation = ""
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res_no_stream = await client.audio.translations.create(
+            model=model_name,
+            file=foscolo,
+            response_format="json",
+            extra_body=dict(language="it"),
+            temperature=0.0)
+        # Unfortunately this only works when the openai client is patched
+        # to use streaming mode, not exposed in the translation api.
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.translations.create(model=model_name,
+                                                         file=foscolo,
+                                                         temperature=0.0,
+                                                         extra_body=dict(
+                                                             stream=True,
+                                                             language="it"))
+            # Reconstruct from chunks and validate
+            async for chunk in res:
+                # just a chunk
+                text = chunk.choices[0]['delta']['content']
+                translation += text
+
+        assert translation == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.translations.create(
+                model=model_name,
+                file=foscolo,
+                temperature=0.0,
+                extra_body=dict(language="it",
+                                stream=True,
+                                stream_include_usage=True,
+                                stream_continuous_usage_stats=True))
+            final = False
+            continuous = True
+            async for chunk in res:
+                if not len(chunk.choices):
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and hasattr(chunk, 'usage')
+            assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(foscolo):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+
+    foscolo.seek(0)
+    audio, sr = librosa.load(foscolo)
+    repeated_audio = np.tile(audio, 2)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format='WAV')
+    buffer.seek(0)
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=model_name,
+            file=buffer,
+            extra_body=dict(language="it"),
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(translation)['text'].strip().lower()
+        # TODO investigate higher model uncertainty in for longer translations.
+        assert out.count("nor will i ever") == 2
--- a/tests/kernels/attention/test_mla_decode_cpu.py
+++ b/tests/kernels/attention/test_mla_decode_cpu.py
@ -7,10 +7,7 @@ from torch import Tensor

 import vllm._custom_ops as ops
 from vllm.platforms import current_platform
-
-
-def cdiv(a, b):
-    return (a + b - 1) // b
+from vllm.utils import cdiv


 def ref_mla(
--- a/tests/kernels/attention/test_triton_decode_attention.py
+++ b/tests/kernels/attention/test_triton_decode_attention.py
@ -5,10 +5,7 @@ import pytest
 import torch

 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
-
-
-def cdiv(a, b):
-    return (a + b - 1) // b
+from vllm.utils import cdiv


@pytest.mark.parametrize("B", [3, 5])
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@ -29,7 +29,10 @@ MNK_FACTORS = [
    (224, 1024, 1536),
    (224, 3072, 1024),
    (224, 3072, 1536),
-    (1024 * 128, 1024, 1024),
+    (32768, 1024, 1024),
+    # These sizes trigger wrong answers.
+    #(7232, 2048, 5120),
+    #(40000, 2048, 5120),
 ]

 vllm_config = VllmConfig(parallel_config=ParallelConfig(
@ -232,8 +235,10 @@ def test_cutlass_moe_8_bit_no_graph(
    topk: int,
    per_act_token: bool,
    per_out_ch: bool,
+    monkeypatch,
 ):
    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
    with set_current_vllm_config(vllm_config):
        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
                                                  per_out_ch)
@ -274,8 +279,10 @@ def test_cutlass_moe_8_bit_cuda_graph(
    topk: int,
    per_act_token: bool,
    per_out_ch: bool,
+    monkeypatch,
 ):
    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
    with set_current_vllm_config(vllm_config):
        dtype = torch.half

@ -329,8 +336,10 @@ def test_cutlass_moe_8_bit_EP(
    per_act_token: bool,
    per_out_channel: bool,
    ep_size: int,
+    monkeypatch,
 ):
    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
    with set_current_vllm_config(vllm_config):
        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
                                                  per_out_channel)
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@ -22,7 +22,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8)
 from vllm.platforms import current_platform

-from .deepep_utils import ProcessGroupInfo, parallel_launch
+from .utils import ProcessGroupInfo, parallel_launch

 has_deep_ep = importlib.util.find_spec("deep_ep") is not None

--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8)
 from vllm.platforms import current_platform

-from .deepep_utils import ProcessGroupInfo, parallel_launch
+from .utils import ProcessGroupInfo, parallel_launch

 has_deep_ep = importlib.util.find_spec("deep_ep") is not None

--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@ -4,6 +4,9 @@

 Run `pytest tests/kernels/test_moe.py`.
 """
+import functools
+from typing import Callable, Optional, Union
+
 import pytest
 import torch
 from torch.nn import Parameter
@ -14,6 +17,7 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
 from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_topk, modular_triton_fused_moe)
@ -40,7 +44,76 @@ vllm_config.scheduler_config.max_num_seqs = 128
 vllm_config.scheduler_config.max_model_len = 8192


-@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128])
+def run_moe_test(
+    baseline: Union[Callable, torch.Tensor],
+    moe_fn: Callable,
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    score: torch.Tensor,
+    topk: int,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    padding: bool = False,
+    use_compile: bool = False,
+    use_cudagraph: bool = False,
+    atol: float = 2e-2,
+    rtol: float = 0,
+) -> torch.Tensor:
+    if isinstance(baseline, torch.Tensor):
+        baseline_output = baseline
+    else:
+        baseline_output = baseline(a,
+                                   w1,
+                                   w2,
+                                   score,
+                                   topk,
+                                   global_num_experts=global_num_experts,
+                                   expert_map=expert_map)
+
+    # Pad the weight if moe padding is enabled
+    if padding:
+        w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128]
+        w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128]
+
+    if use_compile:
+        moe_fn = torch.compile(moe_fn, backend="inductor", fullgraph=True)
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(score, 0)
+
+    test_output = moe_fn(a,
+                         w1,
+                         w2,
+                         score,
+                         topk,
+                         global_num_experts=global_num_experts,
+                         expert_map=expert_map)
+
+    if use_cudagraph:
+        test_output.fill_(0)
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            test_output = moe_fn(a,
+                                 w1,
+                                 w2,
+                                 score,
+                                 topk,
+                                 global_num_experts=global_num_experts,
+                                 expert_map=expert_map)
+        torch.cuda.synchronize()
+        graph.replay()
+        torch.cuda.synchronize()
+
+    torch.testing.assert_close(test_output,
+                               baseline_output,
+                               atol=atol,
+                               rtol=rtol)
+
+    return baseline_output
+
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222, 32768, 40000])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("k", [128, 511, 1024])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@ -48,6 +121,7 @@ vllm_config.scheduler_config.max_model_len = 8192
@pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize("chunk_size", [8192])
 def test_fused_moe(
    m: int,
    n: int,
@ -57,7 +131,17 @@ def test_fused_moe(
    ep_size: int,
    dtype: torch.dtype,
    padding: bool,
+    chunk_size: int,
+    monkeypatch,
 ):
+    current_platform.seed_everything(7)
+
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
+
+    #
+    # Setup test data
+    #
+
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
@ -77,58 +161,70 @@ def test_fused_moe(
    else:
        e_map = None

-    m_fused_moe = modular_triton_fused_moe(use_fp8_w8a8=False,
-                                           use_int8_w8a8=False,
-                                           use_int8_w8a16=False,
-                                           use_int4_w4a16=False,
-                                           per_channel_quant=False,
-                                           block_shape=None)
+    #
+    # Setup test functions
+    #
+
+    m_fused_moe_fn = modular_triton_fused_moe(use_fp8_w8a8=False,
+                                              use_int8_w8a8=False,
+                                              use_int8_w8a16=False,
+                                              use_int4_w4a16=False,
+                                              per_channel_quant=False,
+                                              block_shape=None)
+
+    def m_fused_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        score: torch.Tensor,
+        topk: int,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
+        return m_fused_moe_fn(a,
+                              w1,
+                              w2,
+                              topk_weights,
+                              topk_ids,
+                              global_num_experts=global_num_experts,
+                              expert_map=expert_map)
+
+    fused_moe_fn = functools.partial(fused_moe, renormalize=False)
+
+    #
+    # Run tests
+    #
+    runner = functools.partial(
+        run_moe_test,
+        a=a,
+        w1=w1,
+        w2=w2,
+        score=score,
+        topk=topk,
+        global_num_experts=e,
+        expert_map=e_map,
+        padding=padding,
+    )
+
+    # Note: for now use_compile will error out if the problem size is
+    # large enough to trigger chunking. I'm leaving the flag and
+    # setup code in case we are able to revisit this later.
+    use_compile = False
+
+    use_cudagraph = (n >= 1024 and k >= 1024
+                     and current_platform.is_cuda_alike())

    with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe(a, w1, w2, score, topk, e_map)
-        iterative_output = iterative_moe(a,
-                                         w1,
-                                         w2,
-                                         score,
-                                         topk,
-                                         global_num_experts=e,
-                                         expert_map=e_map,
-                                         renormalize=False)
-
-        # Pad the weight if moe padding is enabled
-        if padding:
-            w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128]
-            torch.cuda.empty_cache()
-            w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128]
-            torch.cuda.empty_cache()
-
-        triton_output = fused_moe(a,
-                                  w1,
-                                  w2,
-                                  score,
-                                  topk,
-                                  global_num_experts=e,
-                                  expert_map=e_map,
-                                  renormalize=False)
-
-        topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
-        m_triton_output = m_fused_moe(a,
-                                      w1,
-                                      w2,
-                                      topk_weights,
-                                      topk_ids,
-                                      global_num_experts=e,
-                                      expert_map=e_map)
-
-    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
-    torch.testing.assert_close(m_triton_output,
-                               torch_output,
-                               atol=2e-2,
-                               rtol=0)
-    torch.testing.assert_close(iterative_output,
-                               torch_output,
-                               atol=2e-2,
-                               rtol=0)
+        baseline_output = runner(torch_moe, iterative_moe)
+        runner(baseline_output,
+               fused_moe_fn,
+               use_compile=use_compile,
+               use_cudagraph=use_cudagraph)
+        runner(baseline_output,
+               m_fused_moe,
+               use_compile=use_compile,
+               use_cudagraph=use_cudagraph)


@pytest.mark.parametrize("m", [1, 32, 222])
@ -238,7 +334,12 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
                                  w1_zp=w1_qzeros if has_zp else None,
                                  w2_zp=w2_qzeros if has_zp else None,
                                  block_shape=[0, group_size])
-        torch_output = torch_moe(a, w1_ref, w2_ref, score, topk, e_map)
+        torch_output = torch_moe(a,
+                                 w1_ref,
+                                 w2_ref,
+                                 score,
+                                 topk,
+                                 expert_map=e_map)

    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)

@ -265,45 +366,51 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
            pytest.skip("AITER ROCm test skip for float32")

    # Instantiate our and huggingface's MoE blocks
-    config = MixtralConfig()
-    hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
-    vllm_moe = MixtralMoE(
-        num_experts=config.num_local_experts,
-        top_k=config.num_experts_per_tok,
-        hidden_size=config.hidden_size,
-        intermediate_size=config.intermediate_size,
-        params_dtype=dtype,
-        tp_size=1,
-        dp_size=1,
-    ).cuda()
+    vllm_config.compilation_config.static_forward_context = dict()
+    with (set_current_vllm_config(vllm_config),
+          set_forward_context(None, vllm_config)):
+        config = MixtralConfig()
+        hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
+        vllm_moe = MixtralMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            params_dtype=dtype,
+            tp_size=1,
+            dp_size=1,
+        ).cuda()

-    # Load the weights
-    vllm_moe.gate.weight.data[:] = hf_moe.gate.weight.data
-    for i in range(config.num_local_experts):
-        weights = (hf_moe.experts[i].w1.weight.data,
-                   hf_moe.experts[i].w3.weight.data)
-        vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
-        vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data
+        # Load the weights
+        vllm_moe.gate.weight.data[:] = hf_moe.gate.weight.data
+        for i in range(config.num_local_experts):
+            weights = (hf_moe.experts[i].w1.weight.data,
+                       hf_moe.experts[i].w3.weight.data)
+            vllm_moe.experts.w13_weight[i][:] = torch.cat(weights, dim=0)
+            vllm_moe.experts.w2_weight[i][:] = hf_moe.experts[i].w2.weight.data

-    # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
-    hf_inputs = torch.randn((1, 64, config.hidden_size)).to(dtype).to("cuda")
-    # vLLM uses 1D query [num_tokens, hidden_dim]
-    vllm_inputs = hf_inputs.flatten(0, 1)
+        # Generate input batch of dimensions [batch_size, seq_len, hidden_dim]
+        hf_inputs = torch.randn(
+            (1, 64, config.hidden_size)).to(dtype).to("cuda")
+        # vLLM uses 1D query [num_tokens, hidden_dim]
+        vllm_inputs = hf_inputs.flatten(0, 1)

-    # Pad the weight if moe padding is enabled
-    if padding:
-        vllm_moe.experts.w13_weight = Parameter(F.pad(
-            vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[..., 0:-128],
-                                                requires_grad=False)
-        torch.cuda.empty_cache()
-        vllm_moe.experts.w2_weight = Parameter(F.pad(
-            vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128],
-                                               requires_grad=False)
-        torch.cuda.empty_cache()
+        # Pad the weight if moe padding is enabled
+        if padding:
+            vllm_moe.experts.w13_weight = Parameter(F.pad(
+                vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[...,
+                                                                      0:-128],
+                                                    requires_grad=False)
+            torch.cuda.empty_cache()
+            vllm_moe.experts.w2_weight = Parameter(F.pad(
+                vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[...,
+                                                                     0:-128],
+                                                   requires_grad=False)
+            torch.cuda.empty_cache()

-    # Run forward passes for both MoE blocks
-    hf_states, _ = hf_moe.forward(hf_inputs)
-    vllm_states = vllm_moe.forward(vllm_inputs)
+        # Run forward passes for both MoE blocks
+        hf_states, _ = hf_moe.forward(hf_inputs)
+        vllm_states = vllm_moe.forward(vllm_inputs)

    mixtral_moe_tol = {
        torch.float32: 1e-3,
@ -546,7 +653,12 @@ def test_fused_marlin_moe(
    topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)

    with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map)
+        torch_output = torch_moe(a,
+                                 w_ref1,
+                                 w_ref2,
+                                 score,
+                                 topk,
+                                 expert_map=e_map)

    marlin_output = torch.ops.vllm.fused_marlin_moe(
        a,
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@ -136,7 +136,7 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
                                                  device=w2.device,
                                                  block_size=quant_blocksize)

-        torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk, None)
+        torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)

        torch.testing.assert_close(torch_output,
                                   cutlass_output,
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@ -6,16 +6,16 @@ from typing import Optional
 import pytest
 import torch

+from tests.kernels.utils import torch_experts
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
 from vllm.platforms import current_platform

-from .deepep_utils import ProcessGroupInfo, parallel_launch
+from .utils import ProcessGroupInfo, parallel_launch

 try:
    from pplx_kernels import AllToAll
@ -164,22 +164,6 @@ vllm_config.scheduler_config.max_num_seqs = 128
 vllm_config.scheduler_config.max_model_len = 8192


-def torch_moe2(a, w1, w2, topk_weight, topk_ids):
-    M, K = a.shape
-    topk = topk_ids.shape[1]
-    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
-    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
-    num_experts = w1.shape[0]
-    for i in range(num_experts):
-        mask = (topk_ids == i).view(-1)
-        if mask.sum():
-            out[mask] = SiluAndMul()(
-                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
-
-    return (out.view(M, -1, w2.shape[1]) *
-            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
-
-
 def _pplx_moe(
    pgi: ProcessGroupInfo,
    dp_size: int,
@ -210,8 +194,8 @@ def _pplx_moe(
        group_name = cpu_group.group_name

    with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe2(a_full, w1_full, w2_full, topk_weights,
-                                  topk_ids)
+        torch_output = torch_experts(a_full, w1_full, w2_full, topk_weights,
+                                     topk_ids)
        pplx_output = pplx_cutlass_moe(pgi, dp_size, a, w1, w2, w1_scale,
                                       w2_scale, topk_weights, topk_ids,
                                       a1_scale, out_dtype, per_act_token,
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@ -18,8 +18,8 @@ try:
 except ImportError:
    has_pplx = False

+from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import override_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
    BatchedExperts, BatchedPrepareAndFinalize, BatchedTritonExperts)
@ -29,7 +29,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
 from vllm.platforms import current_platform

-from .deepep_utils import ProcessGroupInfo, parallel_launch
+from .utils import ProcessGroupInfo, parallel_launch

 requires_pplx = pytest.mark.skipif(
    not has_pplx,
@ -163,29 +163,6 @@ def batched_moe(
    return fused_experts(a, w1, w2, topk_weight, topk_ids, num_experts)


-# Note: same as torch_moe but with fused_topk factored out.
-def torch_moe2(
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-) -> torch.Tensor:
-    M, K = a.shape
-    topk = topk_ids.shape[1]
-    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
-    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
-    num_experts = w1.shape[0]
-    for i in range(num_experts):
-        mask = (topk_ids == i).view(-1)
-        if mask.sum():
-            out[mask] = SiluAndMul()(
-                a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1)
-
-    return (out.view(M, -1, w2.shape[1]) *
-            topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
-
-
@pytest.mark.parametrize("m", [1, 33, 64, 222])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("k", [128, 512, 1024])
@ -209,7 +186,7 @@ def test_fused_moe_batched_experts(

    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
+        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
        torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
        batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids)

@ -409,7 +386,7 @@ def pplx_moe(
    w2: torch.Tensor,
    topk_weight: torch.Tensor,
    topk_ids: torch.Tensor,
-    use_compile: bool = True,
+    use_compile: bool = False,
    use_cudagraphs: bool = True,
 ) -> torch.Tensor:
    from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
@ -470,10 +447,16 @@ def pplx_moe(
    w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
    w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)

+    # Note: for now use_compile will error out if the problem size is
+    # large enough to trigger chunking. I'm leaving the flag and
+    # setup code in case we are able to revisit this later.
    if use_compile:
        _fused_experts = torch.compile(fused_experts,
                                       backend='inductor',
                                       fullgraph=True)
+        torch._dynamo.mark_dynamic(a_chunk, 0)
+        torch._dynamo.mark_dynamic(chunk_topk_weight, 0)
+        torch._dynamo.mark_dynamic(chunk_topk_ids, 0)
    else:
        _fused_experts = fused_experts

@ -576,7 +559,7 @@ def _pplx_moe(

    with set_current_vllm_config(vllm_config), override_config(moe_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids)
+        torch_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
        pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
                               a, w1, w2, topk_weight, topk_ids)
        # TODO (bnell): fix + re-enable
--- a/tests/kernels/moe/deepep_utils.py
+++ b/tests/kernels/moe/deepep_utils.py
@ -4,6 +4,7 @@ DeepEP test utilities
 """
 import dataclasses
 import importlib
+import os
 import traceback
 from typing import Callable, Optional

@ -13,6 +14,8 @@ from torch.multiprocessing import (
    spawn)  # pyright: ignore[reportPrivateImportUsage]
 from typing_extensions import Concatenate, ParamSpec

+from vllm.model_executor.layers.fused_moe.utils import find_free_port
+
 has_deep_ep = importlib.util.find_spec("deep_ep") is not None
 if has_deep_ep:
    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
@ -92,7 +95,7 @@ def parallel_launch(
            world_size,
            world_size,
            0,
-            "tcp://localhost:29500",
+            f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{find_free_port()}",
            worker,
        ) + args,
        nprocs=world_size,
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@ -403,19 +403,24 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
    itertools.product(M_moe_dg, N_moe, K_moe, E, TOP_KS, SEEDS))
@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
@torch.inference_mode()
-def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
-
-    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
-    block_size = [block_m, block_m]
-    dtype = torch.bfloat16
-
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
+                                            monkeypatch):
    if topk > E:
        pytest.skip(f"Skipping test: topk={topk} > E={E}")

    if not _valid_deep_gemm_shape(M, N, K):
        pytest.skip(f"Skipping test: invalid size m={M}, n={N}, k={K}")

+    chunk_size = 1024
+
    torch.manual_seed(seed)
+
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+    dtype = torch.bfloat16
+
    fp8_info = torch.finfo(torch.float8_e4m3fn)
    fp8_max, fp8_min = fp8_info.max, fp8_info.min

@ -451,6 +456,14 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])

+    # Note: for now use_compile will error out if the problem size is
+    # large enough to trigger chunking. I'm leaving the flag and
+    # setup code in case we are able to revisit this later.
+    use_compile = False
+
+    use_cudagraph = (chunk_size < M and N >= 1024 and K >= 1024
+                     and current_platform.is_cuda_alike())
+
    # Set the context to avoid lots of warning spam.
    with set_current_vllm_config(vllm_config):
        if M >= 128:
@ -463,7 +476,29 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
        topk_weights, topk_ids, token_expert_indices = fused_topk(
            a, score.float(), topk, False)

-        out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
+        if use_compile:
+            deep_gemm_moe_fp8_fn = torch.compile(deep_gemm_moe_fp8,
+                                                 backend="inductor",
+                                                 fullgraph=True)
+            torch._dynamo.mark_dynamic(a, 0)
+            torch._dynamo.mark_dynamic(topk_weights, 0)
+            torch._dynamo.mark_dynamic(topk_ids, 0)
+        else:
+            deep_gemm_moe_fp8_fn = deep_gemm_moe_fp8
+
+        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
+                                   topk_ids)
+
+        if use_cudagraph:
+            out.fill_(0)
+            stream = torch.cuda.Stream()
+            graph = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(graph, stream=stream):
+                out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
+                                           topk_ids)
+            torch.cuda.synchronize()
+            graph.replay()
+            torch.cuda.synchronize()

    #print(f"{out.sum()=}")
    #print(f"{ref_out.sum()=}")
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@ -1054,12 +1054,21 @@ def compute_max_diff(output, output_ref):
        torch.abs(output_ref))


-def torch_moe(a, w1, w2, score, topk, expert_map):
+def torch_experts(a: torch.Tensor,
+                  w1: torch.Tensor,
+                  w2: torch.Tensor,
+                  topk_weight: torch.Tensor,
+                  topk_ids: torch.Tensor,
+                  global_num_experts: int = -1,
+                  expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+    assert (global_num_experts == -1
+            or (global_num_experts == w1.shape[0] and expert_map is None)
+            or (expert_map is not None
+                and global_num_experts == expert_map.shape[0]))
+    topk = topk_ids.shape[1]
    B, D = a.shape
    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
-    score = torch.softmax(score, dim=-1, dtype=torch.float32)
-    topk_weight, topk_ids = torch.topk(score, topk)
    topk_weight = topk_weight.view(-1)
    topk_ids = topk_ids.view(-1)
    if expert_map is not None:
@ -1073,6 +1082,19 @@ def torch_moe(a, w1, w2, score, topk, expert_map):
            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)


+def torch_moe(a: torch.Tensor,
+              w1: torch.Tensor,
+              w2: torch.Tensor,
+              score: torch.Tensor,
+              topk: int,
+              global_num_experts: int = -1,
+              expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts,
+                         expert_map)
+
+
 def torch_moe_single(a, w, score, topk):
    B, D = a.shape
    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@ -28,42 +28,49 @@ class Relu3(ReLUSquaredActivation):


@pytest.mark.parametrize(
-    "env, torch_level, ops_enabled, default_on",
+    "env, torch_level, use_inductor, ops_enabled, default_on",
    [
        # Default values based on compile level
-        ("", 0, [True] * 4, True),
-        ("", 1, [True] * 4, True),
-        ("", 2, [True] * 4, True),  # All by default
-        ("", 3, [False] * 4, False),
-        ("", 4, [False] * 4, False),  # None by default
+        # - All by default (no Inductor compilation)
+        ("", 0, False, [True] * 4, True),
+        ("", 1, True, [True] * 4, True),
+        ("", 2, False, [True] * 4, True),
+        # - None by default (with Inductor)
+        ("", 3, True, [False] * 4, False),
+        ("", 4, True, [False] * 4, False),
+        # - All by default (without Inductor)
+        ("", 3, False, [True] * 4, True),
+        ("", 4, False, [True] * 4, True),
        # Explicitly enabling/disabling
        #
        # Default: all
        #
        # All but SiluAndMul
-        ("+rms_norm,-silu_and_mul", 0, [1, 0, 1, 1], True),
+        ("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True),
        # Only ReLU3
-        ("none,-rms_norm,+relu3", 0, [0, 0, 0, 1], False),
+        ("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False),
        # All but SiluAndMul
-        ("all,-silu_and_mul", 1, [1, 0, 1, 1], True),
+        ("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True),
        # All but ReLU3 (even if ReLU2 is on)
-        ("-relu3,relu2", 1, [1, 1, 1, 0], True),
-        # GeluAndMul and SiluAndMul
-        ("none,-relu3,+gelu_and_mul,+silu_and_mul", 2, [0, 1, 1, 0], False),
+        ("-relu3,relu2", 3, False, [1, 1, 1, 0], True),
+        # RMSNorm and SiluAndMul
+        ("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False),
        # All but RMSNorm
-        ("-rms_norm", 2, [0, 1, 1, 1], True),
+        ("-rms_norm", 3, False, [0, 1, 1, 1], True),
        #
        # Default: none
        #
        # Only ReLU3
-        ("-silu_and_mul,+relu3", 3, [0, 0, 0, 1], False),
+        ("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False),
        # All but RMSNorm
-        ("all,-rms_norm", 4, [0, 1, 1, 1], True),
+        ("all,-rms_norm", 4, True, [0, 1, 1, 1], True),
    ])
-def test_enabled_ops(env: str, torch_level: int, ops_enabled: list[int],
-                     default_on: bool):
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=torch_level, custom_ops=env.split(",")))
+def test_enabled_ops(env: str, torch_level: int, use_inductor: bool,
+                     ops_enabled: list[int], default_on: bool):
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(use_inductor=bool(use_inductor),
+                                             level=torch_level,
+                                             custom_ops=env.split(",")))
    with set_current_vllm_config(vllm_config):
        assert CustomOp.default_on() == default_on

--- a/tests/models/language/generation/test_gemma.py
+++ b/tests/models/language/generation/test_gemma.py
@ -7,14 +7,21 @@ MODELS = ["google/gemma-2b", "google/gemma-2-2b", "google/gemma-3-4b-it"]


@pytest.mark.parametrize("model", MODELS)
-def test_dummy_loader(vllm_runner, model: str) -> None:
-    with vllm_runner(
-            model,
-            load_format="dummy",
-    ) as llm:
-        normalizers = llm.collective_rpc(lambda self: self.worker.model_runner.
-                                         model.model.normalizer.cpu().item())
-        assert np.allclose(
-            normalizers,
-            llm.llm_engine.model_config.hf_config.hidden_size**0.5,
-            rtol=1e-3)
+def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
+        with vllm_runner(
+                model,
+                load_format="dummy",
+        ) as llm:
+            if model == "google/gemma-3-4b-it":
+                normalizers = llm.model.collective_rpc(
+                    lambda self: self.model_runner.model.language_model.model.
+                    normalizer.cpu().item())
+                config = llm.model.llm_engine.model_config.hf_config.text_config
+            else:
+                normalizers = llm.model.collective_rpc(
+                    lambda self: self.model_runner.model.model.normalizer.cpu(
+                    ).item())
+                config = llm.model.llm_engine.model_config.hf_config
+            assert np.allclose(normalizers, config.hidden_size**0.5, rtol=2e-3)
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@ -10,6 +10,7 @@ import pytest
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
    MistralToolCall, MistralToolParser)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.transformers_utils.tokenizer import MistralTokenizer

 from ...utils import check_logprobs_close

@ -318,3 +319,53 @@ def test_mistral_guided_decoding(
                                schema=SAMPLE_JSON_SCHEMA)
        except jsonschema.exceptions.ValidationError:
            pytest.fail("Generated response is not valid with JSON schema")
+
+
+def test_mistral_function_call_nested_json():
+    """Ensure that the function-name regex captures the entire outer-most
+    JSON block, including nested braces."""
+
+    # Create a minimal stub tokenizer that provides the few attributes the
+    # parser accesses (`version` and `get_vocab`).
+    class _StubMistralTokenizer(MistralTokenizer):
+        version = 11  # Satisfy the version check
+
+        def __init__(self):
+            pass
+
+        @staticmethod
+        def get_vocab():
+            # Provide the special TOOL_CALLS token expected by the parser.
+            return {"[TOOL_CALLS]": 0}
+
+    tokenizer = _StubMistralTokenizer()
+    parser = MistralToolParser(tokenizer)
+
+    # Craft a model output featuring nested JSON inside the arguments.
+    args_dict = {
+        "city": "Dallas",
+        "state": "TX",
+        "unit": "fahrenheit",
+        "sub_dict": {
+            "foo": "bar",
+            "inner": {
+                "x": 1,
+                "y": 2
+            }
+        },
+    }
+
+    model_output = (
+        f"{parser.bot_token}get_current_weather{json.dumps(args_dict)}")
+
+    parsed = parser.extract_tool_calls(model_output, None)
+
+    # Assertions: the tool call is detected and the full nested JSON is parsed
+    # without truncation.
+    assert parsed.tools_called
+
+    assert MistralToolCall.is_valid_id(parsed.tool_calls[0].id)
+    assert parsed.tool_calls[0].function.name == "get_current_weather"
+    assert json.loads(parsed.tool_calls[0].function.arguments) == args_dict
+    # No additional content outside the tool call should be returned.
+    assert parsed.content is None
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
-        outputs = self.model.encode(sentences, use_tqdm=False)
+        outputs = self.model.embed(sentences, use_tqdm=False)
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds
@ -250,16 +250,19 @@ def mteb_test_rerank_models(hf_runner,
    with vllm_runner(model_info.name,
                     task="score",
                     max_model_len=None,
+                     max_num_seqs=8,
                     **vllm_extra_kwargs) as vllm_model:

+        model_config = vllm_model.model.llm_engine.model_config
+
        if model_info.architecture:
-            assert (model_info.architecture
-                    in vllm_model.model.llm_engine.model_config.architectures)
+            assert (model_info.architecture in model_config.architectures)
+        assert model_config.hf_config.num_labels == 1

        vllm_main_score = run_mteb_rerank(VllmMtebEncoder(vllm_model),
                                          tasks=MTEB_RERANK_TASKS,
                                          languages=MTEB_RERANK_LANGS)
-        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
+        vllm_dtype = model_config.dtype

    with hf_runner(model_info.name, is_cross_encoder=True,
                   dtype="float32") as hf_model:
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -107,6 +107,8 @@ VLM_TEST_SETTINGS = {
            ),
            limit_mm_per_prompt={"image": 4},
        )],
+        # TODO: Revert to "auto" when CPU backend can use torch > 2.6
+        dtype="bfloat16" if current_platform.is_cpu() else "auto",
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    "paligemma": VLMTestInfo(
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@ -203,6 +203,9 @@ def build_embedding_inputs_from_test_info(

    images = [asset.pil_image for asset in image_assets]
    embeds = test_info.convert_assets_to_embeddings(image_assets)
+    if test_info.dtype != "auto":
+        dtype = getattr(torch, test_info.dtype)  # type: ignore
+        embeds = [e.to(dtype=dtype) for e in embeds]
    assert len(images) == len(model_prompts)

    inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -164,6 +164,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
+    "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
+                                          min_transformers_version="4.53"),
    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
    "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"),
    "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@ -31,12 +31,20 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):

        text_config = hf_config.get_text_config()

+        # Ensure at least 2 expert per group
+        # Since `grouped_topk` assums top-2
+        num_experts = getattr(text_config, 'n_group', 1) * 2
+
        text_config.update({
            "num_layers": 1,
            "num_hidden_layers": 1,
-            "num_experts": 2,
+            "num_experts": num_experts,
            "num_experts_per_tok": 2,
-            "num_local_experts": 2,
+            "num_local_experts": num_experts,
+            # Otherwise there will not be any expert layers
+            "first_k_dense_replace": 0,
+            # To avoid OOM on DeepSeek-V3
+            "n_routed_experts": num_experts,
        })

        if hasattr(hf_config, "vision_config"):
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@ -53,7 +53,9 @@ def test_oot_registration_embedding(
    with monkeypatch.context() as m:
        m.setenv("VLLM_PLUGINS", "register_dummy_model")
        prompts = ["Hello, my name is", "The text does not matter"]
-        llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+        llm = LLM(model=dummy_gemma2_embedding_path,
+                  load_format="dummy",
+                  max_model_len=2048)
        outputs = llm.embed(prompts)

        for output in outputs:
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@ -7,6 +7,8 @@ import pytest
 import torch
 import torch.nn.functional as F

+from vllm.utils import cdiv
+

 class BlockDiagonalCausalFromBottomRightMask:

@ -398,11 +400,8 @@ def test_contexted_kv_attention(
        assert (large_tile_size >= B_P_SIZE
                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"

-        def ceil_div(a, b):
-            return (a + b - 1) // b
-
        def pad_to_multiple(a, b):
-            return ceil_div(a, b) * b
+            return cdiv(a, b) * b

        def pad_to_next_power_of_2(a):
            assert a > 0
@ -411,7 +410,7 @@ def test_contexted_kv_attention(
        # calculate input shapes
        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-        num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+        num_active_blocks = cdiv(context_lens, block_size).sum().item()
        num_active_blocks = pad_to_multiple(num_active_blocks,
                                            large_tile_size // block_size)
        context_kv_len = num_active_blocks * block_size
--- a/tests/standalone_tests/pytorch_nightly_dependency.sh
+++ b/tests/standalone_tests/pytorch_nightly_dependency.sh
@ -0,0 +1,42 @@
+#!/bin/sh
+# This script tests if the nightly torch packages are not overridden by the dependencies
+
+set -e
+set -x
+
+cd /vllm-workspace/
+
+rm -rf .venv
+
+uv venv .venv
+
+source .venv/bin/activate
+
+# check the environment
+uv pip freeze
+
+echo ">>> Installing nightly torch packages"
+uv pip install --quiet torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu128
+
+echo ">>> Capturing torch-related versions before requirements install"
+uv pip freeze | grep -E '^torch|^torchvision|^torchaudio' | sort > before.txt
+echo "Before:"
+cat before.txt
+
+echo ">>> Installing requirements/nightly_torch_test.txt"
+uv pip install --quiet -r requirements/nightly_torch_test.txt
+
+echo ">>> Capturing torch-related versions after requirements install"
+uv pip freeze | grep -E '^torch|^torchvision|^torchaudio' | sort > after.txt
+echo "After:"
+cat after.txt
+
+echo ">>> Comparing versions"
+if diff before.txt after.txt; then
+  echo "torch version not overridden."
+else
+  echo "torch version overridden by nightly_torch_test.txt, \
+  if the dependency is not triggered by the pytroch nightly test,\
+  please add the dependency to the list 'white_list'  in tools/generate_nightly_torch_test.py"
+  exit 1
+fi
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@ -8,8 +8,10 @@ import time
 import uuid
 from threading import Thread
 from typing import Optional
+from unittest.mock import MagicMock

 import pytest
+import torch
 from transformers import AutoTokenizer

 from tests.utils import multi_gpu_test
@ -517,3 +519,72 @@ def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
        )

    assert "Engine core initialization failed" in str(e_info.value)
+
+
+@create_new_process_for_each_test()
+def test_engine_core_proc_instantiation_cuda_empty(
+        monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that EngineCoreProc can be instantiated when CUDA_VISIBLE_DEVICES
+    is empty. This ensures the engine frontend does not need access to GPUs.
+    """
+
+    from vllm.v1.engine.core import EngineCoreProc
+    from vllm.v1.executor.abstract import Executor
+
+    # Create a simple mock executor instead of a complex custom class
+    mock_executor_class = MagicMock(spec=Executor)
+
+    def create_mock_executor(vllm_config):
+        mock_executor = MagicMock()
+
+        # Only implement the methods that are actually called during init
+        from vllm.v1.kv_cache_interface import FullAttentionSpec
+        mock_spec = FullAttentionSpec(block_size=16,
+                                      num_kv_heads=1,
+                                      head_size=64,
+                                      dtype=torch.float16,
+                                      use_mla=False)
+
+        mock_executor.get_kv_cache_specs.return_value = [{
+            "default": mock_spec
+        }]
+        mock_executor.determine_available_memory.return_value = [
+            1024 * 1024 * 1024
+        ]
+        mock_executor.initialize_from_config.return_value = None
+        mock_executor.max_concurrent_batches = 1
+
+        return mock_executor
+
+    mock_executor_class.side_effect = create_mock_executor
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("CUDA_VISIBLE_DEVICES", "")  # No CUDA devices
+
+        from vllm.v1.utils import EngineZmqAddresses
+
+        def mock_startup_handshake(self, handshake_socket, on_head_node,
+                                   parallel_config):
+            return EngineZmqAddresses(inputs=["tcp://127.0.0.1:5555"],
+                                      outputs=["tcp://127.0.0.1:5556"],
+                                      coordinator_input=None,
+                                      coordinator_output=None)
+
+        # Background processes are not important here
+        m.setattr(EngineCoreProc, "startup_handshake", mock_startup_handshake)
+
+        vllm_config = EngineArgs(
+            model="deepseek-ai/DeepSeek-V2-Lite",
+            trust_remote_code=True).create_engine_config()
+        engine_core_proc = EngineCoreProc(
+            vllm_config=vllm_config,
+            on_head_node=True,
+            handshake_address="tcp://127.0.0.1:12345",
+            executor_class=mock_executor_class,
+            log_stats=False,
+            engine_index=0,
+        )
+
+        engine_core_proc.shutdown()
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@ -196,8 +196,7 @@ async def stream_service_response(client_info: dict, endpoint: str,
            yield chunk


-@app.post("/v1/completions")
-async def handle_completions(request: Request):
+async def _handle_completions(api: str, request: Request):
    try:
        req_data = await request.json()
        request_id = str(uuid.uuid4())
@ -206,9 +205,8 @@ async def handle_completions(request: Request):
        prefill_client_info = get_next_client(request.app, 'prefill')

        # Send request to prefill service
-        response = await send_request_to_service(prefill_client_info,
-                                                 "/completions", req_data,
-                                                 request_id)
+        response = await send_request_to_service(prefill_client_info, api,
+                                                 req_data, request_id)

        # Extract the needed fields
        response_json = response.json()
@ -224,7 +222,7 @@ async def handle_completions(request: Request):
        # Stream response from decode service
        async def generate_stream():
            async for chunk in stream_service_response(decode_client_info,
-                                                       "/completions",
+                                                       api,
                                                       req_data,
                                                       request_id=request_id):
                yield chunk
@ -237,12 +235,22 @@ async def handle_completions(request: Request):
        import traceback
        exc_info = sys.exc_info()
        print("Error occurred in disagg prefill proxy server"
-              " - completions endpoint")
+              f" - {api} endpoint")
        print(e)
        print("".join(traceback.format_exception(*exc_info)))
        raise


+@app.post("/v1/completions")
+async def handle_completions(request: Request):
+    return await _handle_completions("/completions", request)
+
+
+@app.post("/v1/chat/completions")
+async def handle_chat_completions(request: Request):
+    return await _handle_completions("/chat/completions", request)
+
+
@app.get("/healthcheck")
 async def healthcheck():
    """Simple endpoint to check if the server is running."""
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@ -9,11 +9,6 @@ from unittest.mock import patch

 import pytest

-try:
-    from nixl._api import nixl_agent as NixlWrapper
-except ImportError:
-    NixlWrapper = None
-
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
    KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata,
    NixlConnectorWorker)
@ -92,7 +87,8 @@ def test_prompt_less_than_block_size():
 class FakeNixlWrapper:
    """Mock implementation of NixlWrapper for testing.
    
-    We don't inherit from NixlWrapper because NixlWrapper could be None.
+    We don't inherit from nixl._api.nixl_agent because nixl may not be
+    installed.
    """

    AGENT_METADATA = b"fake_agent_metadata"
@ -167,7 +163,8 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
        super().__init__(*args, **kwargs)
        self._hand_shake_latency = hand_shake_latency

-    def _nixl_handshake(self, host: str, port: int):
+    def _nixl_handshake(self, host: str, port: int,
+                        remote_tp_size: int) -> dict[int, str]:
        # Mimic slow _nixl_handshake, as well as bypass zmq communication.
        time.sleep(self._hand_shake_latency)
        # These should've been done in register_kv_caches(), called by
@ -177,70 +174,200 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
        self.num_blocks = 1
        self.dst_num_blocks[self.engine_id] = self.num_blocks

-        self.add_remote_agent(
+        remote_agent_name = self.add_remote_agent(
            NixlAgentMetadata(
                engine_id=self.REMOTE_ENGINE_ID,
                agent_metadata=FakeNixlWrapper.AGENT_METADATA,
                kv_caches_base_addr=[0],
                num_blocks=1,
-                tp_size=1,
                block_len=self.block_len,
                attn_backend_name=self.backend_name,
-            ))
+            ),
+            remote_tp_size=remote_tp_size)
+        return {0: remote_agent_name}


-@pytest.mark.skipif(NixlWrapper is None, reason="nixl not installed")
-@patch(
-    "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
-    FakeNixlWrapper)
-def test_multi_xfer_one_engine(
-    # dist_init is a fixture that initializes the distributed environment.
-    dist_init):
-    """Test case where multiple xfers are initiated to the same engine.
-    
-    This test triggers the connector to load remote KV for the same
-    `request_id`. The transfer is not done immediately due to
-    `set_cycles_before_xfer_done`, so there is a state where there are multiple
-    transfer states for the same `request_id`, and `get_finished` should handle
-    it correctly (wait for all transfers to be done).
-    """
-    vllm_config = create_vllm_config()
+class TestNixlHandshake:

-    request_id = "req_id"
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper)
+    def test_multi_xfer_one_engine(
+        self,
+        # dist_init is a fixture that initializes the distributed environment.
+        dist_init):
+        """Test case where multiple xfers are initiated to the same engine.
+        
+        This test triggers the connector to load remote KV for the same
+        `request_id`. The transfer is not done immediately due to
+        `set_cycles_before_xfer_done`, so there is a state where there are
+        multiple transfer states for the same `request_id`, and `get_finished`
+        should handle it correctly (wait for all transfers to be done).
+        """
+        vllm_config = create_vllm_config()

-    # Test worker role in decode server.
-    connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
-    connector.connector_worker = FakeNixlConnectorWorker(vllm_config,
-                                                         connector.engine_id,
-                                                         hand_shake_latency=0)
-    assert isinstance(connector.connector_worker.nixl_wrapper, FakeNixlWrapper)
-    connector.connector_worker.nixl_wrapper.set_cycles_before_xfer_done(3)
-    for i in range(4):
+        request_id = "req_id"
+
+        # Test worker role in decode server.
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id, hand_shake_latency=0)
+        assert isinstance(connector.connector_worker.nixl_wrapper,
+                          FakeNixlWrapper)
+        connector.connector_worker.nixl_wrapper.set_cycles_before_xfer_done(3)
+        num_xfers = 4
+        while True:
+            # For the same request_id, initiate multiple xfers across different
+            # round of `execute_model` calls.
+            metadata = NixlConnectorMetadata()
+            if num_xfers > 0:
+                num_xfers -= 1
+                metadata.add_new_req(
+                    request_id=request_id,
+                    local_block_ids=[
+                        num_xfers + 1, num_xfers + 2, num_xfers + 3
+                    ],
+                    kv_transfer_params={
+                        "remote_block_ids":
+                        [num_xfers + 4, num_xfers + 5, num_xfers + 6],
+                        "remote_engine_id":
+                        FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                        "remote_host":
+                        "localhost",
+                        "remote_port":
+                        1234,
+                        "remote_tp_size":
+                        1,
+                    })
+            connector.bind_connector_metadata(metadata)
+
+            # Mimic maybe_setup_kv_connector in gpu_model_runner.
+            dummy_ctx = ForwardContext(
+                no_compile_layers={},
+                attn_metadata={},
+                virtual_engine=0,
+            )
+            _before_load = time.perf_counter()
+            connector.start_load_kv(dummy_ctx)
+            _after_load = time.perf_counter()
+            assert _after_load - _before_load < 0.1, "start_load_kv took " \
+                f"{_after_load - _before_load} seconds"
+
+            # Mimic get_finished_kv_transfers in gpu_model_runner.
+            _, done_recving = connector.get_finished(finished_req_ids=set())
+            if len(done_recving) > 0:
+                assert request_id in done_recving
+                break
+
+            connector.clear_connector_metadata()
+
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper)
+    @pytest.mark.parametrize("decode_tp_size, prefill_tp_size", [
+        (1, 1),
+        (2, 1),
+        (4, 2),
+        (4, 4),
+    ])
+    def test_async_load_kv(
+            self,
+            # Fixture that initializes the distributed environment.
+            dist_init,
+            # Simulate consumer-producer TP sizes.
+            decode_tp_size,
+            prefill_tp_size):
+        """Test that NixlConnector's start_load_kv should be non-blocking."""
+
+        vllm_config = create_vllm_config()
+        vllm_config.parallel_config.tensor_parallel_size = decode_tp_size
+
+        # Test worker role in decode server.
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id)
        metadata = NixlConnectorMetadata()
-        metadata.add_new_req(request_id=request_id,
-                             local_block_ids=[i + 1, i + 2, i + 3],
+        metadata.add_new_req(request_id="id",
+                             local_block_ids=[1, 2, 3],
                             kv_transfer_params={
-                                 "remote_block_ids": [i + 4, i + 5, i + 6],
+                                 "remote_block_ids": [4, 5, 6],
                                 "remote_engine_id":
                                 FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
                                 "remote_host": "localhost",
                                 "remote_port": 1234,
+                                 "remote_tp_size": prefill_tp_size,
                             })
        connector.bind_connector_metadata(metadata)

-        dummy_ctx = ForwardContext(
-            no_compile_layers={},
-            attn_metadata={},
-            virtual_engine=0,
-        )
-        _before_load = time.perf_counter()
-        connector.start_load_kv(dummy_ctx)
-        _after_load = time.perf_counter()
-        assert _after_load - _before_load < 0.1, "start_load_kv took " \
-            f"{_after_load - _before_load} seconds"
+        timeout = 2.5
+        start = time.perf_counter()
+        while time.perf_counter() - start < timeout:
+            dummy_ctx = ForwardContext(
+                no_compile_layers={},
+                attn_metadata={},
+                virtual_engine=0,
+            )
+            _before_load = time.perf_counter()
+            connector.start_load_kv(dummy_ctx)
+            _after_load = time.perf_counter()
+            assert _after_load - _before_load < 0.1, "start_load_kv took " \
+                f"{_after_load - _before_load} seconds"
+            time.sleep(0.5)  # backoff for the async handshake to complete.
+            connector.bind_connector_metadata(NixlConnectorMetadata())
+            _, done_recving = connector.get_finished(finished_req_ids=set())
+            if len(done_recving) > 0:
+                return
+        raise TimeoutError("Took too long to complete async handshake.")

-    while True:
-        _, done_recving = connector.get_finished(finished_req_ids=set())
-        if len(done_recving) > 0:
-            assert request_id in done_recving
-            break
+    @patch(
+        "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper",
+        FakeNixlWrapper)
+    def test_concurrent_load_kv(
+        self,
+        # dist_init is a fixture that initializes the distributed environment.
+        dist_init):
+        """Test that multiple start_load_kv calls should occur concurrently."""
+
+        vllm_config = create_vllm_config()
+
+        # Test worker role in decode server.
+        connector = NixlConnector(vllm_config, KVConnectorRole.WORKER)
+        connector.connector_worker = FakeNixlConnectorWorker(
+            vllm_config, connector.engine_id)
+        metadata = NixlConnectorMetadata()
+        total_reqs = 5
+        for i in range(total_reqs):
+            metadata.add_new_req(request_id=f"id_{i}",
+                                 local_block_ids=[1, 2, 3],
+                                 kv_transfer_params={
+                                     "remote_block_ids": [4, 5, 6],
+                                     "remote_engine_id":
+                                     FakeNixlConnectorWorker.REMOTE_ENGINE_ID,
+                                     "remote_host": "localhost",
+                                     "remote_port": 1234,
+                                     "remote_tp_size": 1,
+                                 })
+        connector.bind_connector_metadata(metadata)
+
+        timeout = 2.5 * total_reqs
+        cnt_finished_reqs = 0
+        start = time.perf_counter()
+        while time.perf_counter() - start < timeout:
+            dummy_ctx = ForwardContext(
+                no_compile_layers={},
+                attn_metadata={},
+                virtual_engine=0,
+            )
+            _before_load = time.perf_counter()
+            connector.start_load_kv(dummy_ctx)
+            _after_load = time.perf_counter()
+            assert _after_load - _before_load < 0.1, "start_load_kv took " \
+                f"{_after_load - _before_load} seconds"
+            time.sleep(0.5)  # backoff for the async handshake to complete.
+            connector.bind_connector_metadata(NixlConnectorMetadata())
+            _, done_recving = connector.get_finished(finished_req_ids=set())
+            if len(done_recving) > 0:
+                cnt_finished_reqs += len(done_recving)
+                if cnt_finished_reqs == total_reqs:
+                    return
+        raise TimeoutError("Took too long to complete async handshake.")
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@ -74,12 +74,6 @@ def test_unsupported_configs(monkeypatch):
                disable_async_output_proc=True,
            ).create_engine_config()

-        with pytest.raises(NotImplementedError):
-            AsyncEngineArgs(
-                model=MODEL,
-                scheduling_policy="priority",
-            ).create_engine_config()
-
        with pytest.raises(NotImplementedError):
            AsyncEngineArgs(
                model=MODEL,
--- a/tests/v1/tpu/test_kv_cache_update_kernel.py
+++ b/tests/v1/tpu/test_kv_cache_update_kernel.py
@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import pytest
+import torch
+import torch_xla
+
+import vllm.v1.attention.backends.pallas  # noqa: F401
+from vllm.platforms import current_platform
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a test for TPU only")
+@pytest.mark.parametrize("page_size", [32, 33])
+@pytest.mark.parametrize("combined_kv_head_num", [2, 16])
+@pytest.mark.parametrize("head_dim", [128, 256])
+@pytest.mark.parametrize("num_slices_per_block", [4, 8])
+def test_kv_cache_update_kernel(page_size: int, combined_kv_head_num: int,
+                                head_dim: int, num_slices_per_block: int):
+    page_num = 1000
+    padded_num_tokens = 128
+    kv_cache_cpu = torch.zeros(
+        (page_num * page_size, combined_kv_head_num, head_dim),
+        dtype=torch.bfloat16,
+        device="cpu")
+    kv_cache_xla = kv_cache_cpu.to(torch_xla.device())
+    new_kv_cpu = torch.randn(
+        (padded_num_tokens, combined_kv_head_num, head_dim),
+        dtype=torch.bfloat16,
+        device="cpu")
+    new_kv_xla = new_kv_cpu.to(torch_xla.device())
+    slice_lens = np.array([7, page_size, page_size, 1, 1, 1, 9],
+                          dtype=np.int32)
+    kv_cache_start_indices = np.array([
+        page_size * 2 - 7, page_size * 2, page_size * 3, page_size * 4 + 6,
+        page_size * 5 + 7, page_size * 6 + 8, page_size * 15 + 3
+    ],
+                                      dtype=np.int32)
+    new_kv_cache_indices = np.concatenate(
+        [np.array([0], dtype=np.int32),
+         np.cumsum(slice_lens[:-1])])
+    slot_mapping = np.stack(
+        [kv_cache_start_indices, new_kv_cache_indices, slice_lens], axis=1)
+    padded_size = (slot_mapping.shape[0] + num_slices_per_block -
+                   1) // num_slices_per_block * num_slices_per_block
+    slot_mapping = np.pad(slot_mapping,
+                          [[0, padded_size - slot_mapping.shape[0]], [0, 0]],
+                          constant_values=0)
+    slot_mapping = np.transpose(slot_mapping)
+    slot_mapping_cpu = torch.tensor(slot_mapping,
+                                    device="cpu",
+                                    dtype=torch.int32)
+    slot_mapping_xla = slot_mapping_cpu.to(torch_xla.device())
+    torch_xla.sync()
+
+    torch.ops.xla.dynamo_set_buffer_donor_(kv_cache_xla, True)
+    new_kv_cache_xla = torch.ops.xla.kv_cache_update_op(
+        new_kv_xla, slot_mapping_xla, kv_cache_xla, page_size,
+        num_slices_per_block)
+    kv_cache_xla.copy_(new_kv_cache_xla)
+    torch_xla.sync()
+
+    for ni, ci, sl in zip(new_kv_cache_indices, kv_cache_start_indices,
+                          slice_lens):
+        kv_cache_cpu[ci:ci + sl, :, :] = new_kv_cpu[ni:ni + sl, :, :]
+
+    assert torch.allclose(kv_cache_xla.cpu(),
+                          kv_cache_cpu,
+                          atol=1e-4,
+                          rtol=1e-4)
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
@ -47,7 +47,7 @@ def test_ragged_paged_attention():
    key = torch.zeros(num_tokens, num_kv_heads * head_size)
    value = torch.zeros(num_tokens, num_kv_heads * head_size)
    kv_cache = torch.zeros(num_blocks, block_size, num_kv_heads * 2, head_size)
-    slot_mapping = torch.zeros(num_tokens, dtype=torch.int64)
+    slot_mapping = torch.zeros((3, num_tokens), dtype=torch.int64)
    max_num_reqs = 8
    max_num_blocks_per_req = 8
    block_tables = torch.zeros((max_num_reqs, max_num_blocks_per_req),
@ -65,6 +65,7 @@ def test_ragged_paged_attention():
        context_lens=context_lens,
        query_start_loc=query_start_loc,
        num_seqs=num_seqs,
+        num_slices_per_kv_cache_update_block=8,
    )

    with patch("torch.ops.xla.ragged_paged_attention"
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@ -587,3 +587,17 @@ def test_init_kv_cache_with_kv_sharing_valid():
    assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2
    assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0
    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1
+
+
+def test_most_model_len(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_TPU_MOST_MODEL_LEN", "2048")
+    vllm_config = get_vllm_config()
+    vllm_config.model_config.max_model_len = 32000
+    vllm_config.scheduler_config.max_num_seqs = 1200
+    model_runner = get_model_runner(vllm_config)
+
+    # verify model runner will adjust num_reqs to avoid SMEM OOM.
+    assert model_runner.num_reqs_most_model_len == 1200
+    # num_page_per_req = 32k // 128
+    # num_reqs = 1024 ** 2 // 2 // num_page_per_req // 4 = 524
+    assert model_runner.num_reqs_max_model_len == 524
--- a/tools/generate_cmake_presets.py
+++ b/tools/generate_cmake_presets.py
@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import json
+import multiprocessing
+import os
+import sys
+from shutil import which
+
+try:
+    # Try to get CUDA_HOME from PyTorch installation, which is the
+    # most reliable source of truth for vLLM's build.
+    from torch.utils.cpp_extension import CUDA_HOME
+except ImportError:
+    print("Warning: PyTorch not found. "
+          "Falling back to CUDA_HOME environment variable.")
+    CUDA_HOME = os.environ.get("CUDA_HOME")
+
+
+def get_python_executable():
+    """Get the current Python executable, which is used to run this script."""
+    return sys.executable
+
+
+def get_cpu_cores():
+    """Get the number of CPU cores."""
+    return multiprocessing.cpu_count()
+
+
+def generate_presets(output_path="CMakeUserPresets.json"):
+    """Generates the CMakeUserPresets.json file."""
+
+    print("Attempting to detect your system configuration...")
+
+    # Detect NVCC
+    nvcc_path = None
+    if CUDA_HOME:
+        prospective_path = os.path.join(CUDA_HOME, "bin", "nvcc")
+        if os.path.exists(prospective_path):
+            nvcc_path = prospective_path
+            print("Found nvcc via torch.utils.cpp_extension.CUDA_HOME: "
+                  f"{nvcc_path}")
+
+    if not nvcc_path:
+        nvcc_path = which("nvcc")
+        if nvcc_path:
+            print(f"Found nvcc in PATH: {nvcc_path}")
+
+    if not nvcc_path:
+        nvcc_path_input = input(
+            "Could not automatically find 'nvcc'. Please provide the full "
+            "path to nvcc (e.g., /usr/local/cuda/bin/nvcc): ")
+        nvcc_path = nvcc_path_input.strip()
+    print(f"Using NVCC path: {nvcc_path}")
+
+    # Detect Python executable
+    python_executable = get_python_executable()
+    if python_executable:
+        print(f"Found Python via sys.executable: {python_executable}")
+    else:
+        python_executable_prompt = (
+            "Could not automatically find Python executable. Please provide "
+            "the full path to your Python executable for vLLM development "
+            "(typically from your virtual environment, e.g., "
+            "/home/user/venvs/vllm/bin/python): ")
+        python_executable = input(python_executable_prompt).strip()
+        if not python_executable:
+            raise ValueError(
+                "Could not determine Python executable. Please provide it "
+                "manually.")
+
+    print(f"Using Python executable: {python_executable}")
+
+    # Get CPU cores
+    cpu_cores = get_cpu_cores()
+    nvcc_threads = min(4, cpu_cores)
+    cmake_jobs = max(1, cpu_cores // nvcc_threads)
+    print(f"Detected {cpu_cores} CPU cores. "
+          f"Setting NVCC_THREADS={nvcc_threads} and CMake jobs={cmake_jobs}.")
+
+    # Get vLLM project root (assuming this script is in vllm/tools/)
+    project_root = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), ".."))
+    print(f"VLLM project root detected as: {project_root}")
+
+    # Ensure python_executable path is absolute or resolvable
+    if not os.path.isabs(python_executable) and which(python_executable):
+        python_executable = os.path.abspath(which(python_executable))
+    elif not os.path.isabs(python_executable):
+        print(f"Warning: Python executable '{python_executable}' is not an "
+              "absolute path and not found in PATH. CMake might not find it.")
+
+    cache_variables = {
+        "CMAKE_CUDA_COMPILER": nvcc_path,
+        "CMAKE_BUILD_TYPE": "Release",
+        "VLLM_PYTHON_EXECUTABLE": python_executable,
+        "CMAKE_INSTALL_PREFIX": "${sourceDir}",
+        "CMAKE_CUDA_FLAGS": "",
+        "NVCC_THREADS": str(nvcc_threads),
+    }
+
+    # Detect compiler cache
+    if which("sccache"):
+        print("Using sccache for compiler caching.")
+        for launcher in ("C", "CXX", "CUDA", "HIP"):
+            cache_variables[f"CMAKE_{launcher}_COMPILER_LAUNCHER"] = "sccache"
+    elif which("ccache"):
+        print("Using ccache for compiler caching.")
+        for launcher in ("C", "CXX", "CUDA", "HIP"):
+            cache_variables[f"CMAKE_{launcher}_COMPILER_LAUNCHER"] = "ccache"
+    else:
+        print("No compiler cache ('ccache' or 'sccache') found.")
+
+    configure_preset = {
+        "name": "release",
+        "binaryDir": "${sourceDir}/cmake-build-release",
+        "cacheVariables": cache_variables,
+    }
+    if which("ninja"):
+        print("Using Ninja generator.")
+        configure_preset["generator"] = "Ninja"
+        cache_variables["CMAKE_JOB_POOLS"] = f"compile={cmake_jobs}"
+    else:
+        print("Ninja not found, using default generator. "
+              "Build may be slower.")
+
+    presets = {
+        "version":
+        6,
+        # Keep in sync with CMakeLists.txt and requirements/build.txt
+        "cmakeMinimumRequired": {
+            "major": 3,
+            "minor": 26,
+            "patch": 1
+        },
+        "configurePresets": [configure_preset],
+        "buildPresets": [{
+            "name": "release",
+            "configurePreset": "release",
+            "jobs": cmake_jobs,
+        }],
+    }
+
+    output_file_path = os.path.join(project_root, output_path)
+
+    if os.path.exists(output_file_path):
+        overwrite = input(
+            f"'{output_file_path}' already exists. Overwrite? (y/N): ").strip(
+            ).lower()
+        if overwrite != 'y':
+            print("Generation cancelled.")
+            return
+
+    try:
+        with open(output_file_path, "w") as f:
+            json.dump(presets, f, indent=4)
+        print(f"Successfully generated '{output_file_path}'")
+        print("\nTo use this preset:")
+        print(
+            f"1. Ensure you are in the vLLM root directory: cd {project_root}")
+        print("2. Initialize CMake: cmake --preset release")
+        print("3. Build+install: cmake --build --preset release "
+              "--target install")
+
+    except OSError as e:
+        print(f"Error writing file: {e}")
+
+
+if __name__ == "__main__":
+    generate_presets()
--- a/tools/generate_nightly_torch_test.py
+++ b/tools/generate_nightly_torch_test.py
@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Generates specialized requirements files for nightly PyTorch testing.
+
+This script reads the main test requirements input file (`requirements/test.in`)
+and splits its content into two files:
+1.  `requirements/nightly_torch_test.txt`: Contains dependencies
+except PyTorch-related.
+2.  `torch_nightly_test.txt`: Contains only PyTorch-related packages.
+"""
+
+input_file = "requirements/test.in"
+output_file = "requirements/nightly_torch_test.txt"
+
+# white list of packages that are not compatible with PyTorch nightly directly
+# with pip install. Please add your package to this list if it is not compatible
+# or make the dependency test fails.
+white_list = ["torch", "torchaudio", "torchvision", "mamba_ssm"]
+
+with open(input_file) as f:
+    lines = f.readlines()
+
+skip_next = False
+
+for line in lines:
+    if skip_next:
+        if line.startswith((" ", "\t")) or line.strip() == "":
+            continue
+        skip_next = False
+
+    if any(k in line.lower() for k in white_list):
+        skip_next = True
+        continue
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -5,7 +5,6 @@ import contextlib
 from typing import TYPE_CHECKING, Optional, Union

 import torch
-import torch.library

 import vllm.envs as envs
 from vllm.logger import init_logger
@ -1276,7 +1275,7 @@ def scaled_fp8_quant(
            torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
    else:
        # num_token_padding not implemented for this case
-        assert (scale.numel() == 1 or num_token_padding is None)
+        assert (scale.numel() == 1 and num_token_padding is None)
        torch.ops._C.static_scaled_fp8_quant(output, input, scale)

    return output, scale
@ -1749,6 +1748,38 @@ def free_shared_buffer(ptr: int) -> None:
    torch.ops._C_custom_ar.free_shared_buffer(ptr)


+# quick all reduce
+def init_custom_qr(rank: int,
+                   world_size: int,
+                   qr_max_size: Optional[int] = None) -> int:
+    return torch.ops._C_custom_ar.init_custom_qr(rank, world_size, qr_max_size)
+
+
+def qr_destroy(fa: int) -> None:
+    torch.ops._C_custom_ar.qr_destroy(fa)
+
+
+def qr_all_reduce(fa: int,
+                  inp: torch.Tensor,
+                  out: torch.Tensor,
+                  quant_level: int,
+                  cast_bf2half: bool = False) -> None:
+    torch.ops._C_custom_ar.qr_all_reduce(fa, inp, out, quant_level,
+                                         cast_bf2half)
+
+
+def qr_get_handle(fa: int) -> torch.Tensor:
+    return torch.ops._C_custom_ar.qr_get_handle(fa)
+
+
+def qr_open_handles(fa: int, handles: list[torch.Tensor]) -> None:
+    return torch.ops._C_custom_ar.qr_open_handles(fa, handles)
+
+
+def qr_max_size() -> int:
+    return torch.ops._C_custom_ar.qr_max_size()
+
+
 def get_flash_mla_metadata(
    cache_seqlens: torch.Tensor,
    num_heads_per_head_k: int,
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@ -228,6 +228,111 @@ class ipex_ops:
        ipex.llm.modules.PagedAttention.reshape_and_cache(
            key, value, key_cache, value_cache, slot_mapping)

+    @staticmethod
+    def reshape_and_cache_flash(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: Optional[torch.Tensor] = None,
+        v_scale: Optional[torch.Tensor] = None,
+        k_scale_float: float = 1.0,
+        v_scale_float: float = 1.0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        # TODO: support FP8 kv cache.
+        ipex.llm.modules.PagedAttention.reshape_and_cache_flash(
+            key, value, key_cache, value_cache, slot_mapping)
+
+    @staticmethod
+    def flash_attn_varlen_func(
+        out: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        seqused_k: torch.Tensor,  # we don't support this in ipex kernel
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        softmax_scale: float,
+        causal: bool,
+        block_table: torch.Tensor,
+        alibi_slopes: Optional[torch.Tensor],
+        window_size: Optional[list[int]] = None,
+        softcap: Optional[float] = 0.0,
+        cu_seqlens_k: Optional[torch.Tensor] = None,
+        # The following parameters are not used in ipex kernel currently,
+        # we keep API compatible to CUDA's.
+        scheduler_metadata=None,
+        fa_version: int = 2,
+        q_descale=None,
+        k_descale=None,
+        v_descale=None,
+    ):
+        if cu_seqlens_k is None:
+            # cu_seqlens_k is not used in ipex kernel.
+            cu_seqlens_k = torch.cumsum(seqused_k, dim=0)
+            cu_seqlens_k = torch.cat([
+                torch.tensor([0], device=seqused_k.device, dtype=torch.int32),
+                cu_seqlens_k
+            ]).to(torch.int32)
+
+        real_window_size: tuple[int, int]
+        if window_size is None:
+            real_window_size = (-1, -1)
+        else:
+            assert len(window_size) == 2
+            real_window_size = (window_size[0], window_size[1])
+        return ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
+            out,
+            q.contiguous(),
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            softmax_scale,
+            causal,
+            block_table,
+            alibi_slopes,
+            softcap=softcap,
+            window_size_left=real_window_size[0],
+            window_size_right=real_window_size[1],
+            k_scale=1.0,
+            v_scale=1.0,
+        )
+
+    @staticmethod
+    def get_scheduler_metadata(
+            batch_size,
+            max_seqlen_q,
+            max_seqlen_k,
+            num_heads_q,
+            num_heads_kv,
+            headdim,
+            cache_seqlens: torch.Tensor,
+            qkv_dtype=torch.bfloat16,
+            headdim_v=None,
+            cu_seqlens_q: Optional[torch.Tensor] = None,
+            cu_seqlens_k_new: Optional[torch.Tensor] = None,
+            cache_leftpad: Optional[torch.Tensor] = None,
+            page_size: Optional[int] = None,
+            max_seqlen_k_new=0,
+            causal=False,
+            window_size=(-1, -1),  # -1 means infinite context window
+            has_softcap=False,
+            num_splits=0,  # Can be tuned for speed
+            pack_gqa=None,  # Can be tuned for speed
+            sm_margin=0,  # Can be tuned if some SMs are used for communication
+    ) -> None:
+        logger.warning_once(
+            "get_scheduler_metadata is not implemented for ipex_ops, "
+            "returning None.")
+        return None
+
    @staticmethod
    def copy_blocks(key_caches: list[torch.Tensor],
                    value_caches: list[torch.Tensor],
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@ -306,12 +306,16 @@ class MultiHeadAttention(nn.Module):
                                        block_size=16,
                                        is_attention_free=False)
        backend = backend_name_to_enum(attn_backend.get_name())
-        if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
-            backend = _Backend.XFORMERS
+        if current_platform.is_rocm():
+            # currently, only torch_sdpa is supported on rocm
+            self.attn_backend = _Backend.TORCH_SDPA
+        else:
+            if backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+                backend = _Backend.XFORMERS

-        self.attn_backend = backend if backend in {
-            _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
-        } else _Backend.TORCH_SDPA
+            self.attn_backend = backend if backend in {
+                _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
+            } else _Backend.TORCH_SDPA

    def forward(
        self,
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@ -8,9 +8,7 @@ import torch
 from neuronxcc import nki
 from neuronxcc.nki.language import par_dim

-
-def ceil_div(a, b):
-    return (a + b - 1) // b
+from vllm.utils import cdiv


 def is_power_of_2(x):
@ -35,11 +33,10 @@ def load_block_tables(block_tables_hbm, num_tiles, num_blocks_per_tile):
        (num_tiles, num_blocks_per_tile))

    block_tables_sbuf = nl.zeros(
-        (ceil_div(num_tiles,
-                  B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile),
+        (cdiv(num_tiles, B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile),
        dtype=nl.int32,
    )
-    for i in nl.affine_range(ceil_div(num_tiles, B_P_SIZE)):
+    for i in nl.affine_range(cdiv(num_tiles, B_P_SIZE)):
        i_p = nl.arange(B_P_SIZE)[:, None]
        i_f = nl.arange(num_blocks_per_tile)[None, :]
        block_tables_sbuf[i, i_p, i_f] = nl.load(
@ -83,7 +80,7 @@ def transform_block_tables_for_indirect_load(
    assert is_power_of_2(
        num_blocks_per_tile), f"{num_blocks_per_tile=} is not power of 2"

-    num_loads = ceil_div(num_blocks_per_tile, B_P_SIZE)
+    num_loads = cdiv(num_blocks_per_tile, B_P_SIZE)
    block_tables_transposed = nl.ndarray(
        (
            num_loads,
@ -165,7 +162,7 @@ def load_kv_tile_from_cache(
           equivalent to (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
    """
    # load key cache
-    num_loads = ceil_div(num_blocks_per_large_tile, B_P_SIZE)
+    num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
    for load_idx in nl.affine_range(num_loads):
        i_p = nl.arange(B_P_SIZE)[:, None]
        i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
@ -605,7 +602,7 @@ def flash_paged_attention(
    )

    for large_k_tile_idx in nl.sequential_range(0, num_large_k_tile):
-        num_loads = ceil_div(num_blocks_per_large_tile, B_P_SIZE)
+        num_loads = cdiv(num_blocks_per_large_tile, B_P_SIZE)
        cur_k_tile = nl.ndarray(
            (par_dim(B_D_SIZE), LARGE_TILE_SZ),
            dtype=kernel_dtype,
--- a/vllm/attention/ops/pallas_kv_cache_update.py
+++ b/vllm/attention/ops/pallas_kv_cache_update.py
@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import functools
+
+import jax
+from jax.experimental import pallas as pl
+from jax.experimental.pallas import tpu as pltpu
+
+
+def _kv_cache_update_kernel(
+    # Prefetch
+    slices_ref,  # [3, num_slices], list of (kv_cache_start, new_kv_start,
+    # slice_len)
+    # Input
+    new_kv_hbm_ref,  # [num_tokens, num_combined_kv_heads, head_dim]
+    kv_cache_hbm_ref,  # [total_num_pages * page_size, num_combined_kv_heads,
+    # head_dim]
+    # Output
+    _,  # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
+    # Scratch
+    scratch,  # [num_slices_per_block, page_size, num_combined_kv_heads,
+    # head_dim]
+    sem,
+):
+    async_copies = []
+    block_idx = pl.program_id(0)
+    num_slices_per_block = scratch.shape[0]
+
+    # Copy from new_kv_hbm_ref to scratch
+    for i in range(num_slices_per_block):
+        offset_i = i + block_idx * num_slices_per_block
+        new_kv_start = slices_ref[1, offset_i]
+        length = slices_ref[2, offset_i]
+        async_copy = pltpu.make_async_copy(
+            new_kv_hbm_ref.at[pl.ds(new_kv_start, length), ...],
+            scratch.at[i, pl.ds(0, length), ...],
+            sem,
+        )
+        async_copy.start()
+        async_copies.append(async_copy)
+
+    for async_copy in async_copies:
+        async_copy.wait()
+
+    # Copy from scratch to kv_cache_hbm_ref
+    async_copies.clear()
+    for i in range(num_slices_per_block):
+        offset_i = i + block_idx * num_slices_per_block
+        kv_cache_start = slices_ref[0, offset_i]
+        length = slices_ref[2, offset_i]
+        async_copy = pltpu.make_async_copy(
+            scratch.at[i, pl.ds(0, length), ...],
+            kv_cache_hbm_ref.at[pl.ds(kv_cache_start, length), ...],
+            sem,
+        )
+        async_copy.start()
+        async_copies.append(async_copy)
+    for async_copy in async_copies:
+        async_copy.wait()
+
+
+@functools.partial(
+    jax.jit,
+    static_argnames=["page_size", "num_slices_per_block"],
+)
+def kv_cache_update(
+    new_kv: jax.Array,  # [total_num_token, num_combined_kv_heads, head_dim]
+    slices: jax.
+    Array,  # [3, slices], list of (kv_cache_start, new_kv_start, slice_len)
+    kv_cache: jax.
+    Array,  # [total_num_pages * page_size, num_combined_kv_heads, head_dim]
+    *,
+    page_size: int = 32,
+    num_slices_per_block: int = 8,
+):
+    assert slices.shape[1] % num_slices_per_block == 0
+    _, num_combined_kv_heads, head_dim = new_kv.shape
+    assert kv_cache.shape[1] == num_combined_kv_heads
+    assert kv_cache.shape[2] == head_dim
+    assert head_dim % 128 == 0
+    # TODO: Add dynamic check to make sure that the all the slice lengths are
+    # smaller or equal to page_size
+
+    in_specs = [
+        pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY),
+        pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY),
+    ]
+
+    out_specs = [pl.BlockSpec(memory_space=pltpu.TPUMemorySpace.ANY)]
+    out_shape = [jax.ShapeDtypeStruct(kv_cache.shape, dtype=kv_cache.dtype)]
+
+    scalar_prefetches = [slices]
+    scratch = pltpu.VMEM(
+        (num_slices_per_block, page_size, num_combined_kv_heads, head_dim),
+        new_kv.dtype,
+    )
+
+    scratch_shapes = [
+        scratch,
+        pltpu.SemaphoreType.DMA,
+    ]
+
+    kernel = pl.pallas_call(
+        _kv_cache_update_kernel,
+        grid_spec=pltpu.PrefetchScalarGridSpec(
+            num_scalar_prefetch=len(scalar_prefetches),
+            in_specs=in_specs,
+            out_specs=out_specs,
+            grid=(slices.shape[1] // num_slices_per_block, ),
+            scratch_shapes=scratch_shapes,
+        ),
+        out_shape=out_shape,
+        input_output_aliases={len(scalar_prefetches) + 1: 0},
+    )
+
+    return kernel(*scalar_prefetches, new_kv, kv_cache)[0]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ShriKode	b801bf30d7	iterate Signed-off-by: ShriKode <shrikode@gmail.com>	2025-06-28 22:21:17 +00:00
ShriKode	bfd63b1b10	initial Signed-off-by: ShriKode <shrikode@gmail.com>	2025-06-27 20:18:15 +00:00
Fabien Dupont	3c545c0c3b	[CI/Build] Allow hermetic builds (#18064 ) Signed-off-by: Fabien Dupont <fdupont@redhat.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Fabien Dupont <fabiendupont@pm.me> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Elias Levy <eliaslevy@google.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-06-27 09:04:39 -07:00
Tyler Michael Smith	e8c3bd2cd1	[Bugfix] Fix some narrowing conversion warnings (#20141 ) Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-06-27 09:01:28 -07:00
bnellnm	c6c983053d	[Bugfix] Mark 'hidden_states' as mutable in moe_forward registration. (#20152 ) Signed-off-by: Bill Nell <bnell@redhat.com>	2025-06-27 09:42:22 -06:00
Luka Govedič	aafabaa0d5	[Fix][torch.compile] Enable custom ops by default when Inductor off (#20102 ) Signed-off-by: luka <luka@neuralmagic.com>	2025-06-27 09:00:42 -06:00
Hosang	94a55c7681	[Fix][ROCm] Remove unused variables to fix build error on GFX11/12 (#19891 ) Signed-off-by: Hosang Yoon <hosang.yoon@amd.com>	2025-06-27 07:14:44 -07:00
Ilya Lavrenov	aa0dc77ef5	[Perf] Improved perf for resolve_chat_template_content_format (#20065 ) Signed-off-by: Ilya Lavrenov <ilya.lavrenov@cerebras.net>	2025-06-27 09:16:41 +00:00
Michael Goin	4ab3ac285e	[Bugfix] Fix flaky failure when getting DP ports (#20151 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-06-27 15:30:53 +08:00
Robert Shaw	d1c956dc0f	Gemma3n (Text-only) (#20134 ) Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.me>	2025-06-27 07:16:26 +00:00
Chendi.Xue	dec197e3e5	Quick Fix by adding conditional import for flash_attn_varlen_func in flash_attn (#20143 ) Signed-off-by: Chendi.Xue <chendi.xue@intel.com>	2025-06-27 05:48:13 +00:00
Yazan Sharaya	6e244ae091	[Perf][Frontend] eliminate api_key and x_request_id headers middleware overhead (#19946 ) Signed-off-by: Yazan-Sharaya <yazan.sharaya.yes@gmail.com>	2025-06-27 00:44:14 -04:00
wang.yuqi	cd4cfee689	[Model][1/N] Automatic conversion of CrossEncoding model (#20012 ) Signed-off-by: wang.yuqi <noooop@126.com>	2025-06-26 21:10:04 -07:00
Thomas Parnell	e110930680	[Fix] Fix gemma CI test failing on main (#20124 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>	2025-06-26 21:06:59 -07:00
Yang Wang	8b64c895c0	[CI] Sync test dependency with test.in for torch nightly (#19632 ) Signed-off-by: Yang Wang <elainewy@meta.com> Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu> Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: Concurrensee <yida.wu@amd.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com>	2025-06-26 20:55:25 -07:00
li haoyang	0740e29b66	[Feature] add quick all reduce (#19744 ) Signed-off-by: ilmarkov <imarkov@redhat.com> Signed-off-by: Haoyang Li <Haoyang.Li@amd.com> Co-authored-by: ilmarkov <imarkov@redhat.com>	2025-06-26 20:54:24 -07:00
Michael Goin	44d2e6af63	[Bugfix] Build moe_data for both sm100 and sm90 (#20086 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-06-26 20:50:12 -07:00
Ilya Markov	2d7779f888	[Perf] SM100 FP8 GEMM Optimizations after cutlass_profiler (#20071 ) Signed-off-by: ilmarkov <imarkov@redhat.com> Co-authored-by: ilmarkov <imarkov@redhat.com>	2025-06-26 20:50:09 -07:00
Dipika Sikka	a57d57fa72	[Quantization] Bump to use latest `compressed-tensors` (#20033 ) Signed-off-by: Dipika <dipikasikka1@gmail.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>	2025-06-26 20:50:06 -07:00
Michael Goin	71799fd005	[CI Failure] Fix OOM with test_oot_registration_embedding (#20144 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-06-27 11:21:04 +08:00
Bowen Wang	e9fd658a73	[Feature] Expert Parallelism Load Balancer (EPLB) (#18343 ) Signed-off-by: Bowen Wang <abmfy@icloud.com>	2025-06-26 15:30:21 -07:00
Kyle Yu	07b8fae219	[Doc] correct LoRA capitalization (#20135 ) Signed-off-by: kyolebu <kyu@redhat.com>	2025-06-26 15:22:12 -07:00
Wentao Ye	562308816c	[Refactor] Rename commnication utils (#20091 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-06-26 22:19:32 +00:00
Chengji Yao	04e1642e32	[TPU] add kv cache update kernel (#19928 ) Signed-off-by: Chengji Yao <chengjiyao@google.com>	2025-06-26 10:01:37 -07:00
Kunshang Ji	b69781f107	[Hardware][Intel GPU] Add v1 Intel GPU support with Flash attention backend. (#19560 ) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>	2025-06-26 09:27:18 -07:00
Tyler Michael Smith	0bceac9810	Spam folks if config.py changes (#20131 ) Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-06-26 08:19:46 -07:00
Cyrus Leung	34878a0b48	[Doc] Rename page titles (#20130 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-06-26 08:18:49 -07:00
Cyrus Leung	6393b03986	[Doc] Auto sign-off for VSCode (#20132 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-06-26 08:18:36 -07:00
wang.yuqi	0907d507bf	[Doc] Automatically signed-off by PyCharm (#20120 ) Signed-off-by: wang.yuqi <noooop@126.com>	2025-06-26 14:34:17 +00:00
Wentao Ye	c894c5dc1f	[Bug Fix] Fix address/port already in use error for deep_ep test (#20094 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-06-26 22:33:13 +08:00
Michael Goin	1f5d178e9c	Revert "[Bugfix] default set cuda_graph_sizes to max_num_seqs for v1 engine" (#20128 )	2025-06-26 07:32:22 -07:00
TJian	27c065df50	[Bugfix][V1][ROCm] Fix AITER Flash Attention Backend (Fix API Break and Local Attention Logic: affecting Llama4) (#19904 ) Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>	2025-06-26 12:42:31 +00:00
Michael Yao	84c260caeb	[Docs] Improve frameworks/helm.md (#20113 ) Signed-off-by: windsonsea <haifeng.yao@daocloud.io>	2025-06-26 10:41:51 +00:00
Reid	167aca45cb	[Misc] Use collapsible blocks for benchmark examples. (#20017 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-26 03:35:16 -07:00
Li, Jiang	0567c8249f	[CPU] Fix torch version in x86 CPU backend (#19258 ) Signed-off-by: jiang1.li <jiang1.li@intel.com>	2025-06-26 03:34:47 -07:00
Wentao Ye	d188913d99	[Refactor] Remove unused library (#20099 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-06-26 09:16:10 +00:00
Cyrus Leung	1d7c29f5fe	[Doc] Update docs for New Model Implementation (#20115 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-06-26 00:47:06 -07:00
Seiji Eicher	65397e40f5	[Bugfix] Allow `CUDA_VISIBLE_DEVICES=''` in `Platform.device_id_to_physical_device_id` (#18979 ) Signed-off-by: Seiji Eicher <seiji@anyscale.com>	2025-06-26 00:01:57 -07:00
Ekagra Ranjan	9502c38138	[Benchmark][Bug] Fix multiple bugs in bench and add args to spec_decode offline (#20083 )	2025-06-25 22:06:27 -07:00
Nicolò Lucchesi	2582683566	[PD] Skip `tp_size` exchange with rank0 (#19413 ) Signed-off-by: NickLucche <nlucches@redhat.com>	2025-06-25 20:04:39 -07:00
Michael Goin	754b00edb3	[Bugfix] Fix Mistral tool-parser regex for nested JSON (#20093 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-06-26 01:01:17 +00:00
Michael Goin	296ce95d8e	[CI] Add SM120 to the Dockerfile (#19794 ) Signed-off-by: mgoin <mgoin64@gmail.com>	2025-06-25 16:23:56 -07:00
Chenyaaang	2d7620c3eb	[TPU] Add TPU specific var VLLM_TPU_MOST_MODEL_LEN (#19919 ) Signed-off-by: Chenyaaang <chenyangli@google.com>	2025-06-25 15:51:02 -07:00
Nick Hill	55c65ab495	[P/D] Avoid stranding blocks in P when aborted in D's waiting queue (#19223 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-06-25 15:19:44 -07:00
Chengji Yao	2cc2069970	[TPU][Bugfix] fix kv cache padding (#20048 ) Signed-off-by: Chengji Yao <chengjiyao@google.com>	2025-06-25 21:24:10 +00:00
zhrrr	9f0608fc16	[Bugfix] default set cuda_graph_sizes to max_num_seqs for v1 engine (#20062 ) Signed-off-by: izhuhaoran <izhuhaoran@qq.com>	2025-06-25 21:03:17 +00:00
QiliangCui	4e0db57fff	Fix the path to the testing script. (#20082 ) Signed-off-by: Qiliang Cui <derrhein@gmail.com>	2025-06-25 20:48:17 +00:00
Nick Hill	c40692bf9a	[Misc] Add parallel state `node_count` function (#20045 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-06-25 13:38:53 -07:00
lkchen	4734704b30	[PD] let toy proxy handle /chat/completions (#19730 ) Signed-off-by: Linkun <github@lkchen.net>	2025-06-25 15:17:45 -04:00
Eldar Kurtić	8b8c209e35	static_scaled_fp8_quant should not run when scale.numel is not 1 (#20076 )	2025-06-25 15:08:03 -04:00
lsz05	23a04e0895	[Fix] Support cls pooling in ModernBertPooler (#20067 ) Signed-off-by: shengzhe.li <shengzhe.li@sbintuitions.co.jp>	2025-06-25 15:07:45 -04:00
Dipika Sikka	02c97d9a92	[Quantization] Add compressed-tensors emulations support for NVFP4 (#19879 ) Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com> Signed-off-by: Dipika <dipikasikka1@gmail.com>	2025-06-25 14:28:19 -04:00
Nicolò Lucchesi	e795d723ed	[Frontend] Add `/v1/audio/translations` OpenAI API endpoint (#19615 ) Signed-off-by: Roger Wang <ywang@roblox.com> Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Roger Wang <ywang@roblox.com>	2025-06-25 17:54:14 +00:00
cjackal	8359f4c8d8	[V1][Speculative Decoding] Fix DeepSeek MTP (#20022 ) Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>	2025-06-25 08:41:02 -07:00
Michael Goin	bf5181583f	[Doc] Guide for Incremental Compilation Workflow (#19109 )	2025-06-25 22:06:46 +09:00
Reid	c53fec1fcb	[doc] add reference link for Intel XPU (#20064 ) Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>	2025-06-25 12:24:07 +00:00
Lucas Wilkinson	0f9e7354f5	[BugFix] Fix full-cuda-graph illegal memory access in FA3 (#20057 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-06-25 08:39:04 +00:00
Aaron Pham	ba7ba35cda	[Chore] debloat some initial logs (#19438 ) Signed-off-by: Aaron Pham <contact@aarnphm.xyz>	2025-06-25 06:36:22 +00:00
bnellnm	015fab8c2f	[Kernels][Bugfix] Use torch op for all kernels in FusedMoE forward. Add additional testing for cudagraphs. (#19717 ) Signed-off-by: Bill Nell <bnell@redhat.com>	2025-06-24 23:22:58 -07:00
Max Wittig	f59fc60fb3	[Feat][CLI] enforce-include-usage (#19695 ) Signed-off-by: Max Wittig <max.wittig@siemens.com>	2025-06-25 01:43:04 -04:00
Wentao Ye	879f69bed3	[Refactor] Remove duplicate `ceil_div` (#20023 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-06-25 05:19:09 +00:00
David Xia	7108934142	[Frontend] speed up import time of vllm.config (#18036 ) Signed-off-by: David Xia <david@davidxia.com>	2025-06-25 00:41:11 -04:00
h-avsha	3443aaf8dd	Move to a faster base64 implementation (#19984 ) Signed-off-by: h-avsha <avshalom.manevich@hcompany.ai>	2025-06-24 20:33:51 -07:00
Isotr0py	2273ec322c	Revert "Fix(models/siglip): Add compatibility for Gemma models quantized by llm-compressor" (#20030 )	2025-06-25 11:23:29 +08:00
Wentao Ye	a6c4b87fbc	Revert "[Feature] Integrate new deepgemm (#19820 )" (#20049 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-06-24 19:45:22 -07:00
Brayden Zhong	1afa9948f5	[Llama4] Update `attn_temperature_tuning` (#19997 ) Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>	2025-06-24 22:42:53 -04:00
Eli Uriegas	0d06b533a0	cmake: Update vllm_flash_attn for vllm_kernels (#20032 ) Signed-off-by: Eli Uriegas <eliuriegas@meta.com>	2025-06-24 22:44:10 +00:00
Boyuan Feng	c01d1c5aba	use .dev for version comparison with pytorch nightly release (#20031 ) Signed-off-by: Boyuan Feng <boyuan@meta.com>	2025-06-24 21:52:16 +00:00
Brayden Zhong	ead369845d	[Easy] Remove submodule added in #19463 (#20039 ) Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>	2025-06-24 13:23:15 -07:00
Wentao Ye	c6e3bba8e6	[Feature] Integrate new deepgemm (#19820 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-06-24 12:51:56 -07:00
lkchen	91f7d9d0b6	[P/D] Asynchronously do _nixl_handshake (#19836 ) Signed-off-by: Linkun Chen <github@lkchen.net> Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: Nick Hill <nhill@redhat.com>	2025-06-24 12:46:10 -07:00
Nick Hill	8619e7158c	[BugFix] Fix multi-node offline data parallel (#19937 ) Signed-off-by: Nick Hill <nhill@redhat.com>	2025-06-24 12:45:20 -07:00
d.transposed	c635c5f744	[Misc][Benchmarking] Add variable request-rate ("ramp-up") to the benchmarking client. (#19423 ) Signed-off-by: dtransposed <damian@damian-ml-machine.europe-west3-b.c.jetbrains-grazie.internal> Co-authored-by: dtransposed <damian@damian-ml-machine.europe-west3-b.c.jetbrains-grazie.internal> Co-authored-by: Roger Wang <hey@rogerw.me>	2025-06-24 18:41:49 +00:00
Lucas Wilkinson	a045b7e89a	[Perf] Improve/Fix-regression for FA3 in High QPS regimes (#19463 ) Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>	2025-06-24 13:09:01 -04:00
amit	981eeca41a	[Fix][V1] Remove --scheduling-policy oracle (#20010 ) Signed-off-by: amit <amit.man@gmail.com>	2025-06-24 09:52:15 -07:00