opt

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
fix
2025-08-15 14:24:50 -07:00 · 2025-08-15 14:00:26 -07:00 · 2025-08-15 19:10:01 +00:00 · 2025-08-15 11:51:50 -07:00 · 2025-08-15 18:47:56 +00:00 · 2025-08-15 14:46:00 -04:00
368 changed files with 13627 additions and 9058 deletions
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.

-See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
+See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.

 ## Performance benchmark quick overview

@ -138,28 +138,20 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts

 The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
 When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
-`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
+`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
+If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.

-Here is an example using the script to compare result_a and result_b without detail test name.
-`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
-
-|    | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
-|----|----------------------------------------|----------------------------------------|----------|
-| 0  | 142.633982                             | 156.526018                             | 1.097396 |
-| 1  | 241.620334                             | 294.018783                             | 1.216863 |
-| 2  | 218.298905                             | 262.664916                             | 1.203235 |
-| 3  | 242.743860                             | 299.816190                             | 1.235113 |
-
-Here is an example using the script to compare result_a and result_b with detail test name.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`

-|   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
-|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
-| 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |
-| 1 | serving_llama8B_tp1_sharegpt_qps_16         | 241.620334                             | serving_llama8B_tp1_sharegpt_qps_16         | 294.018783                             | 1.216863 |
-| 2 | serving_llama8B_tp1_sharegpt_qps_4          | 218.298905                             | serving_llama8B_tp1_sharegpt_qps_4          | 262.664916                             | 1.203235 |
-| 3 | serving_llama8B_tp1_sharegpt_qps_inf        | 242.743860                             | serving_llama8B_tp1_sharegpt_qps_inf        | 299.816190                             | 1.235113 |
-| 4 | serving_llama8B_tp2_random_1024_128_qps_1   | 96.613390                              | serving_llama8B_tp4_random_1024_128_qps_1   | 108.404853                             | 1.122048 |
+|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
+|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
+| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
+| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
+
+A comparison diagram will be generated below the table.
+Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
+<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />

 ## Nightly test details

--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@ -1,24 +1,38 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
+import json
+import os

 import pandas as pd


 def compare_data_columns(
-    files, name_column, data_column, drop_column, ignore_test_name=False
+    files, name_column, data_column, info_cols, drop_column, debug=False
 ):
    print("\ncompare_data_column: " + data_column)
    frames = []
+    raw_data_cols = []
    compare_frames = []
    for file in files:
        data_df = pd.read_json(file)
        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
-        if ignore_test_name is False:
+        # Show all info columns in the first couple columns
+        if not frames:
+            for col in info_cols:
+                if col not in serving_df.columns:
+                    print(f"Skipping missing column: {col}")
+                    continue
+                frames.append(serving_df[col])
+        # only show test name under debug mode
+        if debug is True:
            serving_df = serving_df.rename(columns={name_column: file + "_name"})
            frames.append(serving_df[file + "_name"])
+
+        file = "/".join(file.split("/")[:-1])
        serving_df = serving_df.rename(columns={data_column: file})
        frames.append(serving_df[file])
+        raw_data_cols.append(file)
        compare_frames.append(serving_df[file])
        if len(compare_frames) >= 2:
            # Compare numbers among two files
@ -27,7 +41,68 @@ def compare_data_columns(
            compare_frames.pop(1)

    concat_df = pd.concat(frames, axis=1)
-    return concat_df
+    print(raw_data_cols)
+    return concat_df, raw_data_cols
+
+
+def split_json_by_tp_pp(
+    input_file: str = "benchmark_results.json", output_root: str = "."
+) -> list[str]:
+    """
+    Split a benchmark JSON into separate folders by (TP Size, PP Size).
+
+    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
+    Returns: list of file paths written.
+    """
+    # Load JSON data into DataFrame
+    with open(input_file, encoding="utf-8") as f:
+        data = json.load(f)
+
+    # If the JSON is a dict with a list under common keys, use that list
+    if isinstance(data, dict):
+        for key in ("results", "serving_results", "benchmarks", "data"):
+            if isinstance(data.get(key), list):
+                data = data[key]
+                break
+
+    df = pd.DataFrame(data)
+
+    # Handle alias column names
+    rename_map = {
+        "tp_size": "TP Size",
+        "tensor_parallel_size": "TP Size",
+        "pp_size": "PP Size",
+        "pipeline_parallel_size": "PP Size",
+    }
+    df.rename(
+        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
+    )
+
+    # Ensure TP/PP columns exist (default to 1 if missing)
+    if "TP Size" not in df.columns:
+        df["TP Size"] = 1
+    if "PP Size" not in df.columns:
+        df["PP Size"] = 1
+
+    # make sure TP/PP are numeric ints with no NaN
+    df["TP Size"] = (
+        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
+    )
+    df["PP Size"] = (
+        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
+    )
+
+    # Split into separate folders
+    saved_paths: list[str] = []
+    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
+        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
+        os.makedirs(folder_name, exist_ok=True)
+        filepath = os.path.join(folder_name, "benchmark_results.json")
+        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
+        print(f"Saved: {filepath}")
+        saved_paths.append(filepath)
+
+    return saved_paths


 if __name__ == "__main__":
@ -36,31 +111,105 @@ if __name__ == "__main__":
        "-f", "--file", action="append", type=str, help="input file name"
    )
    parser.add_argument(
-        "--ignore_test_name", action="store_true", help="ignore_test_name or not"
+        "--debug", action="store_true", help="show all information for debugging"
+    )
+    parser.add_argument(
+        "--plot",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="plot perf diagrams or not --no-plot --plot",
+    )
+    parser.add_argument(
+        "-x",
+        "--xaxis",
+        type=str,
+        default="# of max concurrency.",
+        help="column name to use as X Axis in comparision graph",
    )
    args = parser.parse_args()
-    files = args.file
-    print("comparing : " + ", ".join(files))

    drop_column = "P99"
    name_column = "Test name"
+    info_cols = [
+        "Model",
+        "Dataset Name",
+        "Input Len",
+        "Output Len",
+        "TP Size",
+        "PP Size",
+        "# of max concurrency.",
+        "qps",
+    ]
    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
    html_msgs_for_data_cols = [
        "Compare Output Tokens /n",
        "Median TTFT /n",
        "Median TPOT /n",
    ]
-    ignore_test_name = args.ignore_test_name
+
+    if len(args.file) == 1:
+        files = split_json_by_tp_pp(args.file[0], output_root="splits")
+        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
+    else:
+        files = args.file
+    print("comparing : " + ", ".join(files))
+    debug = args.debug
+    plot = args.plot
+    # For Plot feature, assign y axis from one of info_cols
+    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
    with open("perf_comparison.html", "w") as text_file:
        for i in range(len(data_cols_to_compare)):
-            output_df = compare_data_columns(
+            output_df, raw_data_cols = compare_data_columns(
                files,
                name_column,
                data_cols_to_compare[i],
+                info_cols,
                drop_column,
-                ignore_test_name=ignore_test_name,
+                debug=debug,
            )
-            print(output_df)
-            html = output_df.to_html()
-            text_file.write(html_msgs_for_data_cols[i])
-            text_file.write(html)
+
+            # For Plot feature, insert y axis from one of info_cols
+            raw_data_cols.insert(0, info_cols[y_axis_index])
+
+            filtered_info_cols = info_cols[:-2]
+            existing_group_cols = [
+                c for c in filtered_info_cols if c in output_df.columns
+            ]
+            if not existing_group_cols:
+                raise ValueError(
+                    f"No valid group-by columns  "
+                    f"Expected subset: {filtered_info_cols}, "
+                    f"but DataFrame has: {list(output_df.columns)}"
+                )
+
+            output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
+            for name, group in output_groups:
+                html = group.to_html()
+                text_file.write(html_msgs_for_data_cols[i])
+                text_file.write(html)
+
+                if plot is True:
+                    import pandas as pd
+                    import plotly.express as px
+
+                    df = group[raw_data_cols]
+                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
+                    # Melt DataFrame for plotting
+                    df_melted = df_sorted.melt(
+                        id_vars=info_cols[y_axis_index],
+                        var_name="Configuration",
+                        value_name=data_cols_to_compare[i],
+                    )
+                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                    # Create Plotly line chart
+                    fig = px.line(
+                        df_melted,
+                        x=info_cols[y_axis_index],
+                        y=data_cols_to_compare[i],
+                        color="Configuration",
+                        title=title,
+                        markers=True,
+                    )
+                    # Export to HTML
+                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,17 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import argparse
 import json
 import os
+import re
+import shlex
 from importlib import util
 from pathlib import Path
+from typing import Any

 import pandas as pd
 import psutil
 from tabulate import tabulate

-results_folder = Path("results/")
-
 # latency results and the keys that will be printed into markdown
 latency_results = []
 latency_column_mapping = {
@ -42,14 +44,22 @@ throughput_results_column_mapping = {
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
+    "model_id": "Model",
+    "dataset_name": "Dataset Name",
+    "input_len": "Input Len",
+    "output_len": "Output Len",
+    "tp_size": "TP Size",
+    "pp_size": "PP Size",
+    "dtype": "dtype",
    "gpu_type": "GPU",
    "completed": "# of req.",
+    "qps": "qps",
    "max_concurrency": "# of max concurrency.",
    "request_throughput": "Tput (req/s)",
    "total_token_throughput": "Total Token Tput (tok/s)",
    "output_throughput": "Output Tput (tok/s)",
-    "total_input_tokens": "Total input tokens",
-    "total_output_tokens": "Total output tokens",
+    # "total_input_tokens": "Total input tokens",
+    # "total_output_tokens": "Total output tokens",
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
@ -94,7 +104,104 @@ def get_size_with_unit(bytes, suffix="B"):
        bytes /= factor


+def _coerce(val: str) -> Any:
+    """Best-effort type coercion from string to Python types."""
+    low = val.lower()
+    if low == "null":
+        return None
+    if low == "true":
+        return True
+    if low == "false":
+        return False
+    # integers
+    if re.fullmatch(r"[+-]?\d+", val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+    # floats (keep 'inf'/'-inf'/'nan' as strings)
+    if re.fullmatch(r"[+-]?\d*\.\d+", val):
+        try:
+            return float(val)
+        except ValueError:
+            pass
+    return val
+
+
+def parse_client_command(cmd: str) -> dict[str, Any]:
+    """Parse the client_command shell string into {executable, script, args}."""
+    toks = shlex.split(cmd)
+    if len(toks) < 2:
+        raise ValueError("client_command must include an executable and a script")
+    executable, script = toks[0], toks[1]
+    args: dict[str, Any] = {}
+
+    i = 2
+    while i < len(toks):
+        t = toks[i]
+        if t.startswith("--"):
+            # --key=value or --key (value) or boolean flag
+            if "=" in t:
+                key, val = t.split("=", 1)
+                if key == "--metadata":
+                    md = {}
+                    if val:
+                        if "=" in val:
+                            k, v = val.split("=", 1)
+                            md[k] = _coerce(v)
+                        else:
+                            md[val] = True
+                    args[key] = md
+                else:
+                    args[key] = _coerce(val)
+                i += 1
+                continue
+
+            key = t
+
+            # Special: consume metadata k=v pairs until next --flag
+            if key == "--metadata":
+                i += 1
+                md = {}
+                while i < len(toks) and not toks[i].startswith("--"):
+                    pair = toks[i]
+                    if "=" in pair:
+                        k, v = pair.split("=", 1)
+                        md[k] = _coerce(v)
+                    else:
+                        md[pair] = True
+                    i += 1
+                args[key] = md
+                continue
+
+            # Standard: check if next token is a value (not a flag)
+            if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
+                args[key] = _coerce(toks[i + 1])
+                i += 2
+            else:
+                # lone flag -> True
+                args[key] = True
+                i += 1
+        else:
+            # unexpected positional; skip
+            i += 1
+
+    return {"executable": executable, "script": script, "args": args}
+
+
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-r",
+        "--result",
+        type=str,
+        default="results",
+        help="Folder name for benchmark output results.",
+    )
+    args = parser.parse_args()
+    results_folder = Path(args.result)
+    if not results_folder.exists():
+        raise FileNotFoundError(f"results folder does not exist: {results_folder}")
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
@ -102,7 +209,6 @@ if __name__ == "__main__":

        if "serving" in str(test_file):
            # this result is generated via `vllm bench serve` command
-
            # attach the benchmarking command to raw_result
            try:
                with open(test_file.with_suffix(".commands")) as f:
@ -110,12 +216,44 @@ if __name__ == "__main__":
            except OSError as e:
                print(e)
                continue
+            # Parse Server Command Arg
+            out: dict[str, Any] = {
+                "server_command": parse_client_command(command["server_command"])
+            }
+            parse_args = [
+                "--tensor-parallel-size",
+                "--pipeline-parallel-size",
+                "--dtype",
+            ]
+            col_mapping = ["tp_size", "pp_size", "dtype"]
+            for index, arg in enumerate(parse_args):
+                if arg in out["server_command"]["args"]:
+                    raw_result.update(
+                        {col_mapping[index]: out["server_command"]["args"][arg]}
+                    )

+            # Parse Client Command Arg
+            out: dict[str, Any] = {
+                "client_command": parse_client_command(command["client_command"])
+            }
+            parse_args = [
+                "--dataset-name",
+                "--random-input-len",
+                "--random-output-len",
+                "--request-rate",
+            ]
+            col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
+
+            for index, arg in enumerate(parse_args):
+                if arg in out["client_command"]["args"]:
+                    raw_result.update(
+                        {col_mapping[index]: out["client_command"]["args"][arg]}
+                    )
+            # Add Server, Client command
            raw_result.update(command)

            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
-
            # add the result to raw_result
            serving_results.append(raw_result)
            continue
@ -205,7 +343,10 @@ if __name__ == "__main__":
            columns=latency_column_mapping
        )
    if not serving_results.empty:
-        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+        valid_columns = [
+            col for col in serving_column_mapping if col in serving_results.columns
+        ]
+        serving_results = serving_results[valid_columns].rename(
            columns=serving_column_mapping
        )
    if not throughput_results.empty:
@ -245,7 +386,9 @@ if __name__ == "__main__":
    )

    # document the result
-    with open(results_folder / "benchmark_results.md", "w") as f:
+    md_file = "benchmark_results.md"
+    json_file = "benchmark_results.json"
+    with open(results_folder / md_file, "w") as f:
        results = read_markdown(
            "../.buildkite/nightly-benchmarks/"
            + "performance-benchmarks-descriptions.md"
@ -260,7 +403,7 @@ if __name__ == "__main__":
        f.write(results)

    # document benchmarking results in json
-    with open(results_folder / "benchmark_results.json", "w") as f:
+    with open(results_folder / json_file, "w") as f:
        results = (
            latency_results.to_dict(orient="records")
            + throughput_results.to_dict(orient="records")
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -194,9 +194,11 @@ run_latency_tests() {

    # check if there is enough GPU to run the test
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ];then
-      if [[ $numa_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
@ -261,9 +263,11 @@ run_throughput_tests() {

    # check if there is enough GPU to run the test
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ];then
-      if [[ $numa_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
@ -329,12 +333,21 @@ run_serving_tests() {
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
+    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
+    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
+        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
+        max_concurrency_list="[$num_prompts]"
+    fi
+    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
+    echo "Running over max concurrency list $max_concurrency_list"

    # check if there is enough resources to run the test
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [ "$ON_CPU" == "1" ];then
-      if [[ $numa_count -lt $tp ]]; then
-        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
+    if [ "$ON_CPU" == "1" ]; then
+      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
+      world_size=$(($tp*$pp))
+      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
+        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
        continue
      fi
    else
@ -390,35 +403,39 @@ run_serving_tests() {
        echo "now qps is $qps"
      fi

-      new_test_name=$test_name"_qps_"$qps
+      # iterate over different max_concurrency
+      for max_concurrency in $max_concurrency_list; do
+        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
+        echo " new test name $new_test_name"
+        # pass the tensor parallel size to the client so that it can be displayed
+        # on the benchmark dashboard
+        client_command="vllm bench serve \
+          --save-result \
+          --result-dir $RESULTS_FOLDER \
+          --result-filename ${new_test_name}.json \
+          --request-rate $qps \
+          --max-concurrency $max_concurrency \
+          --metadata "tensor_parallel_size=$tp" \
+          $client_args $client_remote_args "

-      # pass the tensor parallel size to the client so that it can be displayed
-      # on the benchmark dashboard
-      client_command="vllm bench serve \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        --metadata "tensor_parallel_size=$tp" \
-        $client_args $client_remote_args "
+        echo "Running test case $test_name with qps $qps"
+        echo "Client command: $client_command"

-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
+        bash -c "$client_command"

-      bash -c "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu
-        }')
-      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+        # record the benchmarking commands
+        jq_output=$(jq -n \
+          --arg server "$server_command" \
+          --arg client "$client_command" \
+          --arg gpu "$gpu_type" \
+          '{
+            server_command: $server,
+            client_command: $client,
+            gpu_type: $gpu
+          }')
+        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"

+      done
    done

    # clean up
--- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
@ -12,7 +12,6 @@
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@ -6,7 +6,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "num_iters_warmup": 5,
@ -20,7 +20,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "num_iters_warmup": 5,
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@ -36,7 +36,6 @@
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -90,7 +89,6 @@
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -144,7 +142,6 @@
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -195,7 +192,6 @@
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -248,7 +244,6 @@
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
@ -301,7 +296,6 @@
        "vllm_server_parameters": {
            "disable_log_stats": "",
            "gpu_memory_utilization": 0.9,
-            "num_scheduler_steps": 10,
            "max_num_seqs": 512,
            "dtype": "bfloat16"
        },
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@ -1,7 +1,8 @@
 [
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -10,7 +11,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -23,17 +24,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -42,7 +43,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -55,17 +56,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -74,7 +75,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -87,17 +88,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp1_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -106,7 +107,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -120,19 +121,19 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_tp2_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -141,7 +142,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -155,19 +156,19 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_tp4_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -176,7 +177,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -190,13 +191,11 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
-	    "ignore-eos": "",
-	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    }
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@ -1,7 +1,8 @@
 [
    {
        "test_name": "serving_llama8B_pp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -10,7 +11,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -23,17 +24,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_pp3_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -42,7 +43,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -55,17 +56,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
-        "test_name": "serving_llama8B_tp2pp6_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
+        "test_name": "serving_llama8B_tp2pp3_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -74,7 +75,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
@ -88,17 +89,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_pp1_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -107,7 +108,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -121,28 +122,28 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_pp3_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL:": 1,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -156,19 +157,19 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_tp2pp3_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -177,7 +178,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
@ -192,13 +193,12 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 1000,
            "num_prompts": 1000
        }
    }
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@ -2,6 +2,7 @@
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -10,7 +11,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -23,17 +24,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -42,7 +43,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -55,17 +56,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -74,7 +75,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -87,17 +88,17 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-	    "max_concurrency": 60,
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_tp4_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -106,7 +107,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -120,19 +121,19 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 100,
            "num_prompts": 100
        }
    },
    {
        "test_name": "serving_llama8B_pp6_random_1024_128",
        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -141,7 +142,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "pipeline_parallel_size": 6,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
@ -155,13 +156,12 @@
            "load_format": "dummy"
        },
        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 1024,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-	    "max_concurrency": 100,
            "num_prompts": 100
        }
    }
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@ -6,7 +6,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 1,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -21,7 +21,7 @@
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 4,
            "load_format": "dummy",
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -128,7 +128,7 @@ run_and_track_test() {

 # --- Actual Test Execution ---
 run_and_track_test 1 "test_struct_output_generate.py" \
-    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 2 "test_moe_pallas.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 3 "test_lora.py" \
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -134,7 +134,7 @@ run_and_track_test 1 "test_compilation.py" \
 run_and_track_test 2 "test_basic.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
 run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
-    "HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
+    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 run_and_track_test 4 "test_quantization_accuracy.py" \
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
 run_and_track_test 5 "examples/offline_inference/tpu.py" \
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -31,16 +31,6 @@
 steps:
 ##### fast check tests  #####

- label: Documentation Build # 2min
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/test_docs"
-  fast_check: true
-  no_gpu: True
-  commands:
-  - pip install -r ../requirements/docs.txt
-  # TODO: add `--strict` once warnings in docstrings are fixed
-  - mkdocs build
-
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
@ -67,7 +57,6 @@ steps:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
-  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s multimodal
@ -410,6 +399,7 @@ steps:
 - label: Kernels MoE Test %N
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
+  - csrc/quantization/cutlass_w8a8/moe/
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
@ -670,6 +660,7 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
+    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
@ -773,27 +764,6 @@ steps:
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins

- label: Multi-step Tests (4 GPUs) # 36min
-  mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/model_executor/layers/sampler.py
-  - vllm/sequence.py
-  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/multi_step_worker.py
-  - vllm/worker/model_runner_base.py
-  - vllm/worker/model_runner.py
-  - vllm/worker/multi_step_model_runner.py
-  - vllm/engine
-  - tests/multi_step
-  commands:
-  # this test is quite flaky
-  # TODO: investigate and fix.
-  # - pytest -v -s multi_step/test_correctness_async_llm.py
-  - pytest -v -s multi_step/test_correctness_llm.py
-
 - label: Pipeline Parallelism Test # 45min
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -9,7 +9,7 @@
 /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/multimodal @DarkLight1337 @ywang96
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@ -20,7 +20,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson

 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
+/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg

 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
@ -34,16 +34,15 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/kernels @tlrmchlsmth @WoosukKwon
+/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
-/tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
-/tests/quantization @mgoin @robertgshaw2-redhat
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/weight_loading @mgoin @youkaichao
+/tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee

 # Docs
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,11 +1,5 @@
-# Essential Elements of an Effective PR Description Checklist
-
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
-
-PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
+<!-- markdownlint-disable -->
+PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.

 ## Purpose

@ -15,4 +9,14 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B

 ## (Optional) Documentation Update

+---
+<details>
+<summary> Essential Elements of an Effective PR Description Checklist </summary>
+
+- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
+- [ ] The test plan, such as providing test command.
+- [ ] The test results, such as pasting the results comparison before and after, or e2e results
+- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+</details>
+
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
 gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
 cp "${OLD}" "${NEW}"

-# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
-sed -i '/FIX #xxxx.*$/d' "${NEW}"
+# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
+sed -i '/<!--.*-->$/d' "${NEW}"

-# Remove "FILL IN THE PR DESCRIPTION HERE"
-sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
+sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"

 # Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
 sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
--- a/.gitignore
+++ b/.gitignore
@ -150,7 +150,8 @@ venv.bak/
 # mkdocs documentation
 /site
 docs/argparse
-docs/examples
+docs/examples/*
+!docs/examples/README.md

 # mypy
 .mypy_cache/
@ -206,3 +207,6 @@ shellcheck*/

 # Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
+
+# Ignore ep_kernels_workspace folder
+ep_kernels_workspace/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -249,7 +249,6 @@ set(VLLM_EXT_SRC
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/quantization/activation_kernels.cu"
  "csrc/cuda_utils_kernels.cu"
-  "csrc/prepare_inputs/advance_step.cu"
  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")

@ -351,6 +350,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()

    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})

@ -364,7 +367,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set_gencode_flags_for_srcs(
      SRCS "${MARLIN_SRCS}"
      CUDA_ARCHS "${MARLIN_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
    list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
+
    message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
  else()
    message(STATUS "Not building Marlin kernels as no compatible archs found"
@ -854,6 +862,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    set_gencode_flags_for_srcs(
      SRCS "${MOE_WNAA16_MARLIN_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()

    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})

--- a/README.md
+++ b/README.md
@ -162,7 +162,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us

 <!-- --8<-- [start:contact-us] -->
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues)
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
 - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -22,6 +22,17 @@ become available.
      <td style="text-align: center;">✅</td>
      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
    </tr>
+    <tr>
+      <td><strong>ShareGPT4V (Image)</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>
+        <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
+        <br>
+        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
+        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
+      </td>
+    </tr>
    <tr>
      <td><strong>BurstGPT</strong></td>
      <td style="text-align: center;">✅</td>
@ -616,3 +627,41 @@ python3 benchmarks/benchmark_prioritization.py \
 ```

 </details>
+
+## 👁️ Example - Multi-Modal Benchmark
+
+<details>
+<summary>Show more</summary>
+
+<br/>
+
+Benchmark the performance of multi-modal requests in vLLM.
+
+### Images (ShareGPT4V)
+
+Start vLLM:
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dtype bfloat16 \
+  --limit-mm-per-prompt '{"image": 1}' \
+  --allowed-local-media-path /path/to/sharegpt4v/images
+```
+
+Send requests with images:
+
+```bash
+python benchmarks/benchmark_serving.py \
+  --backend openai-chat \
+  --model Qwen/Qwen2.5-VL-7B-Instruct \
+  --dataset-name sharegpt \
+  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
+  --num-prompts 100 \
+  --save-result \
+  --result-dir ~/vllm_benchmark_results \
+  --save-detailed \
+  --endpoint /v1/chat/completion
+```
+
+</details>
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+
+from tabulate import tabulate
+
+from benchmark_utils import TimeCollector
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.core.block_pool import BlockPool
+
+
+def main(args):
+    rows = []
+    for allocate_block in args.allocate_blocks:
+        # Enforce a GC collect ahead to minimize the impact among runs
+        gc.collect()
+        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
+
+        get_blocks_times = TimeCollector(TimeCollector.US)
+        free_blocks_times = TimeCollector(TimeCollector.US)
+        for _ in range(args.num_iteration):
+            with get_blocks_times:
+                blocks = block_pool.get_new_blocks(allocate_block)
+            with free_blocks_times:
+                block_pool.free_blocks(blocks)
+
+        rows.append(
+            [get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
+            + get_blocks_times.dump_avg_max()
+            + free_blocks_times.dump_avg_max()
+        )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "Iterations",
+                "Total\nBlocks",
+                "Allocated\nBlocks",
+                "Get Blocks\nAvg (us)",
+                "Get Blocks\nMax (us)",
+                "Free Blocks\nAvg (us)",
+                "Free Blocks\nMax (us)",
+            ],
+            tablefmt="grid",
+            floatfmt=".3f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of BlockPool for KV Cache."
+    )
+    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=1000,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--allocate-blocks",
+        type=int,
+        nargs="*",
+        default=[10, 50, 100, 500, 1000],
+        help="Number of blocks to allocate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -430,14 +430,20 @@ class ShareGPTDataset(BenchmarkDataset):
                skip_min_output_len_check=output_len is not None,
            ):
                continue
+            # TODO: Also support ShareGPT4Video.
+            if image_path := entry.get("image"):
+                mm_content = process_image(image_path)
+            else:
+                mm_content = None
            if enable_multimodal_chat:
-                prompt = self.apply_multimodal_chat_transformation(prompt, None)
+                prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
            samples.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
+                    multi_modal_data=mm_content,
                )
            )
        self.maybe_oversample_requests(samples, num_requests)
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+
+import numpy as np
+from tabulate import tabulate
+
+from benchmark_utils import TimeCollector
+from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+
+
+def main(args):
+    rows = []
+    for max_ngram in args.max_ngram:
+        collector = TimeCollector(TimeCollector.US)
+
+        model_config = ModelConfig(
+            model="facebook/opt-125m",
+            task="generate",
+            max_model_len=args.num_token + args.num_spec_token,
+            tokenizer="facebook/opt-125m",
+            tokenizer_mode="auto",
+            dtype="auto",
+            seed=None,
+            trust_remote_code=False,
+        )
+        proposer = NgramProposer(
+            vllm_config=VllmConfig(
+                model_config=model_config,
+                speculative_config=SpeculativeConfig(
+                    prompt_lookup_min=args.min_ngram,
+                    prompt_lookup_max=max_ngram,
+                    num_speculative_tokens=args.num_spec_token,
+                    method="ngram",
+                ),
+            )
+        )
+
+        # Warm up
+        proposer.propose(np.random.randint(0, 20, (args.num_token,)))
+
+        gc.collect()
+        for _ in range(args.num_iteration):
+            tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
+            with collector:
+                for i in range(args.num_req):
+                    proposer.propose(tokens[i, :])
+        rows.append(
+            [args.num_req, args.num_token, args.min_ngram, max_ngram]
+            + collector.dump_avg_max()
+        )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "# Request",
+                "# Token",
+                "Min Ngram",
+                "Max Ngram",
+                "Avg (us)",
+                "Max (us)",
+            ],
+            tablefmt="grid",
+            floatfmt=".3f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of N-gram speculative decode drafting"
+    )
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=100,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--num-req", type=int, default=128, help="Number of requests in the batch"
+    )
+    parser.add_argument(
+        "--num-token", type=int, default=1500, help="Number of tokens for each request"
+    )
+    parser.add_argument(
+        "--min-ngram",
+        type=int,
+        default=3,
+        help="Minimum n-gram to match",
+    )
+    parser.add_argument(
+        "--max-ngram",
+        type=int,
+        nargs="*",
+        default=[5, 7, 10, 15, 20],
+        help="Maximum n-gram to match",
+    )
+    parser.add_argument(
+        "--num-spec-token",
+        type=int,
+        default=3,
+        help="Number of speculative tokens to generate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import argparse
 import json
 import math
 import os
-from typing import Any
+import time
+from types import TracebackType
+from typing import Any, Optional, Union


 def convert_to_pytorch_benchmark_format(
@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None:
            cls=InfEncoder,
            default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
        )
+
+
+# Collect time and generate time metrics
+#
+# Example Usage:
+#   collector = TimeCollector(TimeCollector.US)
+#   for _ in range(total_iteration):
+#      with collector:
+#          ...
+#   collector.dump_avg_max()
+class TimeCollector:
+    NS: int = 1
+    US: int = NS * 1000
+    MS: int = US * 1000
+    S: int = MS * 1000
+
+    def __init__(self, scale: int) -> None:
+        self.cnt: int = 0
+        self._sum: int = 0
+        self._max: Optional[int] = None
+        self.scale = scale
+        self.start_time: int = time.monotonic_ns()
+
+    def collect(self, v: int) -> None:
+        self.cnt += 1
+        self._sum += v
+        if self._max is None:
+            self._max = v
+        else:
+            self._max = max(self._max, v)
+
+    def avg(self) -> Union[float, str]:
+        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
+
+    def max(self) -> Union[float, str]:
+        return self._max / self.scale if self._max else "N/A"
+
+    def dump_avg_max(self) -> list[Union[float, str]]:
+        return [self.avg(), self.max()]
+
+    def __enter__(self) -> None:
+        self.start_time = time.monotonic_ns()
+
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc_value: Optional[BaseException],
+        exc_traceback: Optional[TracebackType],
+    ) -> None:
+        self.collect(time.monotonic_ns() - self.start_time)
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@ -1,63 +1,199 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import argparse
+import asyncio
+import logging
 import os

 import aiohttp
-from quart import Quart, make_response, request
+from quart import Quart, Response, make_response, request
+from rate_limiter import RateLimiter
+from request_queue import RequestQueue

-AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
-
-app = Quart(__name__)
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)


-async def forward_request(url, data):
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+def parse_args():
+    """parse command line arguments"""
+    parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server")
+
+    # Add args
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=300,
+        help="Timeout for backend service requests in seconds (default: 300)",
+    )
+    parser.add_argument(
+        "--max-concurrent",
+        type=int,
+        default=100,
+        help="Maximum concurrent requests to backend services (default: 100)",
+    )
+    parser.add_argument(
+        "--queue-size",
+        type=int,
+        default=500,
+        help="Maximum number of requests in the queue (default: 500)",
+    )
+    parser.add_argument(
+        "--rate-limit",
+        type=int,
+        default=40,
+        help="Maximum requests per second (default: 40)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port to run the server on (default: 8000)",
+    )
+    parser.add_argument(
+        "--prefill-url",
+        type=str,
+        default="http://localhost:8100/v1/completions",
+        help="Prefill service endpoint URL",
+    )
+    parser.add_argument(
+        "--decode-url",
+        type=str,
+        default="http://localhost:8200/v1/completions",
+        help="Decode service endpoint URL",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    """parse command line arguments"""
+    args = parse_args()
+
+    # Initialize configuration using command line parameters
+    AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
+    MAX_CONCURRENT_REQUESTS = args.max_concurrent
+    REQUEST_QUEUE_SIZE = args.queue_size
+    RATE_LIMIT = args.rate_limit
+    PREFILL_SERVICE_URL = args.prefill_url
+    DECODE_SERVICE_URL = args.decode_url
+    PORT = args.port
+
+    app = Quart(__name__)
+
+    # Initialize the rate limiter and request queue
+    rate_limiter = RateLimiter(RATE_LIMIT)
+    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
+
+    # Attach the configuration object to the application instance
+    app.config.update(
+        {
+            "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
+            "rate_limiter": rate_limiter,
+            "request_queue": request_queue,
+            "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
+            "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
+        }
+    )
+
+    # Start queue processing on app startup
+    @app.before_serving
+    async def startup():
+        """Start request processing task when app starts serving"""
+        asyncio.create_task(request_queue.process())
+
+    async def forward_request(url, data):
+        """Forward request to backend service with rate limiting and error handling"""
        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-        async with session.post(url=url, json=data, headers=headers) as response:
-            if response.status == 200:
-                # if response.headers.get('Transfer-Encoding') == 'chunked':
-                if True:
-                    async for chunk_bytes in response.content.iter_chunked(1024):
-                        yield chunk_bytes
-                else:
-                    content = await response.read()
-                    yield content

-
-@app.route("/v1/completions", methods=["POST"])
-async def handle_request():
-    try:
-        original_request_data = await request.get_json()
-
-        prefill_request = original_request_data.copy()
-        # change max_tokens = 1 to let it only do prefill
-        prefill_request["max_tokens"] = 1
-
-        # finish prefill
-        async for _ in forward_request(
-            "http://localhost:8100/v1/completions", prefill_request
+        # Use rate limiter as context manager
+        async with (
+            rate_limiter,
+            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
        ):
-            continue
+            try:
+                async with session.post(
+                    url=url, json=data, headers=headers
+                ) as response:
+                    if response.status == 200:
+                        # Stream response chunks
+                        async for chunk_bytes in response.content.iter_chunked(1024):
+                            yield chunk_bytes
+                    else:
+                        # Handle backend service errors
+                        error_text = await response.text()
+                        logger.error(
+                            "Backend service error: %s - %s",
+                            response.status,
+                            error_text,
+                        )
+                        yield b'{"error": "Backend service error"}'
+            except aiohttp.ClientError as e:
+                # Handle connection errors
+                logger.error("Connection error to %s: %s", url, str(e))
+                yield b'{"error": "Service unavailable"}'
+            except asyncio.TimeoutError:
+                # Handle timeout errors
+                logger.error("Timeout connecting to %s", url)
+                yield b'{"error": "Service timeout"}'

-        # return decode
-        generator = forward_request(
-            "http://localhost:8200/v1/completions", original_request_data
-        )
-        response = await make_response(generator)
-        response.timeout = None
+    async def process_request():
+        """Process a single request through prefill and decode stages"""
+        try:
+            original_request_data = await request.get_json()

-        return response
+            # Create prefill request (max_tokens=1)
+            prefill_request = original_request_data.copy()
+            prefill_request["max_tokens"] = 1

-    except Exception as e:
-        import sys
-        import traceback
+            # Execute prefill stage
+            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
+                continue

-        exc_info = sys.exc_info()
-        print("Error occurred in disagg prefill proxy server")
-        print(e)
-        print("".join(traceback.format_exception(*exc_info)))
+            # Execute decode stage and stream response
+            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
+            response = await make_response(generator)
+            response.timeout = None  # Disable timeout for streaming response
+            return response
+
+        except Exception:
+            logger.exception("Error processing request")
+            return Response(
+                response=b'{"error": "Internal server error"}',
+                status=500,
+                content_type="application/json",
+            )
+
+    @app.route("/v1/completions", methods=["POST"])
+    async def handle_request():
+        """Handle incoming API requests with concurrency and rate limiting"""
+        # Create task for request processing
+        task = asyncio.create_task(process_request())
+
+        # Enqueue request or reject if queue is full
+        if not await request_queue.enqueue(task):
+            return Response(
+                response=b'{"error": "Server busy, try again later"}',
+                status=503,
+                content_type="application/json",
+            )
+
+        try:
+            # Return the response from the processing task
+            return await task
+        except asyncio.CancelledError:
+            # Handle task cancellation (timeout or queue full)
+            logger.warning("Request cancelled due to timeout or queue full")
+            return Response(
+                response=b'{"error": "Request cancelled"}',
+                status=503,
+                content_type="application/json",
+            )
+
+    # Start the Quart server with host can be set to 0.0.0.0
+    app.run(port=PORT)


 if __name__ == "__main__":
-    app.run(port=8000)
+    main()
--- a/benchmarks/disagg_benchmarks/rate_limiter.py
+++ b/benchmarks/disagg_benchmarks/rate_limiter.py
@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import time
+
+
+class RateLimiter:
+    """Token bucket rate limiter implementation"""
+
+    def __init__(self, rate_limit):
+        self.rate_limit = rate_limit  # Requests per second
+        self.num_available_tokens = rate_limit  # Available tokens
+        self.last_refill = time.monotonic()  # Last token refill time
+        self.lock = asyncio.Lock()  # Synchronization lock
+
+    async def acquire(self):
+        """Acquire a token from the rate limiter"""
+        while True:
+            async with self.lock:
+                current_time = time.monotonic()
+                elapsed = current_time - self.last_refill
+
+                # Refill num_available_tokens if more than 1 second has passed
+                if elapsed > 1.0:
+                    self.num_available_tokens = self.rate_limit
+                    self.last_refill = current_time
+
+                # Check if num_available_tokens are available
+                if self.num_available_tokens > 0:
+                    self.num_available_tokens -= 1
+                    return True
+
+                # Calculate wait time if no num_available_tokens available
+                wait_time = 1.0 - elapsed
+            await asyncio.sleep(wait_time)
+
+    async def __aenter__(self):
+        """Enter async context manager - acquire token"""
+        await self.acquire()
+        return self
+
+    async def __aexit__(self, exc_type, exc_value, traceback):
+        """Exit async context manager - no cleanup needed"""
+        pass
--- a/benchmarks/disagg_benchmarks/request_queue.py
+++ b/benchmarks/disagg_benchmarks/request_queue.py
@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+from collections import deque
+
+
+class RequestQueue:
+    """Request queue manager with concurrency control"""
+
+    def __init__(self, max_concurrent, max_queue_size):
+        # Maximum concurrent requests
+        self.max_concurrent = max_concurrent
+        self.max_queue_size = max_queue_size  # Maximum queue size
+        # Concurrency control
+        self.semaphore = asyncio.Semaphore(max_concurrent)
+        self.queue = deque()  # Request queue
+        self.queue_size = 0  # Current queue size
+        self.lock = asyncio.Lock()  # Sync queue Lock
+
+    async def enqueue(self, task):
+        """Add a request task to the queue"""
+        async with self.lock:
+            if self.queue_size >= self.max_queue_size:
+                return False
+
+            self.queue.append(task)
+            self.queue_size += 1
+            return True
+
+    async def process(self):
+        """Process queued requests using semaphore for concurrency control"""
+        while True:
+            if self.queue:
+                async with self.semaphore, self.lock:
+                    task = self.queue.popleft()
+                    self.queue_size -= 1
+                    await task
+            await asyncio.sleep(0.01)  # Yield control to event loop
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -236,6 +236,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
            a=bt.a,
            c=None,
            b_q_weight=w_q,
+            b_bias=None,
            b_scales=w_s,
            global_scale=None,
            b_zeros=w_zp,
--- a/benchmarks/kv_cache/benchmark_block_pool.py
+++ b/benchmarks/kv_cache/benchmark_block_pool.py
@ -1,108 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
-import time
-from typing import Optional
-
-from tabulate import tabulate
-
-from vllm.utils import FlexibleArgumentParser
-from vllm.v1.core.block_pool import BlockPool
-
-
-class Metric:
-    def __init__(self) -> None:
-        self.cnt: int = 0
-        self.sum_v: int = 0
-        self.max_v: Optional[int] = None
-
-    def update(self, v: int) -> None:
-        self.cnt += 1
-        self.sum_v += v
-        if self.max_v is None:
-            self.max_v = v
-        else:
-            self.max_v = max(self.max_v, v)
-
-    def avg_v(self) -> float:
-        return self.sum_v * 1.0 / self.cnt
-
-
-def main(args):
-    rows = []
-    for allocate_block in args.allocate_blocks:
-        # Enforce a GC collect ahead to minimize the impact among runs
-        gc.collect()
-        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
-
-        get_blocks_metric: Metric = Metric()
-        free_blocks_metric: Metric = Metric()
-        for _ in range(args.num_iteration):
-            t1 = time.monotonic_ns()
-            blocks = block_pool.get_new_blocks(allocate_block)
-            t2 = time.monotonic_ns()
-            block_pool.free_blocks(blocks)
-            t3 = time.monotonic_ns()
-            get_blocks_metric.update(t2 - t1)
-            free_blocks_metric.update(t3 - t2)
-
-        if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
-            rows.append(
-                [
-                    get_blocks_metric.cnt,
-                    args.num_gpu_blocks,
-                    allocate_block,
-                    get_blocks_metric.avg_v() / 1000000,
-                    get_blocks_metric.max_v / 1000000.0,
-                    free_blocks_metric.avg_v() / 1000000,
-                    free_blocks_metric.max_v / 1000000.0,
-                ]
-            )
-        else:
-            print(
-                "No valid metrics found."
-                f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
-            )
-
-    print(
-        tabulate(
-            rows,
-            headers=[
-                "Iterations",
-                "Total\nBlocks",
-                "Allocated\nBlocks",
-                "Get Blocks\nAvg (ms)",
-                "Get Blocks\nMax (ms)",
-                "Free Blocks\nAvg (ms)",
-                "Free Blocks\nMax (ms)",
-            ],
-            tablefmt="grid",
-            floatfmt=".6f",
-        )
-    )
-
-
-def invoke_main() -> None:
-    parser = FlexibleArgumentParser(
-        description="Benchmark the performance of BlockPool for KV Cache."
-    )
-    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
-    parser.add_argument(
-        "--num-iteration",
-        type=int,
-        default=1000,
-        help="Number of iterations to run to stablize final data readings",
-    )
-    parser.add_argument(
-        "--allocate-blocks",
-        type=int,
-        nargs="*",
-        default=[10, 50, 100, 500, 1000],
-        help="Number of blocks to allocate",
-    )
-    args = parser.parse_args()
-    main(args)
-
-
-if __name__ == "__main__":
-    invoke_main()  # pragma: no cover
--- a/benchmarks/multi_turn/bench_utils.py
+++ b/benchmarks/multi_turn/bench_utils.py
@ -4,7 +4,7 @@ import logging
 from enum import Enum


-class Color(str, Enum):
+class Color(Enum):
    RED = "\033[91m"
    GREEN = "\033[92m"
    BLUE = "\033[94m"
@ -13,6 +13,9 @@ class Color(str, Enum):
    YELLOW = "\033[93m"
    RESET = "\033[0m"

+    def __str__(self):
+        return self.value
+

 TEXT_SEPARATOR = "-" * 100

--- a/csrc/core/scalar_type.hpp
+++ b/csrc/core/scalar_type.hpp
@ -321,6 +321,8 @@ static inline constexpr auto kFE3M2f =
    ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
 static inline constexpr auto kFE4M3fn =
    ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
+static inline constexpr auto kFE8M0fnu =
+    ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
 static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
 static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
 static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -20,6 +20,7 @@ namespace MARLIN_NAMESPACE_NAME {
 TEMPLATE = ("template __global__ void Marlin<"
            "{{scalar_t}}, "
            "{{w_type_id}}, "
+            "{{s_type_id}}, "
            "{{threads}}, "
            "{{thread_m_blocks}}, "
            "{{thread_n_blocks}}, "
@ -77,6 +78,7 @@ def generate_new_kernels():
            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
                continue
            # nvfp4 only supports group_size == 16
+            # mxfp4 only supports group_size == 32
            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
                continue
            # other quantization methods don't support group_size = 16
@ -89,9 +91,22 @@ def generate_new_kernels():

            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"

+            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
+                s_type = "vllm::kFE4M3fn"
+            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
+                s_type = "vllm::kFE8M0fnu"
+                if dtype == "fp16":
+                    # we cannot safely dequantize e8m0 to fp16, so skip this
+                    continue
+            elif dtype == "fp16":
+                s_type = "vllm::kFloat16"
+            elif dtype == "bf16":
+                s_type = "vllm::kBFloat16"
+
            template_str = jinja2.Template(TEMPLATE).render(
                scalar_t=c_dtype,
                w_type_id=scalar_type + ".id()",
+                s_type_id=s_type + ".id()",
                threads=threads,
                thread_m_blocks=max(m_blocks, 1),
                thread_n_blocks=n_blocks,
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@ -7,23 +7,25 @@
 #include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"

-#define MARLIN_KERNEL_PARAMS                                          \
-  const int4 *__restrict__ A, const int4 *__restrict__ B,             \
-      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                 \
-      const int4 *__restrict__ scales_ptr,                            \
-      const uint16_t *__restrict__ scale2_ptr,                        \
-      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
-      const int32_t *__restrict__ sorted_token_ids_ptr,               \
-      const int32_t *__restrict__ expert_ids_ptr,                     \
-      const int32_t *__restrict__ num_tokens_past_padded_ptr,         \
-      const float *__restrict__ topk_weights_ptr, int top_k,          \
-      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,  \
-      int prob_n, int prob_k, int *locks, bool use_atomic_add,        \
+#define MARLIN_KERNEL_PARAMS                                                  \
+  const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
+      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
+      const int4 *__restrict__ b_bias_ptr,                                    \
+      const int4 *__restrict__ scales_ptr,                                    \
+      const uint16_t *__restrict__ scale2_ptr,                                \
+      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
+      const int32_t *__restrict__ sorted_token_ids_ptr,                       \
+      const int32_t *__restrict__ expert_ids_ptr,                             \
+      const int32_t *__restrict__ num_tokens_past_padded_ptr,                 \
+      const float *__restrict__ topk_weights_ptr, int top_k,                  \
+      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
+      int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
      bool use_fp32_reduce, int max_shared_mem

 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@ -280,6 +280,7 @@ __device__ inline void wait_negative_and_add(int* lock) {

 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
@ -299,6 +300,7 @@ __global__ void Marlin(
    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
    int4* __restrict__ C,        // fp16 output buffer of shape mxn
    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ b_bias_ptr,
    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                          // (k/groupsize)xn
    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
@ -318,8 +320,9 @@ __global__ void Marlin(
    int prob_n,             // output dimension n
    int prob_k,             // reduction dimension k
    int* locks,             // extra global storage for barrier synchronization
-    bool use_atomic_add,    // whether to use atomic add to reduce
-    bool use_fp32_reduce,   // whether to use fp32 global reduce
+    bool has_bias,
+    bool use_atomic_add,   // whether to use atomic add to reduce
+    bool use_fp32_reduce,  // whether to use fp32 global reduce
    int max_shared_mem) {
  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
  // same size, which might involve multiple column "slices" (of width 16 *
@ -342,12 +345,23 @@ __global__ void Marlin(

  extern __shared__ int4 sh[];
  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
+  if constexpr (w_type == vllm::kFE2M1f) {
+    static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
+                  s_type == vllm::kFE8M0fnu && group_blocks == 2);
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    static_assert(s_type == vllm::kBFloat16);
+  } else if constexpr (std::is_same<scalar_t, half>::value) {
+    static_assert(s_type == vllm::kFloat16);
+  }
+
  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
  // see comments of dequant.h for more details
  constexpr bool dequant_skip_flop =
-      !is_int_type ||
+      w_type == vllm::kFE4M3fn ||
+      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
      has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
      has_zp && !is_zp_float && !(w_type == vllm::kU8);

@ -365,6 +379,7 @@ __global__ void Marlin(
  const int zp_expert_stride =
      is_zp_float ? prob_n * prob_k / group_size / 8
                  : prob_n * prob_k / group_size / (pack_factor * 4);
+  const int b_bias_expert_stride = prob_n / 8;

  // parallel: num valid moe blocks
  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
@ -475,7 +490,7 @@ __global__ void Marlin(
        for (int i = 0; i < 4; i++) {
          int idx = tid4 * 4 + i;
          idx = idx < block_num_valid_tokens ? idx : 0;
-          if constexpr (w_type == vllm::kFE2M1f) {
+          if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
            sh_block_topk_weights[idx] = __hmul2(
                global_scale, Dtype::num2num2(Dtype::float2num(
                                  topk_weights_ptr[sh_block_sorted_ids[idx]])));
@ -513,7 +528,7 @@ __global__ void Marlin(
      expert_id = expert_ids_ptr[block_id];
    }

-    if constexpr (w_type == vllm::kFE2M1f) {
+    if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
      uint16_t val = scale2_ptr[expert_id];
      global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
    }
@ -526,6 +541,9 @@ __global__ void Marlin(
    if constexpr (has_act_order) {
      g_idx += (expert_id - old_expert_id) * prob_k;
    }
+    if (has_bias) {
+      b_bias_ptr += (expert_id - old_expert_id) * b_bias_expert_stride;
+    }

    read_moe_block_data(block_id);
  };
@ -721,7 +739,7 @@ __global__ void Marlin(

    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + warp_row % 2;
+    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;

  } else if constexpr (group_blocks != -1)
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
@ -734,6 +752,18 @@ __global__ void Marlin(
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) % 4;

+  int bias_sh_rd;
+  if constexpr (m_block_size_8) {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) / 8;
+  } else {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) % 4;
+  }
+
+  int bias_sh_wr = threadIdx.x;
+  int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
+
  // Zero-points have the same read layout as the scales
  // (without column-wise case)
  constexpr int num_col_threads = 8;
@ -793,7 +823,19 @@ __global__ void Marlin(
  constexpr int sh_b_size = stages * b_sh_stage;
  int4* sh_b = sh_new;
  int4* sh_red = sh_new;
-  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+
+  constexpr int sh_size_b_red_min =
+      (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_size_b_red_max =
+      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_bias_size = (thread_n_blocks * 16 / 8);
+  constexpr int sh_b_red_bias_size =
+      sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size)
+          ? sh_size_b_red_max
+          : (sh_size_b_red_min + sh_bias_size);
+
+  int4* sh_bias = sh_new + sh_size_b_red_min;
+  int4* sh_g_idx = sh_new + sh_b_red_bias_size;
  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
  constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                          : (stages * s_sh_stage);
@ -803,9 +845,9 @@ __global__ void Marlin(
  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                stages * b_sh_stage);
  int4* sh_a = sh_s + sh_s_size;
-  constexpr int shm_size_used =
-      moe_block_size + stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
-      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int shm_size_used = moe_block_size +
+                                stages * (g_idx_stage + zp_sh_stage) +
+                                sh_s_size + sh_b_red_bias_size;

  // all remaining shared memory is used to cache A (input)
  // sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
@ -816,7 +858,8 @@ __global__ void Marlin(
  FragA frag_a[2][thread_m_blocks];
  I4 frag_b_quant[2][b_thread_vecs];
  FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];                    // No act-order
+  FragS frag_s[2][4];  // No act-order
+  FragS frag_bias[2][4];
  FragS act_frag_s[2][4][4];             // For act-order
  int frag_qzp[2][num_ints_per_thread];  // Zero-points
  FragZP frag_zp;                        // Zero-points in fp16
@ -1065,10 +1108,15 @@ __global__ void Marlin(
          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
            reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else {
+          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                reinterpret_cast<int2*>(
                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
+                reinterpret_cast<int2*>(
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
+                                k % 2];
          }
        }
      }
@ -1281,9 +1329,9 @@ __global__ void Marlin(
      int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
      int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];

-      dequant_fp8_scales<scalar_t2>(s_quant_0,
-                                    reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2>(
+      dequant_fp8_scales<scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2, s_type_id>(
          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
    }

@ -1566,7 +1614,7 @@ __global__ void Marlin(
  // Write out the reduce final result in the correct layout. We only actually
  // reshuffle matrix fragments in this step, the reduction above is performed
  // in fragment layout.
-  auto write_result = [&]() {
+  auto write_result = [&](bool last) {
    int c_gl_stride = prob_n / 8;
    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
@ -1592,7 +1640,7 @@ __global__ void Marlin(

    // We first reorder in shared memory to guarantee the most efficient final
    // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
+    auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
      scalar_t2 res =
          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));

@ -1601,14 +1649,27 @@ __global__ void Marlin(
      if constexpr (!has_act_order && group_blocks == -1 &&
                    w_type.size_bits() == 4 &&
                    (has_zp && dequant_skip_flop || !has_zp)) {
-        res = __hmul2(res, s[0]);
+        scalar_t2 tmp_scale = s[0];
+        if constexpr (m_block_size_8) {
+          tmp_scale = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hmul2(res, tmp_scale);
      }

-      if constexpr (w_type == vllm::kFE2M1f) {
+      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
        if (!mul_topk_weights) {
          res = __hmul2(res, global_scale);
        }
      }
+      if (has_bias && last) {
+        scalar_t2 tmp_bias = b_bias[0];
+        if constexpr (m_block_size_8) {
+          tmp_bias = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hadd2(res, tmp_bias);
+      }

      if constexpr (m_block_size_8) {
        ((scalar_t*)sh_red)[idx] = res.x;
@ -1626,19 +1687,25 @@ __global__ void Marlin(
          if constexpr (m_block_size_8) {
            int wr = c_sh_wr + 16 * j;
            write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
-                  frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
            write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
-                  frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
          } else {
            int wr = c_sh_wr + 8 * j;
            write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
            write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
            write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
            write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
          }
        }
        c_sh_wr += 16 * (4 * c_sh_stride);
@ -1805,6 +1872,14 @@ __global__ void Marlin(
      }

      thread_block_reduce();
+
+      if (has_bias && last) {
+        __syncthreads();
+        cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd],
+                       threadIdx.x < 16 * thread_n_blocks / 8);
+        cp_async_fence();
+      }
+
      if constexpr (!has_act_order && group_blocks == -1 &&
                    (has_zp && dequant_skip_flop || !has_zp)) {
        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
@ -1867,11 +1942,20 @@ __global__ void Marlin(
        }
        barrier_release(&locks[locks_off], last);
      }
+
+      if (has_bias && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
+        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        __syncthreads();
+      }
+
      if (use_atomic_add && slice_count > 1 && slice_idx != 0)
        wait_negative_and_add(&locks[locks_off]);
      if (last || use_atomic_add)
        // only the last block in a slice actually writes the result
-        write_result();
+        write_result(last);
      int old_slice_row = slice_row;
      slice_row = 0;
      slice_col_par++;
@ -1904,6 +1988,7 @@ __global__ void Marlin(
          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
        }

+        bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
        // Update slice k/n for scales loading
        if constexpr (has_act_order) {
          slice_k_start = tb_k * slice_row;
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@ -51,8 +51,9 @@ __global__ void permute_cols_kernel(
 }  // namespace marlin

 torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
@ -212,7 +213,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
  // Get B size
  int tb_k = th_config.thread_k;
  int tb_n = th_config.thread_n;
-  int tb_m = thread_m_blocks * (m_block_size_8 ? 8 : 16);
+  int tb_m = thread_m_blocks * 16;

  // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
@ -220,6 +221,11 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
  int sh_red_size = tb_m * (tb_n + 8) * 2;
+  int sh_bias_size = tb_n * 2;
+  int tmp_size =
+      (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
+  tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
+
  int sh_s_size =
      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                            group_size, has_act_order, is_k_full);
@ -234,8 +240,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
      sh_zp_size = sh_s_size / 2;
  }

-  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
-                   sh_zp_size + sh_g_idx_size + sh_block_meta_size;
+  int total_size = tmp_size + sh_a_size + sh_s_size + sh_zp_size +
+                   sh_g_idx_size + sh_block_meta_size;

  return total_size;
 }
@ -270,20 +276,25 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
  int cache_size = get_kernel_cache_size(
      th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
      num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size <= max_shared_mem;
+  return cache_size + 512 <= max_shared_mem;
 }

-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)    \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&       \
-             thread_n_blocks == THREAD_N_BLOCKS &&                           \
-             thread_k_blocks == THREAD_K_BLOCKS &&                           \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                             \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&   \
-             is_zp_float == IS_ZP_FLOAT) {                                   \
-      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,   \
-                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,      \
-                      pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>;               \
+  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
+                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
+             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+             is_zp_float == IS_ZP_FLOAT) {                                     \
+      constexpr auto S_TYPE =                                                  \
+          W_TYPE == vllm::kFE2M1f                                              \
+              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
+              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
+                                                     : vllm::kBFloat16);       \
+      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
+                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
+                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
    }

  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
@ -335,31 +346,45 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)

-  #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define FP4_GET_IF(W_TYPE)            \
-    FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
  #define BIGGROUP_GET_IF(W_TYPE)            \
    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)

+  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
+
+  #define NVFP4_GET_IF(W_TYPE)            \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
+
+  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF(W_TYPE)            \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
+
  // We currently have 4-bit models only with group_blocks == 4
  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
@ -408,12 +433,17 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
  COMMON_GET_IF(vllm::kU4B8)
  COMMON_GET_IF(vllm::kU8B128)

-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
+  NVFP4_GET_IF(vllm::kFE2M1f)

-  FP4_GET_IF(vllm::kFE2M1f)
+  BIGGROUP_GET_IF(vllm::kFE4M3fn)

  ACT_GET_IF(vllm::kU4B8)
  ACT_GET_IF(vllm::kU8B128)
+  if (std::is_same<scalar_t, nv_bfloat16>::value) {
+    if (false) {
+    }
+    MXFP4_GET_IF(vllm::kFE2M1f)
+  }

  return kernel;
 }
@ -482,16 +512,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 }

 template <typename scalar_t>
-void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
-               void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
-               void* sorted_token_ids, void* expert_ids,
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
+               void* s, void* s2, void* zp, void* g_idx, void* perm,
+               void* a_tmp, void* sorted_token_ids, void* expert_ids,
               void* num_tokens_past_padded, void* topk_weights,
               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
               int prob_m, int prob_n, int prob_k, void* workspace,
-               vllm::ScalarType const& q_type, bool has_act_order,
-               bool is_k_full, bool has_zp, int num_groups, int group_size,
-               int dev, cudaStream_t stream, int thread_k, int thread_n,
-               int sms, bool use_atomic_add, bool use_fp32_reduce,
+               vllm::ScalarType const& q_type, bool has_bias,
+               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
+               int group_size, int dev, cudaStream_t stream, int thread_k,
+               int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
               bool is_zp_float) {
  int thread_m_blocks = div_ceil(moe_block_size, 16);
  bool m_block_size_8 = moe_block_size == 8;
@ -538,6 +568,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* bias_ptr = (const int4*)b_bias;
  const int4* s_ptr = (const int4*)s;
  const uint16_t* s2_ptr = (const uint16_t*)s2;
  const int4* zp_ptr = (const int4*)zp;
@ -648,10 +679,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
  // avoid ">>>" being formatted to "> > >"
  // clang-format off
  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
-      A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
-      prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce, max_shared_mem);
+      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
  // clang-format on
 }

@ -659,7 +690,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,

 torch::Tensor moe_wna16_marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& global_scale_or_none,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
@ -766,7 +798,6 @@ torch::Tensor moe_wna16_marlin_gemm(
  num_groups = b_scales.size(1);

  torch::Tensor g_idx, perm, a_tmp;
-  ;
  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
    g_idx = g_idx_or_none.value();
    perm = perm_or_none.value();
@ -815,12 +846,24 @@ torch::Tensor moe_wna16_marlin_gemm(
  torch::Tensor global_scale;
  if (global_scale_or_none.has_value()) {
    global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f,
-                "global_scale can only be used for float4_e2m1f.");
+    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+                "global_scale can only be used for nvfp4 format.");
  } else {
    global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
-                "the global_scale parameter must be passed for float4_e2m1f.");
+    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+                "the global_scale parameter must be passed for nvfp4 format.");
+  }
+
+  bool has_bias = b_bias_or_none.has_value();
+  torch::Tensor b_bias;
+  if (has_bias) {
+    b_bias = b_bias_or_none.value();
+    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
+    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
+    TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
+  } else {
+    b_bias = torch::empty({0}, options);
  }

  torch::Tensor b_zeros;
@ -832,7 +875,6 @@ torch::Tensor moe_wna16_marlin_gemm(
    b_zeros = torch::empty({0}, options);
  }
  bool has_zp = b_zeros.size(-1) > 0;
-
  if (has_zp) {
    TORCH_CHECK(
        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
@ -890,41 +932,58 @@ torch::Tensor moe_wna16_marlin_gemm(
  if (a.scalar_type() == at::ScalarType::Half) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
    } else {
      scales_ptr = b_scales.data_ptr<at::Half>();
    }

    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
-        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
-        expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
-        topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
-        size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
-        is_k_full, has_zp, num_groups, group_size, dev,
+        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
+        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
+        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
+        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
+        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
+        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
    } else {
      scales_ptr = b_scales.data_ptr<at::BFloat16>();
    }

    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else {
    TORCH_CHECK(false,
                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -188,7 +188,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(
  It fuses the softmax, max and argmax into a single kernel.

  Limitations:
-  1) This implementation is intended for when the number of experts is a small power of 2.
+  1) This implementation is optimized for when the number of experts is a small power of 2.
+     Additionally it also supports when number of experts is multiple of 64 which is still
+     faster than the computing softmax and topK separately (only tested on CUDA yet).
  2) This implementation assumes k is small, but will work for any k.
 */

@ -198,8 +200,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
        int* source_rows, const int k, const int start_expert, const int end_expert)
 {
    // We begin by enforcing compile time assertions and setting up compile time constants.
-    static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
-    static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
    static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
    static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");

@ -407,12 +407,10 @@ struct TopkConstants
 };
 } // namespace detail

-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, typename IndType>
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
 void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
 {
-    static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
-
    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
    static constexpr int VPT = Constants::VPT;
@ -425,21 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
 }

-#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                                \
-    switch (warpSize) {                                                          \
-        case 32:                                                                 \
-            topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32>(      \
-                gating_output, nullptr, topk_weights, topk_indices,              \
-                token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
-            break;                                                               \
-        case 64:                                                                 \
-            topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64>(      \
-                gating_output, nullptr, topk_weights, topk_indices,              \
-                token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
-            break;                                                               \
-        default:                                                                 \
-            TORCH_CHECK(false, "Unsupported warp size: ", warpSize);             \
+#ifndef USE_ROCM
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                          \
+    static_assert(WARP_SIZE == 32,                                                    \
+                  "Unsupported warp size. Only 32 is supported for CUDA");            \
+    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
+        gating_output, nullptr, topk_weights, topk_indices,                           \
+        token_expert_indices, num_tokens, topk, 0, num_experts, stream);
+#else
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
+    if (WARP_SIZE == 64) {                                                               \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
+            gating_output, nullptr, topk_weights, topk_indices,                          \
+            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+    } else if (WARP_SIZE == 32) {                                                        \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
+            gating_output, nullptr, topk_weights, topk_indices,                          \
+            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+    } else {                                                                             \
+        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
    }
+#endif

 template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
@ -453,38 +457,64 @@ void topkGatingSoftmaxKernelLauncher(
    const int topk,
    cudaStream_t stream) {
    static constexpr int WARPS_PER_TB = 4;
-    auto warpSize = WARP_SIZE;
+    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
+#ifndef USE_ROCM
+    static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
+#endif
    switch (num_experts) {
        case 1:
-            LAUNCH_SOFTMAX(1, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 2:
-            LAUNCH_SOFTMAX(2, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 4:
-            LAUNCH_SOFTMAX(4, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 8:
-            LAUNCH_SOFTMAX(8, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 16:
-            LAUNCH_SOFTMAX(16, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 32:
-            LAUNCH_SOFTMAX(32, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 64:
-            LAUNCH_SOFTMAX(64, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 128:
-            LAUNCH_SOFTMAX(128, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
        case 256:
-            LAUNCH_SOFTMAX(256, WARPS_PER_TB);
+            LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
            break;
+        case 512:
+            LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
+            break;
+        // (CUDA only) support multiples of 64 when num_experts is not power of 2.
+        // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
+        // alternatively we can test 4 bytes loading and enable it in future.
+#ifndef USE_ROCM
+        case 192:
+            LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 320:
+            LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 384:
+            LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 448:
+            LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+        case 576:
+            LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+            break;
+#endif
        default: {
            TORCH_CHECK(softmax_workspace != nullptr,
-                "softmax_workspace must be provided for num_experts that are not a power of 2.");
+                "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
            static constexpr int TPB = 256;
            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
                gating_output, nullptr, softmax_workspace, num_experts);
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -35,7 +35,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {

  m.def(
      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
-      "Tensor! b_q_weight, Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "Tensor! b_q_weight, Tensor? b_bias_or_none,"
+      "Tensor! b_scales, Tensor? global_scale, Tensor? "
      "b_zeros_or_none,"
      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
      "Tensor sorted_token_ids,"
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -145,22 +145,6 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);

 void gelu_quick(torch::Tensor& out, torch::Tensor& input);

-void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                            int64_t block_size, torch::Tensor& input_tokens,
-                            torch::Tensor& sampled_token_ids,
-                            torch::Tensor& input_positions,
-                            torch::Tensor& seq_lens,
-                            torch::Tensor& slot_mapping,
-                            torch::Tensor& block_tables);
-
-void advance_step_flashinfer(
-    int64_t num_seqs, int64_t num_queries, int64_t block_size,
-    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-    torch::Tensor& input_positions, torch::Tensor& seq_lens,
-    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
-    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
-    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
-
 void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
                        torch::Tensor const& q_pe,
                        torch::Tensor const& kv_c_and_k_pe_cache,
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@ -1,336 +0,0 @@
-/*
- * The goal of this GPU kernel is to advance input tensors on the GPU directly
- * PR: https://github.com/vllm-project/vllm/pull/6338
- * Current restrictions:
- *     1. Specialized for DraftModelRunner
- *     2. Supports flash_attn only
- */
-
-#include "advance_step.cuh"
-
-namespace prepare_inputs {
-
-//
-template <int const num_threads>
-__global__ void advance_step_flashattn_kernel(
-    int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
-    long const* sampled_token_ids_ptr, long* input_positions_ptr,
-    int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
-    int64_t const block_tables_stride) {
-  int const n_pad = num_seqs - num_queries;
-  if (n_pad && blockIdx.x == 0) {
-    // Handle cuda graph padding
-    int const offset = num_queries;
-    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
-      input_tokens_ptr[offset + i] = 0;
-      input_positions_ptr[offset + i] = 0;
-      slot_mapping_ptr[offset + i] = -1;
-    }
-  }
-
-  int num_query_blocks = div_ceil(num_queries, num_threads);
-
-  if (blockIdx.x >= num_query_blocks) {
-    return;
-  }
-
-  int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
-
-  if (cur_query_id >= num_queries) {
-    return;
-  }
-
-  // Update input_tokens
-  input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
-
-  int seq_len = seq_lens_ptr[cur_query_id];
-  int next_seq_len = seq_len + 1;
-  int next_input_pos = next_seq_len - 1;
-
-  // Update seq_lens
-  seq_lens_ptr[cur_query_id] = next_seq_len;
-  // Update input_positions
-  input_positions_ptr[cur_query_id] = next_input_pos;
-
-  int const* seq_block_tables_ptr =
-      block_tables_ptr + block_tables_stride * cur_query_id;
-
-  int block_index = next_input_pos / block_size;
-  int block_offset = next_input_pos % block_size;
-
-  int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset;
-  // Update slot_mapping
-  slot_mapping_ptr[cur_query_id] = slot_num;
-}
-
-inline void verify_tensor(std::string const& name, torch::Tensor const& t,
-                          int64_t const size_0, int64_t const size_1,
-                          c10::ScalarType const type) {
-  bool size_0_cond = true;
-  if (size_0 != -1) {
-    size_0_cond = t.size(0) == size_0;
-  }
-
-  bool size_1_cond = true;
-  if (size_1 != -1) {
-    size_1_cond = t.size(1) == size_1;
-  }
-
-  bool is_contiguous = t.is_contiguous();
-  bool same_type = t.dtype() == type;
-
-  bool pass = size_0_cond && size_1_cond && is_contiguous && same_type;
-  if (!pass) {
-    TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(),
-                " is_cont = ", t.is_contiguous(), ", type = ", t.dtype(),
-                " is not as expected: shape = [", size_0, ", ", size_1,
-                "], type = ", type);
-  }
-}
-
-/// each thread processes a block per query
-__global__ void advance_step_flashinfer_kernel(
-    int num_threads, int num_seqs, int num_queries, int block_size,
-    long* input_tokens_ptr, long const* sampled_token_ids_ptr,
-    long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
-    int const* block_tables_ptr, int64_t const block_tables_stride,
-    int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
-  int const n_pad = num_seqs - num_queries;
-  if (n_pad && blockIdx.x == 0) {
-    // Handle cuda graph padding
-    int const offset = num_queries;
-    for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
-      input_tokens_ptr[offset + i] = 0;
-      input_positions_ptr[offset + i] = 0;
-      slot_mapping_ptr[offset + i] = -1;
-    }
-  }
-  int num_query_blocks = div_ceil(num_queries, num_threads);
-
-  if (blockIdx.x < num_query_blocks) {
-    int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
-
-    if (cur_query_id < num_queries) {
-      // Update input_tokens
-      input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
-
-      int seq_len = seq_lens_ptr[cur_query_id];
-      int next_seq_len = seq_len + 1;
-      int next_input_pos = next_seq_len - 1;
-
-      // Update seq_lens
-      seq_lens_ptr[cur_query_id] = next_seq_len;
-      // Update input_positions
-      input_positions_ptr[cur_query_id] = next_input_pos;
-
-      int const* seq_block_tables_ptr =
-          block_tables_ptr + block_tables_stride * cur_query_id;
-
-      int block_index = next_input_pos / block_size;
-      int block_offset = next_input_pos % block_size;
-
-      // Update paged_kv_last_page_len
-      paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
-
-      int slot_num =
-          seq_block_tables_ptr[block_index] * block_size + block_offset;
-      // Update slot_mapping
-      slot_mapping_ptr[cur_query_id] = slot_num;
-      block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
-    }
-  }
-}
-
-__global__ void advance_step_flashinfer_indptr_kernel(
-    int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
-    int* block_table_bound_ptr) {
-  int idx = blockIdx.x * num_threads + threadIdx.x;
-  // Update paged_kv_indptr
-  if (idx == 0) {
-    paged_kv_indptr_ptr[idx] = 0;
-  }
-  if (idx < num_queries) {
-    int sum = 0;
-    for (int i = 0; i <= idx; ++i) {
-      sum += block_table_bound_ptr[i];
-    }
-    paged_kv_indptr_ptr[idx + 1] = sum;
-  }
-}
-
-__global__ void advance_step_flashinfer_indices_kernel(
-    int num_seqs, int num_queries, int const* block_tables_ptr,
-    int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr,
-    int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
-  // note: max_num_blocks_per_seq = block_tables.stride(0)
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // when cuda graphs are enabled, paged_kv_indptr tensor
-  // has to be updated for the padded queries
-  // tid represents a query# for paged_kv_indptr tensor
-  if (num_queries < tid && tid <= num_seqs) {
-    paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries];
-  }
-
-  // each thread processes a block_ptr in block_tables
-  // block_tables shape: [num_queries, max_num_blocks_per_seq]
-  // paged_kv_indices is flattened block_tables.
-  for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq);
-       idx += (gridDim.x * blockDim.x)) {
-    // block_tables-row = paged_kv_indptr[queryNum]
-    int queryNum = idx / max_num_blocks_per_seq;
-    int col = idx % max_num_blocks_per_seq;
-    if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) {
-      int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col;
-      int block_tables_idx = queryNum * max_num_blocks_per_seq + col;
-      paged_kv_indices_ptr[indices_arr_idx] =
-          block_tables_ptr[block_tables_idx];
-    }
-  }
-}
-
-void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
-                            torch::Tensor& input_tokens,       // type: long
-                            torch::Tensor& sampled_token_ids,  // type: long
-                            torch::Tensor& input_positions,    // type: long
-                            torch::Tensor& seq_lens,           // type: int
-                            torch::Tensor& slot_mapping,       // type: long
-                            torch::Tensor& block_tables) {     // type: int
-
-  if (logging) {
-    printf("advance_step_flashattn:\n");
-    printf("  num_seqs = %d\n", num_seqs);
-    printf("  num_queries = %d\n", num_queries);
-    printf("  block_size = %d\n", block_size);
-  }
-  // Verify all tensors
-  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
-  verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
-                at::kLong);
-  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
-  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
-  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
-  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
-
-  int dev = sampled_token_ids.get_device();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
-
-  int blocks;
-  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
-
-  advance_step_flashattn_kernel<max_threads>
-      <<<blocks, max_threads, 0, stream>>>(
-          num_seqs, num_queries, block_size,
-          reinterpret_cast<long*>(input_tokens.data_ptr()),
-          reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
-          reinterpret_cast<long*>(input_positions.data_ptr()),
-          reinterpret_cast<int*>(seq_lens.data_ptr()),
-          reinterpret_cast<long*>(slot_mapping.data_ptr()),
-          reinterpret_cast<int const*>(block_tables.data_ptr()),
-          block_tables.stride(0));
-}
-
-void advance_step_flashinfer(
-    int num_seqs, int num_queries, int block_size,
-    torch::Tensor& input_tokens,            // type: long
-    torch::Tensor& sampled_token_ids,       // type: long
-    torch::Tensor& input_positions,         // type: long
-    torch::Tensor& seq_lens,                // type: int
-    torch::Tensor& slot_mapping,            // type: long
-    torch::Tensor& block_tables,            // type: int
-    torch::Tensor& paged_kv_indices,        // type: int
-    torch::Tensor& paged_kv_indptr,         // type: int
-    torch::Tensor& paged_kv_last_page_len,  // type: int
-    torch::Tensor& block_table_bound) {     // type: int
-
-  if (logging) {
-    printf("advance_step_flashinfer:\n");
-    printf("  num_seqs = %d\n", num_seqs);
-    printf("  num_queries = %d\n", num_queries);
-    printf("  block_size = %d\n", block_size);
-    printf("  block_tables.stride(0) = %zu\n", block_tables.stride(0));
-  }
-  // Verify all tensors
-  verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
-  // verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
-  //               at::kLong);
-  verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
-  verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
-  verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
-  verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
-
-  verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
-  verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
-  verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
-                at::kInt);
-
-  verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
-
-  int dev = sampled_token_ids.get_device();
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
-
-  int blocks;
-  int threads;
-  cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
-  cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
-
-  TORCH_CHECK((blocks * threads > num_queries),
-              "multi-step: not enough threads to map to num_queries = ",
-              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
-              " blocks = ", blocks, " max_threads = ", threads);
-  if (logging) {
-    printf("launching kernels with %d blocks and %d threads\n", blocks,
-           threads);
-  }
-  advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
-      threads, num_seqs, num_queries, block_size,
-      reinterpret_cast<long*>(input_tokens.data_ptr()),
-      reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
-      reinterpret_cast<long*>(input_positions.data_ptr()),
-      reinterpret_cast<int*>(seq_lens.data_ptr()),
-      reinterpret_cast<long*>(slot_mapping.data_ptr()),
-      reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0),
-      reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
-      reinterpret_cast<int*>(block_table_bound.data_ptr()));
-
-  advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
-      threads, num_seqs, num_queries,
-      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
-      reinterpret_cast<int*>(block_table_bound.data_ptr()));
-
-  advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
-      num_seqs, num_queries,
-      reinterpret_cast<int const*>(block_tables.data_ptr()),
-      block_tables.stride(0),
-      reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
-      reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
-      reinterpret_cast<int*>(block_table_bound.data_ptr()));
-}
-
-}  // namespace prepare_inputs
-
-void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
-                            int64_t block_size, torch::Tensor& input_tokens,
-                            torch::Tensor& sampled_token_ids,
-                            torch::Tensor& input_positions,
-                            torch::Tensor& seq_lens,
-                            torch::Tensor& slot_mapping,
-                            torch::Tensor& block_tables) {
-  prepare_inputs::advance_step_flashattn(
-      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
-      input_positions, seq_lens, slot_mapping, block_tables);
-}
-
-void advance_step_flashinfer(
-    int64_t num_seqs, int64_t num_queries, int64_t block_size,
-    torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
-    torch::Tensor& input_positions, torch::Tensor& seq_lens,
-    torch::Tensor& slot_mapping, torch::Tensor& block_tables,
-    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
-    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
-  prepare_inputs::advance_step_flashinfer(
-      num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
-      input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
-      paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
-}
--- a/csrc/prepare_inputs/advance_step.cuh
+++ b/csrc/prepare_inputs/advance_step.cuh
@ -1,19 +0,0 @@
-#pragma once
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <iostream>
-
-namespace prepare_inputs {
-
-static constexpr int max_threads = 256;
-static constexpr bool logging = false;
-
-constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
-
-}  // namespace prepare_inputs
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@ -470,11 +470,12 @@ __device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
  frag_b[0] = __hmul2(frag_b[0], bias_reg);
 }

-template <typename scalar_t2>
+template <typename scalar_t2, vllm::ScalarTypeId s_type_id>
 __device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);

 template <>
-__device__ inline void dequant_fp8_scales<half2>(int q, half2* frag_b) {
+__device__ inline void dequant_fp8_scales<half2, vllm::kFE4M3fn.id()>(
+    int q, half2* frag_b) {
  int Out1 = (q & 0xFF00FF00) >> 1;
  ;
  q <<= 8;
@ -486,8 +487,8 @@ __device__ inline void dequant_fp8_scales<half2>(int q, half2* frag_b) {
 };

 template <>
-__device__ inline void dequant_fp8_scales<nv_bfloat162>(int q,
-                                                        nv_bfloat162* frag_b) {
+__device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE4M3fn.id()>(
+    int q, nv_bfloat162* frag_b) {
  constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8;
  constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT;
  constexpr int MASK = 0x7F007F00;
@ -502,6 +503,20 @@ __device__ inline void dequant_fp8_scales<nv_bfloat162>(int q,
  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
 }

+template <>
+__device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE8M0fnu.id()>(
+    int q, nv_bfloat162* frag_b) {
+  // In this conversion, 2 ** -127 in FP8E8M0 would become 0 in BF16,
+  // but we assume that such a extreme value would not occur in real models.
+  int Out1 = (q & 0xFF00FF00) >> 1;
+  q <<= 7;
+  int Out2 = q & 0x7F807F80;
+
+  // Note: reverse indexing is intentional because weights are permuted
+  frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+}
+
 #endif

 }  // namespace MARLIN_NAMESPACE_NAME
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@ -20,6 +20,7 @@ namespace MARLIN_NAMESPACE_NAME {
 TEMPLATE = ("template __global__ void Marlin<"
            "{{scalar_t}}, "
            "{{w_type_id}}, "
+            "{{s_type_id}}, "
            "{{threads}}, "
            "{{thread_m_blocks}}, "
            "{{thread_n_blocks}}, "
@ -78,7 +79,8 @@ def generate_new_kernels():
            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
                continue
            # nvfp4 only supports group_size == 16
-            if scalar_type == "vllm::kFE2M1f" and group_blocks != 1:
+            # mxfp4 only supports group_size == 32
+            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
                continue
            # other quantization methods don't support group_size = 16
            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
@ -97,10 +99,23 @@ def generate_new_kernels():
                # 4bit quantization and fp16
                is_zp_float_list.append(True)

+            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
+                s_type = "vllm::kFE4M3fn"
+            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
+                s_type = "vllm::kFE8M0fnu"
+                if dtype == "fp16":
+                    # we cannot safely dequantize e8m0 to fp16, so skip this
+                    continue
+            elif dtype == "fp16":
+                s_type = "vllm::kFloat16"
+            elif dtype == "bf16":
+                s_type = "vllm::kBFloat16"
+
            for is_zp_float in is_zp_float_list:
                template_str = jinja2.Template(TEMPLATE).render(
                    scalar_t=c_dtype,
                    w_type_id=scalar_type + ".id()",
+                    s_type_id=s_type + ".id()",
                    threads=threads,
                    thread_m_blocks=max(m_blocks, 1),
                    thread_n_blocks=n_blocks,
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -48,7 +48,8 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,

 torch::Tensor gptq_marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
@ -187,7 +188,12 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
  int tb_m = thread_m_blocks * 16;
  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
-  int sh_red_size = tb_m * (tb_n + 8);
+  int sh_red_size = tb_m * (tb_n + 8) * 2;
+  int sh_bias_size = tb_n * 2;
+  int tmp_size =
+      (sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
+  tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
+
  int sh_s_size =
      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                            group_size, has_act_order, is_k_full);
@ -202,8 +208,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
      sh_zp_size = sh_s_size / 2;
  }

-  int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
-                   sh_zp_size + sh_g_idx_size;
+  int total_size =
+      tmp_size + sh_a_size + sh_s_size + sh_zp_size + sh_g_idx_size;

  return total_size;
 }
@ -237,20 +243,25 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
  int cache_size = get_kernel_cache_size(
      th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
      has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size <= max_shared_mem;
+  return cache_size + 512 <= max_shared_mem;
 }

-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)    \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&       \
-             thread_n_blocks == THREAD_N_BLOCKS &&                           \
-             thread_k_blocks == THREAD_K_BLOCKS &&                           \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                             \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&   \
-             is_zp_float == IS_ZP_FLOAT) {                                   \
-      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,   \
-                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,      \
-                      pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>;               \
+  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
+                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
+    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
+             thread_n_blocks == THREAD_N_BLOCKS &&                             \
+             thread_k_blocks == THREAD_K_BLOCKS &&                             \
+             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
+             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
+             is_zp_float == IS_ZP_FLOAT) {                                     \
+      constexpr auto S_TYPE =                                                  \
+          W_TYPE == vllm::kFE2M1f                                              \
+              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
+              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
+                                                     : vllm::kBFloat16);       \
+      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
+                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
+                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
    }

  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
@ -315,22 +326,39 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
    BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)

-  #define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
+  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)

-  #define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
+  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)

-  #define FP4_GET_IF(W_TYPE)            \
-    FP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    FP4_GET_IF_M234(W_TYPE, 4, 8, 128)
+  #define NVFP4_GET_IF(W_TYPE)            \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    NVFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    NVFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
+
+  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
+    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
+    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
+
+  #define MXFP4_GET_IF(W_TYPE)            \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
+    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
+    MXFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
+    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
+    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
+    MXFP4_GET_IF_M234(W_TYPE, 4, 8, 128)

  // We currently have 4-bit models only with group_blocks == 4
  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
@ -384,7 +412,7 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
  COMMON_GET_IF(vllm::kU4B8)
  COMMON_GET_IF(vllm::kU8B128)

-  FP4_GET_IF(vllm::kFE2M1f)
+  NVFP4_GET_IF(vllm::kFE2M1f)

  BIGGROUP_GET_IF(vllm::kFE4M3fn)

@ -396,6 +424,11 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
    }
    FZP_GET_IF(vllm::kU4)
  }
+  if (std::is_same<scalar_t, nv_bfloat16>::value) {
+    if (false) {
+    }
+    MXFP4_GET_IF(vllm::kFE2M1f)
+  }

  return kernel;
 }
@ -453,12 +486,12 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 }

 template <typename scalar_t>
-void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
-               void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
-               int prob_m, int prob_n, int prob_k, int lda, void* workspace,
-               vllm::ScalarType const& q_type, bool has_act_order,
-               bool is_k_full, bool has_zp, int num_groups, int group_size,
-               int dev, cudaStream_t stream, int thread_k_init,
+void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
+               void* s, void* s2, void* zp, void* g_idx, void* perm,
+               void* a_tmp, int prob_m, int prob_n, int prob_k, int lda,
+               void* workspace, vllm::ScalarType const& q_type, bool has_bias,
+               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
+               int group_size, int dev, cudaStream_t stream, int thread_k_init,
               int thread_n_init, int sms, bool use_atomic_add,
               bool use_fp32_reduce, bool is_zp_float) {
  if (has_zp) {
@ -503,6 +536,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  int4* C_tmp_ptr = (int4*)C_tmp;
+  const int4* bias_ptr = (const int4*)b_bias;
  const int4* s_ptr = (const int4*)s;
  const uint16_t* s2_ptr = (const uint16_t*)s2;
  const int4* zp_ptr = (const int4*)zp;
@ -623,8 +657,9 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
    // avoid ">>>" being formatted to "> > >"
    // clang-format off
    kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
-        A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr, num_groups,
-        prob_m_split, prob_n, prob_k, lda, locks, part_use_atomic_add,
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr,
+        g_idx_ptr, num_groups,
+        prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add,
        use_fp32_reduce, max_shared_mem_new);
    // clang-format on

@ -638,7 +673,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,

 torch::Tensor gptq_marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
+    torch::Tensor& b_q_weight,
+    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& global_scale_or_none,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
@ -785,12 +821,24 @@ torch::Tensor gptq_marlin_gemm(
  torch::Tensor global_scale;
  if (global_scale_or_none.has_value()) {
    global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f,
-                "global_scale can only be used for float4_e2m1f.");
+    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+                "global_scale can only be used for nvfp4 format.");
  } else {
    global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
-                "the global_scale parameter must be passed for float4_e2m1f.");
+    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+                "the global_scale parameter must be passed for nvfp4 format.");
+  }
+
+  bool has_bias = b_bias_or_none.has_value();
+  torch::Tensor b_bias;
+  if (has_bias) {
+    b_bias = b_bias_or_none.value();
+    TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
+    TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
+    TORCH_CHECK(b_bias.size(0) == size_n, "b_bias.size(0) != size_n");
+    TORCH_CHECK(b_bias.stride(0) == 1, "b_bias.stride(0) != 1");
+  } else {
+    b_bias = torch::empty({0}, options);
  }

  torch::Tensor b_zeros;
@ -857,34 +905,50 @@ torch::Tensor gptq_marlin_gemm(
  if (a.scalar_type() == at::ScalarType::Half) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
    } else {
      scales_ptr = b_scales.data_ptr<at::Half>();
    }

    marlin::marlin_mm<half>(
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
-        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
+        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
+        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
+        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        a.stride(0), workspace.data_ptr(), b_q_type, has_bias, has_act_order,
+        is_k_full, has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    void* scales_ptr;
    if (b_q_type == vllm::kFE2M1f) {
-      scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      if (group_size == 16)
+        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
+      else if (group_size == 32)
+        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
+      else
+        TORCH_CHECK(false,
+                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
+                    "and group_size == 32 (MXFP4)");
    } else {
      scales_ptr = b_scales.data_ptr<at::BFloat16>();
    }

    marlin::marlin_mm<nv_bfloat16>(
        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
+        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
+        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
        size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type,
-        has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
+        has_bias, has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else {
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@ -10,15 +10,18 @@
 #define MARLIN_KERNEL_PARAMS                                                   \
  const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
+      const int4 *__restrict__ b_bias_ptr,                                     \
      const int4 *__restrict__ scales_ptr,                                     \
      const uint16_t *__restrict__ scale2_ptr,                                 \
      const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
      int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
-      bool use_atomic_add, bool use_fp32_reduce, int max_shared_mem
+      bool has_bias, bool use_atomic_add, bool use_fp32_reduce,                \
+      int max_shared_mem

 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@ -39,6 +39,7 @@ namespace MARLIN_NAMESPACE_NAME {

 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
@ -271,6 +272,7 @@ __device__ inline void wait_negative_and_add(int* lock) {

 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
@ -290,6 +292,7 @@ __global__ void Marlin(
    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
    int4* __restrict__ C,        // fp16 output buffer of shape mxn
    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ b_bias_ptr,
    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
                                          // (k/groupsize)xn
    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
@ -297,12 +300,13 @@ __global__ void Marlin(
    const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
                                      // (k/groupsize)x(n/pack_factor)
    const int* __restrict__ g_idx,    // int32 group indices of shape k
-    int num_groups,        // number of scale groups per output channel
-    int prob_m,            // batch dimension m
-    int prob_n,            // output dimension n
-    int prob_k,            // reduction dimension k
-    int lda,               // A.stride(0), equal to prob_k is A is contiguous
-    int* locks,            // extra global storage for barrier synchronization
+    int num_groups,  // number of scale groups per output channel
+    int prob_m,      // batch dimension m
+    int prob_n,      // output dimension n
+    int prob_k,      // reduction dimension k
+    int lda,         // A.stride(0), equal to prob_k is A is contiguous
+    int* locks,      // extra global storage for barrier synchronization
+    bool has_bias,
    bool use_atomic_add,   // whether to use atomic add to reduce
    bool use_fp32_reduce,  // whether to use fp32 global reduce
    int max_shared_mem) {
@ -326,18 +330,29 @@ __global__ void Marlin(
  using FragZP = typename ScalarType<scalar_t>::FragZP;

  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
+  if constexpr (w_type == vllm::kFE2M1f) {
+    static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
+                  s_type == vllm::kFE8M0fnu && group_blocks == 2);
+  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    static_assert(s_type == vllm::kBFloat16);
+  } else if constexpr (std::is_same<scalar_t, half>::value) {
+    static_assert(s_type == vllm::kFloat16);
+  }
+
  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
  // see comments of dequant.h for more details
  constexpr bool dequant_skip_flop =
-      !is_int_type ||
+      w_type == vllm::kFE4M3fn ||
+      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
      has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
      has_zp && !is_zp_float && !(w_type == vllm::kU8);

  scalar_t2 global_scale;
-
-  if constexpr (w_type == vllm::kFE2M1f) {
+  if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+    // NVFP4 format requires global scale
    uint16_t val = scale2_ptr[0];
    global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
  }
@ -589,7 +604,7 @@ __global__ void Marlin(

    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + warp_row % 2;
+    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;

  } else if constexpr (group_blocks != -1)
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
@ -602,6 +617,18 @@ __global__ void Marlin(
    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
              (threadIdx.x % 32) % 4;

+  int bias_sh_rd;
+  if constexpr (m_block_size_8) {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) / 8;
+  } else {
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                 (threadIdx.x % 32) % 4;
+  }
+
+  int bias_sh_wr = threadIdx.x;
+  int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
+
  // Zero-points have the same read layout as the scales
  // (without column-wise case)
  constexpr int num_col_threads = 8;
@ -670,7 +697,19 @@ __global__ void Marlin(
  constexpr int sh_b_size = stages * b_sh_stage;
  int4* sh_b = sh;
  int4* sh_red = sh;
-  int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+
+  constexpr int sh_size_b_red_min =
+      (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_size_b_red_max =
+      (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
+  constexpr int sh_bias_size = (thread_n_blocks * 16 / 8);
+  constexpr int sh_b_red_bias_size =
+      sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size)
+          ? sh_size_b_red_max
+          : (sh_size_b_red_min + sh_bias_size);
+
+  int4* sh_bias = sh + sh_size_b_red_min;
+  int4* sh_g_idx = sh + sh_b_red_bias_size;
  int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
  constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                          : (stages * s_sh_stage);
@ -680,15 +719,13 @@ __global__ void Marlin(
  static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                stages * b_sh_stage);
  int4* sh_a = sh_s + sh_s_size;
-  // constexpr int shm_size_used =
-  //     stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
-  //     (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);

  // Register storage for double buffer of shared memory reads.
  FragA frag_a[2][thread_m_blocks];
  I4 frag_b_quant[2][b_thread_vecs];
  FragC frag_c[thread_m_blocks][4][2];
-  FragS frag_s[2][4];                    // No act-order
+  FragS frag_s[2][4];  // No act-order
+  FragS frag_bias[2][4];
  FragS act_frag_s[2][4][4];             // For act-order
  int frag_qzp[2][num_ints_per_thread];  // Zero-points
  FragZP frag_zp;                        // Zero-points in fp16
@ -923,10 +960,15 @@ __global__ void Marlin(
          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
            reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else {
+          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                reinterpret_cast<int2*>(
                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          } else {
+            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
+                reinterpret_cast<int2*>(
+                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
+                                k % 2];
          }
        }
      }
@ -1139,9 +1181,9 @@ __global__ void Marlin(
      int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
      int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];

-      dequant_fp8_scales<scalar_t2>(s_quant_0,
-                                    reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2>(
+      dequant_fp8_scales<scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<scalar_t2, s_type_id>(
          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
    }

@ -1411,7 +1453,7 @@ __global__ void Marlin(
  // Write out the reduce final result in the correct layout. We only actually
  // reshuffle matrix fragments in this step, the reduction above is performed
  // in fragment layout.
-  auto write_result = [&]() {
+  auto write_result = [&](bool last) {
    int c_gl_stride = prob_n / 8;
    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
@ -1438,7 +1480,7 @@ __global__ void Marlin(
    int c_gl_wr_end = c_gl_stride * prob_m;
    // We first reorder in shared memory to guarantee the most efficient final
    // global write patterns
-    auto write = [&](int idx, float c0, float c1, FragS& s) {
+    auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
      scalar_t2 res =
          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));

@ -1447,12 +1489,25 @@ __global__ void Marlin(
      if constexpr (!has_act_order && group_blocks == -1 &&
                    w_type.size_bits() == 4 &&
                    (has_zp && dequant_skip_flop || !has_zp)) {
-        res = __hmul2(res, s[0]);
+        scalar_t2 tmp_scale = s[0];
+        if constexpr (m_block_size_8) {
+          tmp_scale = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hmul2(res, tmp_scale);
      }

-      if constexpr (w_type == vllm::kFE2M1f) {
+      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
        res = __hmul2(res, global_scale);
      }
+      if (has_bias && last) {
+        scalar_t2 tmp_bias = b_bias[0];
+        if constexpr (m_block_size_8) {
+          tmp_bias = Dtype::num2num2(
+              reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
+        }
+        res = __hadd2(res, tmp_bias);
+      }

      if constexpr (m_block_size_8) {
        ((scalar_t*)sh_red)[idx] = res.x;
@ -1470,19 +1525,25 @@ __global__ void Marlin(
          if constexpr (m_block_size_8) {
            int wr = c_sh_wr + 16 * j;
            write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
-                  frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
            write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
-                  frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
          } else {
            int wr = c_sh_wr + 8 * j;
            write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
-                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
            write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
-                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+                  frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0],
+                  frag_bias[j / 2][2 * (j % 2) + 0]);
            write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
-                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
            write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
-                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+                  frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1],
+                  frag_bias[j / 2][2 * (j % 2) + 1]);
          }
        }
        c_sh_wr += 16 * (4 * c_sh_stride);
@ -1622,6 +1683,14 @@ __global__ void Marlin(
      }

      thread_block_reduce();
+
+      if (has_bias && last) {
+        __syncthreads();
+        cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd],
+                       threadIdx.x < 16 * thread_n_blocks / 8);
+        cp_async_fence();
+      }
+
      if constexpr (!has_act_order && group_blocks == -1 &&
                    (has_zp && dequant_skip_flop || !has_zp)) {
        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
@ -1684,11 +1753,20 @@ __global__ void Marlin(
        }
        barrier_release(&locks[locks_off], last);
      }
+
+      if (has_bias && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
+        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        __syncthreads();
+      }
+
      if (use_atomic_add && slice_count > 1 && slice_idx != 0)
        wait_negative_and_add(&locks[locks_off]);
      if (last || use_atomic_add)
        // only the last block in a slice actually writes the result
-        write_result();
+        write_result(last);
      slice_row = 0;
      slice_col_par++;
      slice_col++;
@ -1706,6 +1784,7 @@ __global__ void Marlin(
          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
        }

+        bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
        // Update slice k/n for scales loading
        if constexpr (has_act_order) {
          slice_k_start = tb_k * slice_row;
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -142,25 +142,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);

-  // prepare_inputs advance_step
-  ops.def(
-      "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
-      "Tensor! input_tokens, Tensor sampled_token_ids, "
-      "Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
-      "Tensor block_tables) -> ()");
-  ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
-
-  ops.def(
-      "advance_step_flashinfer("
-      "    int num_seqs, int num_queries, int block_size,"
-      "    Tensor! input_tokens, Tensor sampled_token_ids,"
-      "    Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
-      "    Tensor block_tables, Tensor! paged_kv_indices,"
-      "    Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
-      "    Tensor! block_table_bounds"
-      ") -> ()");
-  ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
-
  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
@ -326,6 +307,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // gptq_marlin Optimized Quantized GEMM for GPTQ.
  ops.def(
      "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
+      "Tensor? b_bias_or_none,"
      "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
      "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -387,7 +387,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
 # We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
-ARG FLASHINFER_GIT_REF="v0.2.10"
+ARG FLASHINFER_GIT_REF="v0.2.11"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
  . /etc/environment
    git clone --depth 1 --recursive --shallow-submodules \
@ -432,7 +432,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \

 # Install DeepGEMM from source
 ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
-ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1"
+ARG DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
 RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
  . /etc/environment
    CUDA_MAJOR="${CUDA_VERSION%%.*}"
@ -497,14 +497,11 @@ ENV HF_HUB_ENABLE_HF_TRANSFER 1
 # Copy in the v1 package for testing (it isn't distributed yet)
 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1

-# doc requires source code
-# we hide them inside `test_docs/` , so that this source code
+# Source code is used in the `python_only_compile.sh` test
+# We hide it inside `src/` so that this source code
 # will not be imported by other tests
-RUN mkdir test_docs
-RUN mv docs test_docs/
-RUN cp -r examples test_docs/
-RUN mv vllm test_docs/
-RUN mv mkdocs.yaml test_docs/
+RUN mkdir src
+RUN mv vllm src/vllm
 #################### TEST IMAGE ####################

 #################### OPENAI API SERVER ####################
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@ -1,25 +1,17 @@
 nav:
-  - Home:
-    - vLLM: README.md
+  - Home: README.md
+  - User Guide:
+    - usage/README.md
    - Getting Started:
      - getting_started/quickstart.md
      - getting_started/installation
    - Examples:
+      - examples/README.md
      - Offline Inference: examples/offline_inference
      - Online Serving: examples/online_serving
      - Others: examples/others
-    - Quick Links:
-      - User Guide: usage/README.md
-      - Developer Guide: contributing/README.md
-      - API Reference: api/README.md
-      - CLI Reference: cli/README.md
-    - Timeline:
-      - Roadmap: https://roadmap.vllm.ai
-      - Releases: https://github.com/vllm-project/vllm/releases
-  - User Guide:
-    - Summary: usage/README.md
-    - usage/v1_guide.md
    - General:
+      - usage/v1_guide.md
      - usage/*
    - Inference and Serving:
      - serving/offline_inference.md
@ -32,7 +24,7 @@ nav:
      - deployment/integrations
    - Training: training
    - Configuration:
-      - Summary: configuration/README.md
+      - configuration/README.md
      - configuration/*
    - Models:
      - models/supported_models.md
@ -45,7 +37,7 @@ nav:
      - features/*
      - features/quantization
  - Developer Guide:
-    - Summary: contributing/README.md
+    - contributing/README.md
    - General:
      - glob: contributing/*
        flatten_single_child_sections: true
--- a/docs/README.md
+++ b/docs/README.md
@ -1,3 +1,9 @@
+---
+hide:
+  - navigation
+  - toc
+---
+
 # Welcome to vLLM

 <figure markdown="span">
@ -21,6 +27,17 @@ vLLM is a fast and easy-to-use library for LLM inference and serving.

 Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.

+Where to get started with vLLM depends on the type of user. If you are looking to:
+
+- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
+- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
+- Build vLLM, we recommend starting with [Developer Guide](./contributing)
+
+For information about the development of vLLM, see:
+
+- [Roadmap](https://roadmap.vllm.ai)
+- [Releases](https://github.com/vllm-project/vllm/releases)
+
 vLLM is fast with:

 - State-of-the-art serving throughput
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@ -11,7 +11,7 @@ vLLM contains two sets of benchmarks:

 The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.

-The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai).
+The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).

 More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).

--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@ -175,11 +175,19 @@ implementations that input `FusedMoEActivationFormat.Standard` support chunking

 ### FusedMoEModularKernel Initialization

-`FusedMoEMethodBase` class has 2 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are,
+`FusedMoEMethodBase` class has 3 methods that are collectively responsible in creating the `FusedMoEModularKernel` object. They are,

+* maybe_make_prepare_finalize,
 * select_gemm_impl, and
 * init_prepare_finalize

+#### maybe_make_prepare_finalize
+
+The `maybe_make_prepare_finalize` method is responsbile for constructing an instance of `FusedMoEPrepareAndFinalize` when appropriate based on the current all2all backend, e.g. when EP + DP is enabled.  The base class method currently constructs all the `FusedMoEPrepareAndFinalize` objects for the EP+DP case.  Derived classes can override this method to construct prepare/finalize objects for different scenarios, e.g. `ModelOptNvFp4FusedMoE` can construct a `FlashInferCutlassMoEPrepareAndFinalize` for the EP+TP case.
+Please refer to the implementations in,
+
+* `ModelOptNvFp4FusedMoE`
+
 #### select_gemm_impl

 The `select_gemm_impl` method is undefined in the base class. It is the responsibility of the derived class to implement a method that constructs a valid/appropriate `FusedMoEPermuteExpertsUnpermute` object.
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@ -0,0 +1,7 @@
+# Examples
+
+vLLM's examples are split into three categories:
+
+- If you are using vLLM from within Python code, see [Offline Inference](./offline_inference/)
+- If you are using vLLM from an HTTP application or client, see [Online Serving](./online_serving/)
+- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see [Others](./others/)
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@ -351,3 +351,22 @@ vllm serve ibm-granite/granite-speech-3.3-2b \
 ```

 Note: Default multimodal LoRAs are currently only available for `.generate` and chat completions.
+
+## Using Tips
+
+### Configuring `max_lora_rank`
+
+The `--max-lora-rank` parameter controls the maximum rank allowed for LoRA adapters. This setting affects memory allocation and performance:
+
+- **Set it to the maximum rank** among all LoRA adapters you plan to use
+- **Avoid setting it too high** - using a value much larger than needed wastes memory and can cause performance issues
+
+For example, if your LoRA adapters have ranks [16, 32, 64], use `--max-lora-rank 64` rather than 256
+
+```bash
+# Good: matches actual maximum rank
+vllm serve model --enable-lora --max-lora-rank 64
+
+# Bad: unnecessarily high, wastes memory
+vllm serve model --enable-lora --max-lora-rank 256
+```
--- a/docs/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@ -203,6 +203,7 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
            "draft_tensor_parallel_size": 1,
            "num_speculative_tokens": 2,
+            "method": "eagle",
        },
    )

@ -231,6 +232,9 @@ A few important things to consider when using the EAGLE based draft models:
   reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
   investigation and tracked here: <gh-issue:9565>.

+4. When using EAGLE-3 based draft model, option "method" must be set to "eagle3".
+   That is, to specify `"method": "eagle3"` in `speculative_config`.
+
 A variety of EAGLE draft models are available on the Hugging Face hub:

 | Base Model                                                           | EAGLE on Hugging Face                     | # EAGLE Parameters |
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@ -14,3 +14,16 @@ vLLM supports the following hardware platforms:
 - [Google TPU](google_tpu.md)
 - [Intel Gaudi](intel_gaudi.md)
 - [AWS Neuron](aws_neuron.md)
+
+## Hardware Plugins
+
+The backends below live **outside** the main `vllm` repository and follow the
+[Hardware-Pluggable RFC](../../design/plugin_system.md).
+
+| Accelerator | PyPI / package | Repository |
+|-------------|----------------|------------|
+| Ascend NPU | `vllm-ascend` | <https://github.com/vllm-project/vllm-ascend> |
+| Intel Gaudi (HPU) | N/A, install from source | <https://github.com/vllm-project/vllm-gaudi> |
+| MetaX MACA GPU | N/A, install from source | <https://github.com/MetaX-MACA/vLLM-metax> |
+| Rebellions ATOM / REBEL NPU | `vllm-rbln` | <https://github.com/rebellions-sw/vllm-rbln> |
+| IBM Spyre AIU | `vllm-spyre` | <https://github.com/vllm-project/vllm-spyre> |
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@ -6,7 +6,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 # --8<-- [start:requirements]

 - OS: Linux
- CPU flags: `avx512f`, `avx512_bf16` (Optional), `avx512_vnni` (Optional)
+- CPU flags: `avx512f` (Recommended), `avx512_bf16` (Optional), `avx512_vnni` (Optional)

 !!! tip
    Use `lscpu` to check the CPU flags.
@ -28,7 +28,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)

 !!! warning
-    If deploying the pre-built images on machines only contain `avx512f`, `Illegal instruction` error may be raised. It is recommended to build images for these machines with `--build-arg VLLM_CPU_AVX512BF16=false` and `--build-arg VLLM_CPU_AVX512VNNI=false`.
+    If deploying the pre-built images on machines without `avx512f`, `avx512_bf16`, or `avx512_vnni` support, an `Illegal instruction` error may be raised. It is recommended to build images for these machines with the appropriate build arguments (e.g., `--build-arg VLLM_CPU_DISABLE_AVX512=true`, `--build-arg VLLM_CPU_AVX512BF16=false`, or `--build-arg VLLM_CPU_AVX512VNNI=false`) to disable unsupported features. Please note that without `avx512f`, AVX2 will be used and this version is not recommended because it only has basic feature support.

 # --8<-- [end:pre-built-images]
 # --8<-- [start:build-image-from-source]
@ -37,6 +37,7 @@ vLLM supports basic model inferencing and serving on x86 CPU platform, with data
 docker build -f docker/Dockerfile.cpu \
        --build-arg VLLM_CPU_AVX512BF16=false (default)|true \
        --build-arg VLLM_CPU_AVX512VNNI=false (default)|true \
+        --build-arg VLLM_CPU_DISABLE_AVX512=false (default)|true \ 
        --tag vllm-cpu-env \
        --target vllm-openai .

--- a/docs/mkdocs/stylesheets/extra.css
+++ b/docs/mkdocs/stylesheets/extra.css
@ -23,6 +23,13 @@ a:not(:has(svg)):not(.md-icon):not(.autorefs-external) {
    }
 }

+a[href*="localhost"]::after,
+a[href*="127.0.0.1"]::after,
+a[href*="org.readthedocs.build"]::after,
+a[href*="docs.vllm.ai"]::after {
+    display: none !important;
+}
+
 /* Light mode: darker section titles */
 body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
  color: rgba(0, 0, 0, 0.7) !important;
--- a/docs/models/extensions/fastsafetensor.md
+++ b/docs/models/extensions/fastsafetensor.md
@ -2,4 +2,5 @@ Loading Model weights with fastsafetensors
 ===================================================================

 Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
-For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
+
+To enable this feature, use the ``--load-format fastsafetensors`` command-line argument
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -331,7 +331,7 @@ th {
 | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | |
 | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | |
 | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
@ -409,6 +409,12 @@ th {
 | `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |

+Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+
+| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | [V1](gh-issue:8779) |
+|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
+| `SmolLM3ForCausalLM` | SmolLM3 | `HuggingFaceTB/SmolLM3-3B` | ✅︎ | ✅︎ | ✅︎ |
+
 !!! note
    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.

@ -601,6 +607,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ |
 | `Blip2ForConditionalGeneration` | BLIP-2 | T + I<sup>E</sup> | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ |
 | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | ✅︎ |
+| `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
 | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
--- a/docs/usage/README.md
+++ b/docs/usage/README.md
@ -1,6 +1,8 @@
 # Using vLLM

-vLLM supports the following usage patterns:
+First, vLLM must be [installed](../getting_started/installation) for your chosen device in either a Python or Docker environment.
+
+Then, vLLM supports the following usage patterns:

 - [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
 - [Deployment](../deployment/docker.md): Scale up model instances for production.
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@ -35,6 +35,7 @@ You can check if this is happening by trying the old defaults with `--generation
 If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:

 - `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging.
+- `export VLLM_LOG_STATS_INTERVAL=1.` to get log statistics more frequently for tracking running queue, waiting queue and cache hit states.
 - `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem.
 - `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL.
 - `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. Do not use this flag unless absolutely needed for debugging, it will cause significant delays in startup time.
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@ -63,6 +63,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 |------------|-----------------------------------------------|
 | **NVIDIA** | <nobr>🚀</nobr>                               |
 | **AMD**    | <nobr>🟢</nobr>                               |
+| **INTEL GPU**    | <nobr>🟢</nobr>                               |
 | **TPU**    | <nobr>🟢</nobr>                               |
 | **CPU**    | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |

@ -72,6 +73,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the

    - [vllm-ascend](https://github.com/vllm-project/vllm-ascend)
    - [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
+    - [vllm-gaudi](https://github.com/vllm-project/vllm-gaudi)
    - [vllm-openvino](https://github.com/vllm-project/vllm-openvino)

    Please check their corresponding repositories for more details.
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@ -70,12 +70,27 @@ def parse_args():
        default=64,
        help=("Maximum number of sequences to be processed in a single iteration."),
    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        help=("Maximum number of tokens to be processed in a single iteration."),
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=300,
+        help=("Number of seconds before unresponsive process is killed."),
+    )
    parser.add_argument(
        "--gpu-memory-utilization",
        type=float,
        default=0.8,
        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
    )
+    parser.add_argument(
+        "--quantization",
+        type=str,
+    )
    return parser.parse_args()


@ -90,7 +105,9 @@ def main(
    enforce_eager,
    trust_remote_code,
    max_num_seqs,
+    max_model_len,
    gpu_memory_utilization,
+    quantization,
 ):
    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@ -142,7 +159,9 @@ def main(
        enable_expert_parallel=True,
        trust_remote_code=trust_remote_code,
        max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
        gpu_memory_utilization=gpu_memory_utilization,
+        quantization=quantization,
    )
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
@ -198,14 +217,16 @@ if __name__ == "__main__":
                args.enforce_eager,
                args.trust_remote_code,
                args.max_num_seqs,
+                args.max_model_len,
                args.gpu_memory_utilization,
+                args.quantization,
            ),
        )
        proc.start()
        procs.append(proc)
    exit_code = 0
    for proc in procs:
-        proc.join(timeout=300)
+        proc.join(timeout=args.timeout)
        if proc.exitcode is None:
            print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
            proc.kill()
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@ -15,6 +15,8 @@ from pydantic import BaseModel
 from vllm import LLM, SamplingParams
 from vllm.sampling_params import GuidedDecodingParams

+MAX_TOKENS = 50
+
 # Guided decoding by Choice (list of possible options)
 guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
 sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
@ -23,7 +25,9 @@ prompt_choice = "Classify this sentiment: vLLM is wonderful!"
 # Guided decoding by Regex
 guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
 sampling_params_regex = SamplingParams(
-    guided_decoding=guided_decoding_params_regex, stop=["\n"]
+    guided_decoding=guided_decoding_params_regex,
+    stop=["\n"],
+    max_tokens=MAX_TOKENS,
 )
 prompt_regex = (
    "Generate an email address for Alan Turing, who works in Enigma."
@ -48,7 +52,10 @@ class CarDescription(BaseModel):

 json_schema = CarDescription.model_json_schema()
 guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
-sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json)
+sampling_params_json = SamplingParams(
+    guided_decoding=guided_decoding_params_json,
+    max_tokens=MAX_TOKENS,
+)
 prompt_json = (
    "Generate a JSON with the brand, model and car_type of"
    "the most iconic car from the 90's"
@ -64,7 +71,10 @@ condition ::= column "= " number
 number ::= "1 " | "2 "
 """
 guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
-sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar)
+sampling_params_grammar = SamplingParams(
+    guided_decoding=guided_decoding_params_grammar,
+    max_tokens=MAX_TOKENS,
+)
 prompt_grammar = (
    "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
 )
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -126,6 +126,29 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    )


+def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "CohereLabs/command-a-vision-07-2025"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768,
+        tensor_parallel_size=4,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    prompts = [
+        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Deepseek-VL2
 def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -1417,6 +1440,7 @@ model_example_map = {
    "aya_vision": run_aya_vision,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
+    "command_a_vision": run_command_a_vision,
    "deepseek_vl_v2": run_deepseek_vl2,
    "florence2": run_florence2,
    "fuyu": run_fuyu,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -107,6 +107,42 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "CohereLabs/command-a-vision-07-2025"
+
+    # NOTE: This model is 122B parameters and requires tensor parallelism
+    # Recommended to use tp=4 on H100 GPUs
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=32768,
+        tensor_parallel_size=4,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "deepseek-ai/deepseek-vl2-tiny"

@ -1031,6 +1067,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
 model_example_map = {
    "aria": load_aria,
    "aya_vision": load_aya_vision,
+    "command_a_vision": load_command_a_vision,
    "deepseek_vl_v2": load_deepseek_vl2,
    "gemma3": load_gemma3,
    "h2ovl_chat": load_h2ovl,
--- a/examples/online_serving/openai_embedding_long_text/README.md
+++ b/examples/online_serving/openai_embedding_long_text/README.md
@ -0,0 +1,186 @@
+# Long Text Embedding with Chunked Processing
+
+This directory contains examples for using vLLM's **chunked processing** feature to handle long text embedding that exceeds the model's maximum context length.
+
+## 🚀 Quick Start
+
+### Start the Server
+
+Use the provided script to start a vLLM server with chunked processing enabled:
+
+```bash
+# Basic usage (supports very long texts up to ~3M tokens)
+./service.sh
+
+# Custom configuration with different models
+MODEL_NAME="jinaai/jina-embeddings-v3" \
+MAX_EMBED_LEN=1048576 \
+./service.sh
+
+# For extremely long documents
+MODEL_NAME="intfloat/multilingual-e5-large" \
+MAX_EMBED_LEN=3072000 \
+./service.sh
+```
+
+### Test Long Text Embedding
+
+Run the comprehensive test client:
+
+```bash
+python client.py
+```
+
+## 📁 Files
+
+| File | Description |
+|------|-------------|
+| `service.sh` | Server startup script with chunked processing enabled |
+| `client.py` | Comprehensive test client for long text embedding |
+
+## ⚙️ Configuration
+
+### Server Configuration
+
+The key parameters for chunked processing are in the `--override-pooler-config`:
+
+```json
+{
+  "pooling_type": "auto",
+  "normalize": true,
+  "enable_chunked_processing": true,
+  "max_embed_len": 3072000
+}
+```
+
+!!! note
+    `pooling_type` sets the model's own pooling strategy for processing within each chunk. The cross-chunk aggregation automatically uses MEAN strategy when input exceeds the model's native maximum length.
+
+#### Chunked Processing Behavior
+
+Chunked processing uses **MEAN aggregation** for cross-chunk combination when input exceeds the model's native maximum length:
+
+| Component | Behavior | Description |
+|-----------|----------|-------------|
+| **Within chunks** | Model's native pooling | Uses the model's configured pooling strategy |
+| **Cross-chunk aggregation** | Always MEAN | Weighted averaging based on chunk token counts |
+| **Performance** | Optimal | All chunks processed for complete semantic coverage |
+
+### Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MODEL_NAME` | `intfloat/multilingual-e5-large` | Embedding model to use (supports multiple models) |
+| `PORT` | `31090` | Server port |
+| `GPU_COUNT` | `1` | Number of GPUs to use |
+| `MAX_EMBED_LEN` | `3072000` | Maximum embedding input length (supports very long documents) |
+| `POOLING_TYPE` | `auto` | Model's native pooling type: `auto`, `MEAN`, `CLS`, `LAST` (only affects within-chunk pooling, not cross-chunk aggregation) |
+| `API_KEY` | `EMPTY` | API key for authentication |
+
+## 🔧 How It Works
+
+1. **Enhanced Input Validation**: `max_embed_len` allows accepting inputs longer than `max_model_len` without environment variables
+2. **Smart Chunking**: Text is split based on `max_position_embeddings` to maintain semantic integrity
+3. **Unified Processing**: All chunks processed separately through the model using its configured pooling strategy
+4. **MEAN Aggregation**: When input exceeds model's native length, results combined using token count-based weighted averaging across all chunks
+5. **Consistent Output**: Final embeddings maintain the same dimensionality as standard processing
+
+### Input Length Handling
+
+- **Within max_embed_len**: Input is accepted and processed (up to 3M+ tokens)
+- **Exceeds max_position_embeddings**: Chunked processing is automatically triggered
+- **Exceeds max_embed_len**: Input is rejected with clear error message
+- **No environment variables required**: Works without `VLLM_ALLOW_LONG_MAX_MODEL_LEN`
+
+### Extreme Long Text Support
+
+With `MAX_EMBED_LEN=3072000`, you can process:
+
+- **Academic papers**: Full research papers with references
+- **Legal documents**: Complete contracts and legal texts  
+- **Books**: Entire chapters or small books
+- **Code repositories**: Large codebases and documentation
+
+## 📊 Performance Characteristics
+
+### Chunked Processing Performance
+
+| Aspect | Behavior | Performance |
+|--------|----------|-------------|
+| **Chunk Processing** | All chunks processed with native pooling | Consistent with input length |
+| **Cross-chunk Aggregation** | MEAN weighted averaging | Minimal overhead |
+| **Memory Usage** | Proportional to number of chunks | Moderate, scalable |
+| **Semantic Quality** | Complete text coverage | Optimal for long documents |
+
+## 🧪 Test Cases
+
+The test client demonstrates:
+
+- ✅ **Short text**: Normal processing (baseline)
+- ✅ **Medium text**: Single chunk processing
+- ✅ **Long text**: Multi-chunk processing with aggregation
+- ✅ **Very long text**: Many chunks processing
+- ✅ **Extreme long text**: Document-level processing (100K+ tokens)
+- ✅ **Batch processing**: Mixed-length inputs in one request
+- ✅ **Consistency**: Reproducible results across runs
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+1. **Chunked processing not enabled**:
+
+   ```log
+   ValueError: This model's maximum position embeddings length is 4096 tokens...
+   ```
+
+   **Solution**: Ensure `enable_chunked_processing: true` in pooler config
+
+2. **Input exceeds max_embed_len**:
+
+   ```log
+   ValueError: This model's maximum embedding input length is 3072000 tokens...
+   ```
+
+   **Solution**: Increase `max_embed_len` in pooler config or reduce input length
+
+3. **Memory errors**:
+  
+   ```log
+   RuntimeError: CUDA out of memory
+   ```
+  
+   **Solution**: Reduce chunk size by adjusting model's `max_position_embeddings` or use fewer GPUs
+
+4. **Slow processing**:
+   **Expected**: Long text takes more time due to multiple inference calls
+
+### Debug Information
+
+Server logs show chunked processing activity:
+
+```log
+INFO: Input length 150000 exceeds max_position_embeddings 4096, will use chunked processing
+INFO: Split input of 150000 tokens into 37 chunks (max_chunk_size: 4096)
+```
+
+## 🤝 Contributing
+
+To extend chunked processing support to other embedding models:
+
+1. Check model compatibility with the pooling architecture
+2. Test with various text lengths
+3. Validate embedding quality compared to single-chunk processing
+4. Submit PR with test cases and documentation updates
+
+## 🆕 Enhanced Features
+
+### max_embed_len Parameter
+
+The new `max_embed_len` parameter provides:
+
+- **Simplified Configuration**: No need for `VLLM_ALLOW_LONG_MAX_MODEL_LEN` environment variable
+- **Flexible Input Validation**: Accept inputs longer than `max_model_len` up to `max_embed_len`
+- **Extreme Length Support**: Process documents with millions of tokens
+- **Clear Error Messages**: Better feedback when inputs exceed limits
+- **Backward Compatibility**: Existing configurations continue to work
--- a/examples/online_serving/openai_embedding_long_text/client.py
+++ b/examples/online_serving/openai_embedding_long_text/client.py
@ -0,0 +1,366 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Example script demonstrating long text embedding with chunked processing in vLLM.
+
+This example shows how to use vLLM's chunked processing feature to handle text
+inputs that exceed the model's maximum token length. The feature automatically
+splits long text into chunks and handles different pooling types optimally.
+
+Prerequisites:
+1. Start vLLM server with chunked processing enabled:
+   
+   # MEAN pooling (processes all chunks, recommended for complete coverage)
+   vllm serve intfloat/multilingual-e5-large \
+     --override-pooler-config \
+      '{"pooling_type": "MEAN", "normalize": true, ' \
+      '"enable_chunked_processing": true, "max_embed_len": 3072000}' \
+     --served-model-name multilingual-e5-large \
+     --trust-remote-code \
+     --port 31090 \
+     --api-key your-api-key
+
+   # OR CLS pooling (native CLS within chunks, MEAN aggregation across chunks)
+   vllm serve BAAI/bge-large-en-v1.5 \
+     --override-pooler-config \
+      '{"pooling_type": "CLS", "normalize": true, ' \
+      '"enable_chunked_processing": true, "max_embed_len": 1048576}' \
+     --served-model-name bge-large-en-v1.5 \
+     --trust-remote-code \
+     --port 31090 \
+     --api-key your-api-key
+
+2. Install required dependencies:
+   pip install openai requests
+"""
+
+import time
+
+import numpy as np
+from openai import OpenAI
+
+# Configuration
+API_KEY = "your-api-key"  # Replace with your actual API key
+BASE_URL = "http://localhost:31090/v1"
+MODEL_NAME = "multilingual-e5-large"
+
+
+def generate_long_text(base_text: str, repeat_count: int) -> str:
+    """Generate long text by repeating base text."""
+    return base_text * repeat_count
+
+
+def test_embedding_with_different_lengths():
+    """Test embedding generation with different text lengths."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    # Test cases with different text lengths
+    test_cases = [
+        {
+            "name": "Short Text",
+            "text": "Hello, this is a short text for embedding.",
+            "expected_chunks": 1,
+        },
+        {
+            "name": "Medium Text",
+            "text": generate_long_text(
+                "This is a medium-length text that should fit within the "
+                "model's context window. " * 20,
+                2,
+            ),
+            "expected_chunks": 1,
+        },
+        {
+            "name": "Long Text (2 chunks)",
+            "text": generate_long_text(
+                "This is a very long text that will exceed the model's "
+                "maximum context length and trigger chunked processing. " * 50,
+                5,
+            ),
+            "expected_chunks": 2,
+        },
+        {
+            "name": "Very Long Text (3+ chunks)",
+            "text": generate_long_text(
+                "This text is extremely long and will definitely "
+                "require multiple chunks for processing. " * 100,
+                10,
+            ),
+            "expected_chunks": 3,
+        },
+    ]
+
+    print("🧪 Testing vLLM Long Text Embedding with Chunked Processing")
+    print("=" * 70)
+
+    for i, test_case in enumerate(test_cases, 1):
+        print(f"\n📝 Test {i}: {test_case['name']}")
+        print(f"Text length: {len(test_case['text'])} characters")
+
+        try:
+            start_time = time.time()
+
+            response = client.embeddings.create(
+                input=test_case["text"], model=MODEL_NAME, encoding_format="float"
+            )
+
+            end_time = time.time()
+            processing_time = end_time - start_time
+
+            # Extract embedding data
+            embedding = response.data[0].embedding
+            embedding_dim = len(embedding)
+
+            print("✅ Success!")
+            print(f"   - Embedding dimension: {embedding_dim}")
+            print(f"   - Processing time: {processing_time:.2f}s")
+            print(f"   - Expected chunks: ~{test_case['expected_chunks']}")
+            print(f"   - First 5 values: {embedding[:5]}")
+
+        except Exception as e:
+            print(f"❌ Failed: {str(e)}")
+
+
+def test_batch_embedding():
+    """Test batch embedding with mixed-length inputs."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    print("\n🔄 Testing Batch Embedding with Mixed Lengths")
+    print("=" * 50)
+
+    # Mix of short and long texts
+    batch_inputs = [
+        "Short text 1",
+        generate_long_text("Medium length text that fits in one chunk. " * 20, 1),
+        "Another short text",
+        generate_long_text("Long text requiring chunked processing. " * 100, 5),
+    ]
+
+    try:
+        start_time = time.time()
+
+        response = client.embeddings.create(
+            input=batch_inputs, model=MODEL_NAME, encoding_format="float"
+        )
+
+        end_time = time.time()
+        processing_time = end_time - start_time
+
+        print("✅ Batch processing successful!")
+        print(f"   - Number of inputs: {len(batch_inputs)}")
+        print(f"   - Number of embeddings: {len(response.data)}")
+        print(f"   - Total processing time: {processing_time:.2f}s")
+        print(
+            f"   - Average time per input: {processing_time / len(batch_inputs):.2f}s"
+        )
+
+        for i, data in enumerate(response.data):
+            input_length = len(batch_inputs[i])
+            embedding_dim = len(data.embedding)
+            print(
+                f"   - Input {i + 1}: {input_length} chars → {embedding_dim}D embedding"
+            )
+
+    except Exception as e:
+        print(f"❌ Batch processing failed: {str(e)}")
+
+
+def test_multiple_long_texts_batch():
+    """Test batch processing with multiple long texts to verify chunk ID uniqueness."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    print("\n🔧 Testing Multiple Long Texts in Batch (Chunk ID Fix Verification)")
+    print("=" * 70)
+
+    # Create multiple distinct long texts that will all require chunking
+    # Note: All pooling types now use MEAN aggregation across chunks:
+    # - Native pooling (MEAN/CLS/LAST) is used within each chunk
+    # - MEAN aggregation combines results across all chunks
+    # - Full semantic coverage for all pooling types
+    long_texts = [
+        generate_long_text(
+            "First long document about artificial intelligence and machine learning. "
+            * 80,
+            6,
+        ),
+        generate_long_text(
+            "Second long document about natural language processing and transformers. "
+            * 80,
+            6,
+        ),
+        generate_long_text(
+            "Third long document about computer vision and neural networks. " * 80, 6
+        ),
+    ]
+
+    # Add some short texts to mix things up
+    batch_inputs = [
+        "Short text before long texts",
+        long_texts[0],
+        "Short text between long texts",
+        long_texts[1],
+        long_texts[2],
+        "Short text after long texts",
+    ]
+
+    print("📊 Batch composition:")
+    for i, text in enumerate(batch_inputs):
+        length = len(text)
+        text_type = "Long (will be chunked)" if length > 5000 else "Short"
+        print(f"   - Input {i + 1}: {length} chars ({text_type})")
+
+    try:
+        start_time = time.time()
+
+        response = client.embeddings.create(
+            input=batch_inputs, model=MODEL_NAME, encoding_format="float"
+        )
+
+        end_time = time.time()
+        processing_time = end_time - start_time
+
+        print("\n✅ Multiple long texts batch processing successful!")
+        print(f"   - Number of inputs: {len(batch_inputs)}")
+        print(f"   - Number of embeddings returned: {len(response.data)}")
+        print(f"   - Total processing time: {processing_time:.2f}s")
+
+        # Verify each embedding is different (no incorrect aggregation)
+        embeddings = [data.embedding for data in response.data]
+
+        if len(embeddings) >= 3:
+            import numpy as np
+
+            # Compare embeddings of the long texts (indices 1, 3, 4)
+            long_embeddings = [
+                np.array(embeddings[1]),  # First long text
+                np.array(embeddings[3]),  # Second long text
+                np.array(embeddings[4]),  # Third long text
+            ]
+
+            print("\n🔍 Verifying embedding uniqueness:")
+            for i in range(len(long_embeddings)):
+                for j in range(i + 1, len(long_embeddings)):
+                    cosine_sim = np.dot(long_embeddings[i], long_embeddings[j]) / (
+                        np.linalg.norm(long_embeddings[i])
+                        * np.linalg.norm(long_embeddings[j])
+                    )
+                    print(
+                        f"   - Similarity between long text {i + 1} and {j + 1}: "
+                        f"{cosine_sim:.4f}"
+                    )
+
+                    if (
+                        cosine_sim < 0.9
+                    ):  # Different content should have lower similarity
+                        print("     ✅ Good: Embeddings are appropriately different")
+                    else:
+                        print(
+                            "     ⚠️ High similarity - may indicate chunk "
+                            "aggregation issue"
+                        )
+
+        print("\n📋 Per-input results:")
+        for i, data in enumerate(response.data):
+            input_length = len(batch_inputs[i])
+            embedding_dim = len(data.embedding)
+            embedding_norm = np.linalg.norm(data.embedding)
+            print(
+                f"   - Input {i + 1}: {input_length} chars → {embedding_dim}D "
+                f"embedding (norm: {embedding_norm:.4f})"
+            )
+
+        print(
+            "\n✅ This test verifies the fix for chunk ID collisions in "
+            "batch processing"
+        )
+        print("   - Before fix: Multiple long texts would have conflicting chunk IDs")
+        print("   - After fix: Each prompt's chunks have unique IDs with prompt index")
+
+    except Exception as e:
+        print(f"❌ Multiple long texts batch test failed: {str(e)}")
+        print("   This might indicate the chunk ID collision bug is present!")
+
+
+def test_embedding_consistency():
+    """Test that chunked processing produces consistent results."""
+    client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+
+    print("\n🔍 Testing Embedding Consistency")
+    print("=" * 40)
+
+    # Use the same long text multiple times
+    long_text = generate_long_text(
+        "Consistency test text for chunked processing validation. " * 50, 3
+    )
+
+    embeddings = []
+
+    try:
+        for i in range(3):
+            response = client.embeddings.create(
+                input=long_text, model=MODEL_NAME, encoding_format="float"
+            )
+            embeddings.append(response.data[0].embedding)
+            print(f"   - Generated embedding {i + 1}")
+
+        # Check consistency (embeddings should be identical)
+        if len(embeddings) >= 2:
+            # Calculate similarity between first two embeddings
+
+            emb1 = np.array(embeddings[0])
+            emb2 = np.array(embeddings[1])
+
+            # Cosine similarity
+            cosine_sim = np.dot(emb1, emb2) / (
+                np.linalg.norm(emb1) * np.linalg.norm(emb2)
+            )
+
+            print("✅ Consistency test completed!")
+            print(f"   - Cosine similarity between runs: {cosine_sim:.6f}")
+            print("   - Expected: ~1.0 (identical embeddings)")
+
+            if cosine_sim > 0.999:
+                print("   - ✅ High consistency achieved!")
+            else:
+                print("   - ⚠️ Consistency may vary due to numerical precision")
+
+    except Exception as e:
+        print(f"❌ Consistency test failed: {str(e)}")
+
+
+def main():
+    """Main function to run all tests."""
+    print("🚀 vLLM Long Text Embedding Client")
+    print(f"📡 Connecting to: {BASE_URL}")
+    print(f"🤖 Model: {MODEL_NAME}")
+    masked_key = "*" * (len(API_KEY) - 4) + API_KEY[-4:] if len(API_KEY) > 4 else "****"
+    print(f"🔑 API Key: {masked_key}")
+
+    # Run all test cases
+    test_embedding_with_different_lengths()
+    test_batch_embedding()
+    test_multiple_long_texts_batch()
+    test_embedding_consistency()
+
+    print("\n" + "=" * 70)
+    print("🎉 All tests completed!")
+    print("\n💡 Key Features Demonstrated:")
+    print("   - ✅ Automatic chunked processing for long text")
+    print("   - ✅ Seamless handling of mixed-length batches")
+    print("   - ✅ Multiple long texts in single batch (chunk ID fix)")
+    print("   - ✅ Unified chunked processing:")
+    print("     • Native pooling used within each chunk")
+    print("     • MEAN aggregation across all chunks")
+    print("     • Complete semantic coverage for all pooling types")
+    print("   - ✅ Consistent embedding generation")
+    print("   - ✅ Backward compatibility with short text")
+    print("\n📚 For more information, see:")
+    print(
+        "   - Documentation: https://docs.vllm.ai/en/latest/models/pooling_models.html"
+    )
+    print("   - Chunked Processing Guide: openai_embedding_long_text.md")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/online_serving/openai_embedding_long_text/service.sh
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@ -0,0 +1,137 @@
+#!/bin/bash
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# vLLM Embedding Server with Enhanced Chunked Processing
+# This script starts a vLLM server with chunked processing enabled for long text embedding.
+# Now supports proper pooling type validation and model-specific configurations.
+
+set -euo pipefail
+
+# Configuration
+MODEL_NAME=${MODEL_NAME:-"intfloat/multilingual-e5-large"}
+MODEL_CODE=${MODEL_CODE:-"multilingual-e5-large"}
+
+PORT=${PORT:-31090}
+GPU_COUNT=${GPU_COUNT:-1}
+MAX_EMBED_LEN=${MAX_EMBED_LEN:-3072000}
+API_KEY=${API_KEY:-"your-api-key"}
+
+# Enhanced pooling configuration with model-specific defaults
+POOLING_TYPE=${POOLING_TYPE:-"auto"}  # auto, MEAN, CLS, LAST
+export VLLM_ENABLE_CHUNKED_PROCESSING=true
+export CUDA_VISIBLE_DEVICES=2,3,4,5
+# export VLLM_ATTENTION_BACKEND=XFORMERS
+
+echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
+echo "=================================================================="
+
+# Environment variables for optimization
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+
+# Function to determine optimal pooling type for known models
+get_optimal_pooling_type() {
+    local model="$1"
+    case "$model" in
+        *"e5-"* | *"multilingual-e5"*)
+            echo "MEAN"  # E5 series native pooling
+            ;;
+        *"bge-"*)
+            echo "CLS"   # BGE series native pooling
+            ;;
+        *"gte-"*)
+            echo "LAST"  # GTE series native pooling
+            ;;
+        *"sentence-t5"* | *"st5"*)
+            echo "MEAN"  # Sentence-T5 native pooling
+            ;;
+        *"jina-embeddings"*)
+            echo "MEAN"  # Jina embeddings native pooling
+            ;;
+        *"Qwen"*"Embedding"*)
+            echo "LAST"  # Qwen embeddings native pooling
+            ;;
+        *)
+            echo "MEAN"  # Default native pooling for unknown models
+            ;;
+    esac
+}
+
+# Auto-detect pooling type if not explicitly set
+if [ "$POOLING_TYPE" = "auto" ]; then
+    POOLING_TYPE=$(get_optimal_pooling_type "$MODEL_NAME")
+    echo "🔍 Auto-detected pooling type: $POOLING_TYPE for model $MODEL_NAME"
+fi
+
+# Display configuration
+echo "📋 Configuration:"
+echo "   - Model: $MODEL_NAME"
+echo "   - Port: $PORT"
+echo "   - GPU Count: $GPU_COUNT"
+echo "   - Enhanced Chunked Processing: ${VLLM_ENABLE_CHUNKED_PROCESSING}"
+echo "   - Max Embed Length: ${MAX_EMBED_LEN} tokens"
+echo "   - Native Pooling Type: $POOLING_TYPE + Normalization"
+echo "   - Cross-chunk Aggregation: MEAN (automatic)"
+echo ""
+
+# Validate GPU availability
+if command -v nvidia-smi &> /dev/null; then
+    gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    echo "🖥️  Available GPUs: $gpu_count"
+    if [ "$GPU_COUNT" -gt "$gpu_count" ]; then
+        echo "⚠️  Warning: Requested $GPU_COUNT GPUs but only $gpu_count available"
+        echo "   Adjusting to use $gpu_count GPUs"
+        GPU_COUNT=$gpu_count
+    fi
+else
+    echo "⚠️  Warning: nvidia-smi not found. GPU detection skipped."
+fi
+
+# Chunked processing uses unified MEAN aggregation
+echo "ℹ️  Chunked Processing: Using $POOLING_TYPE pooling within chunks, MEAN aggregation across chunks"
+echo "   - All chunks processed for complete semantic coverage"
+echo "   - Weighted averaging based on chunk token counts"
+
+echo ""
+echo "🔧 Starting server with enhanced chunked processing configuration..."
+
+# Build pooler config JSON
+POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}"
+
+# Start vLLM server with enhanced chunked processing
+vllm serve "$MODEL_NAME" \
+  --tensor-parallel-size "$GPU_COUNT" \
+  --enforce-eager \
+  --override-pooler-config "$POOLER_CONFIG" \
+  --served-model-name ${MODEL_CODE} \
+  --api-key "$API_KEY" \
+  --trust-remote-code \
+  --port "$PORT" \
+  --host 0.0.0.0
+
+echo ""
+echo "✅ vLLM Embedding Server started successfully!"
+echo ""
+echo "📡 Server Information:"
+echo "   - Base URL: http://localhost:$PORT"
+echo "   - Model Code: ${MODEL_CODE}"
+echo "   - API Key: $API_KEY"
+echo "   - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN"
+echo ""
+echo "🧪 Test the server with:"
+echo "   python examples/online_serving/openai_embedding_long_text_client.py"
+echo ""
+echo "📚 Enhanced features enabled:"
+echo "   ✅ Intelligent native pooling type detection"
+echo "   ✅ Unified MEAN aggregation for chunked processing"
+echo "   ✅ Model-specific native pooling optimization"
+echo "   ✅ Enhanced max embedding length (${MAX_EMBED_LEN} tokens)"
+echo "   ✅ Complete semantic coverage for all pooling types"
+echo "   ✅ OpenAI-compatible API"
+echo "   ✅ GPU acceleration"
+echo ""
+echo "🔧 Advanced usage:"
+echo "   - Set POOLING_TYPE=MEAN|CLS|LAST to override auto-detection"
+echo "   - Set MAX_EMBED_LEN to adjust maximum input length"
+echo "   - All pooling types use MEAN aggregation across chunks" 
--- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
@ -15,6 +15,14 @@ else
    MODEL=$2
 fi

+# The prefillers and decoders in LMCache use the same hash seed for all chunk keys.
+# This seed must be aligned so that decoders can identify and retrieve KV cache
+# entries stored by prefillers.
+#
+# WARNING: Using a fixed hash seed is insecure and makes the application vulnerable to
+# denial-of-service attacks. In a production environment, this should be set to a
+# secure random value. This is set to a fixed value for demonstration purposes only.
+export PYTHONHASHSEED=${VLLM_PYTHON_HASH_SEED:-123}

 if [[ $1 == "prefiller" ]]; then
    # Prefiller listens on port 8100
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@ -34,13 +34,14 @@ theme:
    - content.action.edit
    - content.code.copy
    - content.tabs.link
+    - navigation.instant
+    - navigation.instant.progress
    - navigation.tracking
    - navigation.tabs
    - navigation.tabs.sticky
    - navigation.sections
-    - navigation.prune
-    - navigation.top
    - navigation.indexes
+    - navigation.top
    - search.highlight
    - search.share
    - toc.follow
--- a/setup.py
+++ b/setup.py
@ -684,7 +684,7 @@ setup(
                  "mistral_common[audio]"],  # Required for audio processing
        "video": [],  # Kept for backwards compatibility
        # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.10"],
+        "flashinfer": ["flashinfer-python==0.2.11"],
    },
    cmdclass=cmdclass,
    package_data=package_data,
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@ -1,409 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-import uuid
-from asyncio import CancelledError
-from copy import copy
-from dataclasses import dataclass, field
-from typing import Any, Optional
-
-import pytest
-import pytest_asyncio
-import torch
-
-from vllm import SamplingParams
-from vllm.config import ParallelConfig
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
-from vllm.outputs import RequestOutput as RealRequestOutput
-from vllm.sampling_params import RequestOutputKind
-
-from ..utils import wait_for_gpu_memory_to_clear
-
-
-@dataclass
-class RequestOutput:
-    request_id: int
-    finished: bool = False
-
-
-@dataclass
-class MockModelConfig:
-    use_async_output_proc = True
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-
-
-class MockEngine:
-
-    def __init__(self):
-        self.step_calls = 0
-        self.add_request_calls = 0
-        self.abort_request_calls = 0
-        self.request_id = None
-        # Ugly, remove dependency when possible
-        self.parallel_config = ParallelConfig()
-        self.model_config = MockModelConfig()
-
-    async def step_async(self, virtual_engine):
-        # PP size is 1, ignore virtual engine
-        self.step_calls += 1
-        return [RequestOutput(
-            request_id=self.request_id)] if self.request_id else []
-
-    async def process_model_inputs_async(self, *args, **kwargs):
-        pass
-
-    async def stop_remote_worker_execution_loop_async(self):
-        pass
-
-    def generate(self, request_id):
-        self.request_id = request_id
-
-    def stop_generating(self):
-        self.request_id = None
-
-    def add_request(self, **kwargs):
-        del kwargs  # Unused
-        self.add_request_calls += 1
-        print(f'Request calls: {self.add_request_calls}')
-
-    async def add_request_async(self, **kwargs):
-        self.add_request_calls += 1
-        return
-
-    def abort_request(self, request_id):
-        del request_id  # Unused
-        self.abort_request_calls += 1
-
-    def has_unfinished_requests(self):
-        return self.request_id is not None
-
-    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
-        return self.request_id is not None
-
-
-class MockAsyncLLMEngine(AsyncLLMEngine):
-    _engine_class = MockEngine
-
-
-@pytest.mark.asyncio
-async def test_new_requests_event():
-    params = SamplingParams()
-
-    engine = MockAsyncLLMEngine()
-    engine.start_background_loop()
-    await asyncio.sleep(0.01)
-    assert engine.engine.step_calls == 0
-
-    await engine.add_request("1", "", params)
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 1
-    assert engine.engine.step_calls == 1
-
-    await engine.add_request("2", "", params)
-    engine.engine.generate("2")
-    await asyncio.sleep(0)
-    await asyncio.sleep(0)
-    await asyncio.sleep(0)
-    assert engine.engine.add_request_calls == 2
-    assert engine.engine.step_calls >= 2
-    await asyncio.sleep(0.001)
-    assert engine.engine.step_calls >= 3
-    engine.engine.stop_generating()
-    await asyncio.sleep(0.001)
-    old_step_calls = engine.engine.step_calls
-    await asyncio.sleep(0.001)
-    assert engine.engine.step_calls == old_step_calls
-
-    await engine.add_request("3", "", params)
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == old_step_calls + 1
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == old_step_calls + 1
-
-    engine = MockAsyncLLMEngine()
-    assert engine.get_model_config() is not None
-    assert engine.get_tokenizer() is not None
-    assert engine.get_decoding_config() is not None
-
-
-def start_engine():
-    wait_for_gpu_memory_to_clear(
-        devices=list(range(torch.cuda.device_count())),
-        threshold_bytes=2 * 2**30,
-        timeout_s=60,
-    )
-
-    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
-    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
-
-    return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m",
-                        enforce_eager=True,
-                        num_scheduler_steps=num_scheduler_steps))
-
-
-def uid() -> str:
-    return str(uuid.uuid4())
-
-
-@pytest_asyncio.fixture(scope="module")
-async def async_engine():
-    # We cannot use monkeypatch since this is a module
-    # scoped fixture and monkeypatch is function scoped.
-    previous_value = os.getenv("VLLM_USE_V1", None)
-    os.environ["VLLM_USE_V1"] = "0"
-    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
-                                                            func=start_engine)
-    try:
-        yield engine
-    finally:
-        engine.shutdown_background_loop()
-        del engine
-        await asyncio.sleep(0.1)
-        cleanup_dist_env_and_memory()
-
-        if previous_value:
-            os.environ["VLLM_USE_V1"] = previous_value
-        else:
-            del os.environ["VLLM_USE_V1"]
-
-
-@pytest.fixture()
-def should_do_global_cleanup_after_test(request) -> bool:
-    # So we can share the async engine fixture between these tests
-    return False
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_asyncio_run(async_engine, stop):
-
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    async def run(prompt: str):
-        sampling_params = SamplingParams(
-            temperature=0,
-            max_tokens=32,
-            min_tokens=32,
-            stop=stop,
-        )
-
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  sampling_params,
-                                                  request_id=uid()):
-            output_count += 1
-            final_output = output
-        return final_output, output_count
-
-    results = await asyncio.gather(
-        run("test0"),
-        run("test0"),
-    )
-    assert len(results) == 2
-    first, second = results
-
-    # remove nondeterministic fields for comparison
-    first[0].metrics = None
-    second[0].metrics = None
-    first[0].request_id = None
-    second[0].request_id = None
-
-    assert str(first) == str(second)
-
-    output_count = results[0][1]
-    if num_scheduler_steps == 1:
-        assert output_count == 32
-    else:
-        assert 1 < output_count < 32
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_output_kinds(async_engine, stop):
-    """Test that output_kind works as expected and that
-    results are equivalent across different kinds."""
-
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_tokens=32,
-        min_tokens=32,
-        stop=stop,
-    )
-
-    async def run(prompt: str, kind: RequestOutputKind):
-        params = copy(sampling_params)
-        params.output_kind = kind
-
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
-            output_count += 1
-            final_output = output
-
-        assert final_output is not None
-        assert final_output.finished
-
-        return (final_output.prompt_token_ids,
-                final_output.outputs[0].token_ids,
-                final_output.outputs[0].text, output_count)
-
-    async def run_deltas(prompt: str):
-        params = copy(sampling_params)
-        params.output_kind = RequestOutputKind.DELTA
-
-        prompt_tokens = None
-        output_tokens: list[int] = []
-        output_text = ""
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
-            token_ids = output.outputs[0].token_ids
-            text = output.outputs[0].text
-            final_output = output
-
-            # Ensure we get prompt ids iff we haven't yet received output tokens
-            if output_tokens:
-                assert 1 <= len(token_ids) <= num_scheduler_steps
-                assert stop or text
-                assert not output.prompt_token_ids
-            else:
-                assert output.prompt_token_ids
-                prompt_tokens = output.prompt_token_ids
-
-            output_tokens.extend(token_ids)
-            output_text += text
-
-            output_count += 1
-
-        assert final_output is not None
-        assert final_output.finished
-
-        return prompt_tokens, output_tokens, output_text, output_count
-
-    results = await asyncio.gather(
-        run("common input prompt", RequestOutputKind.CUMULATIVE),
-        run("common input prompt", RequestOutputKind.FINAL_ONLY),
-        run_deltas("common input prompt"))
-
-    # Make sure outputs are the same
-    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
-    assert len(prompt_set) == 1
-
-    text_set = set(text for _, _, text, _ in results)
-    assert len(text_set) == 1
-
-    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
-    assert len(tokens_set) == 1
-
-    cumulative, final, deltas = results
-
-    # output message counts
-    assert cumulative[3] == deltas[3]
-
-    if num_scheduler_steps == 1:
-        assert cumulative[3] == 32
-    else:
-        assert 1 < cumulative[3] < 32
-
-    assert final[3] == 1
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_cancellation(async_engine, stop):
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=13,
-        max_tokens=13,
-        stop=stop,
-    )
-
-    stop_at = 5 if num_scheduler_steps == 1 else 1
-
-    request_id = uid()
-
-    i = 0
-    with pytest.raises(CancelledError):
-        async for output in async_engine.generate("test2",
-                                                  sampling_params,
-                                                  request_id=request_id):
-            assert not output.finished
-            i += 1
-            if i == stop_at:
-                await async_engine.abort(request_id)
-
-    assert i == stop_at
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_delayed_generator(async_engine, stop):
-    scheduler_config = await async_engine.get_scheduler_config()
-
-    if scheduler_config.num_scheduler_steps != 1:
-        pytest.skip("no need to test this one with multistep")
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=10,
-        max_tokens=10,
-        stop=stop,
-    )
-
-    stream = async_engine.generate("test3", sampling_params, request_id=uid())
-    i = 0
-    final_output: Optional[RealRequestOutput] = None
-    async for output in stream:
-        final_output = output
-        if i == 0:
-            # wait for generation to complete before consuming
-            # the remaining messages
-            await asyncio.sleep(1)
-        if i < 9:
-            assert not output.finished
-        i += 1
-
-    assert i == 10
-    assert final_output is not None
-    assert len(final_output.outputs[0].token_ids) == 10
-    assert final_output.finished
-
-
-@pytest.mark.asyncio(scope="module")
-async def test_invalid_argument(async_engine):
-    scheduler_config = await async_engine.get_scheduler_config()
-
-    if scheduler_config.num_scheduler_steps != 1:
-        pytest.skip("no need to test this one with multistep")
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=10,
-        max_tokens=10,
-    )
-
-    # Targeting specific DP rank only supported in v1 multi-instance DP
-    with pytest.raises(ValueError):
-        async for _ in async_engine.generate("test",
-                                             sampling_params,
-                                             request_id=uid(),
-                                             data_parallel_rank=0):
-            pass
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@ -3,7 +3,8 @@
 import contextlib
 import os
 import weakref
-from contextlib import ExitStack
+from dataclasses import dataclass
+from typing import Optional

 import pytest

@ -32,27 +33,130 @@ def temporary_environ(env_vars):
                os.environ[k] = v


+@dataclass
+class BackendConfig:
+    name: str
+    env_vars: dict
+    comp_config: dict
+    specific_gpu_arch: Optional[tuple] = None
+
+
+# Define all backend configurations of full cudagraph to be tested
+backend_configs = {
+    # FA3 on Hopper
+    "FA3":
+    BackendConfig(name="FA3",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "3"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # FlashMLA on Hopper
+    "FlashMLA":
+    BackendConfig(name="FlashMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASHMLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  },
+                  specific_gpu_arch=(9, 0)),
+    # Cutlass MLA on Blackwell
+    "CutlassMLA":
+    BackendConfig(
+        name="CutlassMLA",
+        env_vars={
+            "VLLM_USE_V1": "1",
+            "VLLM_ATTENTION_BACKEND": "CUTLASS_MLA",
+            "FORCE_NUM_KV_SPLITS":
+            "1",  # TODO: remove this when hang issue is fixed
+        },
+        comp_config={
+            "cudagraph_mode": "FULL_AND_PIECEWISE",
+            "cudagraph_capture_sizes": [16, 32, 64, 128, 256, 512],
+        },
+        specific_gpu_arch=(10, 0)),
+    # FA2
+    "FA2":
+    BackendConfig(name="FA2",
+                  env_vars={"VLLM_FLASH_ATTN_VERSION": "2"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  }),
+    # Triton Attention
+    "TritonAttn":
+    BackendConfig(name="TritonAttn",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
+                  comp_config={
+                      "cudagraph_mode": "FULL",
+                  }),
+    # FlashInfer
+    "FlashInfer":
+    BackendConfig(name="FlashInfer",
+                  env_vars={"VLLM_ATTENTION_BACKEND": "FLASHINFER"},
+                  comp_config={
+                      "cudagraph_mode": "FULL_AND_PIECEWISE",
+                  }),
+}
+
+test_params_full_cudagraph = []
+
+# deepseek-ai/DeepSeek-V2-Lite with MLA
+MLA_backends = ["FlashMLA", "CutlassMLA"]
+for mla_backend in MLA_backends:
+    test_params_full_cudagraph.append(
+        pytest.param(
+            ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])))
+
+# Qwen/Qwen2-1.5B-Instruct with other backends
+other_backend_configs = [
+    backend_configs[c] for c in backend_configs if c not in MLA_backends
+]
+for backend_config in other_backend_configs:
+    test_params_full_cudagraph.append(
+        pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config)))
+
+
@pytest.fixture(scope="class")
 def llm_pair(request):
-    model = request.param
+    model, backend_config = request.param

-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION": "3"
-    }):
+    # Dynamically skip test if GPU capability is not met
+    if backend_config.specific_gpu_arch and backend_config.specific_gpu_arch\
+        != current_platform.get_device_capability():
+        if backend_config.specific_gpu_arch == (9, 0):
+            pytest.skip("Only Hopper GPUs support FA3 and FlashMLA")
+        elif backend_config.specific_gpu_arch == (10, 0):
+            pytest.skip("Only Blackwell GPUs support Cutlass MLA")
+
+    env_vars = {
+        "VLLM_USE_V1": "1",
+        # Force native sampler to avoid potential nondeterminism in FlashInfer
+        # when per-request generators are not used in V1.
+        "VLLM_USE_FLASHINFER_SAMPLER": "0",
+        **backend_config.env_vars,
+    }
+    with temporary_environ(env_vars):
        full = LLM(
            model=model,
-            gpu_memory_utilization=0.45,
+            gpu_memory_utilization=0.43,
            trust_remote_code=True,
            max_model_len=1024,
-            compilation_config=CompilationConfig(full_cuda_graph=True),
+            max_num_seqs=128,
+            compilation_config=\
+                CompilationConfig(**backend_config.comp_config),
+            generation_config="vllm",
+            seed=42,
        )
        piecewise = LLM(
            model=model,
-            gpu_memory_utilization=0.45,
+            gpu_memory_utilization=0.43,
            trust_remote_code=True,
            max_model_len=1024,
-            compilation_config=CompilationConfig(),
+            max_num_seqs=128,
+            compilation_config=CompilationConfig(cudagraph_mode="PIECEWISE"),
+            generation_config="vllm",
+            seed=42,
        )

    # PyTest caches the fixture values so we use weakref.proxy to enable GC
@ -66,16 +170,7 @@ def llm_pair(request):
    )


-@pytest.mark.parametrize(
-    "llm_pair",
-    [
-        # Model names for the llm_pair fixture
-        "deepseek-ai/DeepSeek-V2-Lite",
-        "Qwen/Qwen2-1.5B-Instruct"
-    ],
-    indirect=True)
-@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
-                    reason="Only Hopper GPUs support FA3 and FlashMLA")
+@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True)
 class TestFullCUDAGraph:
    """
    Use a class such that an llm pair is constructed once for all
@ -104,12 +199,14 @@ class TestFullCUDAGraph:
        full cudagraph compilation works for padded cases too.
        """

-        piecewise_llm, full_cudagraph_llm = llm_pair
+        full_cudagraph_llm, piecewise_llm = llm_pair

-        prompts = ["Hello, my name is"] * batch_size
+        prompts = ["the quick brown fox"] * batch_size
+        # Use purely greedy decoding to avoid top-p truncation sensitivity
+        # that can amplify tiny numeric differences across runtimes.
        sampling_params = SamplingParams(temperature=0.0,
                                         max_tokens=max_tokens,
-                                         top_p=0.95)
+                                         top_p=1.0)

        piecewise_responses = piecewise_llm.generate(prompts, sampling_params)
        full_responses = full_cudagraph_llm.generate(prompts, sampling_params)
@ -117,42 +214,16 @@ class TestFullCUDAGraph:
        # Check that all responses are the same
        for piecewise_res, full_res in zip(piecewise_responses,
                                           full_responses):
-            assert piecewise_res.outputs[0].text == full_res.outputs[0].text
-
-
-@pytest.mark.parametrize(
-    "model, supported",
-    [
-        ("Qwen/Qwen2-1.5B-Instruct", True),
-        # MLA does not support capturing CUDA Graphs with size > max_num_seqs
-        ("deepseek-ai/DeepSeek-V2-Lite", False),
-    ])
-@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
-                    reason="Only Hopper GPUs support FA3 and FlashMLA")
-def test_lower_max_num_seqs(model, supported):
-    with temporary_environ({
-            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION": "3"
-    }), ExitStack() as stack:
-        if not supported:
-            stack.enter_context(pytest.raises(RuntimeError))
-
-        llm = LLM(model=model,
-                  max_num_seqs=256,
-                  trust_remote_code=True,
-                  max_model_len=1024,
-                  compilation_config=CompilationConfig(
-                      full_cuda_graph=True,
-                      cudagraph_capture_sizes=[64, 256, 512]))
-        llm.generate(["Hello, my name is"] * 10)
+            assert piecewise_res.outputs[0].text.lower() == \
+                full_res.outputs[0].text.lower()


@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
 def test_full_cudagraph_with_invalid_backend():
    with temporary_environ({
            "VLLM_USE_V1": "1",
-            "VLLM_FLASH_ATTN_VERSION":
-            "2"  #FA2 not supported with full_cuda_graph
+            "VLLM_ATTENTION_BACKEND": "FLEX_ATTENTION"
+            # Flex_Attention is not supported with full cuda graph
    }), pytest.raises(RuntimeError):
        LLM(model="Qwen/Qwen2-1.5B-Instruct",
-            compilation_config=CompilationConfig(full_cuda_graph=True))
+            compilation_config=CompilationConfig(cudagraph_mode="FULL"))
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@ -11,10 +11,10 @@ from torch.library import Library

 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         VllmConfig, set_current_vllm_config)
 from vllm.envs import VLLM_USE_V1
-from vllm.forward_context import set_forward_context
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op

 global_counter = 0
@ -101,16 +101,33 @@ def test_simple_piecewise_compile(use_inductor):
            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
            num_cudagraph_captured=
            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-    ), set_forward_context({}, vllm_config=vllm_config):
-
+    ), set_forward_context(None,
+                           vllm_config=vllm_config):  # background context
+        # warm up with background context
        model(inputs)

-        model(torch.randn(2).cuda())
-        model(torch.randn(1).cuda())
+        # capturing/replaying should under context of cudagraph dispatching
+        with set_forward_context(
+                None,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=BatchDescriptor(num_tokens=2, )):
+            model(torch.randn(2).cuda())
+        with set_forward_context(
+                None,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=BatchDescriptor(num_tokens=1, )):
+            model(torch.randn(1).cuda())

        input = torch.zeros(2).cuda()
        global global_counter
        global_counter = 0
-        output = model(input)
+        with set_forward_context(
+                None,
+                vllm_config=vllm_config,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
+                batch_descriptor=BatchDescriptor(num_tokens=2, )):
+            output = model(input)
        assert global_counter == 2
        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@ -18,9 +18,9 @@ from torch.library import Library

 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
-                         set_current_vllm_config)
-from vllm.forward_context import set_forward_context
+from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
+                         VllmConfig, set_current_vllm_config)
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils import direct_register_custom_op

 # create a library to hold the custom op
@ -276,9 +276,11 @@ def run_model(llama_config,
        )
        if split_attn:
            compilation_config.splitting_ops = ["silly.attention"]
+        cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
    else:
        compilation_config = CompilationConfig(
            level=CompilationLevel.NO_COMPILATION, )
+        cudagraph_runtime_mode = CUDAGraphMode.NONE

    vllm_config = VllmConfig(compilation_config=compilation_config,
                             additional_config=llama_config)
@ -287,17 +289,37 @@ def run_model(llama_config,
                           vllm_config=vllm_config,
                           prefix="").eval().cuda()

-    with set_forward_context({}, vllm_config=vllm_config):
+    with set_forward_context({},
+                             vllm_config=vllm_config):  # background context
        B = 16  # max batch size
        input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
        positions = torch.arange(B).cuda()

+        # warmup for the model with cudagraph_mode NONE
        model(input_ids, positions)
-        model(input_ids[:2], positions[:2])
-        model(input_ids[:1], positions[:1])
+
+        # simulate cudagraphs capturing
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            model(input_ids[:2], positions[:2])
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=1, )):
+            model(input_ids[:1], positions[:1])

        input_ids[:2].zero_()
-        output = model(input_ids[:2], positions[:2])
+        # simulate cudagraphs replay
+        with set_forward_context({},
+                                 vllm_config=vllm_config,
+                                 cudagraph_runtime_mode=cudagraph_runtime_mode,
+                                 batch_descriptor=BatchDescriptor(
+                                     num_tokens=2, )):
+            output = model(input_ids[:2], positions[:2])

        output = output.cpu()

--- a/tests/config/test_config.yaml
+++ b/tests/config/test_config.yaml
@ -2,4 +2,3 @@ port: 12312
 served_model_name: mymodel
 tensor_parallel_size: 2
 trust_remote_code: true
-multi_step_stream_outputs: false
--- a/tests/config/test_config_with_model.yaml
+++ b/tests/config/test_config_with_model.yaml
@ -4,4 +4,3 @@ port: 12312
 served_model_name: mymodel
 tensor_parallel_size: 2
 trust_remote_code: true
-multi_step_stream_outputs: false
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@ -644,11 +644,9 @@ def test_chunked_prefill_preempt():
    assert out.num_batched_tokens == max_num_batched_tokens


-@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
-def test_chunked_prefill_spec_prefill(num_scheduler_steps):
+def test_chunked_prefill_spec_prefill():
    """Verify that the num_lookahead_slots is set appropriately for an all"""
-    """prefill batch depending on whether multi-step scheduling is enabled"""
-    """or not"""
+    """prefill batch."""
    block_size = 4
    max_seqs = 30
    max_model_len = 200
@ -661,7 +659,6 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
        max_model_len,
        enable_chunked_prefill=True,
        num_lookahead_slots=num_lookahead_slots,
-        num_scheduler_steps=num_scheduler_steps,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
@ -679,8 +676,7 @@ def test_chunked_prefill_spec_prefill(num_scheduler_steps):
    assert out.num_prefill_groups == 1
    assert out.num_batched_tokens == max_num_batched_tokens
    print(out.num_lookahead_slots)
-    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
-                                       num_lookahead_slots)
+    assert out.num_lookahead_slots == 0


 def test_chunked_prefill_max_seqs():
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@ -6,7 +6,6 @@ import pytest
 from tests.conftest import VllmRunner
 from tests.core.utils import create_dummy_prompt
 from vllm.engine.llm_engine import LLMEngine
-from vllm.platforms import current_platform
 from vllm.sequence import SequenceGroup

 MODEL = "JackFram/llama-160m"
@ -17,32 +16,19 @@ def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
    scheduler.add_seq_group(seq_group)


-@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
@pytest.mark.parametrize("enforce_eager", [False, True])
-def test_num_computed_tokens_update(num_scheduler_steps: int,
-                                    enable_chunked_prefill: bool,
+def test_num_computed_tokens_update(enable_chunked_prefill: bool,
                                    enforce_eager: bool):

-    is_multi_step = num_scheduler_steps > 1
-    is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
-
-    if is_multi_step_chunked_prefill and current_platform.is_rocm():
-        pytest.skip("Multi-step with Chunked-Prefill does not support "
-                    "rocm_flash_attn backend")
-
    # Make a vllm engine
    runner = VllmRunner(model_name=MODEL,
                        gpu_memory_utilization=0.7,
-                        num_scheduler_steps=num_scheduler_steps,
                        enable_chunked_prefill=enable_chunked_prefill,
                        enforce_eager=enforce_eager)
    engine: LLMEngine = runner.llm.llm_engine

-    # In multi-step + chunked-prefill there is no separate single prompt step.
-    # What is scheduled will run for num_scheduler_steps always.
-    num_prompt_steps = num_scheduler_steps \
-        if is_multi_step_chunked_prefill else 1
+    num_prompt_steps = 1

    num_output_tokens_list = [4, 8, 12, 15, 16, 17]

@ -73,10 +59,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
                # Test correctness of num_computed_tokens after the decode steps
                assert seq.data.get_num_computed_tokens(
                ) == prompt_num_computed_tokens + decode_step_counter
-                for _ in range(num_scheduler_steps):
-                    # decode step
-                    engine.step()
-                    decode_step_counter += 1
+                engine.step()
+                decode_step_counter += 1

        # Test correctness of num_computed_tokens after the sequence finish.
        assert seq.data.get_num_computed_tokens(
--- a/tests/engine/test_multi_step_output_processor.py
+++ b/tests/engine/test_multi_step_output_processor.py
@ -1,274 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-from unittest.mock import MagicMock
-
-import pytest
-from transformers import PreTrainedTokenizer
-
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceOutput, SequenceStatus)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.utils import Counter
-
-from ..core.utils import create_seq_group
-
-
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [1, 12])
-@pytest.mark.skip_global_cleanup
-def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
-    """Verify multi-step decoding appends token ids correctly.
-
-    We append token ids and verify all the token ids were appended correctly.
-    Note that ignore_eos=True.
-    """
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=1024,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=seq_output_len +
-                                       num_new_tokens,
-                                       ignore_eos=True),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
-    output_processor.process_outputs(seq_group, outputs)
-    assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
-@pytest.mark.parametrize("max_tokens", [128 + 3])
-@pytest.mark.skip_global_cleanup
-def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
-                             seq_output_len: int, max_tokens: int):
-    """Verify tokens after max_tokens are dropped and not appended to the
-    sequence.
-    """
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=max_tokens, ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to not go over max tokens in len.
-    assert seq.get_len() == seq_prompt_len + max_tokens
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [12])
-@pytest.mark.parametrize("seed", list(range(6)))
-@pytest.mark.skip_global_cleanup
-def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                               seq_output_len: int, seed: int):
-    """Verify the eos token id is included in the sequence, but subsequent
-    tokens are dropped (not appended to sequence).
-    """
-    random.seed(seed)
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    eos_token_id = 100
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(
-            # Ensure enough space.
-            max_tokens=seq_output_len + num_new_tokens, ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-    assert eos_token_id not in new_token_ids
-    eos_index = random.randint(0, len(new_token_ids) - 1)
-    new_token_ids[eos_index] = eos_token_id
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to not go beyond provided eos.
-    assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:eos_index + 1]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [12])
-@pytest.mark.parametrize("seed", list(range(6)))
-@pytest.mark.skip_global_cleanup
-def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                              seq_output_len: int, seed: int):
-    """When sampling parameters dictate that we should ignore the eos token id,
-    ensure all token ids are appended even if the eos token id is emitted.
-    """
-    random.seed(seed)
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    eos_token_id = 100
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(
-            # Ensure enough space.
-            max_tokens=seq_output_len + num_new_tokens,
-            ignore_eos=True,
-        ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-    assert eos_token_id not in new_token_ids
-    eos_index = random.randint(0, len(new_token_ids) - 1)
-    new_token_ids[eos_index] = eos_token_id
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to go beyond eos.
-    assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
-                                             seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-def mock_tokenizer(eos_token_id=1000):
-    tokenizer = MagicMock(spec=PreTrainedTokenizer)
-    tokenizer.eos_token_id = eos_token_id
-    return tokenizer
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@ -96,9 +96,6 @@ def test_lm_eval_accuracy_v1_engine_fp8_kv_cache(
        more_args = None
        if current_platform.is_tpu():
            # Limit compilation time for TPU V1
-
-            # xet doesn't work well for Qwen/Qwen3-1.7B
-            m.setenv("HF_HUB_DISABLE_XET", "1")
            more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8"

            # Add TP test (if provided)
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@ -26,15 +26,12 @@ DEFAULT_ARGS = ["--max-model-len", "4096"]
 MORE_ARGS_LIST = [
    [],  # Default
    ["--enable-chunked-prefill"],  # Chunked
-    ["--num-scheduler-steps", "8"],  # MS
-    ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"]  # MS+Stream
 ]
 MAX_WAIT_SECONDS = None

 if current_platform.is_tpu():
    MORE_ARGS_LIST = [
        [],  # Default
-        # ["--num-scheduler-steps", "8"], # Multi-step << currently fails
    ]
    MAX_WAIT_SECONDS = 600

--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@ -23,6 +23,8 @@ MAXIMUM_AUDIOS = 2
@pytest.fixture(scope="module")
 def server():
    args = [
+        "--dtype",
+        "float32",
        "--max-model-len",
        "2048",
        "--max-num-seqs",
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@ -24,18 +24,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original


@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module", params=[False, True])
-def multimodal_server(request, monkeypatch_module):  # noqa: F811
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+def multimodal_server():  # noqa: F811

    args = [
        # use half precision for speed and memory savings in CI environment
--- a/tests/entrypoints/openai/test_embedding_long_text.py
+++ b/tests/entrypoints/openai/test_embedding_long_text.py
@ -0,0 +1,441 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test cases for long text embedding with automatic chunking mechanism.
+
+This test suite validates vLLM's automatic chunking functionality for handling
+text inputs that exceed the model's maximum token length, specifically targeting
+the intfloat/multilingual-e5-small model (max token length: 512).
+"""
+
+import random
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.entrypoints.openai.protocol import EmbeddingResponse
+
+from ...utils import RemoteOpenAIServer
+
+
+def _generate_random_text(word_count: int) -> str:
+    """Generate random text with approximately the specified word count."""
+    # Common English words with focus on verbs and nouns for realistic text
+    common_words = [
+        # Essential articles and pronouns (minimal)
+        "the",
+        "and",
+        "you",
+        "they",
+        "this",
+        "that",
+        "these",
+        "those",
+
+        # Action verbs
+        "create",
+        "build",
+        "develop",
+        "design",
+        "implement",
+        "execute",
+        "analyze",
+        "process",
+        "generate",
+        "calculate",
+        "evaluate",
+        "optimize",
+        "transform",
+        "integrate",
+        "configure",
+        "deploy",
+        "monitor",
+        "manage",
+        "discover",
+        "explore",
+        "investigate",
+        "research",
+        "study",
+        "examine",
+        "improve",
+        "enhance",
+        "upgrade",
+        "modify",
+        "update",
+        "maintain",
+        "solve",
+        "resolve",
+        "handle",
+        "address",
+        "tackle",
+        "overcome",
+        "communicate",
+        "collaborate",
+        "coordinate",
+        "organize",
+        "plan",
+        "achieve",
+        "accomplish",
+        "complete",
+        "finish",
+        "deliver",
+        "provide",
+
+        # Technology and science nouns
+        "system",
+        "application",
+        "software",
+        "hardware",
+        "network",
+        "database",
+        "algorithm",
+        "model",
+        "framework",
+        "platform",
+        "interface",
+        "protocol",
+        "architecture",
+        "infrastructure",
+        "component",
+        "module",
+        "service",
+        "technology",
+        "innovation",
+        "solution",
+        "methodology",
+        "approach",
+        "artificial",
+        "intelligence",
+        "machine",
+        "learning",
+        "neural",
+        "network",
+        "computer",
+        "processor",
+        "memory",
+        "storage",
+        "computation",
+        "data",
+        "information",
+        "knowledge",
+        "insight",
+        "pattern",
+        "trend",
+        "analysis",
+        "research",
+        "development",
+        "engineering",
+        "science",
+        "mathematics",
+        "statistics",
+        "probability",
+        "optimization",
+        "performance",
+        "efficiency",
+
+        # General nouns
+        "project",
+        "team",
+        "organization",
+        "company",
+        "business",
+        "industry",
+        "market",
+        "customer",
+        "user",
+        "client",
+        "product",
+        "feature",
+        "function",
+        "requirement",
+        "specification",
+        "documentation",
+        "report",
+        "result",
+        "outcome",
+        "impact",
+        "benefit",
+        "advantage",
+        "challenge",
+        "problem",
+        "opportunity",
+        "strategy",
+        "goal",
+        "objective",
+        "target",
+        "milestone",
+        "process",
+        "procedure",
+        "workflow",
+        "pipeline",
+        "operation",
+        "task",
+        "activity",
+        "event",
+        "session",
+        "meeting",
+        "discussion",
+        "decision"
+    ]
+
+    words = []
+    for _ in range(word_count):
+        words.append(random.choice(common_words))
+
+    # Add some punctuation for more realistic text
+    text = " ".join(words)
+    # Add periods every 10-20 words
+    words_list = text.split()
+    result = []
+    for i, word in enumerate(words_list):
+        result.append(word)
+        if ((i + 1) % random.randint(10, 20) == 0 and i < len(words_list) - 1):
+            result[-1] += "."
+
+    return " ".join(result)
+
+
+MODEL_NAME = "intfloat/multilingual-e5-small"
+DTYPE = "bfloat16"
+
+# Test text: Generate text with approximately 1500 words to exceed 1024 tokens
+LONG_TEXT_1500_WORDS = _generate_random_text(1500)
+
+# Test text: Generate text with approximately 2500 words to exceed 2048 tokens
+LONG_TEXT_2500_WORDS = _generate_random_text(2500)
+
+
+@pytest.fixture(scope="module")
+def server_with_chunked_processing():
+    """Start server with automatic chunking processing enabled."""
+    args = [
+        "--runner",
+        "pooling",
+        "--dtype",
+        DTYPE,
+        "--enforce-eager",
+        "--max-model-len",
+        "512",  # Set smaller max_model_len to trigger chunking mechanism
+        '--override-pooler-config',
+        ('{"pooling_type": "MEAN", "normalize": true, '
+         '"enable_chunked_processing": true, "max_embed_len": 10000}'),
+        "--gpu-memory-utilization",
+        "0.8",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client_with_chunked_processing(server_with_chunked_processing):
+    """Create async client with chunking processing support."""
+    async with server_with_chunked_processing.get_async_client(
+    ) as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_long_text_embedding_1500_chars(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test embedding processing for ~1500 character long text 
+    (~1028 tokens, exceeding 512 token limit)."""
+
+    # Verify text length
+    # Verify text has sufficient word count (approximately 1500 words)
+    word_count = len(LONG_TEXT_1500_WORDS.split())
+    assert word_count >= 1400, (
+        f"Test text word count insufficient: {word_count} words")
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_1500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding
+               ) == 384  # multilingual-e5-small embedding dimension
+    assert embeddings.usage.completion_tokens == 0
+    # Due to chunked processing, token count should
+    # reflect actual processed tokens
+    # With ~1500 words, we expect roughly
+    # 1024+ tokens (exceeding 512 token limit)
+    # Should exceed single chunk limit of 512
+    assert embeddings.usage.prompt_tokens > 800
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # Verify embedding vector validity
+    embedding_vector = embeddings.data[0].embedding
+    assert all(
+        isinstance(x, float)
+        for x in embedding_vector), "Embedding vector should contain floats"
+    assert not all(
+        x == 0
+        for x in embedding_vector), "Embedding vector should not be all zeros"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_long_text_embedding_2500_chars(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test embedding processing for ~2500 character long text
+    (~2048 tokens, requiring multiple chunks)."""
+
+    # Verify text length
+    # Verify text has sufficient word count (approximately 2500 words)
+    word_count = len(LONG_TEXT_2500_WORDS.split())
+    assert word_count >= 2300, (
+        f"Test text word count insufficient: {word_count} words")
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_2500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding
+               ) == 384  # multilingual-e5-small embedding dimension
+    assert embeddings.usage.completion_tokens == 0
+    # Due to chunked processing, token count should
+    # reflect actual processed tokens
+    # With ~2500 words, we expect
+    # roughly 2048+ tokens (requiring multiple chunks)
+    # Should require multiple chunks for processing
+    assert embeddings.usage.prompt_tokens > 1500
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # Verify embedding vector validity
+    embedding_vector = embeddings.data[0].embedding
+    assert all(
+        isinstance(x, float)
+        for x in embedding_vector), "Embedding vector should contain floats"
+    assert not all(
+        x == 0
+        for x in embedding_vector), "Embedding vector should not be all zeros"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_long_text_embedding(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test batch long text embedding processing."""
+
+    input_texts = [
+        LONG_TEXT_1500_WORDS,
+        LONG_TEXT_2500_WORDS,
+        "This is a short text test.",  # Short text for comparison
+    ]
+
+    # Send batch embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=input_texts,
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 3  # Three input texts
+
+    # Verify each embedding dimension
+    for i, embedding_data in enumerate(embeddings.data):
+        assert len(embedding_data.embedding) == 384
+        assert embedding_data.index == i
+
+        # Verify embedding vector validity
+        embedding_vector = embedding_data.embedding
+        assert all(isinstance(x, float) for x in embedding_vector)
+        assert not all(x == 0 for x in embedding_vector)
+
+    # Verify token usage
+    assert embeddings.usage.completion_tokens == 0
+    # Total token count should be very substantial
+    assert embeddings.usage.prompt_tokens > 1000
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chunked_vs_normal_consistency(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test consistency between chunked and
+    normal processing (using short text)."""
+
+    # Use a short text within the 512 token limit
+    short_text = ("Artificial intelligence technology is changing our world, "
+                  "bringing unprecedented opportunities and challenges.")
+
+    # Send embedding request
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[short_text],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 384
+    assert embeddings.usage.completion_tokens == 0
+    # Short text should not require chunked processing
+    assert embeddings.usage.prompt_tokens < 512
+    assert embeddings.usage.total_tokens == embeddings.usage.prompt_tokens
+
+    # 验证embedding向量的有效性
+    embedding_vector = embeddings.data[0].embedding
+    assert all(isinstance(x, float) for x in embedding_vector)
+    assert not all(x == 0 for x in embedding_vector)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chunked_processing_response_format(
+        client_with_chunked_processing: openai.AsyncOpenAI, model_name: str):
+    """Test response format and structure during chunked processing."""
+
+    # Test with long text to trigger chunking
+    embedding_response = await client_with_chunked_processing.embeddings.create(
+        model=model_name,
+        input=[LONG_TEXT_1500_WORDS],
+        encoding_format="float",
+    )
+
+    # Verify response structure
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert embeddings.data[0].object == "embedding"
+    assert embeddings.data[0].index == 0
+
+    # Verify embedding vector properties
+    embedding_vector = embeddings.data[0].embedding
+    import math
+    vector_norm = math.sqrt(sum(x * x for x in embedding_vector))
+    # Check that the vector is normalized
+    # (default behavior for most embedding models)
+    assert 0.8 < vector_norm < 1.2, (
+        f"Vector norm should be reasonable, actual: {vector_norm}")
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
    op = context.operation
    assert op is not None

-    def no_file_type(case: schemathesis.models.Case):
+    def no_invalid_types(case: schemathesis.models.Case):
        """
-        This filter skips test cases for the `POST /tokenize` endpoint where the
-        HTTP request body uses `"type": "file"` in any message's content.
-        We expect these cases to fail because that type isn't implemented here
-        https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
+        This filter skips test cases with invalid data that schemathesis
+        incorrectly generates due to permissive schema configurations.
+        
+        1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in 
+           message content, which isn't implemented.
+        
+        2. Skips tool_calls with `"type": "custom"` which schemathesis 
+           incorrectly generates instead of the valid `"type": "function"`.

        Example test cases that are skipped:
        curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
            http://localhost:8000/tokenize

        curl -X POST -H 'Content-Type: application/json' \
-            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
-            http://localhost:8000/tokenize
+            -d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
+            http://localhost:8000/v1/chat/completions
        """  # noqa: E501
-        if (op.method.lower() == "post" and op.path == "/tokenize"
-                and hasattr(case, "body") and isinstance(case.body, dict)
+        if (hasattr(case, "body") and isinstance(case.body, dict)
                and "messages" in case.body
                and isinstance(case.body["messages"], list)
                and len(case.body["messages"]) > 0):
+
            for message in case.body["messages"]:
                if not isinstance(message, dict):
                    continue
-                content = message.get("content", [])
-                if not isinstance(content, list) or len(content) == 0:
-                    continue
-                if any(item.get("type") == "file" for item in content):
-                    return False
+
+                # Check for invalid file type in tokenize endpoint
+                if op.method.lower() == "post" and op.path == "/tokenize":
+                    content = message.get("content", [])
+                    if (isinstance(content, list) and len(content) > 0 and any(
+                            item.get("type") == "file" for item in content)):
+                        return False
+
+                # Check for invalid tool_calls with non-function types
+                tool_calls = message.get("tool_calls", [])
+                if isinstance(tool_calls, list):
+                    for tool_call in tool_calls:
+                        if isinstance(tool_call, dict):
+                            if tool_call.get("type") != "function":
+                                return False
+                            if "custom" in tool_call:
+                                return False
        return True

-    return strategy.filter(no_file_type)
+    return strategy.filter(no_invalid_types)


@schema.parametrize()
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -0,0 +1,624 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import time
+
+import pytest
+import pytest_asyncio
+import requests
+from openai import BadRequestError, NotFoundError, OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+pytest.skip(allow_module_level=True, reason="gpt-oss can't run on CI yet.")
+
+MODEL_NAME = "openai/gpt-oss-20b"
+DTYPE = "bfloat16"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--enforce-eager", "--tool-server", "demo"]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic_with_instructions(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+        instructions="Respond in Korean.",
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is the capital of South Korea?",
+        reasoning={"effort": "low"},
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "system",
+                "content": "Respond in Korean."
+            },
+            {
+                "role": "user",
+                "content": "Hello!"
+            },
+            {
+                "role": "assistant",
+                "content": "Hello! How can I help you today?"
+            },
+            {
+                "role": "user",
+                "content": "What is 13 * 24? Explain your answer."
+            },
+        ],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_chat_with_input_type(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "user",
+                "content": [{
+                    "type": "input_text",
+                    "text": "What is 13*24?"
+                }],
+            },
+        ],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_structured_output(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {
+                "role": "system",
+                "content": "Extract the event information."
+            },
+            {
+                "role": "user",
+                "content":
+                "Alice and Bob are going to a science fair on Friday.",
+            },
+        ],
+        text={
+            "format": {
+                "type": "json_schema",
+                "name": "calendar_event",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                        "name": {
+                            "type": "string"
+                        },
+                        "date": {
+                            "type": "string"
+                        },
+                        "participants": {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        },
+                    },
+                    "required": ["name", "date", "participants"],
+                    "additionalProperties": False,
+                },
+                "description": "A calendar event.",
+                "strict": True,
+            }
+        },
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_structured_output_with_parse(client: OpenAI, model_name: str):
+    from pydantic import BaseModel
+
+    class CalendarEvent(BaseModel):
+        name: str
+        date: str
+        participants: list[str]
+
+    response = await client.responses.parse(
+        model=model_name,
+        input="Alice and Bob are going to a science fair on Friday",
+        instructions="Extract the event information",
+        text_format=CalendarEvent,
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_store(client: OpenAI, model_name: str):
+    for store in [True, False]:
+        response = await client.responses.create(
+            model=model_name,
+            input="What is 13 * 24?",
+            store=store,
+        )
+        assert response is not None
+
+        try:
+            _retrieved_response = await client.responses.retrieve(response.id)
+            is_not_found = False
+        except NotFoundError:
+            is_not_found = True
+
+        assert is_not_found == (not store)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_background(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+        background=True,
+    )
+    assert response is not None
+
+    retries = 0
+    max_retries = 30
+    while retries < max_retries:
+        response = await client.responses.retrieve(response.id)
+        if response.status == "completed":
+            break
+        time.sleep(1)
+        retries += 1
+
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_background_cancel(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Write a long story about a cat.",
+        background=True,
+    )
+    assert response is not None
+    time.sleep(1)
+
+    cancelled_response = await client.responses.cancel(response.id)
+    assert cancelled_response is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_stateful_multi_turn(client: OpenAI, model_name: str):
+    response1 = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response1 is not None
+    assert response1.status == "completed"
+
+    response2 = await client.responses.create(
+        model=model_name,
+        input="What if I increase both numbers by 1?",
+        previous_response_id=response1.id,
+    )
+    assert response2 is not None
+    assert response2.status == "completed"
+
+    response3 = await client.responses.create(
+        model=model_name,
+        input="Divide the result by 2.",
+        previous_response_id=response2.id,
+    )
+    assert response3 is not None
+    assert response3.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_streaming(client: OpenAI, model_name: str):
+    prompts = [
+        "tell me a story about a cat in 20 words",
+        "What is 13 * 24? Use python to calculate the result.",
+        "When did Jensen found NVIDIA? Search it and answer the year only.",
+    ]
+
+    for prompt in prompts:
+        response = await client.responses.create(
+            model=model_name,
+            input=prompt,
+            reasoning={"effort": "low"},
+            tools=[
+                {
+                    "type": "web_search_preview"
+                },
+                {
+                    "type": "code_interpreter",
+                    "container": {
+                        "type": "auto"
+                    }
+                },
+            ],
+            stream=True,
+        )
+
+        events = []
+        current_event_mode = None
+        async for event in response:
+            if current_event_mode != event.type:
+                current_event_mode = event.type
+                print(f"\n[{event.type}] ", end="", flush=True)
+
+            if "text.delta" in event.type:
+                print(event.delta, end="", flush=True)
+            elif "reasoning_text.delta" in event.type:
+                print(f"{event.delta}", end="", flush=True)
+            elif "response.code_interpreter_call_code.done" in event.type:
+                print(f"Code: {event.code}", end="", flush=True)
+            elif ("response.output_item.added" in event.type
+                  and event.item.type == "web_search_call"):
+                print(f"Web search: {event.item.action}", end="", flush=True)
+            events.append(event)
+
+        assert len(events) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_web_search(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Who is the president of South Korea as of now?",
+        tools=[{
+            "type": "web_search_preview"
+        }],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_code_interpreter(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="Multiply 64548*15151 using builtin python interpreter.",
+        tools=[{
+            "type": "code_interpreter",
+            "container": {
+                "type": "auto"
+            }
+        }],
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
+def get_weather(latitude, longitude):
+    response = requests.get(
+        f"https://api.open-meteo.com/v1/forecast?latitude={latitude}&longitude={longitude}&current=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m"  # noqa
+    )
+    data = response.json()
+    return data["current"]["temperature_2m"]
+
+
+def get_place_to_travel():
+    return "Paris"
+
+
+def call_function(name, args):
+    if name == "get_weather":
+        return get_weather(**args)
+    elif name == "get_place_to_travel":
+        return get_place_to_travel()
+    else:
+        raise ValueError(f"Unknown function: {name}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling(client: OpenAI, model_name: str):
+    tools = [{
+        "type": "function",
+        "name": "get_weather",
+        "description":
+        "Get current temperature for provided coordinates in celsius.",  # noqa
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {
+                    "type": "number"
+                },
+                "longitude": {
+                    "type": "number"
+                },
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }]
+
+    response = await client.responses.create(
+        model=model_name,
+        input="What's the weather like in Paris today?",
+        tools=tools,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.output) == 2
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "function_call"
+
+    tool_call = response.output[1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=[{
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }],
+        tools=tools,
+        previous_response_id=response.id,
+    )
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None
+
+    # NOTE: chain-of-thought should be removed.
+    response_3 = await client.responses.create(
+        model=model_name,
+        input="What's the weather like in Paris today?",
+        tools=tools,
+        previous_response_id=response_2.id,
+    )
+    assert response_3 is not None
+    assert response_3.status == "completed"
+    assert response_3.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_multi_turn(client: OpenAI, model_name: str):
+    tools = [
+        {
+            "type": "function",
+            "name": "get_place_to_travel",
+            "description": "Get a random place to travel",
+            "parameters": {
+                "type": "object",
+                "properties": {},
+                "required": [],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        },
+        {
+            "type": "function",
+            "name": "get_weather",
+            "description":
+            "Get current temperature for provided coordinates in celsius.",  # noqa
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "latitude": {
+                        "type": "number"
+                    },
+                    "longitude": {
+                        "type": "number"
+                    },
+                },
+                "required": ["latitude", "longitude"],
+                "additionalProperties": False,
+            },
+            "strict": True,
+        },
+    ]
+
+    response = await client.responses.create(
+        model=model_name,
+        input=
+        "Help me plan a trip to a random place. And tell me the weather there.",
+        tools=tools,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    assert len(response.output) == 2
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "function_call"
+
+    tool_call = response.output[1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=[{
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }],
+        tools=tools,
+        previous_response_id=response.id,
+    )
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert len(response_2.output) == 2
+    assert response_2.output[0].type == "reasoning"
+    assert response_2.output[1].type == "function_call"
+
+    tool_call = response_2.output[1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    response_3 = await client.responses.create(
+        model=model_name,
+        input=[{
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }],
+        tools=tools,
+        previous_response_id=response_2.id,
+    )
+    assert response_3 is not None
+    assert response_3.status == "completed"
+    assert response_3.output_text is not None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_required(client: OpenAI, model_name: str):
+    tools = [{
+        "type": "function",
+        "name": "get_weather",
+        "description":
+        "Get current temperature for provided coordinates in celsius.",  # noqa
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {
+                    "type": "number"
+                },
+                "longitude": {
+                    "type": "number"
+                },
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }]
+
+    with pytest.raises(BadRequestError):
+        await client.responses.create(
+            model=model_name,
+            input="What's the weather like in Paris today?",
+            tools=tools,
+            tool_choice="required",
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_function_calling_full_history(client: OpenAI, model_name: str):
+    tools = [{
+        "type": "function",
+        "name": "get_weather",
+        "description":
+        "Get current temperature for provided coordinates in celsius.",  # noqa
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "latitude": {
+                    "type": "number"
+                },
+                "longitude": {
+                    "type": "number"
+                },
+            },
+            "required": ["latitude", "longitude"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }]
+
+    input_messages = [{
+        "role": "user",
+        "content": "What's the weather like in Paris today?"
+    }]
+
+    response = await client.responses.create(
+        model=model_name,
+        input=input_messages,
+        tools=tools,
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+
+    tool_call = response.output[-1]
+    name = tool_call.name
+    args = json.loads(tool_call.arguments)
+
+    result = call_function(name, args)
+
+    input_messages.extend(
+        response.output)  # append model's function call message
+    input_messages.append(
+        {  # append result message
+            "type": "function_call_output",
+            "call_id": tool_call.call_id,
+            "output": str(result),
+        }
+    )
+
+    response_2 = await client.responses.create(
+        model=model_name,
+        input=input_messages,
+        tools=tools,
+    )
+    assert response_2 is not None
+    assert response_2.status == "completed"
+    assert response_2.output_text is not None
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -80,9 +80,6 @@ async def test_bad_requests(mary_had_lamb):
 async def test_long_audio_request(mary_had_lamb, model_name):
    server_args = ["--enforce-eager"]

-    if model_name.startswith("openai"):
-        return
-
    mary_had_lamb.seek(0)
    audio, sr = librosa.load(mary_had_lamb)
    # Add small silence after each audio for repeatability in the split process
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@ -9,10 +9,10 @@ import torch
 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
 from vllm.platforms import current_platform

-NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
-BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
+BLOCK_SIZES = [16]
+DTYPES = [torch.bfloat16]
 QDTYPES = [None]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@ -29,17 +29,14 @@ MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
 NUM_BLOCKS = 4321  # Arbitrary values for testing
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
-# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
-DTYPES = [
-    torch.half, torch.bfloat16, torch.float
-] if not current_platform.is_rocm() else [torch.half, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 NUM_GEN_SEQS = [7]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
 NUM_HEADS = [(40, 40), (64, 8)]  # Arbitrary values for testing

 # This should be sync with get_supported_head_sizes() in
 # vllm.attention.ops.paged_attn.PagedAttention
-HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [32, 80, 128, 256]

 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@ -11,11 +11,11 @@ from vllm import _custom_ops as ops
 from vllm.platforms import current_platform

 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
-DTYPES = [torch.half, torch.bfloat16, torch.float]
+DTYPES = [torch.bfloat16, torch.float]
 NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
-HEAD_SIZES = [64, 80, 120, 256]
+HEAD_SIZES = [64, 80, 256]
 BLOCK_SIZES = [8, 16, 32]
 CACHE_LAYOUTS = ["NHD", "HND"]

--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@ -12,14 +12,16 @@ from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
                                  flash_attn_with_kvcache,
                                  is_fa_version_supported)

-NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
-BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
+BLOCK_SIZES = [16]
+DTYPES = [torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
+SOFT_CAPS = [None, 50.0]
+SLIDING_WINDOWS = [None, 256]


 def ref_paged_attn(
@ -83,9 +85,9 @@ def ref_paged_attn(
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
@pytest.mark.parametrize("fa_version", [2, 3])
@pytest.mark.parametrize("q_dtype", QDTYPES)
@torch.inference_mode()
@ -198,9 +200,9 @@ def test_flash_attn_with_paged_kv(
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("sliding_window", [None, 256])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("fa_version", [2, 3])
@pytest.mark.parametrize("q_dtype", QDTYPES)
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@ -9,11 +9,13 @@ import torch

 from vllm.platforms import current_platform

-NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)]
+NUM_HEADS = [(32, 8), (6, 1)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
-DTYPES = [torch.float16, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
+SOFT_CAPS = [None, 30.0]
+SLIDING_WINDOWS = [None, 64]


 def ref_paged_attn(
@ -76,8 +78,8 @@ def ref_paged_attn(
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
-@pytest.mark.parametrize("sliding_window", [None, 64])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
@torch.inference_mode
 def test_flashinfer_decode_with_paged_kv(
    kv_lens: list[int],
@ -173,8 +175,8 @@ def test_flashinfer_decode_with_paged_kv(
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
-@pytest.mark.parametrize("sliding_window", [None, 64])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
@torch.inference_mode
 def test_flashinfer_prefill_with_paged_kv(
    seq_lens: list[tuple[int, int]],
@ -278,11 +280,11 @@ def test_flashinfer_prefill_with_paged_kv(


@pytest.mark.parametrize("seq_lens", [[(1, 132), (5, 18)]])
-@pytest.mark.parametrize("num_heads", [(32, 8), (6, 1)])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
 def test_flashinfer_prefill_with_paged_fp8_kv(
        seq_lens: list[tuple[int, int]], num_heads: tuple[int, int],
        head_size: int, dtype: torch.dtype, block_size: int,
@ -385,11 +387,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv(


@pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]])
-@pytest.mark.parametrize("num_heads", [(32, 8), (64, 8), (6, 1)])
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
+@pytest.mark.parametrize("soft_cap", SOFT_CAPS)
+@pytest.mark.skip(reason="TODO: fix the accuracy issue")
@torch.inference_mode
 def test_flashinfer_decode_with_paged_fp8_kv(
    kv_lens: list[int],
@ -399,7 +402,6 @@ def test_flashinfer_decode_with_paged_fp8_kv(
    block_size: int,
    soft_cap: Optional[float],
 ) -> None:
-    pytest.skip("TODO: fix the accuracy issue")
    # test doesn't work for num_heads = (16,16)
    torch.set_default_device("cuda")
    current_platform.seed_everything(0)
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@ -20,11 +20,11 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 MAX_Q_LEN = 1024
 MAX_KV_LEN = 4096
 BATCH_SIZES = [4, 12]
-NUM_HEADS = [(64, 8), (16, 16), (40, 8), (32, 8)]
+NUM_HEADS = [(16, 16), (40, 8)]
 HEAD_SIZES = [128]
-BLOCK_SIZES = [16, 32]
+BLOCK_SIZES = [16]
 KV_LAYOUTS = ["HND"]
-DTYPES = [torch.float16, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 KV_CACHE_DTYPES = [None, current_platform.fp8_dtype()]
 NUM_BLOCKS = 32768  # Large enough to test overflow in index calculation.
 SOFT_CAPS = [None, 50.0]
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@ -35,11 +35,10 @@ FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
@pytest.mark.parametrize("block_size", [64])
@pytest.mark.parametrize("causal", [True])
@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@torch.inference_mode()
 def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
-                   varlen):
-    # TODO: parametrize using pytest
-    dtype = torch.bfloat16
+                   varlen, dtype):
    device = torch.device("cuda:0")
    torch.set_default_dtype(dtype)
    torch.set_default_device(device)
@ -48,7 +47,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
    random.seed(0)

    print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
-          f"{d=}, {dv=}, {causal=}, {varlen=}")
+          f"{d=}, {dv=}, {causal=}, {varlen=}, {dtype=}")

    cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
    if varlen:
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@ -19,13 +19,13 @@ from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

 NUM_HEADS = [64]
-NUM_QUERIES_PER_KV = [1, 8, 64]
-HEAD_SIZES = [128, 96, 24]
+NUM_QUERIES_PER_KV = [1, 64]
+HEAD_SIZES = [24, 128]
 DTYPES = [torch.float16]
 CUDA_DEVICES = [
    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
-SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
+SLIDING_WINDOW = [0, 16, 2048]
 KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]

 OPS = [chunked_prefill_paged_decode, context_attention_fwd]
--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@ -9,11 +9,11 @@ import torch
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.platforms import current_platform

-NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
+NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
-BLOCK_SIZES = [16, 32]
+BLOCK_SIZES = [16]

-DTYPES = [torch.float16, torch.bfloat16]
+DTYPES = [torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [
    None, torch.float8_e4m3fnuz
 ]
@ -85,7 +85,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("sliding_window", [None, 256])
@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@pytest.mark.parametrize("soft_cap", [None, 50.0])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("q_dtype", QDTYPES)
@torch.inference_mode()
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@ -9,7 +9,7 @@ from einops import rearrange, repeat
 from vllm.model_executor.layers.mamba.ops.ssd_combined import (
    mamba_chunk_scan_combined)
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.mamba_attn import (
+from vllm.v1.attention.backends.mamba2_attn import (
    _query_start_loc_to_chunk_indices_offsets)

 # Added by the IBM Team, 2024
@ -187,7 +187,7 @@ def generate_continuous_batched_examples(example_lens_by_batch,
                         [torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
-@pytest.mark.parametrize("seq_len_chunk_size", [(119, 17), (128, 32)])
+@pytest.mark.parametrize("seq_len_chunk_size", [(112, 16), (128, 32)])
 def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
                                         itype):

@ -253,15 +253,15 @@ def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
            (8, 8, 16, 32, 16),
        ]),  # mode examples with varied lengths

-        # odd chunk_size
-        (64, 29, 2, [(11, 4), (13, 23), (19, 22),
-                     (21, 15)]),  # irregular sizes
-
        # large-ish chunk_size (256)
        (64, 256, 1, [(5, ), (1, ), (1, ),
                      (1, )]),  # irregular sizes with small sequences
        (64, 256, 2, [(5, 30), (1, 2), (1, 2),
                      (1, 2)]),  # irregular sizes with small sequences
+
+        # we also need to test some large seqlen
+        # to catch errors with init states decay
+        (768, 128, 2, [(138, 225), (138, 225)]),
    ])
 def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
                                     itype):
@ -271,10 +271,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,

    seqlen, chunk_size, num_examples, cases = seq_len_chunk_size_cases

-    # TODO: the irregular chunk size cases have some issues and require higher
-    # tolerance. This is to be invesigated
-    if chunk_size not in {8, 256}:
-        atol, rtol = 5e-1, 5e-1
+    # This test can have larger error for longer sequences
+    if seqlen > 256:
+        atol, rtol = 1e-2, 5e-3
    else:
        atol, rtol = 5e-3, 5e-3

--- a/Show More
+++ b/Show More