rename

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Merge branch 'main' into wentao-batch-invariance-dp
2025-10-29 15:19:17 -07:00 · 2025-10-29 15:18:33 -07:00 · 2025-10-29 21:01:05 +08:00 · 2025-10-29 05:59:48 -07:00 · 2025-10-29 12:55:51 +00:00 · 2025-10-29 12:19:37 +00:00
1076 changed files with 51476 additions and 19243 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
 # Note that we have 800 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/6326 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))


 def print_top_10_largest_files(zip_file):
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
+model_name: "HandH1998/QQQ-Llama-3-8b-g128"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.419
+  - name: "exact_match,flexible-extract"
+    value: 0.416
+limit: 1000
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
@ -0,0 +1,12 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
+    value: 0.80
+limit: 100
+num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
@ -0,0 +1,10 @@
+# For hf script, without -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
+model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
+tasks:
+- name: "mmlu_pro"
+  metrics:
+  - name: "exact_match,custom-extract"
+    value: 0.80
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@ -1,4 +1,5 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
+# For vllm script, with -t option (tensor parallel size)
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 tasks:
 - name: "gsm8k"
--- a/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2.5-VL-7B-Instruct.yaml
@ -0,0 +1,12 @@
+# For vllm script, with -t option (tensor parallel size).
+# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
+
+model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
+backend: "vllm-vlm"
+tasks:
+- name: "chartqa"
+  metrics:
+  - name: "relaxed_accuracy,none"
+    value: 0.855
+limit: 2500
+num_fewshot: 0
--- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-large-h100.txt
@ -0,0 +1 @@
+Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
--- a/.buildkite/lm-eval-harness/configs/models-mm-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-mm-small.txt
@ -0,0 +1 @@
+Qwen2.5-VL-7B-Instruct.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on chartqa for vllm.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install lm-eval==0.4.9
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on ChartQA using multimodal vllm."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our correctness tests in vllm's CI."
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:l:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    t ) 
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm-vlm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
+  --tasks chartqa \
+  --batch_size auto \
+  --apply_chat_template \
+  --limit $LIMIT
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m )
+        MODEL="$OPTARG"
+        ;;
+    b )
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l )
+        LIMIT="$OPTARG"
+        ;;
+    f )
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? )
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
+  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size auto
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -19,21 +19,27 @@ RTOL = 0.08
 def launch_lm_eval(eval_config, tp_size):
    trust_remote_code = eval_config.get("trust_remote_code", False)
    max_model_len = eval_config.get("max_model_len", 4096)
+    batch_size = eval_config.get("batch_size", "auto")
+    backend = eval_config.get("backend", "vllm")
    model_args = (
        f"pretrained={eval_config['model_name']},"
        f"tensor_parallel_size={tp_size},"
        f"enforce_eager=true,"
        f"add_bos_token=true,"
        f"trust_remote_code={trust_remote_code},"
-        f"max_model_len={max_model_len}"
+        f"max_model_len={max_model_len},"
    )
    results = lm_eval.simple_evaluate(
-        model="vllm",
+        model=backend,
        model_args=model_args,
        tasks=[task["name"] for task in eval_config["tasks"]],
        num_fewshot=eval_config["num_fewshot"],
        limit=eval_config["limit"],
-        batch_size="auto",
+        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
+        # text models. however, this is regressing measured strict-match for
+        # existing text models in CI, so only apply it for mm.
+        apply_chat_template=backend == "vllm-vlm",
+        batch_size=batch_size,
    )
    return results

--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@ -7,6 +7,7 @@ from importlib import util

 import pandas as pd

+pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None


@ -109,7 +110,10 @@ def compare_data_columns(
        if len(compare_frames) >= 2:
            base = compare_frames[0]
            current = compare_frames[-1]
-            ratio = current / base
+            if "P99" in data_column or "Median" in data_column:
+                ratio = base / current  # for latency
+            else:
+                ratio = current / base
            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)
@ -199,6 +203,71 @@ def split_json_by_tp_pp(
    return saved_paths


+def _add_limit_line(fig, y_value, label):
+    # Visible dashed line + annotation
+    fig.add_hline(
+        y=y_value,
+        line_dash="dash",
+        line_color="red" if "ttft" in label.lower() else "blue",
+        annotation_text=f"{label}: {y_value} ms",
+        annotation_position="top left",
+    )
+    # Optional: add a legend item (as a transparent helper trace)
+    if plot and plotly_found:
+        import plotly.graph_objects as go
+
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(
+                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
+                ),
+                name=f"{label}",
+            )
+        )
+
+
+def _find_concurrency_col(df: pd.DataFrame) -> str:
+    for c in [
+        "# of max concurrency.",
+        "# of max concurrency",
+        "Max Concurrency",
+        "max_concurrency",
+        "Concurrency",
+    ]:
+        if c in df.columns:
+            return c
+    # Fallback: guess an integer-like column (harmless if unused)
+    for c in df.columns:
+        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
+            return c
+    return "# of max concurrency."
+
+
+def _highlight_threshold(
+    df: pd.DataFrame, threshold: float
+) -> "pd.io.formats.style.Styler":
+    """Highlight numeric per-configuration columns with value <= threshold."""
+    conc_col = _find_concurrency_col(df)
+    key_cols = [
+        c
+        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
+        if c in df.columns
+    ]
+    conf_cols = [
+        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
+    ]
+    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+    return df.style.map(
+        lambda v: "background-color:#e6ffe6;font-weight:bold;"
+        if pd.notna(v) and v <= threshold
+        else "",
+        subset=conf_cols,
+    )
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
@ -220,6 +289,26 @@ if __name__ == "__main__":
        default="# of max concurrency.",
        help="column name to use as X Axis in comparison graph",
    )
+    parser.add_argument(
+        "-l",
+        "--latency",
+        type=str,
+        default="p99",
+        help="take median|p99 for latency like TTFT/TPOT",
+    )
+    parser.add_argument(
+        "--ttft-max-ms",
+        type=float,
+        default=3000.0,
+        help="Reference limit for TTFT plots (ms)",
+    )
+    parser.add_argument(
+        "--tpot-max-ms",
+        type=float,
+        default=100.0,
+        help="Reference limit for TPOT plots (ms)",
+    )
+
    args = parser.parse_args()

    drop_column = "P99"
@ -234,12 +323,22 @@ if __name__ == "__main__":
        "# of max concurrency.",
        "qps",
    ]
-    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-    html_msgs_for_data_cols = [
-        "Compare Output Tokens /n",
-        "Median TTFT /n",
-        "Median TPOT /n",
-    ]
+
+    if "median" in args.latency:
+        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+        html_msgs_for_data_cols = [
+            "Compare Output Tokens /n",
+            "Median TTFT /n",
+            "Median TPOT /n",
+        ]
+        drop_column = "P99"
+    elif "p99" in args.latency:
+        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
+        html_msgs_for_data_cols = [
+            "Compare Output Tokens /n",
+            "P99 TTFT /n",
+            "P99 TPOT /n",
+        ]

    if len(args.file) == 1:
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
@ -275,33 +374,83 @@ if __name__ == "__main__":
                    f"Expected subset: {filtered_info_cols}, "
                    f"but DataFrame has: {list(output_df.columns)}"
                )
-            output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            output_df_sorted = output_df.sort_values(by=args.xaxis)
            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
            for name, group in output_groups:
-                html = group.to_html()
+                group_name = (
+                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
+                )
+                group_html_name = "perf_comparison_" + group_name + ".html"
+
+                metric_name = str(data_cols_to_compare[i]).lower()
+                if "tok/s" in metric_name:
+                    html = group.to_html()
+                elif "ttft" in metric_name:
+                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
+                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
+                        na_rep="—",
+                    )
+                    html = styler.to_html(
+                        table_attributes='border="1" class="dataframe"'
+                    )
+                elif (
+                    "tpot" in metric_name
+                    or "median" in metric_name
+                    or "p99" in metric_name
+                ):
+                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
+                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
+                        na_rep="—",
+                    )
+                    html = styler.to_html(
+                        table_attributes='border="1" class="dataframe"'
+                    )
+
                text_file.write(html_msgs_for_data_cols[i])
                text_file.write(html)
+                with open(group_html_name, "a+") as sub_text_file:
+                    sub_text_file.write(html_msgs_for_data_cols[i])
+                    sub_text_file.write(html)

-                if plot and plotly_found:
-                    import plotly.express as px
+                    if plot and plotly_found:
+                        import plotly.express as px

-                    df = group[raw_data_cols]
-                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                    # Melt DataFrame for plotting
-                    df_melted = df_sorted.melt(
-                        id_vars=info_cols[y_axis_index],
-                        var_name="Configuration",
-                        value_name=data_cols_to_compare[i],
-                    )
-                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                    # Create Plotly line chart
-                    fig = px.line(
-                        df_melted,
-                        x=info_cols[y_axis_index],
-                        y=data_cols_to_compare[i],
-                        color="Configuration",
-                        title=title,
-                        markers=True,
-                    )
-                    # Export to HTML
-                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
+                        df = group[raw_data_cols]
+                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
+                        # Melt DataFrame for plotting
+                        df_melted = df_sorted.melt(
+                            id_vars=info_cols[y_axis_index],
+                            var_name="Configuration",
+                            value_name=data_cols_to_compare[i],
+                        )
+                        title = (
+                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                        )
+                        # Create Plotly line chart
+                        fig = px.line(
+                            df_melted,
+                            x=info_cols[y_axis_index],
+                            y=data_cols_to_compare[i],
+                            color="Configuration",
+                            title=title,
+                            markers=True,
+                        )
+
+                        # ---- Add threshold lines based on metric name ----
+                        if "ttft" in metric_name:
+                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+                        elif (
+                            "tpot" in metric_name
+                            or "median" in metric_name
+                            or "p99" in metric_name
+                        ):
+                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
+                        # Export to HTML
+                        text_file.write(
+                            fig.to_html(full_html=True, include_plotlyjs="cdn")
+                        )
+                        sub_text_file.write(
+                            fig.to_html(full_html=True, include_plotlyjs="cdn")
+                        )
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -63,9 +63,11 @@ serving_column_mapping = {
    "mean_ttft_ms": "Mean TTFT (ms)",
    "median_ttft_ms": "Median TTFT (ms)",
    "p99_ttft_ms": "P99 TTFT (ms)",
+    "std_ttft_ms": "STD TTFT (ms)",
    "mean_tpot_ms": "Mean TPOT (ms)",
    "median_tpot_ms": "Median",
    "p99_tpot_ms": "P99",
+    "std_tpot_ms": "STD TPOT (ms)",
    "mean_itl_ms": "Mean ITL (ms)",
    "median_itl_ms": "Median ITL (ms)",
    "p99_itl_ms": "P99 ITL (ms)",
@ -368,7 +370,7 @@ if __name__ == "__main__":
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
        # we want to turn it into "8xGPUTYPE"
        df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
+            lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
        )

    # get markdown tables
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -471,6 +471,11 @@ main() {
  mkdir -p $RESULTS_FOLDER
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/

+  # dump vllm info via vllm collect-env
+  env_output=$(vllm collect-env)
+
+  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
+
  # benchmarking
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@ -1,28 +1,24 @@
 [
    {
-        "test_name": "latency_llama8B_tp1",
+        "test_name": "latency_llama8B_tp2",
        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-    {
-        "test_name": "latency_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
            "num_iters_warmup": 5,
            "num_iters": 15
        }
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@ -95,6 +95,38 @@
            "num_prompts": 200
        }
    },
+    {
+        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@ -233,6 +265,41 @@
            "num_prompts": 1000
        }
    },
+    {
+        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
    {
        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
        "qps_list": ["inf"],
@ -365,6 +432,38 @@
            "num_prompts": 200
        }
    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@ -503,6 +602,41 @@
            "num_prompts": 1000
        }
    },
+    {
+        "test_name": "serving_llama8B_int8_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
        "qps_list": ["inf"],
@ -638,6 +772,39 @@
            "num_prompts": 200
        }
    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
        "qps_list": ["inf"],
@ -780,6 +947,42 @@
            "num_prompts": 1000
        }
    },
+    {
+        "test_name": "serving_llama8B_int4_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
        "qps_list": ["inf"],
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@ -2,7 +2,7 @@
    {
        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -28,13 +28,13 @@
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
        }
    },
    {
        "test_name": "serving_llama8B_tp2_sharegpt",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -60,13 +60,13 @@
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
        }
    },
    {
-        "test_name": "serving_llama8B_tp4_sharegpt",
+        "test_name": "serving_llama8B_tp1_random_128_128",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -76,39 +76,7 @@
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp4_random_1024_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
+            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@ -124,16 +92,16 @@
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 32
        }
    },
    {
-        "test_name": "serving_llama8B_pp6_random_1024_128",
+        "test_name": "serving_llama8B_tp2_random_128_128",
        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -143,7 +111,7 @@
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 6,
+            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@ -159,10 +127,150 @@
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp1_random_128_2048",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 2048,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_128_2048",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 2048,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp1_random_2048_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 2048,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_2048_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 2048,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 32
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@ -1,29 +1,24 @@
 [
    {
-        "test_name": "throughput_llama8B_tp1",
+        "test_name": "throughput_llama8B_tp2",
        "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200,
            "backend": "vllm"
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,5 +1,5 @@
 steps:
-  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  # aarch64 + CUDA builds
  - label: "Build arm64 wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-arm64-cuda-12-9
@ -15,6 +15,21 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

+  # aarch64 build
+  - label: "Build arm64 CPU wheel"
+    depends_on: ~
+    id: build-wheel-arm64-cpu
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.8"
    depends_on: ~
    id: build-wheel-cuda-12-8
@ -28,20 +43,6 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

-  - label: "Build wheel - CUDA 12.6"
-    depends_on: ~
-    id: build-wheel-cuda-12-6
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
-  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.9"
    depends_on: ~
    id: build-wheel-cuda-12-9
@ -55,6 +56,20 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

+  - label: "Build wheel - CUDA 13.0"
+    depends_on: ~
+    id: build-wheel-cuda-13-0
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/scripts/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  # Build release images (12.9)
  - label: "Build release image (x86)"
    depends_on: ~
    id: build-release-image-x86
@ -62,13 +77,12 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

-  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build release image (arm64)"
    depends_on: ~
    id: build-release-image-arm64
@ -142,6 +156,22 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

+  - block: "Build arm64 CPU release image"
+    key: block-arm64-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish arm64 CPU release image"
+    depends_on: block-arm64-cpu-release-image-build
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
+    env:
+      DOCKER_BUILDKIT: "1"
+
  - label: "Build and publish nightly multi-arch image to DockerHub"
    depends_on:
      - create-multi-arch-manifest
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -70,7 +70,7 @@ function cpu_tests() {
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -s -v \
-    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"

  # Note: disable it until supports V1
  # Run AWQ test
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"

-if [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
-else
+if [[ $normal_wheel == *"cu129"* ]]; then
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+else
+    echo "Skipping index files for non-cu129 wheels"
 fi

 # generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"

-if [[ $normal_wheel == *"cu126"* ]]; then
-    # if $normal_wheel matches cu126, do not upload the index.html
-    echo "Skipping index files for cu126 wheels"
-elif [[ $normal_wheel == *"cu128"* ]]; then
-    # if $normal_wheel matches cu128, do not upload the index.html
-    echo "Skipping index files for cu128 wheels"
-else
+if [[ $normal_wheel == *"cu129"* ]]; then
    # only upload index.html for cu129 wheels (default wheels) as it
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+else
+    echo "Skipping index files for non-cu129 wheels"
 fi

 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -38,7 +38,7 @@ steps:
 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
  # of the dependencies. Please check the error message and add the package to whitelist
-  # in /vllm/tools/generate_nightly_torch_test.py
+  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
  soft_fail: true
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
@ -172,6 +172,8 @@ steps:
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
@ -311,6 +313,15 @@ steps:
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

+- label: V1 Test attention (H100) # 10min
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
 - label: V1 Test others (CPU) # 5 mins
  source_file_dependencies:
    - vllm/
@ -349,7 +360,8 @@ steps:
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
-    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
+    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
+    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536

 - label: Platform Tests (CUDA) # 4min
  timeout_in_minutes: 15
@ -384,7 +396,12 @@ steps:
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
-      --ignore=lora/test_llm_with_multi_loras.py
+      --ignore=lora/test_llm_with_multi_loras.py \
+      --ignore=lora/test_olmoe_tp.py \
+      --ignore=lora/test_deepseekv2_tp.py \
+      --ignore=lora/test_gptoss.py \
+      --ignore=lora/test_qwen3moe_tp.py
+      
  parallelism: 4

 - label: PyTorch Compilation Unit Tests # 15min
@ -416,8 +433,8 @@ steps:
  - pytest -v -s compile/test_basic_correctness.py
  - pytest -v -s compile/piecewise/

- label: PyTorch Fullgraph Test # 20min
-  timeout_in_minutes: 30
+- label: PyTorch Fullgraph Test # 22min
+  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -425,6 +442,19 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_full_graph.py
+  - pytest -v -s compile/test_fusions_e2e.py
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py

 - label: Kernels Core Operation Test # 48min
  timeout_in_minutes: 75
@ -468,6 +498,8 @@ steps:
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
@ -527,8 +559,9 @@ steps:
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
-  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
-  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
+  # TODO(jerryzh168): resolve the above comment
+  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py

 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
@ -677,8 +710,10 @@ steps:
  - vllm/
  - tests/models/language/generation
  commands:
-    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
-    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
+    # Install fast path packages for testing against transformers
+    # Note: also needed to run plamo2 model in vLLM
+    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
+    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'

 - label: Language Models Test (PPL)
@ -733,6 +768,16 @@ steps:
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

+- label: Multi-Modal Accuracy Eval (Small Models) # 50min
+  timeout_in_minutes: 70
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - vllm/multimodal/
+  - vllm/inputs/
+  - vllm/v1/core/
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
  optional: true
@ -796,8 +841,8 @@ steps:
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper

- label: Blackwell Test # 38 min
-  timeout_in_minutes: 60
+- label: Blackwell Test # 21 min
+  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -810,8 +855,6 @@ steps:
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/fusion.py
-  - vllm/compilation/fusion_attn.py
  commands:
    - nvidia-smi
    - python3 examples/offline_inference/basic/chat.py
@ -828,15 +871,32 @@ steps:
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
-    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
-    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
-    # Fusion
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
-    - pytest -v -s tests/kernels/moe/test_flashinfer.py
-    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
+    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
+    - pytest -v -s tests/kernels/moe/test_flashinfer.py
+
+- label: Blackwell Fusion Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  commands:
+    - nvidia-smi
+    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
+    # this runner has 2 GPUs available even though num_gpus=2 is not set
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py

 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
@ -943,6 +1003,8 @@ steps:
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  - export NCCL_CUMEM_HOST_ENABLE=0
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
@ -950,6 +1012,7 @@ steps:
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
@ -993,6 +1056,11 @@ steps:
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
+  # begin stat_logger plugins test
+  - pip install -e ./plugins/vllm_add_dummy_stat_logger
+  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
+  - pip uninstall dummy_stat_logger -y
+  # end stat_logger plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -1032,6 +1100,7 @@ steps:
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_olmoe_tp.py


 - label: Weight Loading Multiple GPU Test  # 33min
@ -1057,6 +1126,17 @@ steps:
  - tests/weight_loading
  commands:
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+  
+- label: NixlConnector PD accuracy tests (Distributed) # 30min
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh


 ##### multi gpus test #####
@ -1089,7 +1169,7 @@ steps:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4

 ##### H200 test #####
- label: Distrubted Tests (H200) # optional
+- label: Distributed Tests (H200) # optional
  gpu: h200
  optional: true
  working_dir: "/vllm-workspace/"
@ -1097,6 +1177,8 @@ steps:
  commands:
    - pytest -v -s tests/compile/test_async_tp.py
    - pytest -v -s tests/compile/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
    - pytest -v -s tests/distributed/test_context_parallel.py
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048

--- a/.coveragerc
+++ b/.coveragerc
@ -1,5 +1,10 @@
 [run]
-source = vllm
+# Track the installed vllm package (this is what actually gets imported during tests)
+# Use wildcard pattern to match the installed location
+source =
+    vllm
+    */dist-packages/vllm
+    */site-packages/vllm
 omit =
    */tests/*
    */test_*
@ -12,6 +17,16 @@ omit =
    */benchmarks/*
    */docs/*

+[paths]
+# Map all possible vllm locations to a canonical "vllm" path
+# This ensures coverage.combine properly merges data from different test runs
+source =
+    vllm
+    /vllm-workspace/src/vllm
+    /vllm-workspace/vllm
+    */site-packages/vllm
+    */dist-packages/vllm
+
 [report]
 exclude_lines =
    pragma: no cover
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,10 +5,8 @@
 /vllm/attention @LucasWilkinson
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
-/vllm/model_executor/layers/fused_moe @mgoin
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
+/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
+/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
@ -26,9 +24,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345

 # vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 /vllm/v1/attention @LucasWilkinson
-/vllm/v1/attention/backends/flashinfer.py @mgoin
+/vllm/v1/attention/backends/mla @pavanimajety
+/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
@ -47,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
-/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
+/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
@ -60,7 +58,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/v1/offloading @ApostaC

 # Transformers backend
-/vllm/model_executor/models/transformers.py @hmellor
+/vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor

 # Docs
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -13,6 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
+        id: label-step
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
@ -42,7 +43,6 @@ jobs:
                    searchIn: "body"
                  },
                ],
-                
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
@ -89,14 +89,12 @@ jobs:
                    term: "hip_",
                    searchIn: "both"
                  },
-                  
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
-                
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
@ -107,13 +105,17 @@ jobs:
                  }
                ],
              },
+              // Add more label configurations here as needed
+              // example: {
+              //   keywords: [...],
+              //   substrings: [...],
+              //   regexPatterns: [...]
+              // },
            };
-            
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-              
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
@ -125,16 +127,13 @@ jobs:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
-            
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
-              
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
-                
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
@ -146,21 +145,17 @@ jobs:
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
-                
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
-                
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
-                
                const termMatches = [];
-                
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
@ -175,15 +170,14 @@ jobs:
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
-                        context: line.length > 100 ? 
-                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
-                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
+                        context: line.length > 100 ?
+                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
+                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
                          : line.trim()
                      });
                    });
                  }
                });
-                
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
@ -196,64 +190,48 @@ jobs:
                  });
                }
              }
-              
              return matches;
            }
-            
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
-              
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
-              
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
-              
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
-              
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
-              
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
-                
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
-                
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
-              
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
-                
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
-                
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
-              
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
-                
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
-                  
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
-                  
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -266,7 +244,6 @@ jobs:
                    }
                  });
                }
-                
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -274,13 +251,10 @@ jobs:
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
-                
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
-              
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
-              
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
@ -296,14 +270,92 @@ jobs:
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
-              
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
-            
            // Process all configured labels
-            const processLabels = Object.entries(labelConfig)
-              .map(([labelName, config]) => processLabel(labelName, config));
-            const labelsAdded = await Promise.all(processLabels);
-            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
-            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            const labelsAddedResults = await Promise.all(
+              Object.entries(labelConfig).map(([labelName, config]) => 
+                processLabel(labelName, config).then(added => ({ labelName, added }))
+              )
+            );
+            
+            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
+            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
+            
+            // Return which labels were added for the next step
+            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
+            core.setOutput('labels_added', JSON.stringify(addedLabels));
+            return addedLabels;
+
+      - name: CC users for labeled issues
+        if: steps.label-step.outputs.labels_added != '[]'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
+        with:
+          script: |
+            // Configuration: Map labels to GitHub users to CC
+            // You can add multiple users per label, and multiple label configurations
+            const ccConfig = {
+              rocm: {
+                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
+                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
+              },
+              // Add more label -> user mappings here
+              // Example:
+              // cuda: {
+              //   users: ['user1', 'user2'],
+              //   message: 'CC {users} for CUDA-related issue'
+              // },
+              // performance: {
+              //   users: ['perfexpert'],
+              //   message: 'CC {users} for performance issue'
+              // },
+            };
+            
+            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
+            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
+            
+            // Get existing comments to check for already mentioned users
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            
+            const issueBody = context.payload.issue.body || '';
+            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
+            
+            // Process each label that was added
+            for (const label of labelsAdded) {
+              if (ccConfig[label]) {
+                const config = ccConfig[label];
+                const usersToMention = [];
+                
+                // Check which users haven't been mentioned yet
+                for (const user of config.users) {
+                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
+                  if (!mentionPattern.test(allExistingText)) {
+                    usersToMention.push(user);
+                  } else {
+                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
+                  }
+                }
+                
+                // Post comment if there are users to mention
+                if (usersToMention.length > 0) {
+                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
+                  const message = config.message.replace('{users}', mentions);
+                  
+                  await github.rest.issues.createComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: context.issue.number,
+                    body: message
+                  });
+                  
+                  core.notice(`CC comment added for label "${label}": ${mentions}`);
+                } else {
+                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
+                }
+              }
+            }
--- a/.gitignore
+++ b/.gitignore
@ -94,6 +94,9 @@ ipython_config.py
 # generated files
 **/generated/**

+# uv
+uv.lock
+
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@ -4,7 +4,6 @@ MD013: false
 MD024:
  siblings_only: true
 MD033: false
-MD042: false
 MD045: false
 MD046: false
 MD051: false
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -16,6 +16,7 @@ repos:
  rev: v1.38.1
  hooks:
  - id: typos
+    args: [--force-exclude]
 - repo: https://github.com/pre-commit/mirrors-clang-format
  rev: v21.1.2
  hooks:
@ -37,18 +38,18 @@ repos:
  rev: 0.9.1
  hooks:
    - id: pip-compile
-      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
+      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
      files: ^requirements/test\.(in|txt)$
 - repo: local
  hooks:
  - id: format-torch-nightly-test
    name: reformat nightly_torch_test.txt to be in sync with test.in
    language: python
-    entry: python tools/generate_nightly_torch_test.py
+    entry: python tools/pre_commit/generate_nightly_torch_test.py
    files: ^requirements/test\.(in|txt)$
  - id: mypy-local
-    name: Run mypy for local Python installation
-    entry: python tools/pre_commit/mypy.py 0 "local"
+    name: Run mypy locally for lowest supported Python version
+    entry: python tools/pre_commit/mypy.py 0 "3.10"
    stages: [pre-commit] # Don't run in CI
    <<: &mypy_common
      language: python
@ -77,12 +78,12 @@ repos:
    stages: [manual] # Only run in CI
  - id: shellcheck
    name: Lint shell scripts
-    entry: tools/shellcheck.sh
+    entry: tools/pre_commit/shellcheck.sh
    language: script
    types: [shell]
  - id: png-lint
    name: Lint PNG exports from excalidraw
-    entry: tools/png-lint.sh
+    entry: tools/pre_commit/png-lint.sh
    language: script
    types: [png]
  - id: signoff-commit
@ -99,12 +100,12 @@ repos:
    stages: [commit-msg]
  - id: check-spdx-header
    name: Check SPDX headers
-    entry: python tools/check_spdx_header.py
+    entry: python tools/pre_commit/check_spdx_header.py
    language: python
    types: [python]
  - id: check-root-lazy-imports
    name: Check root lazy imports
-    entry: python tools/check_init_lazy_imports.py
+    entry: python tools/pre_commit/check_init_lazy_imports.py
    language: python
    types: [python]
  - id: check-filenames
@ -118,11 +119,11 @@ repos:
    pass_filenames: false
  - id: update-dockerfile-graph
    name: Update Dockerfile dependency graph
-    entry: tools/update-dockerfile-graph.sh
+    entry: tools/pre_commit/update-dockerfile-graph.sh
    language: script
  - id: enforce-import-regex-instead-of-re
    name: Enforce import regex as re
-    entry: python tools/enforce_regex_import.py
+    entry: python tools/pre_commit/enforce_regex_import.py
    language: python
    types: [python]
    pass_filenames: false
@ -130,7 +131,7 @@ repos:
  # forbid directly import triton
  - id: forbid-direct-triton-import
    name: "Forbid direct 'import triton'"
-    entry: python tools/check_triton_import.py
+    entry: python tools/pre_commit/check_triton_import.py
    language: python
    types: [python]
    pass_filenames: false
@ -143,7 +144,7 @@ repos:
    additional_dependencies: [regex]
  - id: validate-config
    name: Validate configuration has default values and that each field has a docstring
-    entry: python tools/validate_config.py
+    entry: python tools/pre_commit/validate_config.py
    language: python
    additional_dependencies: [regex]
  # Keep `suggestion` last
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")

 #
 # Try to find python package with an executable that exactly matches
@ -883,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
  "csrc/moe/torch_bindings.cpp"
  "csrc/moe/moe_align_sum_kernels.cu"
+  "csrc/moe/moe_lora_align_sum_kernels.cu"
  "csrc/moe/topk_softmax_kernels.cu")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -5,7 +5,7 @@ import gc
 from benchmark_utils import TimeCollector
 from tabulate import tabulate

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.core.block_pool import BlockPool


--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@ -46,7 +46,7 @@ import time

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -19,7 +19,7 @@ from vllm.config import (
    VllmConfig,
 )
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -37,7 +37,7 @@ from transformers import PreTrainedTokenizerBase

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -11,7 +11,7 @@ import time
 from transformers import AutoTokenizer, PreTrainedTokenizerBase

 from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 # Select a equi-probable random priority
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -31,6 +31,7 @@ import time
 import uuid
 import warnings
 from collections.abc import AsyncGenerator
+from contextlib import nullcontext
 from dataclasses import dataclass

 import datasets
@ -50,7 +51,7 @@ except ImportError:
    from backend_request_func import get_tokenizer

 try:
-    from vllm.utils import FlexibleArgumentParser
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

@ -501,15 +502,9 @@ async def benchmark(

    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

-    # This can be used once the minimum Python version is 3.10 or higher,
-    # and it will simplify the code in limited_request_func.
-    #    semaphore = (asyncio.Semaphore(max_concurrency)
-    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()

    async def limited_request_func(request_func_input, pbar):
-        if semaphore is None:
-            return await request_func(request_func_input=request_func_input, pbar=pbar)
        async with semaphore:
            return await request_func(request_func_input=request_func_input, pbar=pbar)

--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -15,7 +15,7 @@ from utils import make_rand_sparse_tensors
 from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -18,7 +18,8 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    w8a8_triton_block_scaled_mm,
 )
-from vllm.utils import FlexibleArgumentParser, cdiv
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.math_utils import cdiv

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
--- a/benchmarks/kernels/bench_per_token_quant_fp8.py
+++ b/benchmarks/kernels/bench_per_token_quant_fp8.py
@ -10,7 +10,8 @@ import torch
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


 def with_triton_mode(fn):
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

 batch_size_range = [1, 16, 32, 64, 128]
 seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@ -28,7 +28,7 @@ except ImportError as e:

 from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 parser = FlexibleArgumentParser(
    description="Benchmark BitBLAS int4 on a specific target."
--- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
+++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py
@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.scalar_type import scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 WEIGHT_SHAPES_MOE = {
    "nvidia/DeepSeek-R1-FP4": [
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
 from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 # Weight shapes for different models: [num_experts, topk, hidden_size,
 # intermediate_size]
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
 )
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 logger = init_logger(__name__)

--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_experts,
    fused_topk,
 )
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = [
    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -7,7 +7,8 @@ import torch

 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -25,7 +25,7 @@ if HAS_TRITON:
    from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
    from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 DEFAULT_TP_SIZES = [1]
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    quantize_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
    sort_weights,
 )
 from vllm.scalar_type import ScalarType, scalar_types
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 FP8_DTYPE = current_platform.fp8_dtype()

@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    block_quant_shape = get_weight_block_size_safety(config)
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
 )
 from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 FP8_DTYPE = current_platform.fp8_dtype()

@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok

    hidden_size = config.hidden_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_customized_permute = args.use_customized_permute
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@ -39,7 +39,7 @@ import torch
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -9,9 +9,9 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )

--- a/benchmarks/kernels/benchmark_polynorm.py
+++ b/benchmarks/kernels/benchmark_polynorm.py
@ -1,155 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import itertools
-
-import torch
-
-from vllm import _custom_ops as vllm_ops
-from vllm.triton_utils import triton
-
-
-def polynorm_naive(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    def norm(x, eps: float):
-        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-
-    x = x.float()
-    return (
-        (
-            weight[0] * norm(x**3, eps)
-            + weight[1] * norm(x**2, eps)
-            + weight[2] * norm(x, eps)
-            + bias
-        )
-        .to(weight.dtype)
-        .view(orig_shape)
-    )
-
-
-def polynorm_vllm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float = 1e-6,
-):
-    orig_shape = x.shape
-    x = x.view(-1, x.shape[-1])
-
-    out = torch.empty_like(x)
-    vllm_ops.poly_norm(out, x, weight, bias, eps)
-    output = out
-
-    output = output.view(orig_shape)
-    return output
-
-
-def calculate_diff(batch_size, seq_len, hidden_dim):
-    dtype = torch.bfloat16
-    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-    weight = torch.ones(3, dtype=dtype, device="cuda")
-    bias = torch.ones(1, dtype=dtype, device="cuda")
-
-    output_naive = polynorm_naive(x, weight, bias)
-    output_vllm = polynorm_vllm(x, weight, bias)
-
-    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
-        print("✅ All implementations match")
-    else:
-        print("❌ Implementations differ")
-
-
-batch_size_range = [2**i for i in range(0, 7, 2)]
-seq_length_range = [2**i for i in range(6, 11, 1)]
-dim_range = [2048, 4096]
-configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
-
-
-def get_benchmark():
-    @triton.testing.perf_report(
-        triton.testing.Benchmark(
-            x_names=["dim", "batch_size", "seq_len"],
-            x_vals=[list(_) for _ in configs],
-            line_arg="provider",
-            line_vals=["naive", "vllm"],
-            line_names=["Naive", "vLLM"],
-            styles=[("blue", "-"), ("red", "-")],
-            ylabel="us",
-            plot_name="polynorm-perf",
-            args={},
-        )
-    )
-    def benchmark(dim, batch_size, seq_len, provider):
-        dtype = torch.bfloat16
-        hidden_dim = dim * 4
-
-        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
-        weight = torch.ones(3, dtype=dtype, device="cuda")
-        bias = torch.ones(1, dtype=dtype, device="cuda")
-
-        quantiles = [0.5, 0.2, 0.8]
-
-        if provider == "naive":
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_naive(x, weight, bias),
-                quantiles=quantiles,
-            )
-        else:
-            ms, min_ms, max_ms = triton.testing.do_bench(
-                lambda: polynorm_vllm(x, weight, bias),
-                quantiles=quantiles,
-            )
-
-        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
-
-    return benchmark
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=4,
-        help="Batch size",
-    )
-    parser.add_argument(
-        "--seq-len",
-        type=int,
-        default=128,
-        help="Sequence length",
-    )
-    parser.add_argument(
-        "--hidden-dim",
-        type=int,
-        default=8192,
-        help="Intermediate size of MLP",
-    )
-    parser.add_argument(
-        "--save-path",
-        type=str,
-        default="./configs/polnorm/",
-        help="Path to save polnorm benchmark results",
-    )
-
-    args = parser.parse_args()
-
-    # Run correctness test
-    calculate_diff(
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        hidden_dim=args.hidden_dim,
-    )
-
-    benchmark = get_benchmark()
-    # Run performance benchmark
-    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -7,7 +7,8 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE


@torch.inference_mode()
--- a/benchmarks/kernels/benchmark_reshape_and_cache.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache.py
@ -9,9 +9,9 @@ from tabulate import tabulate
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
    create_kv_caches_with_random,
 )

--- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
+++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py
@ -12,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
 )
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.utils import (
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.utils.torch_utils import (
    STR_DTYPE_TO_TORCH_DTYPE,
-    FlexibleArgumentParser,
    create_kv_caches_with_random_flash,
 )

--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -8,7 +8,7 @@ import torch

 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
 from vllm.platforms import current_platform
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser


 def benchmark_rope_kernels_multi_lora(
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -8,7 +8,7 @@ from datetime import datetime
 import flashinfer
 import torch

-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up

 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -8,7 +8,7 @@ from datetime import datetime
 import flashinfer
 import torch

-from vllm.utils import round_up
+from vllm.utils.math_utils import round_up

 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 mp.set_start_method("spawn", force=True)

--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@ -11,7 +11,7 @@ import regex as re
 import seaborn as sns
 from torch.utils.benchmark import Measurement as TMeasurement

-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -1251,7 +1251,7 @@ async def main() -> None:
        default=None,
        help="The model name used in the API. "
        "If not specified, the model name will be the "
-        "same as the ``--model`` argument. ",
+        "same as the `--model` argument. ",
    )

    parser.add_argument(
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@ -5,7 +5,7 @@ import cProfile
 import pstats

 from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils.argparse_utils import FlexibleArgumentParser

 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -188,16 +188,60 @@ else()
    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 endif()

-#
-# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
-# Flag to enable ACL kernels for AARCH64 platforms
-if (VLLM_BUILD_ACL STREQUAL "ON")
-    set(USE_ACL ON)
-else()
-    set(USE_ACL OFF)
-endif()

+# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
 if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+    # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
+    # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
+    if(ASIMD_FOUND)
+        if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
+            message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
+        else()
+            message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
+            FetchContent_Populate(arm_compute
+                SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
+                SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
+                GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
+                GIT_TAG        v52.2.0
+                GIT_SHALLOW    TRUE
+                GIT_PROGRESS   TRUE
+            )
+            set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
+        endif()
+
+        # Build ACL with scons
+        include(ProcessorCount)
+        ProcessorCount(_NPROC)
+        set(_scons_cmd
+        scons -j${_NPROC}
+            Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
+            arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
+            multi_isa=1 openmp=1 cppthreads=0
+        )
+
+        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
+        # and create a local shim dir with it
+        include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
+        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
+
+        if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
+            list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
+        endif()
+
+        execute_process(
+            COMMAND ${_scons_cmd}
+            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
+            RESULT_VARIABLE _acl_rc
+        )
+        if(NOT _acl_rc EQUAL 0)
+            message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
+        endif()
+
+        set(ONEDNN_AARCH64_USE_ACL "ON")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
+        add_compile_definitions(VLLM_USE_ACL)
+    endif()
+
    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")

    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
@ -217,16 +261,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        )
    endif()

-    if(USE_ACL)
-        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
-        if(NOT ARM_COMPUTE_LIBRARY)
-            message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
-        endif()
-        set(ONEDNN_AARCH64_USE_ACL "ON")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
-        add_compile_definitions(VLLM_USE_ACL)
-    endif()
-
    set(ONEDNN_LIBRARY_TYPE "STATIC")
    set(ONEDNN_BUILD_DOC "OFF")
    set(ONEDNN_BUILD_EXAMPLES "OFF")
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
-        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
+        GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
+        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
    )

    set(FlashMLA_INCLUDES
--- a/cmake/external_projects/qutlass.cmake
+++ b/cmake/external_projects/qutlass.cmake
@ -22,10 +22,10 @@ else()
    CONFIGURE_COMMAND ""
    BUILD_COMMAND ""
  )
-  FetchContent_Populate(qutlass)
-  set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
 endif()

+FetchContent_Populate(qutlass)
+
 if(NOT qutlass_SOURCE_DIR)
  message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
 endif()
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
+          GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -129,6 +129,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
 endfunction()

+# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
+#   libgomp.so    -> libgomp-<hash>.so...
+#   libgomp.so.1  -> libgomp-<hash>.so...
+# OUTPUT: TORCH_GOMP_SHIM_DIR  ("" if not found)
+function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
+  set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
+
+  # Use run_python to locate vendored libgomp; never throw on failure.
+  run_python(_VLLM_TORCH_GOMP_PATH
+    "
+import os, glob
+try:
+  import torch
+  torch_pkg = os.path.dirname(torch.__file__)
+  site_root = os.path.dirname(torch_pkg)
+  torch_libs = os.path.join(site_root, 'torch.libs')
+  print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
+except:
+  print('')
+"
+    "failed to probe torch.libs for libgomp")
+
+  if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
+    return()
+  endif()
+
+  # Create shim under the build tree
+  set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
+  file(MAKE_DIRECTORY "${_shim}")
+
+  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
+
+  set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
+endfunction()
+
 # Macro for converting a `gencode` version number to a cmake version number.
 macro(string_to_ver OUT_VER IN_STR)
  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
--- a/codecov.yml
+++ b/codecov.yml
@ -0,0 +1,12 @@
+codecov:
+  require_ci_to_pass: false
+
+fixes:
+  # Map source code paths to repository root paths
+  # Wildcards match any Python version (python3.*)
+  - "/vllm-workspace/src/vllm/::vllm/"
+  - "/vllm-workspace/vllm/::vllm/"
+  - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
+  - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
+  - "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
+  - "/usr/lib/python3.*/site-packages/vllm/::vllm/"
--- a/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
+++ b/csrc/attention/mla/cutlass_sm100_mla/device/sm100_mla.hpp
@ -125,32 +125,37 @@ public:
  }

  static void set_split_kv (KernelArguments& args) {
-    // printf("set_split_kv start");
    if (args.split_kv >= 1) return;
    auto [H, K, D, B] = args.problem_shape;
-    // std::cout << H << " " << K << " " << D << " " << B << "\n";      
    int sm_count = args.hw_info.sm_count;
-    // printf("    sm_count = %d\n", sm_count);
-    int max_splits = ceil_div(K, 128);
-    max_splits = min(16, max_splits);
+    float seq_length_k = static_cast<float>(K) / 1024.0f;
+    int max_splits = 1;

-    // TODO: This avoids a hang when the batch size larger than 1 and 
-    // there is more than 1 kv_splits. 
-    // Discuss with NVIDIA how this can be fixed.
-    if (B > 1) {
-      max_splits = min(1, max_splits);
+    if (B <= 4 && seq_length_k >= 16) {
+      max_splits = 16;
    }
-    
-    // printf("    max_splits = %d\n", max_splits);
+    else if (B <= 8 && seq_length_k >= 4) {
+      max_splits = 8;
+    }
+    else if ((B <= 16 && seq_length_k >= 8) ||
+             (B == 48 && seq_length_k >= 32)) {
+      max_splits = 4;
+    }
+    else if ((B <= 32 && seq_length_k >= 16) ||
+             (B == 96 && seq_length_k >= 16)) {
+      max_splits = 2;
+    }
+    else {
+      max_splits = 1;
+    }
+
+    // Wave-aware scheduling: ensure integer number of waves in K dimension
    int sms_per_batch = max(1, sm_count / B);
-    // printf("    sms_per_batch = %d\n", sms_per_batch);
    int split_heur = min(max_splits, sms_per_batch);
    int waves = ceil_div(B * split_heur, sm_count);
    int k_waves = ceil_div(max_splits, split_heur);
    int split_wave_aware = ceil_div(max_splits, k_waves);
    args.split_kv = split_wave_aware;
-    // printf("    args.split_kv = %d\n", args.split_kv);
-
  }

  /// Determines whether the GEMM can execute the given problem.
--- a/csrc/core/batch_invariant.hpp
+++ b/csrc/core/batch_invariant.hpp
@ -5,11 +5,11 @@

 namespace vllm {

-// vllm_kernel_override_batch_invariant(); returns true
-// if env VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT=1
-inline bool vllm_kernel_override_batch_invariant() {
+// vllm_is_batch_invariant(); returns true
+// if env VLLM_BATCH_INVARIANT=1
+inline bool vllm_is_batch_invariant() {
  static bool cached = []() {
-    std::string env_key = "VLLM_KERNEL_OVERRIDE_BATCH_INVARIANT";
+    std::string env_key = "VLLM_BATCH_INVARIANT";
    const char* val = std::getenv(env_key.c_str());
    return (val && std::atoi(val) != 0) ? 1 : 0;
  }();
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@ -187,7 +187,8 @@ template <>
 struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
  size_t operator()(
      const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
-    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
+    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
+           hash<int>()(static_cast<int>(val.b_type));
  }
 };

@ -216,7 +217,8 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,

 bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
                const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
-  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
+  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
+         l.b_type == r.b_type;
 }

 bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
@ -493,8 +495,10 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
 dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
    const MSizeCacheKey& key) {
  if (m_size_cache_.get() == nullptr) {
-    ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
-    m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
+    ClassMatmulCacheKey class_key = {
+        .b_n_size = b_n_size_, .b_k_size = b_k_size_, .b_type = b_type_};
+    m_size_cache_ =
+        get_matul_class_primitive_cache(class_key, primitive_cache_size_);
  }
  return m_size_cache_->get_or_create(key, [&]() {
    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@ -199,6 +199,7 @@ class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
  struct ClassMatmulCacheKey {
    dnnl_dim_t b_n_size;
    dnnl_dim_t b_k_size;
+    dnnl::memory::data_type b_type;

    friend bool operator==(const ClassMatmulCacheKey& l,
                           const ClassMatmulCacheKey& r);
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -2,6 +2,7 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
+#include "quantization/vectorization_utils.cuh"

 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -18,11 +19,22 @@ __global__ void rms_norm_kernel(
    const float epsilon, const int num_tokens, const int hidden_size) {
  __shared__ float s_variance;
  float variance = 0.0f;
+  const scalar_t* input_row = input + blockIdx.x * input_stride;

-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+  constexpr int VEC_SIZE = 8;
+  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      float x = static_cast<float>(vec.val[i]);
+      variance += x * x;
+    }
+  };
+  auto scalar_op = [&variance](const scalar_t& val) {
+    float x = static_cast<float>(val);
    variance += x * x;
-  }
+  };
+  vllm::vectorize_read_with_alignment<VEC_SIZE>(
+      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
@ -136,211 +148,6 @@ fused_add_rms_norm_kernel(
  }
 }

-/* Function specialization in the case of FP16/BF16 tensors.
-   Additional optimizations we can make in this case are
-   packed and vectorized operations, which help with the
-   memory latency bottleneck.
-
-   _f16VecPN struct extends _f16Vec to add operations specifically required for
-   polynomial normalization (poly norm).
-   The original _f16Vec does not include the sum-of-powers computation or
-   in-place polynomial normalization logic. */
-template <typename scalar_t, int width>
-struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
-  using Base = _f16Vec<scalar_t, width>;
-  using Converter = typename Base::Converter;
-  using T1 = typename Base::T1;
-  using T2 = typename Base::T2;
-  using Base::data;
-
-  __device__ auto sum_pows() const {
-    float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < width; i += 2) {
-      float2 z = Converter::convert(T2{data[i], data[i + 1]});
-      float x2 = z.x * z.x;
-      float x4 = x2 * x2;
-      float x6 = x4 * x2;
-
-      float y2 = z.y * z.y;
-      float y4 = y2 * y2;
-      float y6 = y4 * y2;
-
-      s2 += x2 + y2;
-      s4 += x4 + y4;
-      s6 += x6 + y6;
-    }
-    return std::make_tuple(s2, s4, s6);
-  }
-
-  __device__ void poly_norm_inplace(const float w2_inv_std,
-                                    const float w1_inv_std2,
-                                    const float w0_inv_std3, const float bias) {
-#pragma unroll
-    for (int i = 0; i < width; i += 2) {
-      float2 z = Converter::convert(T2{data[i], data[i + 1]});
-
-      float x2 = z.x * z.x;
-      float x3 = x2 * z.x;
-      z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
-
-      float y2 = z.y * z.y;
-      float y3 = y2 * z.y;
-      z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
-
-      auto out = Converter::convert(z);
-      data[i] = out.x;
-      data[i + 1] = out.y;
-    }
-  }
-};
-
-template <typename scalar_t, int width>
-__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
-poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
-                 const scalar_t* __restrict__ input,   // [..., hidden_size]
-                 const scalar_t* __restrict__ weight,  // [3]
-                 const scalar_t* __restrict__ bias,    // [1]
-                 const float epsilon, const int hidden_size) {
-  // Sanity checks on our vector struct and type-punned pointer arithmetic
-  static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
-  static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
-
-  /* These and the argument pointers are all declared `restrict` as they are
-     not aliased in practice. Argument pointers should not be dereferenced
-     in this kernel as that would be undefined behavior */
-  auto* __restrict__ input_v =
-      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
-  const int vec_hidden_size = hidden_size / width;
-  float variance = 0.0f;
-  float variance2 = 0.0f;
-  float variance3 = 0.0f;
-
-  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
-    int id = blockIdx.x * vec_hidden_size + idx;
-    _f16VecPN<scalar_t, width> temp = input_v[id];
-    auto [x2, x4, x6] = temp.sum_pows();
-
-    variance += x2;
-    variance2 += x4;
-    variance3 += x6;
-  }
-
-  float3 thread_variances = make_float3(variance, variance2, variance3);
-
-  struct SumOp {
-    __device__ float3 operator()(const float3& a, const float3& b) const {
-      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-    }
-  };
-
-  using BlockReduce = cub::BlockReduce<float3, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  float3 block_variances =
-      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
-
-  variance = block_variances.x;
-  variance2 = block_variances.y;
-  variance3 = block_variances.z;
-
-  __shared__ float s_w2_inv_std;
-  __shared__ float s_w1_inv_std2;
-  __shared__ float s_w0_inv_std3;
-  __shared__ float s_bias;
-
-  if (threadIdx.x == 0) {
-    float w0 = (float)weight[0];
-    float w1 = (float)weight[1];
-    float w2 = (float)weight[2];
-    s_bias = (float)bias[0];
-
-    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
-    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
-    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
-  }
-  __syncthreads();
-
-  auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
-
-  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
-    int id = blockIdx.x * vec_hidden_size + idx;
-    _f16VecPN<scalar_t, width> temp = input_v[id];
-    temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
-    out_v[id] = temp;
-  }
-}
-
-/* Generic poly_norm_kernel
-   The width field is not used here but necessary for other specializations.
- */
-template <typename scalar_t, int width>
-__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
-poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
-                 const scalar_t* __restrict__ input,   // [..., hidden_size]
-                 const scalar_t* __restrict__ weight,  // [3]
-                 const scalar_t* __restrict__ bias,    // [1]
-                 const float epsilon, const int hidden_size) {
-  float variance = 0.0f;
-  float variance2 = 0.0f;
-  float variance3 = 0.0f;
-
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * hidden_size + idx];
-    float x2 = x * x;
-    float x4 = x2 * x2;
-    float x6 = x4 * x2;
-
-    variance += x2;
-    variance2 += x4;
-    variance3 += x6;
-  }
-
-  float3 thread_variances = make_float3(variance, variance2, variance3);
-
-  struct SumOp {
-    __device__ float3 operator()(const float3& a, const float3& b) const {
-      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-    }
-  };
-
-  using BlockReduce = cub::BlockReduce<float3, 1024>;
-  __shared__ typename BlockReduce::TempStorage reduceStore;
-  float3 block_variances =
-      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
-
-  variance = block_variances.x;
-  variance2 = block_variances.y;
-  variance3 = block_variances.z;
-
-  __shared__ float s_w2_inv_std;
-  __shared__ float s_w1_inv_std2;
-  __shared__ float s_w0_inv_std3;
-  __shared__ float s_bias;
-
-  if (threadIdx.x == 0) {
-    float w0 = (float)weight[0];
-    float w1 = (float)weight[1];
-    float w2 = (float)weight[2];
-    s_bias = (float)bias[0];
-
-    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
-    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
-    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
-  }
-  __syncthreads();
-
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    float x = (float)input[blockIdx.x * hidden_size + idx];
-    float x2 = x * x;
-    float x3 = x2 * x;
-
-    out[blockIdx.x * hidden_size + idx] =
-        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
-                   s_bias);
-  }
-}
-
 }  // namespace vllm

 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
@ -352,18 +159,26 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
  TORCH_CHECK(weight.is_contiguous());

  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-  int64_t input_stride = input.stride(-2);
+
+  // We cannot just use `input.stride(-2)` if the tensor is not row-major.
+  // Instead, we use a 2d view to get the second-innermost stride.
+  // That way the dimensions (except the last one) can be arbitrarily permuted.
+  torch::Tensor input_view = input.view({-1, hidden_size});
+
+  int num_tokens = input_view.numel() / hidden_size;
+  int64_t input_stride = input_view.stride(-2);

  dim3 grid(num_tokens);
  dim3 block(std::min(hidden_size, 1024));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
-    vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(), input_stride,
-        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
-  });
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input_view.scalar_type(), "rms_norm_kernel", [&] {
+        vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
+            input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
+            hidden_size);
+      });
 }

 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
@ -380,6 +195,8 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                        torch::Tensor& residual,  // [..., hidden_size]
                        torch::Tensor& weight,    // [hidden_size]
                        double epsilon) {
+  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
+  TORCH_CHECK(input.scalar_type() == residual.scalar_type());
  TORCH_CHECK(residual.is_contiguous());
  TORCH_CHECK(weight.is_contiguous());
  int hidden_size = input.size(-1);
@ -414,7 +231,7 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                          wt_ptr % req_alignment_bytes == 0;
  bool offsets_are_multiple_of_vector_width =
      hidden_size % vector_width == 0 && input_stride % vector_width == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && offsets_are_multiple_of_vector_width &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
@ -422,50 +239,3 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
 }
-
-#define LAUNCH_FUSED_POLY_NORM(width)                                         \
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
-    vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                 \
-        weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon,      \
-        hidden_size);                                                         \
-  });
-
-void poly_norm(torch::Tensor& out,     // [..., hidden_size]
-               torch::Tensor& input,   // [..., hidden_size]
-               torch::Tensor& weight,  // [3]
-               torch::Tensor& bias,    // [1]
-               double epsilon) {
-  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.data_ptr() != input.data_ptr());
-
-  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-
-  dim3 grid(num_tokens);
-  /* This kernel is memory-latency bound in many scenarios.
-     When num_tokens is large, a smaller block size allows
-     for increased block occupancy on CUs and better latency
-     hiding on global mem ops. */
-  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
-  dim3 block(std::min(hidden_size, max_block_size));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  /*If the tensor types are FP16/BF16, try to use the optimized kernel
-    with packed + vectorized ops.
-    Max optimization is achieved with a width-8 vector of FP16/BF16s
-    since we can load at most 128 bits at once in a global memory op.
-    However, this requires each tensor's data to be aligned to 16
-    bytes.
-   */
-  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
-  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
-  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
-  if (ptrs_are_aligned && hidden_size % 8 == 0 && !batch_invariant_launch) {
-    LAUNCH_FUSED_POLY_NORM(8);
-  } else {
-    LAUNCH_FUSED_POLY_NORM(0);
-  }
-}
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@ -10,6 +10,7 @@
 #include "dispatch_utils.h"
 #include "cub_helpers.h"
 #include "core/batch_invariant.hpp"
+#include "quantization/vectorization_utils.cuh"

 #include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
@ -28,10 +29,22 @@ __global__ void rms_norm_static_fp8_quant_kernel(
  __shared__ float s_variance;
  float variance = 0.0f;

-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
-    const float x = (float)input[blockIdx.x * input_stride + idx];
+  const scalar_t* input_row = input + blockIdx.x * input_stride;
+
+  constexpr int VEC_SIZE = 8;
+  auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      float x = static_cast<float>(vec.val[i]);
+      variance += x * x;
+    }
+  };
+  auto scalar_op = [&variance](const scalar_t& val) {
+    float x = static_cast<float>(val);
    variance += x * x;
-  }
+  };
+  vllm::vectorize_read_with_alignment<VEC_SIZE>(
+      input_row, hidden_size, threadIdx.x, blockDim.x, vec_op, scalar_op);

  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
@ -216,6 +229,8 @@ void fused_add_rms_norm_static_fp8_quant(
    double epsilon) {
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(residual.is_contiguous());
+  TORCH_CHECK(residual.scalar_type() == input.scalar_type());
+  TORCH_CHECK(weight.scalar_type() == input.scalar_type());
  int hidden_size = input.size(-1);
  int input_stride = input.stride(-2);
  int num_tokens = input.numel() / hidden_size;
@ -241,7 +256,7 @@ void fused_add_rms_norm_static_fp8_quant(
  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
  bool ptrs_are_aligned =
      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
-  bool batch_invariant_launch = vllm::vllm_kernel_override_batch_invariant();
+  bool batch_invariant_launch = vllm::vllm_is_batch_invariant();
  if (ptrs_are_aligned && hidden_size % 8 == 0 && input_stride % 8 == 0 &&
      !batch_invariant_launch) {
    LAUNCH_FUSED_ADD_RMS_NORM(8);
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -8,12 +8,77 @@

 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
+#include "core/math.hpp"

 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))

 namespace vllm {
 namespace moe {

+namespace batched_moe_align_block_size {
+
+// Note num_threads needs to be 1024 for BlockScan Reduction in the kernel.
+static constexpr int32_t num_threads = 1024;
+static constexpr int32_t num_blocks = 1;
+__global__ void batched_moe_align_block_size_kernel(
+    int32_t const num_batches, int32_t const max_tokens_per_batch,
+    int32_t const block_size, int32_t const* __restrict__ batch_num_tokens,
+    int32_t* __restrict__ sorted_ids, int32_t* __restrict__ block_ids,
+    int32_t* __restrict__ num_tokens_post_pad) {
+  // TODO(varun): This is a naive implementation. Could be optimized.
+
+  size_t const batch_id = threadIdx.x;
+  size_t const stride = blockDim.x * gridDim.x;
+  int32_t const num_blocks_per_batch =
+      CEILDIV(max_tokens_per_batch, block_size);
+  int32_t const sorted_ids_size =
+      num_blocks_per_batch * num_batches * block_size;
+  int32_t const block_ids_size = sorted_ids_size / block_size;
+  int32_t const SENTINEL =
+      num_batches * max_tokens_per_batch;  // To denote invalid entries.
+  // Intialize sorted_ids
+  for (size_t i = threadIdx.x; i < sorted_ids_size; i += stride) {
+    sorted_ids[i] = SENTINEL;
+  }
+  // Intialize expert_ids with -1
+  for (size_t i = threadIdx.x; i < block_ids_size; i += stride) {
+    block_ids[i] = -1;
+  }
+
+  int32_t b_num_tokens = 0;
+  if (batch_id < num_batches) {
+    b_num_tokens = batch_num_tokens[batch_id];
+  }
+  int32_t const ceil_b_num_tokens =
+      CEILDIV(b_num_tokens, block_size) * block_size;
+
+  // Compute prefix sum over token counts per expert
+  using BlockScan = cub::BlockScan<int32_t, 1024>;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  int cumsum_val;
+  BlockScan(temp_storage).ExclusiveSum(ceil_b_num_tokens, cumsum_val);
+  __syncthreads();
+
+  bool const is_last_batch = batch_id == (num_batches - 1);
+  if (is_last_batch) {
+    *num_tokens_post_pad = cumsum_val + ceil_b_num_tokens;
+  }
+
+  if (batch_id < num_batches) {
+    int32_t const batch_offset = batch_id * max_tokens_per_batch;
+    for (size_t i = 0; i < b_num_tokens; ++i) {
+      sorted_ids[cumsum_val + i] = batch_offset + i;
+    }
+
+    int32_t const block_start = cumsum_val / block_size;
+    int32_t const num_blocks = ceil_b_num_tokens / block_size;
+    for (size_t i = 0; i < num_blocks; ++i) {
+      block_ids[block_start + i] = batch_id;
+    }
+  }
+}
+}  // namespace batched_moe_align_block_size
+
 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(
    const scalar_t* __restrict__ topk_ids,
@ -280,6 +345,33 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
      });
 }

+void batched_moe_align_block_size(int64_t max_tokens_per_batch,
+                                  int64_t block_size,
+                                  torch::Tensor const& batch_num_tokens,
+                                  torch::Tensor sorted_ids,
+                                  torch::Tensor batch_ids,
+                                  torch::Tensor num_tokens_post_pad) {
+  namespace batched_kernel = vllm::moe::batched_moe_align_block_size;
+
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  int32_t const B = batch_num_tokens.size(0);
+  int32_t const num_blocks_per_batch =
+      round_to_next_multiple_of(max_tokens_per_batch, block_size) / block_size;
+  int32_t const num_blocks = num_blocks_per_batch * B;
+  int64_t const sorted_ids_size = num_blocks * block_size;
+
+  TORCH_CHECK(sorted_ids.size(0) == sorted_ids_size);
+  TORCH_CHECK(batch_ids.size(0) == sorted_ids_size / block_size);
+  TORCH_CHECK(num_tokens_post_pad.size(0) == 1);
+  TORCH_CHECK(B <= batched_kernel::num_threads);
+
+  batched_kernel::batched_moe_align_block_size_kernel<<<
+      batched_kernel::num_blocks, batched_kernel::num_threads, 0, stream>>>(
+      B, max_tokens_per_batch, block_size, batch_num_tokens.data_ptr<int32_t>(),
+      sorted_ids.data_ptr<int32_t>(), batch_ids.data_ptr<int32_t>(),
+      num_tokens_post_pad.data_ptr<int32_t>());
+}
+
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
             torch::Tensor& output)  // [num_tokens, hidden_size]
 {
--- a/csrc/moe/moe_lora_align_sum_kernels.cu
+++ b/csrc/moe/moe_lora_align_sum_kernels.cu
@ -0,0 +1,169 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/Atomic.cuh>
+
+#include "../cuda_compat.h"
+#include "../dispatch_utils.h"
+#include "core/math.hpp"
+
+namespace {
+
+__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
+                                         int32_t col) {
+  return row * total_col + col;
+}
+
+}  // namespace
+
+// TODO: Refactor common parts with moe_align_sum_kernels
+template <typename scalar_t, typename token_cnts_t>
+__global__ void moe_lora_align_sum_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
+    int64_t block_size, int num_experts, int max_loras, size_t numel,
+    int max_num_tokens_padded, int max_num_m_blocks,
+    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
+    int topk_num, int32_t* total_tokens_post_pad) {
+  const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
+  const size_t start_idx = threadIdx.x * tokens_per_thread;
+
+  int lora_id = blockIdx.x;
+  extern __shared__ int32_t shared_mem[];
+  int32_t* cumsum = shared_mem;
+  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
+
+  // Initialize sorted_token_ids with numel
+  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
+    sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
+  }
+
+  // Initialize expert_ids with -1
+  for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
+    expert_ids[lora_id * max_num_m_blocks + it] = -1;
+  }
+
+  // Initialize total_tokens_post_pad with 0
+  if (threadIdx.x == 0) {
+    total_tokens_post_pad[lora_id] = 0;
+  }
+
+  for (int i = 0; i < num_experts; ++i) {
+    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
+  }
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int mask = token_lora_mapping[i / topk_num] == lora_id;
+    int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
+    tokens_cnts[idx] += mask;
+  }
+
+  __syncthreads();
+
+  // For each expert we accumulate the token counts from the different threads.
+  if (threadIdx.x < num_experts) {
+    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
+    for (int i = 1; i <= blockDim.x; ++i) {
+      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
+          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
+    }
+  }
+
+  __syncthreads();
+
+  // We accumulate the token counts of all experts in thread 0.
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      cumsum[i] = cumsum[i - 1] +
+                  div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
+                           block_size) *
+                      block_size;
+    }
+    total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
+  }
+
+  __syncthreads();
+
+  /**
+   * For each expert, each thread processes the tokens of the corresponding
+   * blocks and stores the corresponding expert_id for each block.
+   */
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
+          threadIdx.x;
+    }
+  }
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int32_t expert_id = topk_ids[i];
+    /** The cumsum[expert_id] stores the starting index of the tokens that the
+     * expert with expert_id needs to process, and
+     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
+     * processed by the expert with expert_id within the current thread's token
+     * shard.
+     */
+    int32_t rank_post_pad =
+        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
+        cumsum[expert_id];
+
+    int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
+    atomicAdd(
+        &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
+        (i - numel) * mask);
+    tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
+  }
+}
+
+void moe_lora_align_block_size(torch::Tensor topk_ids,
+                               torch::Tensor token_lora_mapping,
+                               int64_t num_experts, int64_t block_size,
+                               int64_t max_loras, int64_t max_num_tokens_padded,
+                               int64_t max_num_m_blocks,
+                               torch::Tensor sorted_token_ids,
+                               torch::Tensor expert_ids,
+                               torch::Tensor num_tokens_post_pad) {
+  const int topk_num = topk_ids.size(1);
+
+  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
+
+  int device_max_shared_mem;
+  auto dev = topk_ids.get_device();
+  cudaDeviceGetAttribute(&device_max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int32_t num_thread = max((int32_t)num_experts, 128);  // WARP_SIZE,
+  TORCH_CHECK(num_thread <= 1024,
+              "num_thread must be less than 1024, "
+              "and fallback is not implemented yet.");
+  const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
+                             (num_experts + 1) * sizeof(int32_t);
+
+  if (shared_mem > device_max_shared_mem) {
+    TORCH_CHECK(false,
+                "Shared memory usage exceeds device limit, and global memory "
+                "fallback is not implemented yet.");
+  }
+
+  VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
+        dim3 blockDim(num_thread);
+        auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
+        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+            (void*)kernel, shared_mem));
+        kernel<<<max_loras, blockDim, shared_mem, stream>>>(
+            topk_ids.data_ptr<scalar_t>(),
+            token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
+            max_loras, topk_ids.numel(), max_num_tokens_padded,
+            max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
+            expert_ids.data_ptr<int32_t>(), topk_num,
+            num_tokens_post_pad.data_ptr<int32_t>());
+      });
+}
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -4,7 +4,7 @@

 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
-                  torch::Tensor& gating_output);
+                  torch::Tensor& gating_output, bool renormalize);

 void moe_sum(torch::Tensor& input, torch::Tensor& output);

@ -12,6 +12,22 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
+
+void batched_moe_align_block_size(int64_t max_tokens_per_batch,
+                                  int64_t block_size,
+                                  torch::Tensor const& expert_num_tokens,
+                                  torch::Tensor sorted_ids,
+                                  torch::Tensor expert_ids,
+                                  torch::Tensor num_tokens_post_pad);
+
+void moe_lora_align_block_size(torch::Tensor topk_ids,
+                               torch::Tensor token_lora_mapping,
+                               int64_t num_experts, int64_t block_size,
+                               int64_t max_loras, int64_t max_num_tokens_padded,
+                               int64_t max_num_m_blocks,
+                               torch::Tensor sorted_token_ids,
+                               torch::Tensor expert_ids,
+                               torch::Tensor num_tokens_post_pad);
 #ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor b_qweight, torch::Tensor b_scales,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -16,12 +16,23 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+#include <type_traits>
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
 #include "../cub_helpers.h"

+#ifndef USE_ROCM
+    #include <cuda_bf16.h>
+    #include <cuda_fp16.h>
+#else
+    #include <hip/hip_bf16.h>
+    #include <hip/hip_fp16.h>
+    typedef __hip_bfloat16 __nv_bfloat16;
+    typedef __hip_bfloat162 __nv_bfloat162;
+#endif
+
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))

@ -36,16 +47,27 @@ template <
    /// Alignment requirement in bytes
    int Alignment = sizeof(T) * N
 >
-class alignas(Alignment) AlignedArray {
-    float data[N];
+struct alignas(Alignment) AlignedArray {
+    T data[N];
 };

+template <typename T>
+__device__ __forceinline__ float toFloat(T value) {
+    if constexpr (std::is_same_v<T, float>) {
+        return value;
+    } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        return __bfloat162float(value);
+    } else if constexpr (std::is_same_v<T, __half>) {
+        return __half2float(value);
+    }
+}
+
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
-template <int TPB>
+template <int TPB, typename InputType>
 __launch_bounds__(TPB) __global__
-    void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
+    void moeSoftmax(const InputType* input, const bool* finished, float* output, const int num_cols)
 {
    using BlockReduce = cub::BlockReduce<float, TPB>;
    __shared__ typename BlockReduce::TempStorage tmpStorage;
@ -66,7 +88,8 @@ __launch_bounds__(TPB) __global__
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
-        threadData = max(static_cast<float>(input[idx]), threadData);
+        const float val = toFloat(input[idx]);
+        threadData = max(val, threadData);
    }

    const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, CubMaxOp());
@ -81,7 +104,8 @@ __launch_bounds__(TPB) __global__
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
-        threadData += exp((static_cast<float>(input[idx]) - float_max));
+        const float val = toFloat(input[idx]);
+        threadData += expf(val - float_max);
    }

    const auto Z = BlockReduce(tmpStorage).Reduce(threadData, CubAddOp());
@ -95,8 +119,9 @@ __launch_bounds__(TPB) __global__
    for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
    {
        const int idx = thread_row_offset + ii;
-        const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
-        output[idx] = val;
+        const float val = toFloat(input[idx]);
+        const float softmax_val = expf(val - float_max) * normalizing_factor;
+        output[idx] = softmax_val;
    }
 }

@ -110,7 +135,8 @@ __launch_bounds__(TPB) __global__ void moeTopK(
    const int num_experts,
    const int k,
    const int start_expert,
-    const int end_expert)
+    const int end_expert,
+    const bool renormalize)
 {

    using cub_kvp = cub::KeyValuePair<int, float>;
@ -125,6 +151,7 @@ __launch_bounds__(TPB) __global__ void moeTopK(

    const bool row_is_active = finished ? !finished[block_row] : true;
    const int thread_read_offset = blockIdx.x * num_experts;
+    float selected_sum = 0.f;
    for (int k_idx = 0; k_idx < k; ++k_idx)
    {
        thread_kvp.key = 0;
@ -163,9 +190,23 @@ __launch_bounds__(TPB) __global__ void moeTopK(
            indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
            assert(indices[idx] >= 0);
            source_rows[idx] = k_idx * num_rows + block_row;
+            if (renormalize) {
+                selected_sum += result_kvp.value;
+            }
        }
        __syncthreads();
    }
+
+    // Renormalize the k weights for this row to sum to 1, if requested.
+    if (renormalize) {
+        if (threadIdx.x == 0) {
+            const float denom = selected_sum > 0.f ? selected_sum : 1.f;
+            for (int k_idx = 0; k_idx < k; ++k_idx) {
+                const int idx = k * block_row + k_idx;
+                output[idx] = output[idx] / denom;
+            }
+        }
+    }
 }

 // ====================== TopK softmax things ===============================
@ -184,21 +225,30 @@ __launch_bounds__(TPB) __global__ void moeTopK(
  2) This implementation assumes k is small, but will work for any k.
 */

-template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType>
+template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename IndType, typename InputType = float>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
-    void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
-        int* source_rows, const int k, const int start_expert, const int end_expert)
+    void topkGatingSoftmax(const InputType* input, const bool* finished, float* output, const int num_rows, IndType* indices,
+        int* source_rows, const int k, const int start_expert, const int end_expert, const bool renormalize)
 {
+    static_assert(std::is_same_v<InputType, float> || std::is_same_v<InputType, __nv_bfloat16> ||
+                      std::is_same_v<InputType, __half>,
+                  "InputType must be float, __nv_bfloat16, or __half");
+
    // We begin by enforcing compile time assertions and setting up compile time constants.
    static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
    static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");

    // Number of bytes each thread pulls in per load
-    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
    static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
    static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
    static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;

+    if constexpr (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) {
+        static_assert(ELTS_PER_LDG == 1 || ELTS_PER_LDG % 2 == 0,
+            "ELTS_PER_LDG must be 1 or even for 16-bit conversion");
+    }
+
    // Restrictions based on previous section.
    static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
    static_assert(WARP_SIZE_PARAM % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
@ -236,27 +286,71 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__

    // We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
    // row it will read.
-    const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
+    const InputType* thread_row_ptr = input + thread_row * ELTS_PER_ROW;

    // Now, we compute the group each thread belong to in order to determine the first column to start loads.
    const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
    const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
-    const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
-
-    // Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
-    // this can support all powers of 2 up to 16.
-    // NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
-    // We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
-    using AccessType = AlignedArray<float, ELTS_PER_LDG>;
+    const InputType* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;

    // Finally, we pull in the data from global mem
    float row_chunk[VPT];
-    AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
-    const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
+
+    // NOTE(zhuhaoran): dispatch different input types loading, BF16/FP16 convert to float
+    if constexpr (std::is_same_v<InputType, float>) {
+        using VecType = AlignedArray<float, ELTS_PER_LDG>;
+        VecType* row_chunk_vec_ptr = reinterpret_cast<VecType*>(&row_chunk);
+        const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
 #pragma unroll
-    for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
-    {
-        row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+        for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+            row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+        }
+    } else if constexpr (std::is_same_v<InputType, __nv_bfloat16>) {
+        if constexpr (ELTS_PER_LDG >= 2) {
+            using VecType = AlignedArray<__nv_bfloat16, ELTS_PER_LDG>;
+            float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
+            const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+                int base_idx_f2 = ii * ELTS_PER_LDG / 2;
+#pragma unroll
+                for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
+                    row_chunk_f2[base_idx_f2 + jj] = __bfloat1622float2(
+                        *reinterpret_cast<const __nv_bfloat162*>(vec.data + jj * 2)
+                    );
+                }
+            }
+        } else { // ELTS_PER_LDG == 1
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                const __nv_bfloat16* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
+                row_chunk[ii] = __bfloat162float(*scalar_ptr);
+            }
+        }
+    } else if constexpr (std::is_same_v<InputType, __half>) {
+        if constexpr (ELTS_PER_LDG >= 2) {
+            using VecType = AlignedArray<__half, ELTS_PER_LDG>;
+            float2* row_chunk_f2 = reinterpret_cast<float2*>(row_chunk);
+            const VecType* vec_thread_read_ptr = reinterpret_cast<const VecType*>(thread_read_ptr);
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                VecType vec = vec_thread_read_ptr[ii * THREADS_PER_ROW];
+                int base_idx_f2 = ii * ELTS_PER_LDG / 2;
+#pragma unroll
+                for (int jj = 0; jj < ELTS_PER_LDG / 2; ++jj) {
+                    row_chunk_f2[base_idx_f2 + jj] = __half22float2(
+                        *reinterpret_cast<const __half2*>(vec.data + jj * 2)
+                    );
+                }
+            }
+        } else { // ELTS_PER_LDG == 1
+#pragma unroll
+            for (int ii = 0; ii < LDG_PER_THREAD; ++ii) {
+                const __half* scalar_ptr = thread_read_ptr + ii * THREADS_PER_ROW;
+                row_chunk[ii] = __half2float(*scalar_ptr);
+            }
+        }
    }

    // First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
@ -310,6 +404,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
    int start_col = first_elt_read_by_thread;
    static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;

+    float selected_sum = 0.f;
    for (int k_idx = 0; k_idx < k; ++k_idx)
    {
        // First, each thread does the local argmax
@ -363,6 +458,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
            output[idx] = max_val;
            indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
            source_rows[idx] = k_idx * num_rows + thread_row;
+            if (renormalize) {
+                selected_sum += max_val;
+            }
        }

        // Finally, we clear the value in the thread with the current max if there is another iteration to run.
@ -380,15 +478,28 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
            }
        }
    }
+
+    // Renormalize the k weights for this row to sum to 1, if requested.
+    if (renormalize) {
+        if (thread_group_idx == 0)
+        {
+            const float denom = selected_sum > 0.f ? selected_sum : 1.f;
+            for (int k_idx = 0; k_idx < k; ++k_idx)
+            {
+                const int idx = k * thread_row + k_idx;
+                output[idx] = output[idx] / denom;
+            }
+        }
+    }
 }

 namespace detail
 {
 // Constructs some constants needed to partition the work across threads at compile time.
-template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM>
+template <int EXPERTS, int BYTES_PER_LDG, int WARP_SIZE_PARAM, typename InputType>
 struct TopkConstants
 {
-    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
+    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(InputType);
    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE_PARAM) == 0, "");
    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE_PARAM));
    static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
@ -397,20 +508,21 @@ struct TopkConstants
 };
 } // namespace detail

-template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
-void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
-    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
+template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType, typename InputType>
+void topkGatingSoftmaxLauncherHelper(const InputType* input, const bool* finished, float* output, IndType* indices,
+    int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, const bool renormalize,
+    cudaStream_t stream)
 {
-    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
-    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
+    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(InputType) * EXPERTS);
+    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM, InputType>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
    const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
    const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;

    dim3 block_dim(WARP_SIZE_PARAM, WARPS_PER_TB);
-    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM><<<num_blocks, block_dim, 0, stream>>>(
-        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
+    topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG, WARP_SIZE_PARAM, IndType, InputType><<<num_blocks, block_dim, 0, stream>>>(
+        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert, renormalize);
 }

 #ifndef USE_ROCM
@ -418,26 +530,26 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
    static_assert(WARP_SIZE == 32,                                                    \
                  "Unsupported warp size. Only 32 is supported for CUDA");            \
    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
-        gating_output, nullptr, topk_weights, topk_indices,                           \
-        token_expert_indices, num_tokens, topk, 0, num_experts, stream);
+        gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,     \
+        num_tokens, topk, 0, num_experts, renormalize, stream);
 #else
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
    if (WARP_SIZE == 64) {                                                               \
        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices,                          \
-            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
    } else if (WARP_SIZE == 32) {                                                        \
        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
-            gating_output, nullptr, topk_weights, topk_indices,                          \
-            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+            gating_output, nullptr, topk_weights, topk_indices, token_expert_indices,    \
+            num_tokens, topk, 0, num_experts, renormalize, stream);                      \
    } else {                                                                             \
        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
    }
 #endif

-template <typename IndType>
+template <typename IndType, typename InputType>
 void topkGatingSoftmaxKernelLauncher(
-    const float* gating_output,
+    const InputType* gating_output,
    float* topk_weights,
    IndType* topk_indices,
    int* token_expert_indices,
@ -445,11 +557,15 @@ void topkGatingSoftmaxKernelLauncher(
    const int num_tokens,
    const int num_experts,
    const int topk,
+    const bool renormalize,
    cudaStream_t stream) {
    static constexpr int WARPS_PER_TB = 4;
    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
 #ifndef USE_ROCM
-    static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
+    // for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
+    // elements can be loaded by a warp
+    static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
+    (std::is_same_v<InputType, __nv_bfloat16> || std::is_same_v<InputType, __half>) ? 4 : 8;
 #endif
    switch (num_experts) {
        case 1:
@ -506,11 +622,11 @@ void topkGatingSoftmaxKernelLauncher(
            TORCH_CHECK(softmax_workspace != nullptr,
                "softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
            static constexpr int TPB = 256;
-            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
+            moeSoftmax<TPB, InputType><<<num_tokens, TPB, 0, stream>>>(
                gating_output, nullptr, softmax_workspace, num_experts);
            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
-                num_experts, topk, 0, num_experts);
+                num_experts, topk, 0, num_experts, renormalize);
        }
    }
 }
@ -518,11 +634,50 @@ void topkGatingSoftmaxKernelLauncher(
 } // namespace moe
 } // namespace vllm

+
+template<typename ComputeType>
+void dispatch_topk_softmax_launch(
+    torch::Tensor& gating_output,
+    torch::Tensor& topk_weights,
+    torch::Tensor& topk_indices,
+    torch::Tensor& token_expert_indices,
+    torch::Tensor& softmax_workspace,
+    int num_tokens, int num_experts, int topk, bool renormalize, cudaStream_t stream)
+{
+    if (topk_indices.scalar_type() == at::ScalarType::Int) {
+        vllm::moe::topkGatingSoftmaxKernelLauncher<int, ComputeType>(
+            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens, num_experts, topk, renormalize, stream);
+    } else if (topk_indices.scalar_type() == at::ScalarType::UInt32) {
+        vllm::moe::topkGatingSoftmaxKernelLauncher<uint32_t, ComputeType>(
+            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<uint32_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens, num_experts, topk, renormalize, stream);
+    } else {
+        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
+        vllm::moe::topkGatingSoftmaxKernelLauncher<int64_t, ComputeType>(
+            reinterpret_cast<const ComputeType*>(gating_output.data_ptr()),
+            topk_weights.data_ptr<float>(),
+            topk_indices.data_ptr<int64_t>(),
+            token_expert_indices.data_ptr<int>(),
+            softmax_workspace.data_ptr<float>(),
+            num_tokens, num_experts, topk, renormalize, stream);
+    }
+}
+
 void topk_softmax(
    torch::Tensor& topk_weights,                // [num_tokens, topk]
    torch::Tensor& topk_indices,                // [num_tokens, topk]
    torch::Tensor& token_expert_indices,        // [num_tokens, topk]
-    torch::Tensor& gating_output)               // [num_tokens, num_experts]
+    torch::Tensor& gating_output,               // [num_tokens, num_experts]
+    bool renormalize)
 {
    const int num_experts = gating_output.size(-1);
    const auto num_tokens = gating_output.numel() / num_experts;
@ -534,45 +689,19 @@ void topk_softmax(

    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
+    const auto workspace_options = gating_output.options().dtype(at::ScalarType::Float);
+    torch::Tensor softmax_workspace = torch::empty({workspace_size}, workspace_options);

-    if(topk_indices.scalar_type() == at::ScalarType::Int)
-    {
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<int>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
-    }
-    else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
-    {
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<uint32_t>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
-    }
-    else {
-        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
-        vllm::moe::topkGatingSoftmaxKernelLauncher(
-            gating_output.data_ptr<float>(),
-            topk_weights.data_ptr<float>(),
-            topk_indices.data_ptr<int64_t>(),
-            token_expert_indices.data_ptr<int>(),
-            softmax_workspace.data_ptr<float>(),
-            num_tokens,
-            num_experts,
-            topk,
-            stream);
+    if (gating_output.scalar_type() == at::ScalarType::Float) {
+        dispatch_topk_softmax_launch<float>(gating_output, topk_weights, topk_indices, 
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::Half) {
+        dispatch_topk_softmax_launch<__half>(gating_output, topk_weights, topk_indices, 
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+    } else if (gating_output.scalar_type() == at::ScalarType::BFloat16) {
+        dispatch_topk_softmax_launch<__nv_bfloat16>(gating_output, topk_weights, topk_indices, 
+            token_expert_indices, softmax_workspace, num_tokens, num_experts, topk, renormalize, stream);
+    } else {
+        TORCH_CHECK(false, "Unsupported gating_output data type: ", gating_output.scalar_type());
    }
 }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -5,7 +5,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Apply topk softmax to the gating outputs.
  m.def(
      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
-      "token_expert_indices, Tensor gating_output) -> ()");
+      "token_expert_indices, Tensor gating_output, bool renormalize) -> ()");
  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);

  // Calculate the result of moe by summing up the partial results
@ -22,6 +22,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);

+  // Aligning the number of tokens to be processed by each expert such
+  // that it is divisible by the block size, but for the batched case.
+  m.def(
+      "batched_moe_align_block_size(int max_tokens_per_batch,"
+      "                     int block_size, Tensor expert_num_tokens,"
+      "                     Tensor! sorted_token_ids,"
+      "                     Tensor! experts_ids,"
+      "                     Tensor! num_tokens_post_pad) -> ()");
+  m.impl("batched_moe_align_block_size", torch::kCUDA,
+         &batched_moe_align_block_size);
+
+  // Aligning the number of tokens to be processed by each expert such
+  // that it is divisible by the block size.
+  m.def(
+      "moe_lora_align_block_size(Tensor topk_ids,"
+      "                     Tensor token_lora_mapping,"
+      "                     int num_experts,"
+      "                     int block_size, int max_loras, "
+      "                     int max_num_tokens_padded, "
+      "                     int max_num_m_blocks, "
+      "                     Tensor !sorted_token_ids,"
+      "                     Tensor !experts_ids,"
+      "                     Tensor !num_tokens_post_pad) -> () ");
+  m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
+
 #ifndef USE_ROCM
  m.def(
      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -92,9 +92,6 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

-void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-               torch::Tensor& bias, double epsilon);
-
 void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& prompt_mask,
                                 const torch::Tensor& output_mask,
@ -102,8 +99,11 @@ void apply_repetition_penalties_(torch::Tensor& logits,

 void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
-                   torch::Tensor& values, int64_t numRows, int64_t stride0,
-                   int64_t stride1);
+                   int64_t numRows, int64_t stride0, int64_t stride1);
+
+void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
+                          const torch::Tensor& seq_lens, torch::Tensor& indices,
+                          int64_t numRows, int64_t stride0, int64_t stride1);

 void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& weight, torch::Tensor& scale,
@ -307,7 +307,7 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        torch::Tensor b_gptq_qzeros,
                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int64_t bit);
+                        bool use_exllama, bool use_v2_format, int64_t bit);

 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);

--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@ -145,7 +145,11 @@ void rms_norm_dynamic_per_token_quant(
  if (scale_ub.has_value()) {
    TORCH_CHECK(out.dtype() == kFp8Type);
  }
+  TORCH_CHECK(weight.dtype() == input.dtype());
  TORCH_CHECK(scales.dtype() == torch::kFloat32);
+  if (residual) {
+    TORCH_CHECK(residual->scalar_type() == input.scalar_type());
+  }

  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "rms_norm_dynamic_per_token_quant_dispatch", [&] {
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@ -185,7 +185,7 @@ typedef void (*fp_gemm_half_q_half_gptq_kernel)(const half*, const uint32_t*,
                                                const uint32_t*, const half*,
                                                half*, const int, const int,
                                                const int, const int,
-                                                const int*);
+                                                const bool, const int*);

 template <bool first_block, int m_count>
 __global__ void gemm_half_q_half_gptq_4bit_kernel(
@ -193,12 +193,15 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
    const int size_m, const int size_n, const int size_k, const int groups,
-    const int* __restrict__ b_q_perm) {
+    const bool use_v2_format, const int* __restrict__ b_q_perm) {
  MatrixView_half a_(a, size_m, size_k);
  MatrixView_half_rw c_(c, size_m, size_n);
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  auto t = threadIdx.x;

  // Block
@ -256,10 +259,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
  half2 y1y16[4][2];
  b_gptq_qzeros_.item4(zeros, group, n);
  b_gptq_scales_.item4_f(scales, group, n);
-  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+  dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
+  dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
+  dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
+  dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);

  // Column result
  float block_c[m_count][4] = {};
@ -272,10 +275,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
      nextgroup += groupsize;
      b_gptq_qzeros_.item4(zeros, group, n);
      b_gptq_scales_.item4_f(scales, group, n);
-      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+      dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
+      dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
+      dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
+      dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
    }

 #pragma unroll
@ -329,12 +332,15 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
    const int size_m, const int size_n, const int size_k, const int groups,
-    const int* __restrict__ b_q_perm) {
+    const bool use_v2_format, const int* __restrict__ b_q_perm) {
  MatrixView_half a_(a, size_m, size_k);
  MatrixView_half_rw c_(c, size_m, size_n);
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  auto t = threadIdx.x;

  // Block
@ -409,10 +415,10 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
      int4 load_int4 = *b_ptr4;

      half2 dq[4][8];
-      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
-      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
-      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
-      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
+      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
+      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
+      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);

 #pragma unroll
      for (int m = 0; m < m_count; m++) {
@ -448,12 +454,15 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
    const int size_m, const int size_n, const int size_k, const int groups,
-    const int* __restrict__ b_q_perm) {
+    const bool use_v2_format, const int* __restrict__ b_q_perm) {
  MatrixView_half a_(a, size_m, size_k);
  MatrixView_half_rw c_(c, size_m, size_n);
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  auto t = threadIdx.x;

  // Block
@ -534,13 +543,13 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(

      half2 dq[4][16];
      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
-                      size_n, zeros[0] + 1);
+                      size_n, zeros[0] + zero_offset);
      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
-                      size_n, zeros[1] + 1);
+                      size_n, zeros[1] + zero_offset);
      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
-                      size_n, zeros[2] + 1);
+                      size_n, zeros[2] + zero_offset);
      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
-                      size_n, zeros[3] + 1);
+                      size_n, zeros[3] + zero_offset);

 #pragma unroll
      for (int m = 0; m < m_count; m++) {
@ -574,12 +583,15 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
    const int size_m, const int size_n, const int size_k, const int groups,
-    const int* __restrict__ b_q_perm) {
+    const bool use_v2_format, const int* __restrict__ b_q_perm) {
  MatrixView_half a_(a, size_m, size_k);
  MatrixView_half_rw c_(c, size_m, size_n);
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  auto t = threadIdx.x;

  // Block
@ -658,13 +670,13 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(

      half2 dq[4][4];
      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
-                     zeros[0] + 1);
+                     zeros[0] + zero_offset);
      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
-                     zeros[1] + 1);
+                     zeros[1] + zero_offset);
      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
-                     zeros[2] + 1);
+                     zeros[2] + zero_offset);
      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
-                     zeros[3] + 1);
+                     zeros[3] + zero_offset);

      for (int m = 0; m < m_count; m++) {
        block_c[m][0] =
@ -730,7 +742,8 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
                                const uint32_t* b_gptq_qzeros,
                                const half* b_gptq_scales, const int* b_q_perm,
                                half* c, int size_m, int size_n, int size_k,
-                                int m_count, int groups, int bit) {
+                                int m_count, int groups, bool use_v2_format,
+                                int bit) {
  dim3 blockDim, gridDim;
  blockDim.x = BLOCK_KN_SIZE;
  blockDim.y = 1;
@ -743,20 +756,23 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
      pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);

  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  kernel<<<gridDim, blockDim, 0, stream>>>(a, b_q_weight, b_gptq_qzeros,
-                                           b_gptq_scales, c, size_m, size_n,
-                                           size_k, groups, b_q_perm);
+  kernel<<<gridDim, blockDim, 0, stream>>>(
+      a, b_q_weight, b_gptq_qzeros, b_gptq_scales, c, size_m, size_n, size_k,
+      groups, use_v2_format, b_q_perm);
 }

 __global__ void reconstruct_exllama_8bit_kernel(
    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
-    const int groups, half* __restrict__ b) {
+    const int groups, const bool use_v2_format, half* __restrict__ b) {
  MatrixView_half_rw b_(b, size_k, size_n);
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

@ -812,13 +828,13 @@ __global__ void reconstruct_exllama_8bit_kernel(

      half2 dq[4][4];
      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
-                     zeros[0] + 1);
+                     zeros[0] + zero_offset);
      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
-                     zeros[1] + 1);
+                     zeros[1] + zero_offset);
      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
-                     zeros[2] + 1);
+                     zeros[2] + zero_offset);
      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
-                     zeros[3] + 1);
+                     zeros[3] + zero_offset);

      // half* dqh = (half*)dq;
      if (b_q_perm) {
@ -849,11 +865,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
-    const int groups, half* __restrict__ b) {
+    const int groups, const bool use_v2_format, half* __restrict__ b) {
  MatrixView_half_rw b_(b, size_k, size_n);
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

@ -888,10 +907,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
  half2 y1y16[4][2];
  b_gptq_qzeros_.item4(zeros, group, n);
  b_gptq_scales_.item4_h2(scales, group, n);
-  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+  dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
+  dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
+  dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
+  dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);

  __syncthreads();

@ -904,10 +923,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
      nextgroup += groupsize;
      b_gptq_qzeros_.item4(zeros, group, n);
      b_gptq_scales_.item4_h2(scales, group, n);
-      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+      dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
+      dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
+      dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
+      dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
    }

    for (int p = 0; p < 4; p++) {
@ -954,11 +973,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
-    const int groups, half* __restrict__ b) {
+    const int groups, const bool use_v2_format, half* __restrict__ b) {
  MatrixView_half_rw b_(b, size_k, size_n);
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

@ -1016,13 +1038,13 @@ __global__ void reconstruct_exllama_3bit_kernel(

      half2 dq[4][16];
      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
-                      size_n, zeros[0] + 1);
+                      size_n, zeros[0] + zero_offset);
      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
-                      size_n, zeros[1] + 1);
+                      size_n, zeros[1] + zero_offset);
      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
-                      size_n, zeros[2] + 1);
+                      size_n, zeros[2] + zero_offset);
      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
-                      size_n, zeros[3] + 1);
+                      size_n, zeros[3] + zero_offset);

      if (b_q_perm) {
        for (int j = 0; j < 16; j++) {
@ -1052,11 +1074,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
    const uint32_t* __restrict__ b_gptq_qzeros,
    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
-    const int groups, half* __restrict__ b) {
+    const int groups, const bool use_v2_format, half* __restrict__ b) {
  MatrixView_half_rw b_(b, size_k, size_n);
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;

@ -1108,10 +1133,10 @@ __global__ void reconstruct_exllama_2bit_kernel(
      int4 load_int4 = *b_ptr4;

      half2 dq[4][8];
-      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
-      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
-      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
-      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
+      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
+      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
+      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);

      b_ptr += size_n;
      // half* dqh = (half*)dq;
@ -1143,7 +1168,7 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
                         const uint32_t* b_gptq_qzeros,
                         const half* b_gptq_scales, const int* b_q_perm,
                         half* out, int height, int width, int groups,
-                         int bit) {
+                         bool use_v2_format, int bit) {
  dim3 blockDim, gridDim;
  blockDim.x = BLOCK_KN_SIZE;
  blockDim.y = 1;
@ -1162,14 +1187,14 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>(
      b_q_weight, b_q_perm, b_gptq_qzeros, b_gptq_scales, height, width, groups,
-      out);
+      use_v2_format, out);
 }

 __global__ void gemm_half_q_half_alt_4bit_kernel(
    const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
    half* __restrict__ mul, const half* __restrict__ scales,
    const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
-    int batch, int height, int width) {
+    int batch, int height, int width, bool use_v2_format) {
  int zero_width = width / 8;
  int vec_height = height * 4;
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
@ -1179,6 +1204,9 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
  int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
  if (threadIdx.x < h_end) {
    for (int m = 0; m < b_end; ++m) {
@ -1223,10 +1251,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
      half2 zero = __halves2half2(
          __hmul(scale_f,
                 __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) -
-                               1)),
-          __hmul(scale_f2,
-                 __int2half_rn(
-                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)));
+                               zero_offset)),
+          __hmul(
+              scale_f2,
+              __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) -
+                            zero_offset)));
      scales_tmp[tmp_k] = scale;
      zeros_tmp[tmp_k] = zero;
    }
@ -1268,7 +1297,7 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
    const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
    half* __restrict__ mul, const half* __restrict__ scales,
    const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
-    int batch, int height, int width) {
+    int batch, int height, int width, bool use_v2_format) {
  int zero_width = width / 4;
  int vec_height = height * 2;
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
@ -1278,6 +1307,9 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
  int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
  if (threadIdx.x < h_end) {
    for (int m = 0; m < b_end; ++m) {
@ -1312,12 +1344,13 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
      half scale_f2 = scales[g2 * width + w];
      half2 scale = __halves2half2(scale_f, scale_f2);
      half2 zero = __halves2half2(
-          __hmul(scale_f,
-                 __int2half_rn(
-                     -((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
-          __hmul(scale_f2,
-                 __int2half_rn(
-                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)));
+          __hmul(scale_f, __int2half_rn(
+                              -((zeros[g * zero_width + z_w] >> z_mod) & 0xff) -
+                              zero_offset)),
+          __hmul(
+              scale_f2,
+              __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) -
+                            zero_offset)));
      scales_tmp[tmp_k] = scale;
      zeros_tmp[tmp_k] = zero;
    }
@ -1355,7 +1388,7 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
                          const uint32_t* b_gptq_qzeros,
                          const half* b_gptq_scales, const int* b_g_idx,
                          half* c, int size_m, int size_n, int size_k,
-                          int bit) {
+                          bool use_v2_format, int bit) {
  dim3 blockDim, gridDim;
  blockDim.x = BLOCK_KN_SIZE;
  blockDim.y = 1;
@ -1372,17 +1405,15 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  kernel<<<gridDim, blockDim, 0, stream>>>(
      (const half2*)a, b_q_weight, c, b_gptq_scales, b_gptq_qzeros, b_g_idx,
-      size_m, size_k / 32 * bit, size_n);
+      size_m, size_k / 32 * bit, size_n, use_v2_format);
 }

 template <class T, int bit>
-__global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
-                                        const half* __restrict__ w_scales,
-                                        const uint32_t* __restrict__ w_zeros,
-                                        const int* __restrict__ g_idx,
-                                        const int height, const int width,
-                                        const int group,
-                                        half* __restrict__ out) {
+__global__ void reconstruct_gptq_kernel(
+    const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
+    const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
+    const int height, const int width, const int group,
+    const bool use_v2_format, half* __restrict__ out) {
  // Start of block

  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
@ -1395,6 +1426,9 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
  MatrixView_half w_scales_(w_scales, group, width);
  T w_zeros_(w_zeros, group, width);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  uint32_t w_read = w[blockIdx.y * width + column];
  half* out_ptr = out_.item_ptr(row, column);

@ -1402,7 +1436,7 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
  for (int s = 0; s < 32; s += bit) {
    int group = g_idx[row + s / bit];
    half w_scale = w_scales_.item(group, column);
-    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
    half w_item =
        __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero),
               w_scale);
@ -1415,7 +1449,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
    const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
    const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
    const int height, const int width, const int group,
-    half* __restrict__ out) {
+    const bool use_v2_format, half* __restrict__ out) {
  // Start of block
  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
  auto row = blockIdx.y * 32;
@ -1427,6 +1461,9 @@ __global__ void reconstruct_gptq_3bit_kernel(
  MatrixView_half w_scales_(w_scales, group, width);
  MatrixView_q3_row w_zeros_(w_zeros, group, width);

+  // GPTQv2 and GPTQv1 handles zero points differently
+  int zero_offset = use_v2_format ? 0 : 1;
+
  uint32_t w1 = w[(blockIdx.y * 3) * width + column];
  uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
  uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
@ -1436,7 +1473,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
  for (int i = 0; i < 32; i += 1) {
    int group = g_idx[row + i];
    half w_scale = w_scales_.item(group, column);
-    uint32_t w_zero = w_zeros_.item(group, column) + 1;
+    uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
    int w_item;
    if (i == 10) {
      w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
@ -1456,7 +1493,8 @@ __global__ void reconstruct_gptq_3bit_kernel(

 void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
                      const half* b_gptq_scales, const int* b_g_idx, half* out,
-                      int height, int width, int groups, int bit) {
+                      int height, int width, int groups, bool use_v2_format,
+                      int bit) {
  dim3 blockDim, gridDim;
  blockDim.x = BLOCK_KN_SIZE;
  blockDim.y = 1;
@ -1476,7 +1514,7 @@ void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  kernel<<<gridDim, blockDim, 0, stream>>>(b_q_weight, b_gptq_scales,
                                           b_gptq_qzeros, b_g_idx, height,
-                                           width, groups, out);
+                                           width, groups, use_v2_format, out);
 }

 void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
@ -1484,7 +1522,8 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
                           const uint32_t* b_gptq_qzeros,
                           const half* b_gptq_scales, const int* b_g_idx,
                           half* c, half* temp_dq, int size_m, int size_n,
-                           int size_k, int groups, bool use_exllama, int bit) {
+                           int size_k, int groups, bool use_exllama,
+                           bool use_v2_format, int bit) {
  bool use_reconstruct;
  if (use_exllama) {
    use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) ||
@ -1498,10 +1537,10 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
    // Reconstruct FP16 matrix, then cuBLAS
    if (use_exllama) {
      reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                          temp_dq, size_k, size_n, groups, bit);
+                          temp_dq, size_k, size_n, groups, use_v2_format, bit);
    } else {
      reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                       temp_dq, size_k, size_n, groups, bit);
+                       temp_dq, size_k, size_n, groups, use_v2_format, bit);
    }

    const half alpha = __float2half(1.0f);
@ -1517,18 +1556,18 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
    if (max_chunks) {
      gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
                                 b_g_idx, c, last_chunk, size_n, size_k,
-                                 BLOCK_M_SIZE_MAX, groups, bit);
+                                 BLOCK_M_SIZE_MAX, groups, use_v2_format, bit);
    }

    if (last_chunk_size) {
-      gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight,
-                                 b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                                 c + last_chunk * size_n, last_chunk_size,
-                                 size_n, size_k, last_chunk_size, groups, bit);
+      gemm_half_q_half_cuda_part(
+          a + last_chunk * size_k, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+          b_g_idx, c + last_chunk * size_n, last_chunk_size, size_n, size_k,
+          last_chunk_size, groups, use_v2_format, bit);
    }
  } else {
    gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                         c, size_m, size_n, size_k, bit);
+                         c, size_m, size_n, size_k, use_v2_format, bit);
  }
 }

@ -1815,7 +1854,7 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        torch::Tensor b_gptq_qzeros,
                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int64_t bit) {
+                        bool use_exllama, bool use_v2_format, int64_t bit) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
@ -1833,7 +1872,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
      c.size(1),              // n
      a.size(1),              // k
      b_gptq_qzeros.size(0),  // group number
-      use_exllama, bit);
+      use_exllama, use_v2_format, bit);
  return c;
 }

--- a/csrc/sampler.cu
+++ b/csrc/sampler.cu
@ -54,15 +54,10 @@ static inline __device__ uint16_t extractBinIdx(float x) {
  return 511 - (tmp.u16 >> 7);
 }

-template <int kNumThreadsPerBlock = 512>
-static __global__ void topKPerRow(const float* logits, const int* rowStarts,
-                                  const int* rowEnds, int* outIndices,
-                                  float* outLogits, int stride0, int stride1) {
-  // The number of bins in the histogram.
-  static constexpr int kNumBins = 512;
-
-  // The top-k width.
-  static constexpr int kTopK = 2048;
+template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
+__device__ void topKPerRowJob(const float* logits, const int rowStart,
+                              const int rowEnd, const int rowIdx,
+                              int* outIndices, int stride0, int stride1) {
  // The number of elements per thread for the final top-k sort.
  static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
  // The class to sort the elements during the final top-k sort.
@ -103,17 +98,11 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
  __shared__ int smemHistogram[kNumBins];
  // Shared memory to store the selected indices.
  __shared__ int smemIndices[kTopK];
-  // Shared memory to store the selected logits.
-  __shared__ float smemLogits[kTopK];
  // Shared memory to store the threshold bin.
  __shared__ int smemThresholdBinIdx[1];
  // Shared memory counter to register the candidates for the final phase.
  __shared__ int smemFinalDstIdx[1];

-  // The row computed by this block.
-  int rowIdx = blockIdx.x;
-  // The range of logits within the row.
-  int rowStart = rowStarts[rowIdx], rowEnd = rowEnds[rowIdx];
  // The length of the row.
  int rowLen = rowEnd - rowStart;

@ -124,13 +113,10 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
         rowIt += kNumThreadsPerBlock) {
      int idx = rowStart + rowIt;
      outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
-      outLogits[rowIdx * kTopK + rowIt] =
-          logits[rowIdx * stride0 + idx * stride1];
    }
    for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
         rowIt += kNumThreadsPerBlock) {
      outIndices[rowIdx * kTopK + rowIt] = -1;
-      outLogits[rowIdx * kTopK + rowIt] = -FLT_MAX;
    }
    return;
  }
@ -201,7 +187,6 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
    uint16_t idx = extractBinIdx(logit);
    if (idx < thresholdBinIdx) {
      int dstIdx = atomicAdd(&smemHistogram[idx], 1);
-      smemLogits[dstIdx] = logit;
      smemIndices[dstIdx] = rowIt;
    } else if (idx == thresholdBinIdx) {
      int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
@ -250,7 +235,6 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
    int dstIdx = baseIdx + srcIdx;
    if (dstIdx < kTopK) {
-      smemLogits[dstIdx] = finalLogits[ii];
      smemIndices[dstIdx] = finalIndices[ii];
    }
  }
@ -258,31 +242,58 @@ static __global__ void topKPerRow(const float* logits, const int* rowStarts,
  // Make sure the data is in shared memory.
  __syncthreads();

-  // The topK logits.
-  float topKLogits[kNumTopKItemsPerThread];
-  // The topK indices.
-  int topKIndices[kNumTopKItemsPerThread];
-
-// Load from shared memory.
-#pragma unroll
-  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
-    topKLogits[ii] = smemLogits[ii * kNumThreadsPerBlock + threadIdx.x];
-    topKIndices[ii] = smemIndices[ii * kNumThreadsPerBlock + threadIdx.x];
-  }
-
-  // Sort the elements.
-  TopKSort(smemFinal.topKSort)
-      .SortDescendingBlockedToStriped(topKLogits, topKIndices);
-
 // Store to global memory.
 #pragma unroll
  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
    int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
-    outIndices[offset] = topKIndices[ii] - rowStart;
-    outLogits[offset] = topKLogits[ii];
+    outIndices[offset] =
+        smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
  }
 }

+template <int kNumThreadsPerBlock = 512>
+static __global__ void topKPerRow(const float* logits, const int* rowStarts,
+                                  const int* rowEnds, int* outIndices,
+                                  int stride0, int stride1) {
+  // The number of bins in the histogram.
+  static constexpr int kNumBins = 512;
+
+  // The top-k width.
+  static constexpr int kTopK = 2048;
+
+  // The row computed by this block.
+  int rowIdx = blockIdx.x;
+
+  // The range of logits within the row.
+  int rowStart = rowStarts[rowIdx];
+  int rowEnd = rowEnds[rowIdx];
+
+  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
+      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
+}
+
+template <int kNumThreadsPerBlock = 512>
+static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
+                                        int* outIndices, int stride0,
+                                        int stride1, int next_n) {
+  // The number of bins in the histogram.
+  static constexpr int kNumBins = 512;
+
+  // The top-k width.
+  static constexpr int kTopK = 2048;
+
+  // The row computed by this block.
+  int rowIdx = blockIdx.x;
+
+  // The range of logits within the row.
+  int rowStart = 0;
+  int seq_len = seqLens[rowIdx / next_n];
+  int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
+
+  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
+      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
+}
+
 }  // namespace vllm

 void apply_repetition_penalties_(
@ -326,10 +337,23 @@ void apply_repetition_penalties_(
      });
 }

+void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
+                          const torch::Tensor& seqLens, torch::Tensor& indices,
+                          int64_t numRows, int64_t stride0, int64_t stride1) {
+  // Compute the results on the device.
+  constexpr int kNumThreadsPerBlock = 512;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  vllm::topKPerRowDecode<kNumThreadsPerBlock>
+      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
+          logits.data_ptr<float>(), seqLens.data_ptr<int>(),
+          indices.data_ptr<int>(), static_cast<int>(stride0),
+          static_cast<int>(stride1), static_cast<int>(next_n));
+}
+
 void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
-                   torch::Tensor& values, int64_t numRows, int64_t stride0,
-                   int64_t stride1) {
+                   int64_t numRows, int64_t stride0, int64_t stride1) {
  // Compute the results on the device.
  constexpr int kNumThreadsPerBlock = 512;
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -338,6 +362,5 @@ void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
          logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
          rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
-          values.data_ptr<float>(), static_cast<int>(stride0),
-          static_cast<int>(stride1));
+          static_cast<int>(stride0), static_cast<int>(stride1));
 }
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -175,12 +175,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "float epsilon) -> ()");
  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);

-  // Polynomial Normalization.
-  ops.def(
-      "poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float "
-      "epsilon) -> ()");
-  ops.impl("poly_norm", torch::kCUDA, &poly_norm);
-
  // Apply repetition penalties to logits in-place
  ops.def(
      "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
@ -191,10 +185,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // Optimized top-k per row operation
  ops.def(
      "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
-      "Tensor! indices, Tensor! values, int numRows, int stride0, "
+      "Tensor! indices, int numRows, int stride0, "
      "int stride1) -> ()");
  ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);

+  ops.def(
+      "top_k_per_row_decode(Tensor logits, int next_n, "
+      "Tensor seq_lens, Tensor! indices, int numRows, "
+      "int stride0, int stride1) -> ()");
+  ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
+
  // Layernorm-quant
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
@ -557,7 +557,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // to prevent the meta function registry.
  ops.def(
      "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
-      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
+      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
+      "use_v2_format, int bit) "
      "-> Tensor",
      {stride_tag});
  ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -5,7 +5,7 @@
 # docs/contributing/dockerfile/dockerfile.md and
 # docs/assets/contributing/dockerfile-stages-dependency.png

-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.9.1
 ARG PYTHON_VERSION=3.12

 # By parameterizing the base images, we allow third-party to use their own
@ -132,7 +132,9 @@ WORKDIR /workspace
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
+    # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
+    uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
+    && uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

 # cuda arch list used by torch
@ -229,7 +231,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=450
+ARG VLLM_MAX_SIZE_MB=500
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
@ -273,6 +275,7 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM

+# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
 ARG GDRCOPY_CUDA_VERSION=12.8
 # Keep in line with FINAL_BASE_IMAGE
 ARG GDRCOPY_OS_VERSION=Ubuntu22_04
@ -353,14 +356,23 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose \
+    # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
+    uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
+    && uv pip install --system dist/*.whl --verbose \
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')

+# TODO (huydhn): Remove this once xformers is released for 2.9.0
+RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
+    . /etc/environment
+    export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
+    uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
+BASH
+
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.4.0 \
-    && uv pip install --system flashinfer-jit-cache==0.4.0 \
+    uv pip install --system flashinfer-cubin==0.4.1 \
+    && uv pip install --system flashinfer-jit-cache==0.4.1 \
        --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
    && flashinfer show-config

@ -422,6 +434,7 @@ ARG PYTHON_VERSION

 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
+ARG PYTORCH_CUDA_INDEX_BASE_URL

 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -434,7 +447,8 @@ ENV UV_LINK_MODE=copy
 RUN --mount=type=cache,target=/root/.cache/uv \
    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
-        uv pip install --system -r requirements/dev.txt; \
+        uv pip install --system -r requirements/dev.txt \
+        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
    fi

 # install development dependencies (for testing)
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -31,7 +31,7 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get update -y \
-    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
+    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
    && curl -LsSf https://astral.sh/uv/install.sh | sh
@ -79,6 +79,9 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
 ######################### BUILD IMAGE #########################
 FROM base AS vllm-build

+ARG max_jobs=32
+ENV MAX_JOBS=${max_jobs}
+
 ARG GIT_REPO_CHECK=0
 # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 ARG VLLM_CPU_DISABLE_AVX512=0
@ -104,16 +107,20 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38

 ######################### TEST DEPS #########################
 FROM base AS vllm-test-deps

 WORKDIR /workspace/vllm

+# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
    cp requirements/test.in requirements/cpu-test.in && \
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
+    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
+    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu

 RUN --mount=type=cache,target=/root/.cache/uv \
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.


 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.4.0
+# release version: v0.4.1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
    echo "git clone flashinfer..." \
    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
-    && git checkout v0.4.0 \
+    && git checkout v0.4.1\
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
    && rm -rf build \
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 RUN apt-get update -q -y && apt-get install -q -y \
    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
    apt-transport-https ca-certificates wget curl
-# Remove sccache    
+# Remove sccache
 RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
 ARG COMMON_WORKDIR
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,13 +1,13 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
-ARG TRITON_BRANCH="f9e5bf54"
+ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
-ARG PYTORCH_BRANCH="b2fb6885"
+ARG PYTORCH_BRANCH="1c57644d"
 ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="2ab9f4cd"
+ARG AITER_BRANCH="9716b1b8"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base
--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes.
 - [vllm.config.CompilationConfig][]
 - [vllm.config.VllmConfig][]

-[](){ #offline-inference-api }
-
 ## Offline Inference

 LLM Class.
@ -45,18 +43,14 @@ Engine classes for offline and online inference.

 Inference parameters for vLLM APIs.

-[](){ #sampling-params }
-
 - [vllm.SamplingParams][]
 - [vllm.PoolingParams][]

-[](){ #multi-modality }
-
 ## Multi-Modality

 vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.

-Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
+Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
 via the `multi_modal_data` field in [vllm.inputs.PromptType][].

 Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/docs/assets/contributing/load-pattern-examples.png
+++ b/docs/assets/contributing/load-pattern-examples.png
--- a/docs/cli/.nav.yml
+++ b/docs/cli/.nav.yml
@ -5,4 +5,4 @@ nav:
  - complete.md
  - run-batch.md
  - vllm bench:
-    - bench/*.md
+    - bench/**/*.md
--- a/docs/cli/bench/sweep/plot.md
+++ b/docs/cli/bench/sweep/plot.md
@ -0,0 +1,9 @@
+# vllm bench sweep plot
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/bench_sweep_plot.md"
--- a/docs/cli/bench/sweep/serve.md
+++ b/docs/cli/bench/sweep/serve.md
@ -0,0 +1,9 @@
+# vllm bench sweep serve
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/bench_sweep_serve.md"
--- a/docs/cli/bench/sweep/serve_sla.md
+++ b/docs/cli/bench/sweep/serve_sla.md
@ -0,0 +1,9 @@
+# vllm bench sweep serve_sla
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Options
+
+--8<-- "docs/argparse/bench_sweep_serve_sla.md"
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`