Fix use_ep

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
[Misc] Human-readable max-model-len cli arg (#16181 )
2025-04-07 19:56:41 +00:00 · 2025-04-07 14:40:58 -04:00 · 2025-04-07 18:30:06 +00:00 · 2025-04-07 13:54:36 -04:00 · 2025-04-07 23:15:58 +08:00 · 2025-04-07 08:06:27 -07:00
320 changed files with 15964 additions and 3216 deletions
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -10,15 +10,24 @@ set -x
 set -o pipefail

 check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if command -v nvidia-smi; then
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  elif command -v amd-smi; then
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  fi
+
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  if command -v nvidia-smi; then
+    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  elif command -v amd-smi; then
+    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  fi
  echo "GPU type is $gpu_type"
 }

@ -90,9 +99,15 @@ kill_gpu_processes() {


  # wait until GPU memory usage smaller than 1GB
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
+  if command -v nvidia-smi; then
+    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+      sleep 1
+    done
+  elif command -v amd-smi; then
+    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
+      sleep 1
+    done
+  fi

  # remove vllm config file
  rm -rf ~/.config/vllm
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -3,10 +3,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -14,10 +14,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -31,10 +31,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -48,7 +48,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Build and publish TPU release image"
@ -57,7 +57,7 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
@ -82,7 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -105,19 +105,33 @@ fi
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_chat.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi

 #ignore certain Entrypoints/llm tests
-if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi

+#Obsolete currently
+##ignore certain Entrypoints/llm tests
+#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+#fi
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -10,5 +10,5 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
+docker build -t cpu-test -f docker/Dockerfile.ppc64le .

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -18,8 +18,8 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .

 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@ -9,6 +9,7 @@ python3 use_existing_torch.py

 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -5,7 +5,7 @@
 set -ex

 # Try building the docker image
-docker build -t hpu-test-env -f Dockerfile.hpu .
+docker build -t hpu-test-env -f docker/Dockerfile.hpu .

 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -35,7 +35,7 @@ else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

-docker build -t "${image_name}" -f Dockerfile.neuron .
+docker build -t "${image_name}" -f docker/Dockerfile.neuron .

 # Setup cleanup
 remove_docker_container() {
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -1,9 +1,9 @@
 #!/bin/bash

-set -e
+set -xue

 # Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .

 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
@ -21,6 +21,8 @@ docker run --privileged --net host --shm-size=16G -it \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo TEST_0 \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
    && echo TEST_1 \
    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
    && echo TEST_2 \
@ -34,7 +36,11 @@ docker run --privileged --net host --shm-size=16G -it \
    && echo TEST_6 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
    && echo TEST_7 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
+    && echo TEST_8 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
+    && echo TEST_9 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \


 # TODO: This test fails because it uses RANDOM_SEED sampling
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t ${image_name} -f Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() { 
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -5,8 +5,8 @@
 set -ex
 set -o pipefail

-# cd into parent directory of this file
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+# cd 2 levels into the working directory
+cd "$(dirname "${BASH_SOURCE[0]}")/../.."

 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@ -3,7 +3,7 @@
 set -euox pipefail

 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi

--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -104,7 +104,7 @@ steps:
 - label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
@ -155,6 +155,7 @@ steps:
  - popd

 - label: Metrics, Tracing Test # 10min
+  mirror_hardwares: [amd]
  num_gpus: 2
  source_file_dependencies:
  - vllm/
@ -173,7 +174,7 @@ steps:
 #####  1 GPU test  #####

 - label: Regression Test # 5min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@ -204,7 +205,6 @@ steps:
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
-    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/engine
    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/sample
@ -285,11 +285,11 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py

 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
  parallelism: 4

 - label: PyTorch Fullgraph Smoke Test # 9min
@ -311,7 +311,7 @@ steps:
  - pytest -v -s compile/test_full_graph.py

 - label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
+  # mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
  - vllm/attention
@ -321,7 +321,7 @@ steps:
  parallelism: 4

 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amd]
+  # mirror_hardwares: [amd]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -337,7 +337,7 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
-  - bash run-benchmarks.sh
+  - bash scripts/run-benchmarks.sh

 - label: Quantization Test # 33min
  source_file_dependencies:
@ -372,7 +372,7 @@ steps:

 - label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
-  mirror_hardwares: [ amd ]
+  #mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
@ -389,7 +389,8 @@ steps:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'

 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@ -464,6 +465,7 @@ steps:

 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
+  mirror_hardwares: [amd]
  optional: true
  commands:
    - echo 'Testing custom models...'
@ -475,6 +477,7 @@ steps:
 #####  multi gpus test  #####

 - label: Distributed Comm Ops Test # 7min
+  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -602,8 +605,6 @@ steps:
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_minicpmv_tp.py
-    - pytest -v -s -x lora/test_transfomers_model.py


 - label: Weight Loading Multiple GPU Test  # 33min
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -19,7 +19,7 @@ pull_request_rules:
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
-      - files~=^Dockerfile
+      - files~=^docker/Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -50,7 +50,7 @@ jobs:
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0

      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .

      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,3 +1,6 @@
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
 default_stages:
  - pre-commit # Run locally
  - manual # Run in CI
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from Dockerfile.rocm
+# versions are derived from docker/Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
+  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -283,7 +284,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/custom_all_reduce.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
--- a/README.md
+++ b/README.md
@ -15,14 +15,12 @@ Easy, fast, and cheap LLM serving for everyone

 ---

-[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
-
 [2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)

 ---

 *Latest News* 🔥
-
+- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
@ -103,7 +101,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 ## Contributing

 We welcome and value any contributions and collaborations.
-Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.

 ## Sponsors

@ -126,6 +124,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -51,6 +51,12 @@ become available.
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>likaixin/InstructCoder</code></td>
+    </tr>
+      <tr>
+      <td><strong>HuggingFace-AIMO</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
    </tr>
    <tr>
      <td><strong>HuggingFace-Other</strong></td>
@ -187,6 +193,35 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts 10
 ```

+**`AI-MO/aimo-validation-aime`**
+
+``` bash
+python3 vllm/benchmarks/benchmark_serving.py \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path AI-MO/aimo-validation-aime \
+    --num-prompts 10 \
+    --seed 42
+```
+
+### Running With Sampling Parameters
+
+When using OpenAI-compatible backends such as `vllm`, optional sampling
+parameters can be specified. Example client command:
+
+```bash
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --top-k 10 \
+  --top-p 0.9 \
+  --temperature 0.5 \
+  --num-prompts 10
+```
+
 ---
 ## Example - Offline Throughput Benchmark

@ -278,6 +313,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
  --num-prompts 10
 ```

+**`AI-MO/aimo-validation-aime`**
+
+```bash
+python3 benchmarks/benchmark_throughput.py \
+  --model Qwen/QwQ-32B \
+  --backend vllm \
+  --dataset-name hf \
+  --dataset-path AI-MO/aimo-validation-aime \
+  --hf-split train \
+  --num-prompts 10
+```
+
 ### Benchmark with LoRA Adapters

 ``` bash
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -219,7 +219,15 @@ async def async_request_deepspeed_mii(
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp["text"][0]
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0][
+                            "text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = ("Unexpected response format: "
+                                        "neither 'choices' nor 'text' found")
+                        output.success = False
                    output.success = True
                else:
                    output.error = response.reason or ""
@ -489,3 +497,9 @@ ASYNC_REQUEST_FUNCS = {
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
 }
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -582,15 +582,6 @@ class HuggingFaceDataset(BenchmarkDataset):
    ) -> None:
        super().__init__(dataset_path=dataset_path, **kwargs)

-        # Validate dataset path
-        if self.SUPPORTED_DATASET_PATHS and \
-            self.dataset_path not in self.SUPPORTED_DATASET_PATHS:
-            raise ValueError(
-                f"{self.__class__.__name__} "
-                f"only supports: {', '.join(self.SUPPORTED_DATASET_PATHS)}. "
-                "Please consider contributing if you would "
-                "like to add support for additional dataset formats.")
-
        self.dataset_split = dataset_split
        self.dataset_subset = dataset_subset
        self.load_data()
@ -761,3 +752,52 @@ class InstructCoderDataset(HuggingFaceDataset):
                ))
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT"
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item['problem'], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len,
+                                                        completion_len,
+                                                        max_prompt_len=2048,
+                                                        max_total_len=32000):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -34,7 +34,8 @@ from datetime import datetime
 from typing import Any, Optional

 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+from backend_request_func import (ASYNC_REQUEST_FUNCS,
+                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
@ -49,7 +50,8 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

-from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
+from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+                               ConversationDataset, HuggingFaceDataset,
                               InstructCoderDataset, RandomDataset,
                               SampleRequest, ShareGPTDataset, SonnetDataset,
                               VisionArenaDataset)
@ -259,6 +261,7 @@ async def benchmark(
    goodput_config_dict: dict[str, float],
    max_concurrency: Optional[int],
    lora_modules: Optional[Iterable[str]],
+    extra_body: Optional[dict],
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -286,6 +289,7 @@ async def benchmark(
        logprobs=logprobs,
        multi_modal_content=test_mm_content,
        ignore_eos=ignore_eos,
+        extra_body=extra_body,
    )

    test_output = await request_func(request_func_input=test_input)
@ -312,7 +316,8 @@ async def benchmark(
                                         output_len=test_output_len,
                                         logprobs=logprobs,
                                         multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos)
+                                         ignore_eos=ignore_eos,
+                                         extra_body=extra_body)
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")
@ -362,7 +367,8 @@ async def benchmark(
                                              output_len=output_len,
                                              logprobs=logprobs,
                                              multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos)
+                                              ignore_eos=ignore_eos,
+                                              extra_body=extra_body)
        tasks.append(
            asyncio.create_task(
                limited_request_func(request_func_input=request_func_input,
@ -595,14 +601,28 @@ def main(args: argparse.Namespace):
            args.hf_split = "train"
        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = ConversationDataset
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = AIMODataset
+            args.hf_split = "train"
+        else:
+            supported_datasets = set([
+                dataset_name for cls in HuggingFaceDataset.__subclasses__()
+                for dataset_name in cls.SUPPORTED_DATASET_PATHS
+            ])
+            raise ValueError(
+                f"Unsupported dataset path: {args.dataset_path}. "
+                "Huggingface dataset only supports dataset_path"
+                f" from one of following: {supported_datasets}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats.")
        input_requests = dataset_class(
            dataset_path=args.dataset_path,
            dataset_subset=args.hf_subset,
            dataset_split=args.hf_split,
+            random_seed=args.seed,
        ).sample(
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
-            random_seed=args.seed,
            output_len=args.hf_output_len,
        )

@ -637,6 +657,26 @@ def main(args: argparse.Namespace):
            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
    goodput_config_dict = check_goodput_args(args)

+    # Collect the sampling parameters.
+    sampling_params = {
+        k: v
+        for k, v in {
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "min_p": args.min_p,
+            "temperature": args.temperature
+        }.items() if v is not None
+    }
+
+    # Sampling parameters are only supported by openai-compatible backend.
+    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+        raise ValueError(
+            "Sampling parameters are only supported by openai-compatible "
+            "backends.")
+
+    if "temperature" not in sampling_params:
+        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
    # Avoid GC processing "static" data - reduce pause times.
    gc.collect()
    gc.freeze()
@ -663,6 +703,7 @@ def main(args: argparse.Namespace):
            goodput_config_dict=goodput_config_dict,
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
+            extra_body=sampling_params,
        ))

    # Save config and results to json
@ -985,6 +1026,33 @@ if __name__ == "__main__":
        "from the sampled HF dataset.",
    )

+    sampling_group = parser.add_argument_group("sampling parameters")
+    sampling_group.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Top-p sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top-k sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--min-p",
+        type=float,
+        default=None,
+        help="Min-p sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        help="Temperature sampling parameter. Only has effect on "
+        "openai-compatible backends. If not specified, default to greedy "
+        "decoding (i.e. temperature==0.0).")
+
    parser.add_argument(
        '--tokenizer-mode',
        type=str,
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -11,10 +11,10 @@ from typing import Any, Optional, Union

 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
-                               InstructCoderDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+                               ConversationDataset, InstructCoderDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@ -332,7 +332,10 @@ def get_requests(args, tokenizer):
            common_kwargs['dataset_subset'] = args.hf_subset
            common_kwargs['dataset_split'] = args.hf_split
            sample_kwargs["enable_multimodal_chat"] = True
-
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
@ -467,12 +470,13 @@ def validate_args(args):
                since --dataset-name is not 'hf'.",
                      stacklevel=2)
    elif args.dataset_name == "hf":
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend."  #noqa: E501
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            assert args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend."  #noqa: E501
+        if args.dataset_path in (
+                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+                | ConversationDataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
+        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
+                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
        else:
            raise ValueError(
                f"{args.dataset_path} is not supported by hf dataset.")
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -30,19 +30,18 @@ class BenchmarkConfig(TypedDict):
    num_stages: int


-def benchmark_config(
-    config: BenchmarkConfig,
-    num_tokens: int,
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8_w8a8: bool,
-    use_int8_w8a16: bool,
-    num_iters: int = 100,
-    block_quant_shape: List[int] = None,
-) -> float:
+def benchmark_config(config: BenchmarkConfig,
+                     num_tokens: int,
+                     num_experts: int,
+                     shard_intermediate_size: int,
+                     hidden_size: int,
+                     topk: int,
+                     dtype: torch.dtype,
+                     use_fp8_w8a8: bool,
+                     use_int8_w8a16: bool,
+                     num_iters: int = 100,
+                     block_quant_shape: List[int] = None,
+                     use_deep_gemm: bool = False) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    if use_int8_w8a16:
@ -115,22 +114,41 @@ def benchmark_config(
    def run():
        from vllm.model_executor.layers.fused_moe import override_config
        with override_config(config):
-            fused_moe(
-                x,
-                w1,
-                w2,
-                input_gating,
-                topk,
-                renormalize=True,
-                inplace=True,
-                use_fp8_w8a8=use_fp8_w8a8,
-                use_int8_w8a16=use_int8_w8a16,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                block_shape=block_quant_shape,
-            )
+            if use_deep_gemm:
+                topk_weights, topk_ids = fused_topk(x, input_gating, topk,
+                                                    False)
+                return fused_experts(
+                    x,
+                    w1,
+                    w2,
+                    topk_weights,
+                    topk_ids,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                    allow_deep_gemm=True,
+                )
+            else:
+                fused_moe(
+                    x,
+                    w1,
+                    w2,
+                    input_gating,
+                    topk,
+                    renormalize=True,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    use_int8_w8a16=use_int8_w8a16,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                )

    # JIT compilation & warmup
    run()
@ -366,6 +384,7 @@ class BenchmarkWorker:
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
        block_quant_shape: List[int] = None,
+        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
        current_platform.seed_everything(self.seed)
        dtype_str = get_config_dtype_str(dtype,
@ -396,7 +415,8 @@ class BenchmarkWorker:
                                       use_fp8_w8a8,
                                       use_int8_w8a16,
                                       num_iters=100,
-                                       block_quant_shape=block_quant_shape)
+                                       block_quant_shape=block_quant_shape,
+                                       use_deep_gemm=use_deep_gemm)
        return config, kernel_time

    def tune(
@ -411,6 +431,7 @@ class BenchmarkWorker:
        use_int8_w8a16: bool,
        search_space: list[dict[str, int]],
        block_quant_shape: list[int],
+        use_deep_gemm: bool,
    ) -> dict[str, int]:
        best_config = None
        best_time = float("inf")
@ -436,7 +457,8 @@ class BenchmarkWorker:
                        use_fp8_w8a8,
                        use_int8_w8a16,
                        num_iters=20,
-                        block_quant_shape=block_quant_shape)
+                        block_quant_shape=block_quant_shape,
+                        use_deep_gemm=use_deep_gemm)
                except triton.runtime.autotuner.OutOfResources:
                    # Some configurations may be invalid and fail to compile.
                    continue
@ -531,6 +553,9 @@ def main(args: argparse.Namespace):
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
+        if not hasattr(config, "hidden_size"):
+            # Support for llama4
+            config = config.text_config
        # Default: Mixtral.
        E = config.num_local_experts
        topk = config.num_experts_per_tok
@ -550,6 +575,8 @@ def main(args: argparse.Namespace):
    else:
        batch_sizes = [args.batch_size]

+    use_deep_gemm = bool(args.use_deep_gemm)
+
    ray.init()
    num_gpus = int(ray.available_resources()["GPU"])
    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@ -572,10 +599,10 @@ def main(args: argparse.Namespace):

        start = time.time()
        configs = _distribute(
-            "tune",
-            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
-             for batch_size in batch_sizes])
+            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
+                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
+                      block_quant_shape, use_deep_gemm)
+                     for batch_size in batch_sizes])
        best_configs = {
            M: sort_config(config)
            for M, config in zip(batch_sizes, configs)
@ -589,7 +616,7 @@ def main(args: argparse.Namespace):
        outputs = _distribute(
            "benchmark",
            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
             for batch_size in batch_sizes])

        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@ -611,6 +638,7 @@ if __name__ == "__main__":
                        type=str,
                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
                        default="auto")
+    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--tune", action="store_true")
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -33,8 +33,6 @@ endif()

 if(MACOSX_FOUND)
    list(APPEND CXX_COMPILE_FLAGS
-        "-Xpreprocessor"
-        "-fopenmp"
        "-DVLLM_CPU_EXTENSION")
 else()
    list(APPEND CXX_COMPILE_FLAGS
@ -197,6 +195,7 @@ set(VLLM_EXT_SRC
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/quant.cpp"
+        "csrc/cpu/shm.cpp"
        ${VLLM_EXT_SRC})
 endif()

--- a/collect_env.py
+++ b/collect_env.py
@ -482,16 +482,28 @@ def get_pip_packages(run_lambda, patterns=None):
    if patterns is None:
        patterns = DEFAULT_PIP_PATTERNS

-    # People generally have `pip` as `pip` or `pip3`
-    # But here it is invoked as `python -mpip`
-    def run_with_pip(pip):
-        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+    def run_with_pip():
+        try:
+            import importlib.util
+            pip_spec = importlib.util.find_spec('pip')
+            pip_available = pip_spec is not None
+        except ImportError:
+            pip_available = False
+
+        if pip_available:
+            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
+        elif os.environ.get("UV") is not None:
+            print("uv is set")
+            cmd = ["uv", "pip", "list", "--format=freeze"]
+        else:
+            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
+
+        out = run_and_read_all(run_lambda, cmd)
        return "\n".join(line for line in out.splitlines()
                         if any(name in line for name in patterns))

    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
-    out = run_with_pip([sys.executable, '-mpip'])
-
+    out = run_with_pip()
    return pip_version, out


--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -78,9 +78,14 @@ struct FP16Vec16 : public Vec<FP16Vec16> {

  __m256i reg;

+  // normal load
  explicit FP16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}

+  // non-temproal load
+  explicit FP16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
  explicit FP16Vec16(const FP32Vec16&);

  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -110,9 +115,14 @@ struct BF16Vec16 : public Vec<BF16Vec16> {

  __m256i reg;

+  // normal load
  explicit BF16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}

+  // non-temproal load
+  explicit BF16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
  explicit BF16Vec16(const FP32Vec16&);

  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -313,8 +323,13 @@ struct FP32Vec16 : public Vec<FP32Vec16> {

  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}

+  // normal load
  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}

+  // non-temproal load
+  explicit FP32Vec16(bool, void* ptr)
+      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
+
  explicit FP32Vec16(__m512 data) : reg(data) {}

  explicit FP32Vec16(const FP32Vec4& data)
@ -547,6 +562,33 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
    _mm_mask_storeu_epi8(ptr, mask, reg);
  }
 };
+
+struct INT8Vec64 : public Vec<INT8Vec64> {
+  constexpr static int VEC_ELEM_NUM = 64;
+  union AliasReg {
+    __m512i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+
+  __m512i reg;
+
+  // normal load
+  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
+
+  // non-temproal load
+  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
+
+  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
+
+  void save(int8_t* ptr, const int elem_num) const {
+    constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
+    __mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
+    _mm512_mask_storeu_epi8(ptr, mask, reg);
+  }
+
+  // non-temproal save
+  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
+};
 #endif

 template <typename T>
@ -657,6 +699,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {

 inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }

+#ifdef __AVX512F__
+inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
+  _mm512_stream_si512((__m512i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
+  _mm512_stream_ps((float*)ptr, vec.reg);
+}
+#endif
+
+inline void mem_barrier() { _mm_mfence(); }
 };  // namespace vec_op

 #endif
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@ -0,0 +1,781 @@
+#include "cpu/cpu_types.hpp"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+#define MAX_SHM_RANK_NUM 8
+#define MAX_THREAD_NUM 12
+#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
+#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
+#define MAX_P2P_SEND_TENSOR_NUM 8
+
+template <typename scalar_t>
+struct KernelVecType {
+  using scalar_vec_t = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using scalar_vec_t = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using scalar_vec_t = vec_op::BF16Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+  using scalar_vec_t = vec_op::FP16Vec16;
+};
+
+enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
+
+struct ThreadSHMContext {
+  volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
+  int thread_id;
+  int thread_num;
+  int rank;
+  int group_size;
+  size_t _spinning_count;
+  int swizzled_ranks[MAX_SHM_RANK_NUM];
+  void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
+  ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
+
+  ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
+                   const int group_size, void* thread_shm_ptr)
+      : thread_id(thread_id),
+        thread_num(thread_num),
+        rank(rank),
+        group_size(group_size),
+        _spinning_count(0) {
+    static_assert(sizeof(ThreadSHMContext) % 64 == 0);
+    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
+    TORCH_CHECK((size_t)this % 64 == 0);
+    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      shm_contexts[i] = nullptr;
+      thread_shm_ptrs[i] = nullptr;
+      swizzled_ranks[i] = (i + rank) % group_size;
+      thread_stats[i] = ThreadSHMStat::DONE;
+    }
+    set_context(rank, this, thread_shm_ptr);
+  }
+
+  void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
+    TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
+    TORCH_CHECK(ptr);
+    TORCH_CHECK(thread_shm_ptr);
+    TORCH_CHECK_EQ(ptr->thread_num, thread_num);
+    TORCH_CHECK_EQ(ptr->thread_id, thread_id);
+    shm_contexts[rank] = ptr;
+    thread_shm_ptrs[rank] = thread_shm_ptr;
+  }
+
+  template <typename T>
+  T* get_thread_shm_ptr(int rank) {
+    return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
+  }
+
+  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
+
+  void wait_for_all(ThreadSHMStat prev_stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      while (thread_stats[rank] == prev_stat) {
+        ++_spinning_count;
+        _mm_pause();
+      }
+    }
+    vec_op::mem_barrier();
+  }
+
+  void wait_for_one(int rank, ThreadSHMStat prev_stat) {
+    while (thread_stats[rank] == prev_stat) {
+      ++_spinning_count;
+      _mm_pause();
+    }
+    vec_op::mem_barrier();
+  }
+
+  void set_thread_stat(ThreadSHMStat stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      shm_contexts[rank]->thread_stats[this->rank] = stat;
+    }
+  }
+
+  void set_thread_stat(int target_rank, ThreadSHMStat stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      shm_contexts[rank]->thread_stats[target_rank] = stat;
+    }
+  }
+
+  // barrier for all ranks in the group, used for all2all ops
+  // DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
+  void barrier(ThreadSHMStat next_stat) {
+    if (next_stat == ThreadSHMStat::THREAD_READY) {
+      set_thread_stat(ThreadSHMStat::THREAD_READY);
+      wait_for_all(ThreadSHMStat::DONE);
+    } else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
+      set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
+      wait_for_all(ThreadSHMStat::THREAD_READY);
+    } else if (next_stat == ThreadSHMStat::DONE) {
+      set_thread_stat(ThreadSHMStat::DONE);
+      wait_for_all(ThreadSHMStat::SHM_DATA_READY);
+    } else {
+      TORCH_CHECK(false, "Invalid next_stat to barrier.");
+    }
+  }
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << "SHMContext:";
+    ss << "\nrank: " << rank;
+    ss << "\ngroup_size: " << group_size;
+    ss << "\nthread_num: " << thread_num;
+    ss << "\nthread_id: " << thread_id;
+
+    ss << "\nshm_ctx_stat_loop_seq: [";
+    for (int i = 0; i < group_size; ++i) {
+      ss << swizzled_ranks[i] << ", ";
+    }
+    ss << "]";
+
+    ss << "\nshm_contexts: [";
+    for (int i = 0; i < group_size; ++i) {
+      if (shm_contexts[i]) {
+        ss << shm_contexts[i]->rank << ", ";
+      }
+    }
+    ss << "]";
+
+    return ss.str();
+  }
+};
+
+class SHMManager {
+ public:
+  explicit SHMManager(const std::string& name, const int rank,
+                      const int group_size)
+      : _rank(rank),
+        _group_size(group_size),
+        _thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
+        _shm_names({""}),
+        _shared_mem_ptrs({nullptr}),
+        _shm_ctx(nullptr) {
+    _shm_names[rank] = get_shm_name(name, rank);
+    _shared_mem_ptrs[rank] = init_shm(rank);
+    _shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
+
+    for (int i = 0; i < _thread_num; ++i) {
+      ThreadSHMContext* ctx = new (_shm_ctx + i)
+          ThreadSHMContext(i, _thread_num, _rank, _group_size,
+                           compute_thread_shm_ptr(_shm_ctx, i));
+    }
+  }
+
+  void join(const std::string& name) {
+    for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
+      if (rank_idx != _rank) {
+        TORCH_CHECK(_shm_names[rank_idx].empty());
+        TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
+        _shm_names[rank_idx] = get_shm_name(name, rank_idx);
+        _shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
+        ThreadSHMContext* target_ctx =
+            reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
+        for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
+          _shm_ctx[thread_idx].set_context(
+              rank_idx, target_ctx + thread_idx,
+              compute_thread_shm_ptr(target_ctx, thread_idx));
+        }
+      }
+    }
+  }
+
+  ~SHMManager() { destroy_shm(); }
+
+  ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
+
+  static std::string get_shm_name(const std::string& name, int rank) {
+    return name + "_" + std::to_string(rank);
+  }
+
+  static int64_t create_singleton_instance(const std::string& name,
+                                           const int group_size,
+                                           const int rank) {
+    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
+    SingletonInstances.emplace_back(
+        std::make_unique<SHMManager>(name, rank, group_size));
+    return static_cast<int64_t>(SingletonInstances.size() - 1);
+  }
+
+  static SHMManager* get_singleton_instance(int64_t handle) {
+    return SingletonInstances[handle].get();
+  }
+
+ protected:
+  static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
+  static std::mutex SingletonInstancesLock;
+
+ private:
+  static size_t round_to_alignment(size_t num) {
+    return ((num + 63) / 64) * 64;
+  }
+
+  int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
+    int8_t* thread_shm_ptr =
+        reinterpret_cast<int8_t*>(ctx) +
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    return thread_shm_ptr +
+           thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
+  }
+
+  size_t compute_shm_size() {
+    const size_t rounded_rank_buffer_size =
+        round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
+    const size_t rounded_thread_shm_ctx_size =
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    const size_t shm_size =
+        rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
+    return shm_size;
+  }
+
+  void* init_shm(int target_rank) {
+    const std::string& shm_name = _shm_names[target_rank];
+    const int local_rank = _rank;
+    const size_t shm_size = compute_shm_size();
+
+    int fd = -1;
+    if (local_rank == target_rank) {
+      fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
+                    S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+
+      if (ftruncate(fd, shm_size) == -1)
+        TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    } else {
+      fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    }
+
+    void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
+                         MAP_SHARED | MAP_POPULATE, fd, 0);
+
+    if (shm_ptr == MAP_FAILED) {
+      TORCH_CHECK(false,
+                  "mmap in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    if (close(fd) != 0) {
+      TORCH_CHECK(
+          false, "close in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    TORCH_CHECK((size_t)shm_ptr % 64 == 0);
+
+    return shm_ptr;
+  }
+
+  void destroy_shm() {
+    std::stringstream ss;
+    ss << "local rank " << _rank << ": [";
+    for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
+      ss << _shm_ctx[thread_id]._spinning_count << ", ";
+    }
+    ss << "]\n";
+
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      if (_shared_mem_ptrs[i] != nullptr) {
+        munmap(_shared_mem_ptrs[i], compute_shm_size());
+      }
+
+      if (!_shm_names[i].empty()) {
+        shm_unlink(_shm_names[i].c_str());
+      }
+    }
+  }
+
+  int _rank;
+  int _group_size;
+  int _thread_num;
+  std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
+  std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
+  ThreadSHMContext* _shm_ctx;
+};
+
+namespace shm_cc_ops {
+template <typename scalar_t, typename F>
+void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
+  int thread_num = ctx->thread_num;
+  int64_t total_bytes = elem_num * sizeof(scalar_t);
+  int64_t total_units_num =
+      (total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
+  int64_t per_thread_units_num =
+      (total_units_num + thread_num - 1) / thread_num;
+  int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
+  int64_t max_per_thread_iteration_elem_num =
+      PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
+  int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
+
+#pragma omp parallel for schedule(static, 1)
+  for (int i = 0; i < thread_num; ++i) {
+    int64_t offset = i * per_thread_elem_num;
+    int64_t end = std::min(elem_num, offset + per_thread_elem_num);
+    int64_t curr_elem_num =
+        std::min(max_per_thread_iteration_elem_num, end - offset);
+    ThreadSHMContext* thread_ctx = ctx + i;
+
+    while (curr_elem_num > 0) {
+      inner_func(thread_ctx, offset, curr_elem_num);
+
+      offset += max_per_thread_iteration_elem_num;
+      curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
+    }
+  }
+}
+};  // namespace shm_cc_ops
+
+namespace shm_cc_ops {
+
+void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data(
+        true, (int8_t*)src + i);  // stream loading shm to avoid caching
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
+#pragma GCC unroll 4
+  for (int64_t i = 0; i < bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.nt_save((int8_t*)dst + i);
+  }
+}
+
+void memcpy(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+template <typename scalar_t, int RANKS>
+void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
+                         size_t elem_num) {
+  CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
+  using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
+  constexpr int64_t vec_elem_num = vec_t::get_elem_num();
+  const int worldsize = ctx->group_size;
+
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+        scalar_t* thread_data_ptr = data + data_offset;
+        int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
+
+        scalar_t* remote_data_ptrs[RANKS - 1];
+        vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+          remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
+              thread_ctx->get_swizzled_rank(idx + 1));
+        });
+
+        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
+
+        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
+                                  thread_data_elem_num);
+
+        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
+
+        int64_t aligned_data_elem_num =
+            (data_elem_num / vec_elem_num) * vec_elem_num;
+        int64_t i = 0;
+#pragma GCC unroll 4
+        for (; i < aligned_data_elem_num; i += vec_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i);
+        }
+
+        if (i < data_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i,
+                            data_elem_num - aligned_data_elem_num);
+        }
+
+        thread_ctx->barrier(ThreadSHMStat::DONE);
+      });
+
+  return;
+}
+};  // namespace shm_cc_ops
+
+std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
+std::mutex SHMManager::SingletonInstancesLock = {};
+
+template <typename scalar_t>
+void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
+  switch (ctx->group_size) {
+    case 2:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
+      break;
+    case 3:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
+      break;
+    case 4:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
+      break;
+    case 8:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Invalid world size: " + std::to_string(ctx->group_size));
+  }
+}
+
+template <typename scalar_t>
+void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
+                     scalar_t** outputs, const int dst) {
+  CPU_KERNEL_GUARD_IN(shm_gather_impl)
+  const int worldsize = ctx->group_size;
+  TORCH_CHECK_LT(dst, worldsize);
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+
+        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
+
+        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
+                                  data_elem_num * sizeof(scalar_t));
+
+        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
+
+        if (rank == dst) {
+          shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
+                             data_elem_num * sizeof(scalar_t));
+          for (int i = 1; i < worldsize; ++i) {
+            int src_rank = thread_ctx->get_swizzled_rank(i);
+            scalar_t* src_ptr =
+                thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank);  // shm
+            scalar_t* dst_ptr = outputs[src_rank] + data_offset;
+            shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
+                                        data_elem_num * sizeof(scalar_t));
+          }
+        }
+
+        thread_ctx->barrier(ThreadSHMStat::DONE);
+      });
+
+  return;
+}
+
+struct MemPiece {
+  void* ptr;
+  int64_t size;
+
+  template <typename T>
+  T* data_ptr() {
+    return reinterpret_cast<T*>(ptr);
+  }
+};
+
+struct TensorListMeta {
+  int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
+  torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
+  int64_t tensor_num;
+  int64_t total_bytes;
+
+  TensorListMeta() : tensor_num(0), total_bytes(0) {
+    static_assert(sizeof(TensorListMeta) % 64 == 0);
+    static_assert(sizeof(TensorListMeta) <
+                  MIN_THREAD_PROCESS_SIZE);  // To ensure the metadata always
+                                             // hold by the thread 0
+    for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
+      tensor_bytes[i] = 0;
+      tensor_ptrs[i] = nullptr;
+      tensor_types[i] = torch::ScalarType::Undefined;
+    }
+  }
+
+  // For send and recv
+  void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
+    TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
+                "Re-bind TensorListMeta is not allowed.")
+    TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
+    tensor_num = tensor_list.size();
+    int64_t bytes_sum = 0;
+    for (int i = 0; i < tensor_list.size(); ++i) {
+      torch::Tensor& t = tensor_list[i];
+      TORCH_CHECK(t.is_contiguous());
+      tensor_bytes[i] = t.nbytes();
+      tensor_types[i] = t.scalar_type();
+      tensor_ptrs[i] = t.data_ptr();
+      bytes_sum += t.nbytes();
+    }
+    total_bytes = bytes_sum;
+  }
+
+  // For recv
+  std::vector<torch::Tensor> generate_tensor_list() {
+    std::vector<torch::Tensor> tensor_list;
+    tensor_list.reserve(tensor_num);
+
+    for (int i = 0; i < tensor_num; ++i) {
+      int64_t bytes = tensor_bytes[i];
+      auto type = tensor_types[i];
+      int64_t elem_bytes = torch::elementSize(type);
+
+      TORCH_CHECK_EQ(bytes % elem_bytes, 0);
+      int64_t elem_num = bytes / elem_bytes;
+      auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
+      tensor_list.emplace_back(torch::empty({elem_num}, options));
+    }
+    return tensor_list;
+  }
+
+  MemPiece get_data(int64_t offset) {
+    for (int i = 0; i < tensor_num; ++i) {
+      if (offset < tensor_bytes[i]) {
+        return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
+                tensor_bytes[i] - offset};
+      }
+      offset -= tensor_bytes[i];
+    }
+    return {nullptr, 0};
+  }
+
+ private:
+  void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
+  int8_t _padding[40];
+};
+
+void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
+                               const std::vector<torch::Tensor>& tensor_list) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
+  std::vector<torch::Tensor> tensor_list_with_metadata;
+  tensor_list_with_metadata.reserve(1 + tensor_list.size());
+
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  tensor_list_with_metadata.emplace_back(
+      torch::empty({sizeof(TensorListMeta)}, options));
+  tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
+                                   tensor_list.begin(), tensor_list.end());
+
+  torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
+  TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
+
+  TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
+  metadata->bind_tensor_list(tensor_list_with_metadata);
+
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata->total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        // Wait until the receiver set the stat to DONE
+        thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
+
+        int64_t curr_shm_offset = 0;
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
+              frag.ptr, frag.size);
+          curr_shm_offset += frag.size;
+        }
+
+        thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
+      });
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
+                                                     int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  torch::Tensor metadata_tensor =
+      torch::empty({sizeof(TensorListMeta)}, options);
+
+  // Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
+  ctx->wait_for_one(src, ThreadSHMStat::DONE);
+  shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
+                     ctx->get_thread_shm_ptr<void>(src),
+                     sizeof(TensorListMeta));
+  TensorListMeta* src_metadata =
+      reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
+  std::vector<torch::Tensor> tensor_list_with_metadata =
+      src_metadata->generate_tensor_list();
+
+  TensorListMeta metadata;
+  metadata.bind_tensor_list(tensor_list_with_metadata);
+  TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
+  TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
+
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata.total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        // Wait until the sender set the stat to SHM_DATA_READY
+        thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
+        int64_t curr_shm_offset = 0;
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              frag.ptr,
+              thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
+              frag.size);
+          curr_shm_offset += frag.size;
+        }
+
+        thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
+      });
+
+  std::vector<torch::Tensor> tensor_list;
+  tensor_list.reserve(metadata.tensor_num - 1);
+  tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
+                     tensor_list_with_metadata.end());
+
+  return tensor_list;
+}
+}  // namespace
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_gather_impl)
+
+    if (outputs.has_value()) {
+      TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
+      scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+      for (int i = 0; i < outputs->size(); ++i) {
+        output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
+      }
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                      dst);
+    } else {
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
+                      dst);
+    }
+
+    CPU_KERNEL_GUARD_OUT(shm_gather_impl)
+  });
+}
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output) {
+  TORCH_CHECK(data.is_contiguous())
+  TORCH_CHECK(output.is_contiguous())
+
+  const int64_t input_elem_num = data.numel();
+  const int64_t output_elem_num = output.numel();
+  TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
+  const int world_size = output_elem_num / input_elem_num;
+
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
+    auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
+    TORCH_CHECK_EQ(ctx->group_size, world_size);
+
+    scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+    for (int i = 0; i < world_size; ++i) {
+      output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
+    }
+    shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                    ctx->rank);
+    CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
+  });
+}
+
+void shm_allreduce(int64_t handle, torch::Tensor& data) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
+    CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
+    shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel());
+    CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
+  });
+}
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
+  shm_send_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
+  CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
+  auto tensor_list = shm_recv_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
+  CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
+  return tensor_list;
+}
+
+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank) {
+  return SHMManager::create_singleton_instance(name, group_size, rank);
+}
+
+std::string join_shm_manager(int64_t handle, const std::string& name) {
+  auto shm_manager = SHMManager::get_singleton_instance(handle);
+  TORCH_CHECK(shm_manager);
+  shm_manager->join(name);
+  return shm_manager->get_shm_ctx()->to_string();
+}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -22,6 +22,26 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);

+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank);
+
+std::string join_shm_manager(int64_t handle, const std::string& name);
+
+void shm_allreduce(int64_t handle, torch::Tensor& data);
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst);
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output);
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst);
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops

@ -131,6 +151,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                  Tensor? azp, Tensor? bias) -> ()");
  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif
+
+// SHM CCL
+#ifdef __AVX512F__
+  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
+          &init_shm_manager);
+  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
+  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
+  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
+  ops.def(
+      "shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
+      "()");
+  ops.impl("shm_gather", torch::kCPU, &shm_gather);
+  ops.def(
+      "shm_all_gather(int handle, Tensor data, Tensor! output) -> "
+      "()");
+  ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
+  ops.def(
+      "shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
+      "()");
+  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
+  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
+          &shm_recv_tensor_list);
+#endif
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@ -18,7 +18,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {

 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
-  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
+  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
  TORCH_CHECK(omp_cpu_mask->size > 0);
  std::vector<int> omp_cpu_ids;
  omp_cpu_ids.reserve(omp_cpu_mask->size);
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));

 fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
                      torch::Tensor& rank_data, int64_t rank,
-                      bool full_nvlink) {
+                      bool fully_connected) {
  int world_size = fake_ipc_ptrs.size();
  if (world_size > 8)
    throw std::invalid_argument("world size > 8 is not supported");
@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
  }
  return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
                                            rank_data.numel(), rank, world_size,
-                                            full_nvlink);
+                                            fully_connected);
 }

 /**
@ -142,3 +142,48 @@ void register_graph_buffers(fptr_t _fa,
  bytes.reserve(handles.size());
  fa->register_graph_buffers(bytes, offsets);
 }
+
+std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size) {
+  auto device_index = c10::cuda::current_device();
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
+  void* buffer;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Allocate buffer
+#if defined(USE_ROCM)
+  // data buffers need to be "uncached" for signal on MI200
+  AT_CUDA_CHECK(
+      hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
+#else
+  AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
+#endif
+  AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
+  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Create IPC memhandle for the allocated buffer.
+  // Will use it in open_mem_handle.
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handle =
+      torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
+  AT_CUDA_CHECK(
+      cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
+
+  return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
+}
+
+fptr_t open_mem_handle(torch::Tensor& mem_handle) {
+  void* ipc_ptr;
+  AT_CUDA_CHECK(cudaIpcOpenMemHandle(
+      (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
+      cudaIpcMemLazyEnablePeerAccess));
+  return reinterpret_cast<fptr_t>(ipc_ptr);
+}
+
+void free_shared_buffer(fptr_t buffer) {
+  AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
+}
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -5,6 +5,10 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>

+#if defined(USE_ROCM)
+typedef __hip_bfloat16 nv_bfloat16;
+#endif
+
 #include <iostream>
 #include <array>
 #include <limits>
@ -12,6 +16,7 @@
 #include <unordered_map>
 #include <vector>

+namespace vllm {
 #define CUDACHECK(cmd)                                              \
  do {                                                              \
    cudaError_t e = cmd;                                            \
@ -22,24 +27,37 @@
    }                                                               \
  } while (0)

-namespace vllm {
-
+// Maximal number of blocks in allreduce kernel.
 constexpr int kMaxBlocks = 36;
+
+// Default number of blocks in allreduce kernel.
+#ifndef USE_ROCM
+const int defaultBlockLimit = 36;
+CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#else
+const int defaultBlockLimit = 16;
+hipPointer_attribute rangeStartAddrAttr =
+    HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#endif
+
 // Counter may overflow, but it's fine since unsigned int overflow is
 // well-defined behavior.
 using FlagType = uint32_t;
+
+// Two sets of peer counters are needed for two syncs: starting and ending an
+// operation. The reason is that it's possible for peer GPU block to arrive at
+// the second sync point while the current GPU block haven't passed the first
+// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
+// waiting for counter. We use alternating counter array to avoid this
+// possibility.
 struct Signal {
-  alignas(128) FlagType self_counter[kMaxBlocks][8];
-  // Two sets of peer counters are needed for two syncs. The reason is that
-  // it's possible for peer GPU block to arrive at the second sync point while
-  // the current GPU block haven't passed the first sync point. Thus, peer GPU
-  // may write counter+1 while current GPU is busy waiting for counter. We use
-  // alternating counter array to avoid this possibility.
-  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
+  alignas(128) FlagType start[kMaxBlocks][8];
+  alignas(128) FlagType end[kMaxBlocks][8];
+  alignas(128) FlagType _flag[kMaxBlocks];  // incremental flags for each rank
 };

 struct __align__(16) RankData {
-  const void* __restrict__ ptrs[8];
+  const void* ptrs[8];
 };

 struct __align__(16) RankSignals {
@ -134,27 +152,29 @@ DINLINE O downcast(array_t<float, O::size> val) {
  }
 }

+#if !defined(USE_ROCM)
+
 static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
               "l"(flag_addr));
-#else
+  #else
  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
               "l"(flag_addr));
-#endif
+  #endif
 }

 static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
  FlagType flag;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
               : "=r"(flag)
               : "l"(flag_addr));
-#else
+  #else
  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
               : "=r"(flag)
               : "l"(flag_addr));
-#endif
+  #endif
  return flag;
 }

@ -170,37 +190,99 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
  return flag;
 }

-// is_start: whether this is the very first synchronization barrier.
-// need_fence: whether a memory fence is needed. If true, a release-acquire
-// semantic is used to enforce memory access order before and after this
-// barrier.
-template <int ngpus, bool is_start, bool need_fence = false>
-DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
-                               int rank) {
-  if constexpr (!is_start) __syncthreads();
-  static_assert(
-      !(is_start && need_fence));  // Start barrier shouldn't need fence.
+// This function is meant to be used as the first synchronization in the all
+// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
+// prior memory accesses. Note: volatile writes will not be reordered against
+// other volatile writes.
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
  if (threadIdx.x < ngpus) {
-    // Increment the counter. Technically we only need one counter, but we use
-    // multiple per block to eliminate the need to share the counter via smem.
-    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
+    // Write the expected counter value to peer and wait for correct value
+    // from peer.
+    st_flag_volatile(peer_counter_ptr, flag);
+    while (ld_flag_volatile(self_counter_ptr) != flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+// This function is meant to be used as the second or the final
+// synchronization barrier in the all reduce kernel. If it's the final
+// synchronization barrier, we don't need to make any visibility guarantees
+// for prior memory accesses.
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
    // Write the expected counter value to peer and wait for correct value from
    // peer.
-    auto peer_counter_ptr =
-        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
-    auto self_counter_ptr =
-        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
-    if constexpr (need_fence) {
-      st_flag_release(peer_counter_ptr, val);
-      while (ld_flag_acquire(self_counter_ptr) != val);
+    if constexpr (!final_sync) {
+      st_flag_release(peer_counter_ptr, flag);
+      while (ld_flag_acquire(self_counter_ptr) != flag);
    } else {
-      st_flag_volatile(peer_counter_ptr, val);
-      while (ld_flag_volatile(self_counter_ptr) != val);
+      st_flag_volatile(peer_counter_ptr, flag);
+      while (ld_flag_volatile(self_counter_ptr) != flag);
    }
  }
-  if constexpr (is_start || need_fence) __syncthreads();
+  if constexpr (!final_sync) __syncthreads();
+
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
 }

+#else
+
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
+                            flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
+                                  __ATOMIC_RELAXED,
+                                  __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
+                            flag,
+                            final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
+                            __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (
+        __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
+                               final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
+                               __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  if constexpr (!final_sync) __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+#endif
+
 template <typename P, int ngpus, typename A>
 DINLINE P packed_reduce(const P* ptrs[], int idx) {
  A tmp = upcast(ptrs[0][idx]);
@ -220,13 +302,13 @@ __global__ void __launch_bounds__(512, 1)
  // note: we don't reorder the address so the accumulation order is the same
  // for all ranks, ensuring bitwise identical results
  auto dp = *_dp;
-  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
  // do the actual reduction
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
       idx += gridDim.x * blockDim.x) {
    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
  }
-  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+  barrier_at_end<ngpus, true>(sg, self_sg, rank);
 }

 template <typename P>
@ -255,18 +337,20 @@ __global__ void __launch_bounds__(512, 1)
    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
  }
  auto tmp_out = tmps[0];
-  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
+
  // stage 1: reduce scatter
  for (int idx = start + tid; idx < end; idx += stride) {
    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
  }
-  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
+  barrier_at_end<ngpus>(sg, self_sg, rank);

  // stage 2: allgather. Note: it's important to match the tid between
  // the two stages, because visibility across devices is only guaranteed
  // between threads that have the same tid. If thread i computes the sum of
-  // start + i in the first stage, then thread i also gathers start + i from all
-  // ranks.
+  // start + i in the first stage, then thread i also gathers start + i from
+  // all ranks.
+
  for (int idx = tid; idx < largest_part; idx += stride) {
 #pragma unroll
    for (int i = 0; i < ngpus; i++) {
@ -287,21 +371,22 @@ class CustomAllreduce {
 public:
  int rank_;
  int world_size_;
-  bool full_nvlink_;
+  // Full NVLink or xGMI connection between GPUs.
+  bool fully_connected_;

  RankSignals sg_;
-  // Stores an map from a pointer to its peer pointters from all ranks.
+  // Stores an map from a pointer to its peer pointers from all ranks.
  std::unordered_map<void*, RankData*> buffers_;
  Signal* self_sg_;

  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
  // For cuda graph to work, all kernel arguments must be fixed during graph
-  // capture time. However, the peer pointers are not known during graph capture
-  // time. Therefore, during capture, we increment the rank data pointer and use
-  // that as the argument to the kernel. The kernel arguments are stored in
-  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
-  // memory pointed to by the pointers in graph_unreg_buffers_ when
-  // the IPC handles are exchanged between ranks.
+  // capture time. However, the peer pointers are not known during graph
+  // capture time. Therefore, during capture, we increment the rank data
+  // pointer and use that as the argument to the kernel. The kernel arguments
+  // are stored in graph_unreg_buffers_. The actual peer pointers will be
+  // filled in at the memory pointed to by the pointers in
+  // graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
  //
  // The overall process looks like this:
  // 1. Graph capture.
@ -319,17 +404,18 @@ class CustomAllreduce {
   * Signals are an array of ipc-enabled buffers from all ranks.
   * For each of the buffer, the layout is as follows:
   * | -- sizeof(Signal) -- | ------ a few MB ----- |
-   * The first section is for allreduce synchronization, and the second section
-   * is for storing the intermediate results required by some allreduce algos.
+   * The first section is for allreduce synchronization, and the second
+   * section is for storing the intermediate results required by some
+   * allreduce algos.
   *
   * Note: this class does not own any device memory. Any required buffers
   * are passed in from the constructor.
   */
  CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
-                  int rank, int world_size, bool full_nvlink = true)
+                  int rank, int world_size, bool fully_connected = true)
      : rank_(rank),
        world_size_(world_size),
-        full_nvlink_(full_nvlink),
+        fully_connected_(fully_connected),
        self_sg_(signals[rank]),
        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
@ -361,8 +447,7 @@ class CustomAllreduce {
      void* base_ptr;
      // note: must share the base address of each allocation, or we get wrong
      // address
-      if (cuPointerGetAttribute(&base_ptr,
-                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+      if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
        throw std::runtime_error("failed to get pointer attr");
      CUDACHECK(cudaIpcGetMemHandle(
@ -396,11 +481,11 @@ class CustomAllreduce {

  // Note: when registering graph buffers, we intentionally choose to not
  // deduplicate the addresses. That means if the allocator reuses some
-  // addresses, they will be registered again. This is to account for the remote
-  // possibility of different allocation patterns between ranks. For example,
-  // rank 1 may get the same input address for the second allreduce, but rank 2
-  // got a different address. IPC handles have internal reference counting
-  // mechanism so overhead should be small.
+  // addresses, they will be registered again. This is to account for the
+  // remote possibility of different allocation patterns between ranks. For
+  // example, rank 1 may get the same input address for the second allreduce,
+  // but rank 2 got a different address. IPC handles have internal reference
+  // counting mechanism so overhead should be small.
  void register_graph_buffers(
      const std::vector<std::string>& handles,
      const std::vector<std::vector<int64_t>>& offsets) {
@ -431,15 +516,15 @@ class CustomAllreduce {
  /**
   * Performs allreduce, assuming input has already been registered.
   *
-   * Block and grid default configs are results after careful grid search. Using
-   * 36 blocks give the best or close to the best runtime on the devices I
-   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
-   * take a small amount of SMs. Not quite sure the underlying reason, but my
-   * guess is that too many SMs will cause contention on NVLink bus.
+   * Block and grid default configs are results after careful grid search.
+   * Using 36 blocks give the best or close to the best runtime on the devices
+   * I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
+   * only take a small amount of SMs. Not quite sure the underlying reason,
+   * but my guess is that too many SMs will cause contention on NVLink bus.
   */
  template <typename T>
  void allreduce(cudaStream_t stream, T* input, T* output, int size,
-                 int threads = 512, int block_limit = 36) {
+                 int threads = 512, int block_limit = defaultBlockLimit) {
    auto d = packed_t<T>::P::size;
    if (size % d != 0)
      throw std::runtime_error(
@ -473,13 +558,11 @@ class CustomAllreduce {
 #define KL(ngpus, name)                                                       \
  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                 rank_, size);
-    // TODO(hanzhi713): Threshold is different for A100 and H100.
-    // Add per device threshold.
 #define REDUCE_CASE(ngpus)                            \
  case ngpus: {                                       \
    if (world_size_ == 2) {                           \
      KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (full_nvlink_) {                        \
+    } else if (fully_connected_) {                    \
      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
          (world_size_ <= 8 && bytes < 256 * 1024)) { \
        KL(ngpus, cross_device_reduce_1stage);        \
@ -497,7 +580,8 @@ class CustomAllreduce {
      REDUCE_CASE(8)
      default:
        throw std::runtime_error(
-            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual "
+            "num "
            "gpus = " +
            std::to_string(world_size_));
    }
@ -511,10 +595,11 @@ class CustomAllreduce {
    }
  }
 };
+
 /**
- * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
- a template instantiation:
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and
+ add a template instantiation:
 * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
 half *, int, int, int);
 */
-}  // namespace vllm
+}  // namespace vllm
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@ -1,9 +1,9 @@
 /**
 * This is a standalone test for custom allreduce.
 * To compile, make sure you have MPI and NCCL installed in your system.
- * export MPI_HOME=xxx
+ * export MPI_HOME=XXX
 * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
- * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
+ * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
 *
 * Warning: this C++ test is not designed to be very readable and was used
 * during the rapid prototyping process.
@ -22,7 +22,15 @@
 #include "cuda_profiler_api.h"
 #include "custom_all_reduce.cuh"
 #include "mpi.h"
-#include "nccl.h"
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+typedef __hip_bfloat16 nv_bfloat16;
+  #include "rccl/rccl.h"
+  #include "custom_all_reduce_hip.cuh"
+#else
+  #include "nccl.h"
+  #include "custom_all_reduce.cuh"
+#endif

 #define MPICHECK(cmd)                                                  \
  do {                                                                 \
@ -43,16 +51,29 @@
    }                                                               \
  } while (0)

+#ifdef USE_ROCM
 __global__ void dummy_kernel() {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  for (int i = 0; i < 100; i++) {
+    uint64_t start = wall_clock64();
+    uint64_t cycles_elapsed;
+    do {
+      cycles_elapsed = wall_clock64() - start;
+    } while (cycles_elapsed < 100);
+  }
  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+}
 #else
+__global__ void dummy_kernel() {
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+  #else
  for (int i = 0; i < 100; i++) {
    long long int start = clock64();
    while (clock64() - start < 150000000);  // approximately 98.4ms on P40
  }
-#endif
+  #endif
 }
+#endif

 template <typename T>
 __global__ void set_data(T* data, int size, int myRank) {
@ -121,8 +142,14 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
   * registration, they are allocated and registered together in the test for
   * convenience.
   */
+#ifdef USE_ROCM
+  CUDACHECK(hipExtMallocWithFlags(
+      (void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
+      hipDeviceMallocUncached));
+#else
  CUDACHECK(
      cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
+#endif
  CUDACHECK(
      cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
  CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
@ -311,13 +338,18 @@ int main(int argc, char** argv) {

  bool performance_test = true;
  cudaProfilerStart();
-  // Uncomment to scan through different block size configs.
-  // for (int threads : {256, 512, 1024}) {
-  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
-  //     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
-  //     performance_test);
-  //   }
-  // }
+// Uncomment to scan through different block size configs.
+// for (int threads : {256, 512, 1024}) {
+//   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
+//     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
+//     performance_test);
+//   }
+// }
+#ifdef USE_ROCM
+  const int block_limit = 16;
+#else
+  const int block_limit = 36;
+#endif
  // Scan through different sizes to test performance.
  for (int sz = 512; sz <= (8 << 20); sz *= 2) {
    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
@ -326,4 +358,4 @@ int main(int argc, char** argv) {
  cudaProfilerStop();
  MPICHECK(MPI_Finalize());
  return EXIT_SUCCESS;
-}
+}
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -145,7 +145,8 @@ torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
 #endif

 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
-                              int64_t n);
+                              int64_t n,
+                              std::optional<at::ScalarType> const& dtype);

 torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
                                  int64_t type, int64_t row);
@ -267,10 +268,10 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
                       const std::optional<at::Tensor>& has_initial_state,
                       bool silu_activation, int64_t pad_slot_id);

-#ifndef USE_ROCM
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
-                      torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
+                      torch::Tensor& rank_data, int64_t rank,
+                      bool fully_connected);
 void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
 void dispose(fptr_t _fa);
@ -281,4 +282,7 @@ get_graph_buffer_ipc_meta(fptr_t _fa);
 void register_graph_buffers(fptr_t _fa,
                            const std::vector<std::vector<int64_t>>& handles,
                            const std::vector<std::vector<int64_t>>& offsets);
-#endif
+std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size);
+int64_t open_mem_handle(torch::Tensor& mem_handle);
+void free_shared_buffer(int64_t buffer);
--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@ -94,8 +94,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
    dfloat2 v;
    dequantize_kernel(vx, ib, iqs, v);

-    y[iybs + iqs + 0]        = v.x;
-    y[iybs + iqs + y_offset] = v.y;
+    y[iybs + iqs + 0]        = convert_from_half<dst_t>(v.x);
+    y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y);
 }

 template<typename dst_t>
@ -114,10 +114,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t

    half dall = __low2half(x[i].dm);
    half dmin = __high2half(x[i].dm);
-    y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4)));
-    y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4)));
-    y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4)));
-    y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4)));
+    y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4))));
+    y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4))));
+    y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4))));
+    y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4))));
 }

 template<typename dst_t>
@ -148,7 +148,9 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
    const uint8_t * q = x[i].qs + 32*n;
    const uint8_t * hm = x[i].hmask;

-    for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
+    for (int l = l0; l < l0+4; ++l) {
+        y[l] = convert_from_half<dst_t>(__hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
+    }
 }

 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
@ -188,8 +190,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
    const half d2 = __hmul(dall, __int2half_rn(sc));
    const half m2 = __hmul(dmin, __int2half_rn(m));
    for (int l = 0; l < n; ++l) {
-        y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
-        y[l +32] = __hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2);
+        y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1));
+        y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2));
    }
 }

@ -220,11 +222,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
    const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));

    uint8_t   hm  = 1 << (2*il);
-    y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
-    y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
+    y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1));
+    y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1));
    hm <<= 1;
-    y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2);
-    y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2);
+    y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2));
+    y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2));
 }

 template<typename dst_t>
@ -247,10 +249,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
    const uint8_t   qh = x[i].qh[32*ip + il];
    const int8_t  * sc = x[i].scales + is;

-    y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
-    y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
-    y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
-    y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
+    y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))));
+    y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))));
+    y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32))));
+    y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32))));
 }

 template<typename dst_t>
@ -269,7 +271,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
    const uint32_t aux32 = q2[2] | (q2[3] << 16);
    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 }

 template<typename dst_t>
@ -286,7 +288,7 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);

 }

@ -303,7 +305,7 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 }

 template<typename dst_t>
@ -324,8 +326,8 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
    for (int j = 0; j < 4; ++j) {
-        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
-        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
 }

@ -345,8 +347,8 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
    const uint8_t signs = x[i].signs[4*ib + il];
    for (int j = 0; j < 4; ++j) {
-        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
-        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
    }
 }

@ -367,7 +369,7 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
    grid32[0] &= 0x0f0f0f0f;
    for (int j = 0; j < 8; ++j) {
-        y[j] = __float2half(d * (q[j] + delta));
+        y[j] = d * (q[j] + delta);
    }
 }

@ -392,7 +394,7 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
    grid32[0] &= 0x0f0f0f0f;
    for (int j = 0; j < 8; ++j) {
-        y[j] = __float2half(d * (q[j] + delta));
+        y[j] = d * (q[j] + delta);
    }
 }

@ -409,8 +411,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
    const uint8_t  * q4 = x[ib].qs + 4*il;
    const float d = __half2float(x[ib].d);
    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
-        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }

 }
@ -427,8 +429,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
    const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
-        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
    }
 }

@ -522,7 +524,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 }

-static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
+template<typename dst_t>
+static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
    switch (type) {
        case 2:
            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@ -1063,7 +1063,8 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
 typedef half dfloat; // dequantize float
 typedef half2 dfloat2;
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
+template<typename dst_t>
+using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream);
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
 typedef void (*load_tiles_cuda_t)(
@ -1075,6 +1076,25 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(

 // Utility function

+template<typename dst_t>
+static __device__ __forceinline__ dst_t convert_from_half(half val) {
+    return val;
+}
+
+template<>
+__device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    return __float2bfloat16(__half2float(val));
+#else
+    return __half2float(val);
+#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+}
+
+template<>
+__device__ __forceinline__ float convert_from_half<float>(half val) {
+    return __half2float(val);
+}
+
 #if defined(USE_ROCM)

 #ifndef __has_builtin
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@ -71,14 +71,19 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
 }

 torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
-                              int64_t type, int64_t m, int64_t n) {
+                              int64_t type, int64_t m, int64_t n,
+                              std::optional<at::ScalarType> const& dtype) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
+  auto dtype_ = dtype.value_or(torch::kFloat16);
+  auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
  at::Tensor DW = torch::empty({m, n}, options);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
-  to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream);
+
+  VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
+    auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
+    to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
+  });
+
  return DW;
 }

--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -1785,7 +1785,7 @@ __global__ void Marlin(
            <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
                num_groups, prob_m, prob_n, prob_k, lda, locks,                \
-                use_atomic_add, use_fp32_reduce);                              \
+                part_use_atomic_add, use_fp32_reduce);                         \
      }                                                                        \
    }

@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
      thread_m_blocks = exec_cfg.max_m_blocks;
    }

+    // atomic add reduce have better performance only when m * n is small
+    bool part_use_atomic_add =
+        use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
+
    if (false) {
    }
    GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -272,6 +272,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const float scale,    
    const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
    const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes, // [num_heads]
    const int q_stride,
@ -291,6 +292,13 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  const int rowid = laneid / 16;

  const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
+    return;
+  }
+
  const auto partition_idx = blockIdx.y;

  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
@ -377,9 +385,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  // fetch Q in shared across warps and then write to registers
  const int local_qhead_idx = 4 * warpid + rowid;
  const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
-  const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
  const scalar_t* q_ptr =
-      q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
+      q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;

  const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
  if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
@ -777,6 +786,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    const float scale,
    const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
    const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes, // [num_heads]
    const int q_stride,
@ -794,6 +804,12 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
  const int lane4id = laneid % 4;

  const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
  const auto partition_idx = blockIdx.y;
  const auto partition_size = blockDim.x;
  const auto max_num_partitions = gridDim.y;
@ -882,9 +898,11 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    }

    // fetch q elements
-    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elems
+    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elemsc
+    const int64_t query_start_off = static_cast<int64_t>(
+        query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
    const scalar_t* q_ptr =
-        q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE;
+        q + query_start_off * q_stride + wg_start_head_idx * HEAD_SIZE;
    const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
    const int qhead_elemh8 = laneid / 4;

@ -1267,10 +1285,19 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                           // max_num_partitions, head_size]
    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
    const int max_num_partitions) {
  const auto num_heads = gridDim.x;
  const auto head_idx = blockIdx.x;
  const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
@ -1439,7 +1466,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
      __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
  acc *= inv_global_exp_sum;

-  OUTT* out_ptr = out + static_cast<int64_t>(seq_idx) * num_heads * HEAD_SIZE +
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
  if constexpr (std::is_same<OUTT, bit8_t>::value) {
    out_ptr[threadIdx.x] =
@ -1466,6 +1495,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const float scale,
    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
    const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride,
@ -1492,6 +1522,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    const float scale,
    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
    const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride,
@ -1515,6 +1546,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
    const int max_num_partitions) {
  UNREACHABLE_CODE
 }
@ -1522,34 +1554,34 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support

-#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                             \
-  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
-                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,     \
-                                          GQA_RATIO>                          \
-      <<<grid, block, 0, stream>>>(                                           \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
-          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
-          k_scale_ptr, v_scale_ptr);
+#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
+  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
+                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
+                                          GQA_RATIO>                           \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);

-#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                              \
-  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
-                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
-                                         GQA_RATIO>                           \
-      <<<grid, block, 0, stream>>>(                                           \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
-          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
-          k_scale_ptr, v_scale_ptr);
+#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                               \
+  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,   \
+                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,       \
+                                         GQA_RATIO>                            \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);

 #define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
  paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
                                      PARTITION_SIZE, NPAR_LOOPS>    \
      <<<reduce_grid, reduce_block, 0, stream>>>(                    \
          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
-          context_lens_ptr, max_num_partitions);
+          context_lens_ptr, query_start_loc_ptr, max_num_partitions);

 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
@ -1559,9 +1591,10 @@ void paged_attention_custom_launcher(
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, const int num_kv_heads, float scale,
    torch::Tensor& block_tables, torch::Tensor& context_lens,
-    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
-    torch::Tensor& k_scale, torch::Tensor& v_scale) {
-  int num_seqs = query.size(0);
+    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
+  int num_seqs = block_tables.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
  int max_num_blocks_per_seq = block_tables.size(1);
@ -1569,6 +1602,13 @@ void paged_attention_custom_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);

+  // NOTE: query start location is optional for V0 decode should not be used.
+  // If batch contains mix of prefills and decode, prefills should be skipped.
+  const int* query_start_loc_ptr =
+      query_start_loc
+          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
+          : nullptr;
+
  // NOTE: alibi_slopes is optional.
  const float* alibi_slopes_ptr =
      alibi_slopes
@ -1700,8 +1740,8 @@ void paged_attention_custom_launcher(
  paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
                                  PSIZE, ALIBI_ENABLED>(                    \
      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-      num_kv_heads, scale, block_tables, context_lens, max_context_len,     \
-      alibi_slopes, k_scale, v_scale);
+      num_kv_heads, scale, block_tables, context_lens, query_start_loc,     \
+      max_context_len, alibi_slopes, k_scale, v_scale);

 #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,      \
                                   PSIZE)                                      \
@ -1750,6 +1790,7 @@ void paged_attention(
    double scale,
    torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& context_lens, // [num_seqs]
+    const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
    int64_t block_size, int64_t max_context_len,
    const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@ -7,8 +7,9 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                     torch::Tensor& query, torch::Tensor& key_cache,
                     torch::Tensor& value_cache, int64_t num_kv_heads,
                     double scale, torch::Tensor& block_tables,
-                     torch::Tensor& context_lens, int64_t block_size,
-                     int64_t max_context_len,
+                     torch::Tensor& context_lens,
+                     const std::optional<torch::Tensor>& query_start_loc,
+                     int64_t block_size, int64_t max_context_len,
                     const std::optional<torch::Tensor>& alibi_slopes,
                     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
                     torch::Tensor& v_scale);
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@ -23,7 +23,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
      "                Tensor query, Tensor key_cache,"
      "                Tensor value_cache, int num_kv_heads,"
      "                float scale, Tensor block_tables,"
-      "                Tensor context_lens, int block_size,"
+      "                Tensor context_lens,"
+      "                Tensor? query_start_loc,"
+      "                int block_size,"
      "                int max_context_len,"
      "                Tensor? alibi_slopes,"
      "                str kv_cache_dtype,"
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -295,7 +295,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif

  // Dequantization for GGML.
-  ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
+  ops.def(
+      "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
+      "dtype) -> Tensor");
  ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);

  // mmvq kernel for GGML.
@ -614,12 +616,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
                  &get_max_shared_memory_per_block_device_attribute);
 }

-#ifndef USE_ROCM
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  // Custom all-reduce kernels
  custom_ar.def(
      "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
-      "int rank, bool full_nvlink) -> int");
+      "int rank, bool fully_connected) -> int");
  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
  custom_ar.def(
      "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
@ -632,7 +633,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  custom_ar.def("register_buffer", &register_buffer);
  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
  custom_ar.def("register_graph_buffers", &register_graph_buffers);
+
+  custom_ar.def("allocate_shared_buffer_and_handle",
+                &allocate_shared_buffer_and_handle);
+  custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
+  custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
+
+  custom_ar.def("free_shared_buffer", &free_shared_buffer);
 }
-#endif

 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
--- a/docker/Dockerfile.arm
+++ b/docker/Dockerfile.arm
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@ -38,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
    && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
    && python -m pip install -U pip uv \
-    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
+    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
    && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
    && cd /tmp && touch control
@ -238,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    && python -m pip install -U pip uv --no-cache \
    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
    && make -C /numactl install \
-    && uv pip install cmake \
+    && uv pip install 'cmake<4' \
    && cmake --install /lapack/build \
    && uv pip uninstall cmake

--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,18 +1,18 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLASLT_BRANCH="db8e93b4"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="3a585126"
-ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_VISION_BRANCH="v0.21.0"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="b7d29fb"
-ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
-ARG AITER_BRANCH="21d47a9"
+ARG FA_BRANCH="1a7f4dfa"
+ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
+ARG AITER_BRANCH="8970b25b"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}

 ARG PYTHON_VERSION=3.12
@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive

 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less \
+    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@ -42,7 +42,7 @@ RUN apt-get update -y \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

-RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython

 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@ -60,7 +60,8 @@ RUN cd hipBLAS-common \
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
    && git checkout ${HIPBLASLT_BRANCH} \
-    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && apt-get install -y llvm-dev \
+    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && cd build/release \
    && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -110,11 +111,24 @@ RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
-    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/flash-attention/dist/*.whl /app/install

+FROM base AS build_aiter
+ARG AITER_BRANCH
+ARG AITER_REPO
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
+
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
@ -130,19 +144,12 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
-
-ARG AITER_REPO
-ARG AITER_BRANCH
-RUN git clone --recursive ${AITER_REPO}
-RUN cd aiter \
-    && git checkout ${AITER_BRANCH} \
-    && git submodule update --init --recursive \
-    && pip install -r requirements.txt \
-    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    pip install /install/*.whl

 ARG BASE_IMAGE
-ARG HIPBLASLT_BRANCH
 ARG HIPBLAS_COMMON_BRANCH
+ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@ -154,6 +161,8 @@ ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
+ARG AITER_BRANCH
+ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -167,6 +176,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@ -4,6 +4,8 @@

 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:

+- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
+- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
 - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@ -22,6 +22,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@ -1,6 +1,6 @@
 # Dockerfile

-We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
 More information about deploying with Docker can be found [here](#deployment-docker).

 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
@ -28,7 +28,7 @@ The edges of the build graph represent:
  > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
  >
  > ```bash
-  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
+  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
  > ```
  >
  > or in case you want to run it directly with the docker image:
@ -43,7 +43,7 @@ The edges of the build graph represent:
  >    --output png \
  >    --dpi 200 \
  >    --max-label-length 50 \
-  >    --filename Dockerfile \
+  >    --filename docker/Dockerfile \
  >    --legend
  > ```
  >
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@ -45,7 +45,7 @@ pytest tests/
 ```

 :::{tip}
-Since the <gh-file:Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.

 Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
 :::
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:

 ```Dockerfile
-FROM vllm/vllm-openai:v0.8.2
+FROM vllm/vllm-openai:v0.8.3

-# e.g. install the `audio` and `video` optional dependencies
+# e.g. install the `audio` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio,video]==0.8.2
+RUN uv pip install --system vllm[audio]==0.8.3
 ```

 :::
@ -61,11 +61,11 @@ RUN uv pip install --system git+https://github.com/huggingface/transformers.git

 ## Building vLLM's Docker Image from Source

-You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
+You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:

 ```console
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
+DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
 ```

 :::{note}
@ -92,6 +92,7 @@ Keep an eye on memory usage with parallel jobs as it can be substantial (see exa
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
 $ python3 use_existing_torch.py
 $ DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t vllm/vllm-gh200-openai:latest \
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@ -46,6 +46,7 @@ metadata:
 type: Opaque
 data:
  token: $(HF_TOKEN)
+EOF
 ```

 Next, start the vLLM server as a Kubernetes Deployment and Service:
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@ -69,14 +69,14 @@ server {

 ```console
 cd $vllm_root
-docker build -f Dockerfile . --tag vllm
+docker build -f docker/Dockerfile . --tag vllm
 ```

 If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:

 ```console
 cd $vllm_root
-docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
+docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
 ```

 (nginxloadbalancer-nginx-docker-network)=
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@ -8,7 +8,7 @@ Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModal

 ## Prompt Update Detection

-One of the main responsibilies of HF processor is to update the prompt with placeholder tokens. For example:
+One of the main responsibilities of HF processor is to update the prompt with placeholder tokens. For example:

 - Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
 - Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
@ -126,7 +126,7 @@ Unfortunately, because auto-tuning takes quite a long time (from seconds to minu

 ## Cudagraph Capture

-vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trival to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
+vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.

 The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.

--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@ -19,17 +19,20 @@ And usually, these repositories have a config.json file that includes a quantiza

 ## Read quantized checkpoint

+For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument.
+
 ```python
 from vllm import LLM
 import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes")
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
 ```

 ## Inflight quantization: load as 4bit quantization

+For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument.
+
 ```python
 from vllm import LLM
 import torch
@ -40,7 +43,7 @@ quantization="bitsandbytes")

 ## OpenAI Compatible Server

-Append the following to your 4bit model arguments:
+Append the following to your model arguments for 4bit inflight quantization:

 ```console
 --quantization bitsandbytes
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@ -29,7 +29,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlam
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
 :::

-GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-confing-path
+GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path

 ```console
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@ -16,5 +16,6 @@ gptqmodel
 int4
 int8
 fp8
+quark
 quantized_kvcache
 :::
--- a/docs/source/features/quantization/quark.md
+++ b/docs/source/features/quantization/quark.md
@ -0,0 +1,217 @@
+(quark)=
+
+# AMD QUARK
+
+Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
+throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
+the flexible and powerful quantization toolkit, to produce performant quantized models to run on AMD GPUs. Quark has specialized support for quantizing large language models with weight,
+activation and kv-cache quantization and cutting-edge quantization algorithms like
+AWQ, GPTQ, Rotation and SmoothQuant.
+
+## Quark Installation
+
+Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
+
+```console
+pip install amd-quark
+```
+
+You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
+for more installation details.
+
+## Quantization Process
+
+After installing Quark, we will use an example to illustrate how to use Quark.  
+The Quark quantization process can be listed for 5 steps as below:
+
+1. Load the model
+2. Prepare the calibration dataloader
+3. Set the quantization configuration
+4. Quantize the model and export
+5. Evaluation in vLLM
+
+### 1. Load the Model
+
+Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
+to fetch model and tokenizer.
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+MAX_SEQ_LEN = 512
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+model.eval()
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+tokenizer.pad_token = tokenizer.eos_token
+```
+
+### 2. Prepare the Calibration Dataloader
+
+Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)
+to load calibration data. For more details about how to use calibration datasets efficiently, please refer
+to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
+
+```python
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+
+BATCH_SIZE = 1
+NUM_CALIBRATION_DATA = 512
+
+# Load the dataset and get calibration data.
+dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+
+tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+    batch_size=BATCH_SIZE, drop_last=True)
+```
+
+### 3. Set the Quantization Configuration
+
+We need to set the quantization configuration, you can check
+[quark config guide](https://quark.docs.amd.com/latest/pytorch/user_guide_config_description.html)
+for further details. Here we use FP8 per-tensor quantization on weight, activation,
+kv-cache and the quantization algorithm is AutoSmoothQuant.
+
+:::{note}
+Note the quantization algorithm needs a JSON config file and the config file is located in
+[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
+under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
+AutoSmoothQuant config file for Llama is
+`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
+:::
+
+```python
+from quark.torch.quantization import (Config, QuantizationConfig,
+                                     FP8E4M3PerTensorSpec,
+                                     load_quant_algo_config_from_file)
+
+# Define fp8/per-tensor/static spec.
+FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+    is_dynamic=False).to_quantization_spec()
+
+# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+    weight=FP8_PER_TENSOR_SPEC)
+
+# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
+kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+kv_cache_quant_config = {name :
+    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+                       weight=global_quant_config.weight,
+                       output_tensors=KV_CACHE_SPEC)
+    for name in kv_cache_layer_names_for_llama}
+layer_quant_config = kv_cache_quant_config.copy()
+
+# Define algorithm config by config file.
+LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+
+EXCLUDE_LAYERS = ["lm_head"]
+quant_config = Config(
+    global_quant_config=global_quant_config,
+    layer_quant_config=layer_quant_config,
+    kv_cache_quant_config=kv_cache_quant_config,
+    exclude=EXCLUDE_LAYERS,
+    algo_config=algo_config)
+```
+
+### 4. Quantize the Model and Export
+
+Then we can apply the quantization. After quantizing, we need to freeze the
+quantized model first before exporting. Note that we need to export model with format of
+HuggingFace `safetensors`, you can refer to
+[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
+for more exporting format details.
+
+```python
+import torch
+from quark.torch import ModelQuantizer, ModelExporter
+from quark.torch.export import ExporterConfig, JsonExporterConfig
+
+# Apply quantization.
+quantizer = ModelQuantizer(quant_config)
+quant_model = quantizer.quantize_model(model, calib_dataloader)
+
+# Freeze quantized model to export.
+freezed_model = quantizer.freeze(model)
+
+# Define export config.
+LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+
+EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
+exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
+with torch.no_grad():
+    exporter.export_safetensors_model(freezed_model,
+        quant_config=quant_config, tokenizer=tokenizer)
+```
+
+### 5. Evaluation in vLLM
+
+Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+          kv_cache_dtype='fp8',quantization='quark')
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+print("\nGenerated Outputs:\n" + "-" * 60)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt:    {prompt!r}")
+    print(f"Output:    {generated_text!r}")
+    print("-" * 60)
+```
+
+Or, you can use `lm_eval` to evaluate accuracy:
+
+```console
+$ lm_eval --model vllm \
+  --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
+  --tasks gsm8k
+```
+
+## Quark Quantization Script
+In addition to the example of Python API above, Quark also offers a
+[quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
+to quantize large language models more conveniently. It supports quantizing models with variety
+of different quantization schemes and optimization algorithms. It can export the quantized model
+and run evaluation tasks on the fly. With the script, the example above can be:
+
+```console
+python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
+                          --output_dir /path/to/output \
+                          --quant_scheme w_fp8_a_fp8 \
+                          --kv_cache_dtype fp8 \
+                          --quant_algo autosmoothquant \
+                          --num_calib_data 512 \
+                          --model_export hf_format \
+                          --tasks gsm8k
+```
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@ -1,6 +1,6 @@
 # Tool Calling

-vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but [on the roadmap](gh-issue:13002).
+vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`) and `none` options for the `tool_choice` field in the chat completion API.

 ## Quickstart

@ -91,6 +91,12 @@ For best results, we recommend ensuring that the expected output format / schema
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.

+## Required Function Calling
+
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine.
+
+When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
+
 ## Automatic Function Calling

 To enable this feature, you should set the following flags:
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@ -17,6 +17,7 @@ def fix_case(text: str) -> str:
        "cli": "CLI",
        "cpu": "CPU",
        "llm": "LLM",
+        "mae": "MAE",
        "tpu": "TPU",
        "aqlm": "AQLM",
        "gguf": "GGUF",
@ -24,6 +25,7 @@ def fix_case(text: str) -> str:
        "rlhf": "RLHF",
        "vllm": "vLLM",
        "openai": "OpenAI",
+        "lmcache": "LMCache",
        "multilora": "MultiLoRA",
        "mlpspeculator": "MLPSpeculator",
        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@ -86,7 +86,7 @@ Currently, there are no pre-built Intel Gaudi images.
 ### Build image from source

 ```console
-docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```

--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@ -132,7 +132,7 @@ Currently, there are no pre-built Neuron images.

 See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.

-Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.
+Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.

 ## Extra information

--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@ -169,10 +169,10 @@ See <project:#deployment-docker-pre-built-image> for instructions on using the o

 ### Build image from source

-You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
+You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.

 ```console
-docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 ```

 Run the Docker image with the following command:
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@ -177,7 +177,7 @@ Currently, there are no pre-built CPU wheels.
 ### Build image from source

 ```console
-$ docker build -f Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
+$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .

 # Launching OpenAI server 
 $ docker run --rm \
@ -193,11 +193,11 @@ $ docker run --rm \
 ```

 ::::{tip}
-For ARM or Apple silicon, use `Dockerfile.arm`
+For ARM or Apple silicon, use `docker/Dockerfile.arm`
 ::::

 ::::{tip}
-For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float`
+For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
 ::::

 ## Supported features
@ -272,12 +272,14 @@ $ python examples/offline_inference/basic/basic.py

 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.

- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.

-  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
+  - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:

    ```console
    VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
    ```

-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md).
+  - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
+
+  - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@ -31,7 +31,7 @@ Currently, there are no pre-built ROCm wheels.
    ```console
    # Install PyTorch
    $ pip uninstall torch -y
-    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.3
+    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
    ```

 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
@ -123,7 +123,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO

 #### (Optional) Build an image with ROCm software stack

-Build a docker image from <gh-file:Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
+Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.

@ -140,12 +140,12 @@ It is important that the user kicks off the docker build using buildkit. Either
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:

 ```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base .
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base .
 ```

 #### Build an image with vLLM

-First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
+First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

 ```console
@ -156,10 +156,10 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```

-<gh-file:Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
+<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:

- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
 - `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
 - `BUILD_RPD`: Include RocmProfileData profiling tool in the image
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
@ -169,13 +169,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:

 ```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
 ```

 To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:

 ```console
-DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm .
 ```

 To run the above docker image `vllm-rocm`, use the below command:
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@ -54,7 +54,7 @@ Currently, there are no pre-built XPU images.
 ### Build image from source

 ```console
-$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
 $ docker run -it \
             --rm \
             --network=host \
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -208,5 +208,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.

 ```{attention}
-There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
+There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
 ```
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@ -156,10 +156,3 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring
  cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).

 For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
-
-## Frequently Asked Questions
-
-**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
-The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
-
-On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -77,9 +77,9 @@ getting_started/v1_user_guide
 :caption: Models
 :maxdepth: 1

+models/supported_models
 models/generative_models
 models/pooling_models
-models/supported_models
 models/extensions/index
 :::

--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -1,58 +1,31 @@
 (supported-models)=

-# List of Supported Models
+# Supported Models

-vLLM supports generative and pooling models across various tasks.
+vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks.
 If a model supports more than one task, you can set the task via the `--task` argument.

 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.

-## Loading a Model
+## Model Implementation

-### HuggingFace Hub
+### vLLM

-By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
+If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.

-To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
-If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+These models are what we list in <project:#supported-text-models> and <project:#supported-mm-models>.

-Models do not _need_ to be natively supported to be used in vLLM.
-The <project:#transformers-fallback> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+(transformers-backend)=

-:::{tip}
-The easiest way to check if your model is really supported at runtime is to run the program below:
+### Transformers
+
+vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
+
+To check if the modeling backend is Transformers, you can simply do this:

 ```python
 from vllm import LLM
-
-# For generative models (task=generate) only
-llm = LLM(model=..., task="generate")  # Name or path of your model
-output = llm.generate("Hello, my name is")
-print(output)
-
-# For pooling models (task={embed,classify,reward,score}) only
-llm = LLM(model=..., task="embed")  # Name or path of your model
-output = llm.encode("Hello, my name is")
-print(output)
-```
-
-If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
-:::
-
-Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
-Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
-
-(transformers-fallback)=
-
-### Transformers fallback
-
-vLLM can fallback to model implementations that are available in Transformers. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
-
-To check if the backend is Transformers, you can simply do this:
-
-```python 
-from vllm import LLM
 llm = LLM(model=..., task="generate")  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
 ```
@ -69,27 +42,26 @@ vLLM may not fully optimise the Transformers implementation so you may see degra

 #### Supported features

-The Transformers fallback explicitly supports the following features:
+The Transformers modeling backend explicitly supports the following features:

 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
 - <project:#distributed-serving>

-#### Remote code
+#### Remote Code

-Earlier we mentioned that the Transformers fallback enables you to run remote code models directly in vLLM.
-If you are interested in this feature, this section is for you!
+If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!

 Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
 Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!

-```python 
+```python
 from vllm import LLM
 llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
 llm.apply_model(lambda model: print(model.__class__))
 ```

-To make your model compatible with the Transformers fallback, it needs:
+To make your model compatible with the Transformers backend, it needs:

 ```{code-block} python
 :caption: modeling_my_model.py
@ -121,7 +93,9 @@ Here is what happens in the background:
 2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
 3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.

-To make your model compatible with tensor parallel, it needs:
+That's it!
+
+For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:

 ```{code-block} python
 :caption: configuration_my_model.py
@ -130,20 +104,65 @@ from transformers import PretrainedConfig

 class MyConfig(PretrainedConfig):
  base_model_tp_plan = {
-    "layers.*.self_attn.q_proj": "colwise",
-    ...
+    "layers.*.self_attn.k_proj": "colwise",
+    "layers.*.self_attn.v_proj": "colwise",
+    "layers.*.self_attn.o_proj": "rowwise",
+    "layers.*.mlp.gate_proj": "colwise",
+    "layers.*.mlp.up_proj": "colwise",
+    "layers.*.mlp.down_proj": "rowwise",
+  }
+  base_model_pp_plan = {
+    "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+    "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+    "norm": (["hidden_states"], ["hidden_states"]),
  }
 ```

+- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
+  * You only need to do this for layers which are not present on all pipeline stages
+  * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
+  * The `list` in the first element of the `tuple` contains the names of the input arguments
+  * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
+
+## Loading a Model
+
+### Hugging Face Hub
+
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models).
+
+To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+
+Models do not _need_ to be natively supported to be used in vLLM.
+The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+
 :::{tip}
-`base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+The easiest way to check if your model is really supported at runtime is to run the program below:
+
+```python
+from vllm import LLM
+
+# For generative models (task=generate) only
+llm = LLM(model=..., task="generate")  # Name or path of your model
+output = llm.generate("Hello, my name is")
+print(output)
+
+# For pooling models (task={embed,classify,reward,score}) only
+llm = LLM(model=..., task="embed")  # Name or path of your model
+output = llm.encode("Hello, my name is")
+print(output)
+```
+
+If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
 :::

-That's it!
+Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
+Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.

 ### ModelScope

-To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
+To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:

 ```shell
 export VLLM_USE_MODELSCOPE=True
@ -165,6 +184,8 @@ output = llm.encode("Hello, my name is")
 print(output)
 ```

+(supported-text-models)=
+
 ## List of Text-only Language Models

 ### Generative Models
@ -197,6 +218,11 @@ See [this page](#generative-models) for more information on how to use generativ
  * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
  * ✅︎
  * ✅︎
+- * `BambaForCausalLM`
+  * Bamba
+  * `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`
+  *
+  *
 - * `BloomForCausalLM`
  * BLOOM, BLOOMZ, BLOOMChat
  * `bigscience/bloom`, `bigscience/bloomz`, etc.
@ -207,9 +233,9 @@ See [this page](#generative-models) for more information on how to use generativ
  * `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
  *
  *
- * `ChatGLMModel`
+- * `ChatGLMModel`, `ChatGLMForConditionalGeneration`
  * ChatGLM
-  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.
  * ✅︎
  * ✅︎
 - * `CohereForCausalLM`, `Cohere2ForCausalLM`
@ -452,6 +478,16 @@ See [this page](#generative-models) for more information on how to use generativ
  * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
  *
  * ✅︎
+- * `Qwen3ForCausalLM`
+  * Qwen3
+  * `Qwen/Qwen3-8B`, etc.
+  * ✅︎
+  * ✅︎
+- * `Qwen3MoeForCausalLM`
+  * Qwen3MoE
+  * `Qwen/Qwen3-MoE-15B-A2B`, etc.
+  * ✅︎
+  * ✅︎
 - * `StableLmForCausalLM`
  * StableLM
  * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
@ -482,6 +518,11 @@ See [this page](#generative-models) for more information on how to use generativ
  * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
  * ✅︎
  * ✅︎
+- * `MiniMaxText01ForCausalLM`
+  * MiniMax-Text
+  * `MiniMaxAI/MiniMax-Text-01`, etc.
+  *
+  * ✅︎
 - * `Zamba2ForCausalLM`
  * Zamba2
  * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
@ -545,7 +586,7 @@ you should explicitly specify the task type to ensure that the model is used in
  *
 - * `XLMRobertaModel`
  * XLM-RoBERTa-based
-  * `intfloat/multilingual-e5-large`, etc.
+  * `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, etc.
  *
  *
 :::
@ -732,6 +773,13 @@ See [this page](#generative-models) for more information on how to use generativ
  *
  * ✅︎
  * ✅︎
+- * `AyaVisionForConditionalGeneration`
+  * Aya Vision
+  * T + I<sup>+</sup>
+  * `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `Blip2ForConditionalGeneration`
  * BLIP-2
  * T + I<sup>E</sup>
@ -802,6 +850,13 @@ See [this page](#generative-models) for more information on how to use generativ
  *
  * ✅︎
  * ✅︎
+- * `Llama4ForConditionalGeneration`
+  * Llama-4-17B-Omni-Instruct
+  * T + I<sup>+</sup>
+  * `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `LlavaForConditionalGeneration`
  * LLaVA-1.5
  * T + I<sup>E+</sup>
@ -844,6 +899,13 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
  * ✅︎
  * ✅︎
+- * `Mistral3ForConditionalGeneration`
+  * Mistral3
+  * T + I<sup>+</sup>
+  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `MllamaForConditionalGeneration`
  * Llama 3.2
  * T + I<sup>+</sup>
@ -1066,7 +1128,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.

    :::{tip}
-    When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+    When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
    :::

 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
@ -1083,5 +1145,5 @@ We have the following levels of testing for models:

 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -188,6 +188,7 @@ For example:
 ```yaml
 # config.yaml

+model: meta-llama/Llama-3.1-8B-Instruct
 host: "127.0.0.1"
 port: 6379
 uvicorn-log-level: "info"
@ -196,12 +197,13 @@ uvicorn-log-level: "info"
 To use the above config file:

 ```bash
-vllm serve SOME_MODEL --config config.yaml
+vllm serve --config config.yaml
 ```

 :::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
+e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
 :::

 ## API Reference
--- a/docs/source/serving/usage_stats.md
+++ b/docs/source/serving/usage_stats.md
@ -1,6 +1,8 @@
 # Usage Stats Collection

-vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
+vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information.
+
+A subset of the data, after cleaning and aggregation, will be publicly released for the community's benefit. For example, you can see the 2024 usage report [here](https://2024.vllm.ai).

 ## What data is collected?

--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
        limit_mm_per_prompt={"audio": audio_count},
    )

--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@ -7,89 +7,102 @@ from transformers import AutoTokenizer

 from vllm import LLM, SamplingParams

-parser = argparse.ArgumentParser()

-parser.add_argument(
-    "--dataset",
-    type=str,
-    default="./examples/data/gsm8k.jsonl",
-    help="downloaded from the eagle repo " \
-    "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
-)
-parser.add_argument("--max_num_seqs", type=int, default=8)
-parser.add_argument("--num_prompts", type=int, default=80)
-parser.add_argument("--num_spec_tokens", type=int, default=2)
-parser.add_argument("--tp", type=int, default=1)
-parser.add_argument("--draft_tp", type=int, default=1)
-parser.add_argument("--enforce_eager", action='store_true')
-parser.add_argument("--enable_chunked_prefill", action='store_true')
-parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
-parser.add_argument("--temp", type=float, default=0)
+def load_prompts(dataset_path, num_prompts):
+    if os.path.exists(dataset_path):
+        prompts = []
+        try:
+            with open(dataset_path) as f:
+                for line in f:
+                    data = json.loads(line)
+                    prompts.append(data["turns"][0])
+        except Exception as e:
+            print(f"Error reading dataset: {e}")
+            return []
+    else:
+        prompts = [
+            "The future of AI is", "The president of the United States is"
+        ]

-args = parser.parse_args()
+    return prompts[:num_prompts]

-print(args)

-model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="./examples/data/gsm8k.jsonl",
+        help="downloaded from the eagle repo " \
+        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+    )
+    parser.add_argument("--max_num_seqs", type=int, default=8)
+    parser.add_argument("--num_prompts", type=int, default=80)
+    parser.add_argument("--num_spec_tokens", type=int, default=2)
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--draft_tp", type=int, default=1)
+    parser.add_argument("--enforce_eager", action='store_true')
+    parser.add_argument("--enable_chunked_prefill", action='store_true')
+    parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
+    parser.add_argument("--temp", type=float, default=0)
+    args = parser.parse_args()

-max_model_len = 2048
+    model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+    eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"

-tokenizer = AutoTokenizer.from_pretrained(model_dir)
+    max_model_len = 2048

-if os.path.exists(args.dataset):
-    prompts = []
-    num_prompts = args.num_prompts
-    with open(args.dataset) as f:
-        for line in f:
-            data = json.loads(line)
-            prompts.append(data["turns"][0])
-else:
-    prompts = ["The future of AI is", "The president of the United States is"]
+    tokenizer = AutoTokenizer.from_pretrained(model_dir)

-prompts = prompts[:args.num_prompts]
-num_prompts = len(prompts)
+    prompts = load_prompts(args.dataset, args.num_prompts)

-prompt_ids = [
-    tokenizer.apply_chat_template([{
-        "role": "user",
-        "content": prompt
-    }],
-                                  add_generation_prompt=True)
-    for prompt in prompts
-]
+    prompt_ids = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      add_generation_prompt=True)
+        for prompt in prompts
+    ]

-llm = LLM(
-    model=model_dir,
-    trust_remote_code=True,
-    tensor_parallel_size=args.tp,
-    enable_chunked_prefill=args.enable_chunked_prefill,
-    max_num_batched_tokens=args.max_num_batched_tokens,
-    enforce_eager=args.enforce_eager,
-    max_model_len=max_model_len,
-    max_num_seqs=args.max_num_seqs,
-    gpu_memory_utilization=0.8,
-    speculative_config={
-        "model": eagle_dir,
-        "num_speculative_tokens": args.num_spec_tokens,
-        "draft_tensor_parallel_size": args.draft_tp,
-        "max_model_len": max_model_len,
-    },
-    disable_log_stats=False,
-)
+    llm = LLM(
+        model=model_dir,
+        trust_remote_code=True,
+        tensor_parallel_size=args.tp,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        max_num_batched_tokens=args.max_num_batched_tokens,
+        enforce_eager=args.enforce_eager,
+        max_model_len=max_model_len,
+        max_num_seqs=args.max_num_seqs,
+        gpu_memory_utilization=0.8,
+        speculative_config={
+            "model": eagle_dir,
+            "num_speculative_tokens": args.num_spec_tokens,
+            "draft_tensor_parallel_size": args.draft_tp,
+            "max_model_len": max_model_len,
+        },
+        disable_log_stats=False,
+    )

-sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+    sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)

-outputs = llm.generate(prompt_token_ids=prompt_ids,
-                       sampling_params=sampling_params)
+    outputs = llm.generate(prompt_token_ids=prompt_ids,
+                           sampling_params=sampling_params)

-# calculate the average number of accepted tokens per forward pass, +1 is
-# to account for the token from the target model that's always going to be
-# accepted
-acceptance_counts = [0] * (args.num_spec_tokens + 1)
-for output in outputs:
-    for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
-        acceptance_counts[step] += count
+    # calculate the average number of accepted tokens per forward pass, +1 is
+    # to account for the token from the target model that's always going to be
+    # accepted
+    acceptance_counts = [0] * (args.num_spec_tokens + 1)
+    for output in outputs:
+        for step, count in enumerate(
+                output.metrics.spec_token_acceptance_counts):
+            acceptance_counts[step] += count

-print(f"mean acceptance length: \
-    {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+    print("-" * 50)
+    print(f"mean acceptance length: \
+        {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@ -75,8 +75,6 @@ prompts = [
    enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
 ] + zipped_prompt_list

-print(prompts)
-
 # Create a sampling params object.
 sampling_params = SamplingParams(
    temperature=0,
@ -91,10 +89,13 @@ sampling_params = SamplingParams(
 outputs = llm.generate(prompts, sampling_params)

 # Print the outputs.
-for output in outputs:
+print("-" * 50)
+for i, output in enumerate(outputs):
    prompt = output.prompt
    encoder_prompt = output.encoder_prompt
    generated_text = output.outputs[0].text
-    print(f"Encoder prompt: {encoder_prompt!r}, "
-          f"Decoder prompt: {prompt!r}, "
+    print(f"Output {i+1}:")
+    print(f"Encoder prompt: {encoder_prompt!r}\n"
+          f"Decoder prompt: {prompt!r}\n"
          f"Generated text: {generated_text!r}")
+    print("-" * 50)
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-
+"""
+This file demonstrates using the `LLMEngine`
+for processing prompts with various sampling parameters.
+"""
 import argparse

 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@ -26,6 +29,7 @@ def process_requests(engine: LLMEngine,
    """Continuously process a list of prompts and handle the outputs."""
    request_id = 0

+    print('-' * 50)
    while test_prompts or engine.has_unfinished_requests():
        if test_prompts:
            prompt, sampling_params = test_prompts.pop(0)
@ -37,6 +41,7 @@ def process_requests(engine: LLMEngine,
        for request_output in request_outputs:
            if request_output.finished:
                print(request_output)
+                print('-' * 50)


 def initialize_engine(args: argparse.Namespace) -> LLMEngine:
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Validates the loading of a model saved with the sharded_state format.
+This script demonstrates how to load a model that was previously saved
+using save_sharded_state.py and validates it by running inference.
+Example usage:
+(First need to save a sharded_state mode)
+
+python save_sharded_state.py \
+    --model /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save/sharded/modele
+
+python load_sharded_state.py \
+    --model /path/to/saved/sharded/model \
+    --load-format sharded_state \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --prompt "Hello, my name is" \
+    --max-tokens 50
+"""
+
+import dataclasses
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    # Add engine arguments
+    EngineArgs.add_cli_args(parser)
+
+    # Override default load_format for clarity
+    parser.set_defaults(load_format="sharded_state")
+
+    # Add validation arguments
+    parser.add_argument("--prompt",
+                        type=str,
+                        default="Hello, world!",
+                        help="Prompt for validation")
+    parser.add_argument("--max-tokens",
+                        type=int,
+                        default=100,
+                        help="Maximum number of tokens to generate")
+    parser.add_argument("--temperature",
+                        type=float,
+                        default=0.7,
+                        help="Sampling temperature")
+    parser.add_argument("--top-p",
+                        type=float,
+                        default=1.0,
+                        help="Top-p sampling parameter")
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    engine_args = EngineArgs.from_cli_args(args)
+
+    print(f"Loading model from {engine_args.model} "
+          f"using format {engine_args.load_format}")
+    print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
+
+    # Load the model using engine args
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Prepare sampling parameters
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_tokens,
+    )
+
+    print("\nRunning inference:")
+    print(f"Prompt: {args.prompt}")
+
+    # Generate completion
+    outputs = llm.generate(args.prompt, sampling_params)
+
+    # Display generated text
+    print("\nGenerated outputs:")
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print("-" * 50)
+        print(f"Full output: {args.prompt}{generated_text}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@ -13,9 +13,14 @@ from vllm.sampling_params import SamplingParams
 # - Server:
 #
 # ```bash
+# # Mistral format
 # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
 #   --tokenizer-mode mistral --config-format mistral --load-format mistral \
 #   --limit-mm-per-prompt 'image=4' --max-model-len 16384
+#
+# # HF format
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@ -44,19 +49,22 @@ from vllm.sampling_params import SamplingParams
 #     python demo.py simple
 #     python demo.py advanced

+# Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+# These scripts have been tested on 2x L40 GPUs
+

 def run_simple_demo(args: argparse.Namespace):
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
    sampling_params = SamplingParams(max_tokens=8192)

-    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
    llm = LLM(
        model=model_name,
-        tokenizer_mode="mistral",
-        config_format="mistral",
-        load_format="mistral",
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
        max_model_len=4096,
        max_num_seqs=2,
+        tensor_parallel_size=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

@ -88,17 +96,18 @@ def run_simple_demo(args: argparse.Namespace):

 def run_advanced_demo(args: argparse.Namespace):
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
-    max_img_per_msg = 5
+    max_img_per_msg = 3
    max_tokens_per_img = 4096

    sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
    llm = LLM(
        model=model_name,
-        tokenizer_mode="mistral",
-        config_format="mistral",
-        load_format="mistral",
+        tokenizer_mode="mistral" if args.format == "mistral" else "auto",
+        config_format="mistral" if args.format == "mistral" else "auto",
+        load_format="mistral" if args.format == "mistral" else "auto",
        limit_mm_per_prompt={"image": max_img_per_msg},
        max_model_len=max_img_per_msg * max_tokens_per_img,
+        tensor_parallel_size=2,
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )

@ -166,6 +175,11 @@ def main():
        help="Specify the demo mode: 'simple' or 'advanced'",
    )

+    parser.add_argument('--format',
+                        choices=["mistral", "hf"],
+                        default="mistral",
+                        help='Specify the format of the model to load.')
+
    parser.add_argument(
        '--disable-mm-preprocessor-cache',
        action='store_true',
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@ -1,4 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the usage of text generation with an LLM model,
+comparing the performance with and without speculative decoding.
+
+Note that still not support `v1`:
+VLLM_USE_V1=0 python examples/offline_inference/mlpspeculator.py
+"""

 import gc
 import time
@ -7,7 +14,7 @@ from vllm import LLM, SamplingParams


 def time_generation(llm: LLM, prompts: list[str],
-                    sampling_params: SamplingParams):
+                    sampling_params: SamplingParams, title: str):
    # Generate texts from the prompts. The output is a list of RequestOutput
    # objects that contain the prompt, generated text, and other information.
    # Warmup first
@ -16,11 +23,15 @@ def time_generation(llm: LLM, prompts: list[str],
    start = time.time()
    outputs = llm.generate(prompts, sampling_params)
    end = time.time()
-    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
+    print("-" * 50)
+    print(title)
+    print("time: ",
+          (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs))
    # Print the outputs.
    for output in outputs:
        generated_text = output.outputs[0].text
        print(f"text: {generated_text!r}")
+        print("-" * 50)


 if __name__ == "__main__":
@ -41,8 +52,7 @@ if __name__ == "__main__":
    # Create an LLM without spec decoding
    llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")

-    print("Without speculation")
-    time_generation(llm, prompts, sampling_params)
+    time_generation(llm, prompts, sampling_params, "Without speculation")

    del llm
    gc.collect()
@ -55,5 +65,4 @@ if __name__ == "__main__":
        },
    )

-    print("With speculation")
-    time_generation(llm, prompts, sampling_params)
+    time_generation(llm, prompts, sampling_params, "With speculation")
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@ -57,10 +57,25 @@ def main(args):
    # Prepare output directory
    Path(args.output).mkdir(exist_ok=True)
    # Dump worker states to output directory
-    model_executor = llm.llm_engine.model_executor
-    model_executor.save_sharded_state(path=args.output,
-                                      pattern=args.file_pattern,
-                                      max_size=args.max_file_size)
+
+    # Check which engine version is being used
+    is_v1_engine = hasattr(llm.llm_engine, "engine_core")
+
+    if is_v1_engine:
+        # For V1 engine, we need to use engine_core.save_sharded_state
+        print("Using V1 engine save path")
+        llm.llm_engine.engine_core.save_sharded_state(
+            path=args.output,
+            pattern=args.file_pattern,
+            max_size=args.max_file_size)
+    else:
+        # For V0 engine
+        print("Using V0 engine save path")
+        model_executor = llm.llm_engine.model_executor
+        model_executor.save_sharded_state(path=args.output,
+                                          pattern=args.file_pattern,
+                                          max_size=args.max_file_size)
+
    # Copy metadata files to output directory
    for file in os.listdir(model_path):
        if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@ -23,10 +23,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

 # Use `distributed_executor_backend="external_launcher"` so that
 # this llm engine/instance only creates one worker.
+# it is important to set an explicit seed to make sure that
+# all ranks have the same random seed, so that sampling can be
+# deterministic across ranks.
 llm = LLM(
    model="facebook/opt-125m",
    tensor_parallel_size=2,
    distributed_executor_backend="external_launcher",
+    seed=0,
 )

 outputs = llm.generate(prompts, sampling_params)
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@ -60,6 +60,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Aya Vision
+def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        mm_processor_kwargs={"crop_to_patches": True},
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+    prompts = [
+        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # BLIP-2
 def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -498,6 +520,29 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")


+# Mistral-3 HF-format
+def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # NOTE: Need L40 (or equivalent) to avoid OOM
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # LLama 3.2
 def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -537,6 +582,42 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
    )


+def run_llama4(questions: list[str], modality: str):
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=4,
+        tensor_parallel_size=8,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        gpu_memory_utilization=0.4,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [[{
+        "role":
+        "user",
+        "content": [{
+            "type": "image"
+        }, {
+            "type": "text",
+            "text": f"{question}"
+        }]
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            add_generation_prompt=True,
+                                            tokenize=False)
+    stop_token_ids = None
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 # Molmo
 def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@ -842,6 +923,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:

 model_example_map = {
    "aria": run_aria,
+    "aya_vision": run_aya_vision,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
    "deepseek_vl_v2": run_deepseek_vl2,
@ -859,7 +941,9 @@ model_example_map = {
    "mantis": run_mantis,
    "minicpmo": run_minicpmo,
    "minicpmv": run_minicpmv,
+    "mistral3": run_mistral3,
    "mllama": run_mllama,
+    "llama4": run_llama4,
    "molmo": run_molmo,
    "NVLM_D": run_nvlm_d,
    "paligemma": run_paligemma,
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@ -61,6 +61,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_deepseek_vl2(question: str,
                      image_urls: list[str]) -> ModelRequestData:
    model_name = "deepseek-ai/deepseek-vl2-tiny"
@ -218,6 +253,65 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=4,
+        tensor_parallel_size=8,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
+def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # Adjust this as necessary to fit in GPU
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"

@ -504,11 +598,14 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:

 model_example_map = {
    "aria": load_aria,
+    "aya_vision": load_aya_vision,
    "deepseek_vl_v2": load_deepseek_vl2,
    "gemma3": load_gemma3,
    "h2ovl_chat": load_h2ovl,
    "idefics3": load_idefics3,
    "internvl_chat": load_internvl,
+    "llama4": load_llama4,
+    "mistral3": load_mistral3,
    "mllama": load_mllama,
    "NVLM_D": load_nvlm_d,
    "phi3_v": load_phi3v,
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+To run this example, you can start the vLLM server 
+without any specific flags:
+
+```bash
+VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
+    --guided-decoding-backend outlines
+```
+
+This example demonstrates how to generate chat completions 
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type":
+                        "string",
+                        "description":
+                        "the two-letter abbreviation for the state that the "
+                        "city is in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'New York'",
+                    },
+                    "state": {
+                        "type":
+                        "string",
+                        "description":
+                        "The two-letter abbreviation for the state, e.g. 'NY'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "days", "unit"],
+            },
+        },
+    },
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": "Hi! How are you doing today?"
+    },
+    {
+        "role": "assistant",
+        "content": "I'm doing well! How can I help you?"
+    },
+    {
+        "role":
+        "user",
+        "content":
+        "Can you tell me what the current weather is in Dallas \
+            and the forecast for the next 5 days, in fahrenheit?",
+    },
+]
+
+chat_completion = client.chat.completions.create(
+    messages=messages,
+    model=model,
+    tools=tools,
+    tool_choice="required",
+    stream=True  # Enable streaming response
+)
+
+for chunk in chat_completion:
+    if chunk.choices and chunk.choices[0].delta.tool_calls:
+        print(chunk.choices[0].delta.tool_calls)
+
+chat_completion = client.chat.completions.create(messages=messages,
+                                                 model=model,
+                                                 tools=tools,
+                                                 tool_choice="required")
+
+print(chat_completion.choices[0].message.tool_calls)
--- a/examples/template_florence2.jinja
+++ b/examples/template_florence2.jinja
@ -0,0 +1,7 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- message['content'] -}}
+    {%- endif -%}
+{%- endfor -%}
--- a/examples/tool_chat_template_llama3.2_pythonic.jinja
+++ b/examples/tool_chat_template_llama3.2_pythonic.jinja
@ -76,7 +76,7 @@
            {{- tool_call.name + '(' -}}
            {%- for param in tool_call.arguments %}
                {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                {% if not loop.last %}, {% endif %}
            {%- endfor %}
            {{- ')' -}}
--- a/examples/tool_chat_template_phi4_mini.jinja
+++ b/examples/tool_chat_template_phi4_mini.jinja
@ -0,0 +1,60 @@
+{%- if messages %}
+    {%- if system_message or tools %}
+<|system|>
+
+{%- if system_message %}
+{{ system_message }}
+{%- endif %}
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+
+{%- if tools %}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+{%- endif %}<|end|>
+    {%- endif %}
+
+    {%- for message in messages %}
+        {%- if message.role != "system" %}
+<|{{ message.role }}|>
+            {%- if message.content and message.role == "tools" %}
+{"result": {{ message.content }}}
+            {%- elif message.content %}
+{{ message.content }}
+            {%- elif message.tool_calls %}
+                {%- for call in message.tool_calls %}
+{"name": "{{ call.function.name }}", "arguments": {{ call.function.arguments }}}
+                    {%- if not loop.last %},{% endif %}
+                {%- endfor %}
+            {%- endif %}<|end|>
+        {%- endif %}
+    {%- endfor %}<|assistant|>
+
+{%- else %}
+    {%- if system_message %}
+<|system|>
+
+{{ system_message }}<|end|>
+    {%- endif %}
+    {%- if prompt %}
+<|user|>
+
+{{ prompt }}<|end|>
+    {%- endif %}<|assistant|>
+
+{%- endif %}
+{{ response }}
+{%- if response %}<|user|>{% endif %}
--- a/examples/tool_chat_template_toolace.jinja
+++ b/examples/tool_chat_template_toolace.jinja
@ -44,7 +44,7 @@
            {{- tool_call.name + '(' -}}
            {%- for param in tool_call.arguments %}
                {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                {% if not loop.last %}, {% endif %}
            {%- endfor %}
            {{- ')' -}}
--- a/format.sh
+++ b/format.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-echo "vLLM linting system has been moved from format.sh to pre-commit hook."
+echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
 echo "Please run 'pip install -r requirements/lint.txt', followed by"
-echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
-echo "Then linters will run automatically before each commit."
+echo "'pre-commit install' to install the pre-commit hooks."
+echo "Then linters will run automatically before each commit."
--- a/pyproject.toml
+++ b/pyproject.toml
@ -30,7 +30,7 @@ classifiers = [
    "Topic :: Scientific/Engineering :: Artificial Intelligence",
    "Topic :: Scientific/Engineering :: Information Analysis",
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.9,<3.13"
 dynamic = [ "version", "dependencies", "optional-dependencies"]

 [project.urls]
--- a/python_only_dev.py
+++ b/python_only_dev.py
@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
-
-TL;DR:
-
-VLLM_USE_PRECOMPILED=1 pip install -e .
-
-or
-
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-pip install -e .
-""" # noqa
-
-print(msg)
--- a/Show More
+++ b/Show More