updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
updated
2025-03-28 02:26:20 +00:00 · 2025-03-28 02:17:42 +00:00 · 2025-03-28 01:54:01 +00:00 · 2025-03-27 23:51:36 +00:00
458 changed files with 5645 additions and 21936 deletions
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@ -10,24 +10,15 @@ set -x
 set -o pipefail

 check_gpus() {
-  if command -v nvidia-smi; then
-    # check the number of GPUs and GPU type.
-    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  elif command -v amd-smi; then
-    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
-  fi
-
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
  if [[ $gpu_count -gt 0 ]]; then
    echo "GPU found."
  else
    echo "Need at least 1 GPU to run benchmarking."
    exit 1
  fi
-  if command -v nvidia-smi; then
-    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
-  elif command -v amd-smi; then
-    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
-  fi
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
  echo "GPU type is $gpu_type"
 }

@ -99,15 +90,9 @@ kill_gpu_processes() {


  # wait until GPU memory usage smaller than 1GB
-  if command -v nvidia-smi; then
-    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-      sleep 1
-    done
-  elif command -v amd-smi; then
-    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
-      sleep 1
-    done
-  fi
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+    sleep 1
+  done

  # remove vllm config file
  rm -rf ~/.config/vllm
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -63,12 +63,10 @@
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
            "disable_log_requests": "", 
            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "speculative_config": {
-                "model": "turboderp/Qwama-0.5B-Instruct",
-                "num_speculative_tokens": 4,
-                "draft_tensor_parallel_size": 1
-            }
+            "swap_space": 16, 
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1
        },
        "client_parameters": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -3,10 +3,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -14,10 +14,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -31,10 +31,10 @@ steps:
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
+      - "bash .buildkite/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"

@ -48,7 +48,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"

  - label: "Build and publish TPU release image"
@ -57,7 +57,7 @@ steps:
    agents:
      queue: tpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
      - "docker push vllm/vllm-tpu:nightly"
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
    plugins:
@ -82,7 +82,7 @@ steps:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -105,33 +105,19 @@ fi
 if [[ $commands == *" entrypoints/openai "* ]]; then
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_chat.py \
  --ignore=entrypoints/openai/test_shutdown.py \
  --ignore=entrypoints/openai/test_completion.py \
  --ignore=entrypoints/openai/test_sleep.py \
  --ignore=entrypoints/openai/test_models.py \
-  --ignore=entrypoints/openai/test_lora_adapters.py \
-  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-  --ignore=entrypoints/openai/test_root_path.py \
-  --ignore=entrypoints/openai/test_tokenization.py \
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi

 #ignore certain Entrypoints/llm tests
-if [[ $commands == *" entrypoints/llm "* ]]; then
-  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
-  --ignore=entrypoints/llm/test_chat.py \
-  --ignore=entrypoints/llm/test_accuracy.py \
-  --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
-  --ignore=entrypoints/llm/test_prompt_validation.py "}
+if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 fi

-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -5,8 +5,8 @@
 set -ex
 set -o pipefail

-# cd 2 levels into the working directory
-cd "$(dirname "${BASH_SOURCE[0]}")/../.."
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."

 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)

--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -10,5 +10,5 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Try building the docker image
-docker build -t cpu-test -f docker/Dockerfile.ppc64le .
+docker build -t cpu-test -f Dockerfile.ppc64le .

--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -8,19 +8,15 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}

+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
+
 # Setup cleanup
-remove_docker_container() { 
-    set -e; 
-    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
-    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
-}
+remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
-
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@ -40,6 +36,8 @@ function cpu_tests() {
  # Run basic model test
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
    set -e
+    pip install -r vllm/requirements/test.txt
+    pip install -r vllm/requirements/cpu.txt
    pytest -v -s tests/kernels/test_cache.py -m cpu_model
    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
    pytest -v -s tests/models/decoder_only/language -m cpu_model
--- a/.buildkite/scripts/hardware_ci/run-gh200-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
@ -9,7 +9,6 @@ python3 use_existing_torch.py

 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
-  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t gh200-test \
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@ -5,7 +5,7 @@
 set -ex

 # Try building the docker image
-docker build -t hpu-test-env -f docker/Dockerfile.hpu .
+docker build -t hpu-test-env -f Dockerfile.hpu .

 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
--- a/.buildkite/scripts/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@ -3,7 +3,7 @@
 set -euox pipefail

 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
    exit 1
 fi

--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -35,7 +35,7 @@ else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi

-docker build -t "${image_name}" -f docker/Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .

 # Setup cleanup
 remove_docker_container() {
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -1,9 +1,9 @@
 #!/bin/bash

-set -xue
+set -e

 # Build the docker image.
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+docker build -f Dockerfile.tpu -t vllm-tpu .

 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
@ -21,8 +21,6 @@ docker run --privileged --net host --shm-size=16G -it \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
-    && echo TEST_0 \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
    && echo TEST_1 \
    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
    && echo TEST_2 \
@ -34,14 +32,11 @@ docker run --privileged --net host --shm-size=16G -it \
    && echo TEST_5 \
    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
    && echo TEST_6 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
+    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
    && echo TEST_7 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
-    && echo TEST_8 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
-    && echo TEST_9 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \


 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"

 # Try building the docker image
-docker build -t ${image_name} -f docker/Dockerfile.xpu .
+docker build -t ${image_name} -f Dockerfile.xpu .

 # Setup cleanup
 remove_docker_container() { 
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -104,7 +104,7 @@ steps:
 - label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
@ -150,12 +150,11 @@ steps:
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - pushd ../examples/offline_inference
-  - python3 rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd

 - label: Metrics, Tracing Test # 10min
-  mirror_hardwares: [amd]
  num_gpus: 2
  source_file_dependencies:
  - vllm/
@ -174,7 +173,7 @@ steps:
 #####  1 GPU test  #####

 - label: Regression Test # 5min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  - tests/test_regression
@ -205,6 +204,7 @@ steps:
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
+    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/engine
    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/sample
@ -285,11 +285,11 @@ steps:
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py

 - label: LoRA Test %N # 15min each
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
  parallelism: 4

 - label: PyTorch Fullgraph Smoke Test # 9min
@ -311,7 +311,7 @@ steps:
  - pytest -v -s compile/test_full_graph.py

 - label: Kernels Test %N # 1h each
-  # mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
  - vllm/attention
@ -321,7 +321,7 @@ steps:
  parallelism: 4

 - label: Tensorizer Test # 11min
-  # mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  soft_fail: true
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -337,7 +337,7 @@ steps:
  source_file_dependencies:
  - benchmarks/
  commands:
-  - bash scripts/run-benchmarks.sh
+  - bash run-benchmarks.sh

 - label: Quantization Test # 33min
  source_file_dependencies:
@ -372,7 +372,7 @@ steps:

 - label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
-  #mirror_hardwares: [ amd ]
+  mirror_hardwares: [ amd ]
  source_file_dependencies:
    - vllm/
    - tests/tool_use
@ -389,8 +389,7 @@ steps:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py

 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@ -432,7 +431,6 @@ steps:
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
    - pytest -v -s models/encoder_decoder/language -m core_model
    - pytest -v -s models/encoder_decoder/vision_language -m core_model
-    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py

 - label: Multi-Modal Models Test (Extended) 1 # 48m
  optional: true
@ -465,7 +463,6 @@ steps:

 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
-  mirror_hardwares: [amd]
  optional: true
  commands:
    - echo 'Testing custom models...'
@ -477,7 +474,6 @@ steps:
 #####  multi gpus test  #####

 - label: Distributed Comm Ops Test # 7min
-  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  source_file_dependencies:
@ -524,7 +520,7 @@ steps:
  - vllm/v1/engine/
  commands:
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@ -605,6 +601,8 @@ steps:
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py
+    - pytest -v -s -x lora/test_transfomers_model.py


 - label: Weight Loading Multiple GPU Test  # 33min
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -19,7 +19,7 @@ pull_request_rules:
      - files~=\.buildkite/
      - files~=^cmake/
      - files=CMakeLists.txt
-      - files~=^docker/Dockerfile
+      - files~=^Dockerfile
      - files~=^requirements.*\.txt
      - files=setup.py
  actions:
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -50,7 +50,7 @@ jobs:
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0

      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .

      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,6 +1,3 @@
-default_install_hook_types:
-  - pre-commit
-  - commit-msg
 default_stages:
  - pre-commit # Run locally
  - manual # Run in CI
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")

 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")

 #
 # Supported/expected torch versions for CUDA/ROCm.
@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from docker/Dockerfile.rocm
+# versions are derived from Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
@ -234,7 +234,6 @@ set(VLLM_EXT_SRC
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
  "csrc/layernorm_quant_kernels.cu"
-  "csrc/cuda_view.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
  "csrc/quantization/fp8/common.cu"
@ -242,7 +241,6 @@ set(VLLM_EXT_SRC
  "csrc/quantization/gguf/gguf_kernel.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/prepare_inputs/advance_step.cu"
-  "csrc/custom_all_reduce.cu"
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
@ -284,6 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
    "csrc/quantization/aqlm/gemm_kernels.cu"
    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/custom_all_reduce.cu"
    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
--- a/docker/Dockerfile.arm
+++ b/docker/Dockerfile.arm
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -0,0 +1,69 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-1
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
+# intel-openmp provides additional performance improvement vs. openmp
+# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install intel-openmp==2025.0.1
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+RUN pip install intel_extension_for_pytorch==2.6.0
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements/build.txt
+
+FROM cpu-test-1 AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    pip install -v -r requirements/cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -e tests/vllm_test_utils
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@ -38,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
    && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
    && python -m pip install -U pip uv \
-    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
+    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
    && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
    && cd /tmp && touch control
@ -238,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    && python -m pip install -U pip uv --no-cache \
    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
    && make -C /numactl install \
-    && uv pip install 'cmake<4' \
+    && uv pip install cmake \
    && cmake --install /lapack/build \
    && uv pip uninstall cmake

--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -1,18 +1,18 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="db8e93b4"
+ARG HIPBLASLT_BRANCH="4d40e36"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="295f2ed4"
-ARG PYTORCH_VISION_BRANCH="v0.21.0"
+ARG PYTORCH_BRANCH="3a585126"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="1a7f4dfa"
-ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="8970b25b"
+ARG FA_BRANCH="b7d29fb"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG AITER_BRANCH="21d47a9"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"

 FROM ${BASE_IMAGE} AS base
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}

 ARG PYTHON_VERSION=3.12
@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive

 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
+    && apt-get install -y software-properties-common git curl sudo vim less \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@ -42,7 +42,7 @@ RUN apt-get update -y \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version

-RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython

 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@ -60,8 +60,7 @@ RUN cd hipBLAS-common \
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
    && git checkout ${HIPBLASLT_BRANCH} \
-    && apt-get install -y llvm-dev \
-    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
    && cd build/release \
    && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@ -111,24 +110,11 @@ RUN git clone ${FA_REPO}
 RUN cd flash-attention \
    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
-    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
    && cp /app/vision/dist/*.whl /app/install \
    && cp /app/flash-attention/dist/*.whl /app/install

-FROM base AS build_aiter
-ARG AITER_BRANCH
-ARG AITER_REPO
-RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
-    pip install /install/*.whl
-RUN git clone --recursive ${AITER_REPO}
-RUN cd aiter \
-    && git checkout ${AITER_BRANCH} \
-    && git submodule update --init --recursive \
-    && pip install -r requirements.txt
-RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
-RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
-
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
    dpkg -i /install/*deb \
@ -144,12 +130,19 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
    pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
    pip install /install/*.whl
-RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
-    pip install /install/*.whl
+
+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 

 ARG BASE_IMAGE
-ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@ -161,8 +154,6 @@ ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
-ARG AITER_BRANCH
-ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@ -176,5 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
--- a/docker/Dockerfile.tpu
+++ b/docker/Dockerfile.tpu
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
--- a/README.md
+++ b/README.md
@ -15,12 +15,14 @@ Easy, fast, and cheap LLM serving for everyone

 ---

+[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
+
 [2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)

 ---

 *Latest News* 🔥
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
+
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
@ -101,7 +103,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 ## Contributing

 We welcome and value any contributions and collaborations.
-Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
+Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.

 ## Sponsors

@ -124,7 +126,6 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -41,39 +41,29 @@ become available.
      <td><code>synthetic</code></td>
    </tr>
    <tr>
-      <td><strong>HuggingFace-VisionArena</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmarena-ai/VisionArena-Chat</code></td>
+      <td><strong>HuggingFace</strong></td>
+      <td style="text-align: center;">🟡</td>
+      <td style="text-align: center;">🟡</td>
+      <td>Specify your dataset path on HuggingFace</td>
    </tr>
    <tr>
-      <td><strong>HuggingFace-InstructCoder</strong></td>
+      <td><strong>VisionArena</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
-      <td><code>likaixin/InstructCoder</code></td>
-    </tr>
-      <tr>
-      <td><strong>HuggingFace-AIMO</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
-    </tr>
-    <tr>
-      <td><strong>HuggingFace-Other</strong></td>
-      <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">✅</td>
-      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
+      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
    </tr>
  </tbody>
 </table>

 ✅: supported

-🟡: Partial support
-
 🚧: to be supported

-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
+If you need support for other dataset formats, please consider contributing.
+
+**Note**: VisionArena’s `dataset-name` should be set to `hf`

 ---
 ## Example - Online Benchmark
@ -81,7 +71,8 @@ become available.
 First start serving your model

 ```bash
-vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+vllm serve ${MODEL_NAME} --disable-log-requests
 ```

 Then run the benchmarking script
@ -89,13 +80,12 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --num-prompts 10
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+NUM_PROMPTS=10
+BACKEND="vllm"
+DATASET_NAME="sharegpt"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
 ```

 If successful, you will see the following output
@ -132,105 +122,88 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```

 ```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT='train'
+
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --hf-split train \
-  --num-prompts 1000
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}"
 ```

-### InstructCoder Benchmark with Speculative Decoding
+### HuggingFaceDataset Examples

-``` bash
-VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
-    --speculative-model "[ngram]" \
-    --ngram_prompt_lookup_min 2 \
-    --ngram-prompt-lookup-max 5 \
-    --num_speculative_tokens 5
-```
-
-``` bash
-python3 benchmarks/benchmark_serving.py \
-    --model meta-llama/Meta-Llama-3-8B-Instruct \
-    --dataset-name hf \
-    --dataset-path likaixin/InstructCoder \
-    --num-prompts 2048
-```
-
-### Other HuggingFaceDataset Examples
+Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
+formats, please consider contributing.

 ```bash
+# need a model with vision capability here
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```

 **`lmms-lab/LLaVA-OneVision-Data`**

 ```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
+DATASET_SPLIT='train'
+DATASET_SUBSET='chart2text(cauldron)'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-subset "${DATASET_SUBSET}"
 ```

 **`Aeala/ShareGPT_Vicuna_unfiltered`**

 ```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
+DATASET_SPLIT='train'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend openai-chat \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --endpoint /v1/chat/completions \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-**`AI-MO/aimo-validation-aime`**
-
-``` bash
-python3 vllm/benchmarks/benchmark_serving.py \
-    --model Qwen/QwQ-32B \
-    --dataset-name hf \
-    --dataset-path AI-MO/aimo-validation-aime \
-    --num-prompts 10 \
-    --seed 42
-```
-
-### Running With Sampling Parameters
-
-When using OpenAI-compatible backends such as `vllm`, optional sampling
-parameters can be specified. Example client command:
-
-```bash
-python3 vllm/benchmarks/benchmark_serving.py \
-  --backend vllm \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --endpoint /v1/completions \
-  --dataset-name sharegpt \
-  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --top-k 10 \
-  --top-p 0.9 \
-  --temperature 0.5 \
-  --num-prompts 10
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
 ```

 ---
 ## Example - Offline Throughput Benchmark

 ```bash
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+NUM_PROMPTS=10
+DATASET_NAME="sonnet"
+DATASET_PATH="vllm/benchmarks/sonnet.txt"
+
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model NousResearch/Hermes-3-Llama-3.1-8B \
-  --dataset-name sonnet \
-  --dataset-path vllm/benchmarks/sonnet.txt \
-  --num-prompts 10
+  --model "${MODEL_NAME}" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}"
 ```

 If successful, you will see the following output
@ -244,13 +217,19 @@ Total num output tokens:  1500
 ### VisionArena Benchmark for Vision Language Models

 ``` bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT="train"
+
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmarena-ai/VisionArena-Chat \
-  --num-prompts 1000 \
-  --hf-split train
+  --model "${MODEL_NAME}" \
+  --backend "vllm-chat" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-split "${DATASET_SPLIT}"
 ```

 The `num prompt tokens` now includes image token counts
@ -261,83 +240,29 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```

-### InstructCoder Benchmark with Speculative Decoding
-
-``` bash
-VLLM_WORKER_MULTIPROC_METHOD=spawn \
-VLLM_USE_V1=1 \
-python3 vllm/benchmarks/benchmark_throughput.py \
-    --dataset-name=hf \
-    --dataset-path=likaixin/InstructCoder \
-    --model=meta-llama/Meta-Llama-3-8B-Instruct \
-    --input-len=1000 \
-    --output-len=100 \
-    --num-prompts=2048 \
-    --async-engine \
-    --speculative-model="[ngram]" \
-    --ngram_prompt_lookup_min=2 \
-    --ngram-prompt-lookup-max=5 \
-    --num_speculative_tokens=5
-```
-
-```
-Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
-Total num prompt tokens:  261136
-Total num output tokens:  204800
-```
-
-### Other HuggingFaceDataset Examples
-
-**`lmms-lab/LLaVA-OneVision-Data`**
-
-```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path lmms-lab/LLaVA-OneVision-Data \
-  --hf-split train \
-  --hf-subset "chart2text(cauldron)" \
-  --num-prompts 10
-```
-
-**`Aeala/ShareGPT_Vicuna_unfiltered`**
-
-```bash
-python3 vllm/benchmarks/benchmark_throughput.py \
-  --model Qwen/Qwen2-VL-7B-Instruct \
-  --backend vllm-chat \
-  --dataset-name hf \
-  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
-  --hf-split train \
-  --num-prompts 10
-```
-
-**`AI-MO/aimo-validation-aime`**
-
-```bash
-python3 benchmarks/benchmark_throughput.py \
-  --model Qwen/QwQ-32B \
-  --backend vllm \
-  --dataset-name hf \
-  --dataset-path AI-MO/aimo-validation-aime \
-  --hf-split train \
-  --num-prompts 10
-```
-
 ### Benchmark with LoRA Adapters

 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+MODEL_NAME="meta-llama/Llama-2-7b-hf"
+BACKEND="vllm"
+DATASET_NAME="sharegpt"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
+NUM_PROMPTS=10
+MAX_LORAS=2
+MAX_LORA_RANK=8
+ENABLE_LORA="--enable-lora"
+LORA_PATH="yard1/llama-2-7b-sql-lora-test"
+
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model meta-llama/Llama-2-7b-hf \
-  --backend vllm \
-  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
-  --dataset_name sharegpt \
-  --num-prompts 10 \
-  --max-loras 2 \
-  --max-lora-rank 8 \
-  --enable-lora \
-  --lora-path yard1/llama-2-7b-sql-lora-test
+  --model "${MODEL_NAME}" \
+  --backend "${BACKEND}" \
+  --dataset_path "${DATASET_PATH}" \
+  --dataset_name "${DATASET_NAME}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --max-loras "${MAX_LORAS}" \
+  --max-lora-rank "${MAX_LORA_RANK}" \
+  ${ENABLE_LORA} \
+  --lora-path "${LORA_PATH}"
  ```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -219,15 +219,7 @@ async def async_request_deepspeed_mii(
                if response.status == 200:
                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    if "choices" in parsed_resp:
-                        output.generated_text = parsed_resp["choices"][0][
-                            "text"]
-                    elif "text" in parsed_resp:
-                        output.generated_text = parsed_resp["text"][0]
-                    else:
-                        output.error = ("Unexpected response format: "
-                                        "neither 'choices' nor 'text' found")
-                        output.success = False
+                    output.generated_text = parsed_resp["text"][0]
                    output.success = True
                else:
                    output.error = response.reason or ""
@ -497,9 +489,3 @@ ASYNC_REQUEST_FUNCS = {
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
 }
-
-OPENAI_COMPATIBLE_BACKENDS = [
-    k for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions,
-             async_request_openai_chat_completions)
-]
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -23,8 +23,7 @@ from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass
 from functools import cache
-from io import BytesIO
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union

 import numpy as np
 import pandas as pd
@ -240,24 +239,21 @@ def process_image(image: Any) -> Mapping[str, Any]:
    """
    Process a single image input and return a multimedia content dictionary.

-    Supports three input types:
+    For a PIL.Image.Image input:
+      - Converts the image to RGB.
+      - Saves the image as a JPEG in-memory.
+      - Encodes the JPEG data as a base64 string.
+      - Returns a dictionary with the image as a base64 data URL.

-    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
-       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
-
-    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
-       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
-       a dictionary with the image as a base64 data URL.
-
-    3. String input: - Treats the string as a URL or local file path.  -
-       Prepends "file://" if the string doesn't start with "http://" or
-       "file://".  - Returns a dictionary with the image URL.
+    For a string input:
+      - Treats the string as a URL or file path.
+      - Prepends "file://" if the string doesn't start with "http://" or
+        "file://".
+      - Returns a dictionary with the image URL.

    Raises:
-        ValueError: If the input is not a supported type.
+      ValueError: If the input is neither a PIL.Image.Image nor a string.
    """
-    if isinstance(image, dict) and 'bytes' in image:
-        image = Image.open(BytesIO(image['bytes']))
    if isinstance(image, Image.Image):
        image = image.convert("RGB")
        with io.BytesIO() as image_data:
@ -276,8 +272,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
            ("http://", "file://")) else f"file://{image}")
        return {"type": "image_url", "image_url": {"url": image_url}}

-    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
-                     " or str or dictionary with raw image bytes.")
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image or str.")


 # -----------------------------------------------------------------------------
@ -566,47 +562,48 @@ class BurstGPTDataset(BenchmarkDataset):


 # -----------------------------------------------------------------------------
-# HuggingFace Dataset Base Implementation
+# HuggingFace Dataset Implementation
 # -----------------------------------------------------------------------------
-class HuggingFaceDataset(BenchmarkDataset):
-    """Base class for datasets hosted on HuggingFace."""

-    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
+
+class HuggingFaceDataset(BenchmarkDataset):
+    """
+    Dataset class for processing a HuggingFace dataset with conversation data
+    and optional images.
+    """

    def __init__(
        self,
-        dataset_path: str,
        dataset_split: str,
        dataset_subset: Optional[str] = None,
        **kwargs,
    ) -> None:
-        super().__init__(dataset_path=dataset_path, **kwargs)
-
+        super().__init__(**kwargs)
        self.dataset_split = dataset_split
        self.dataset_subset = dataset_subset
+
        self.load_data()

    def load_data(self) -> None:
-        """Load data from HuggingFace datasets."""
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided for loading data.")
+
        self.data = load_dataset(
            self.dataset_path,
            name=self.dataset_subset,
            split=self.dataset_split,
            streaming=True,
        )
-        self.data = self.data.shuffle(seed=self.random_seed)
-
-
-# -----------------------------------------------------------------------------
-# Conversation Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class ConversationDataset(HuggingFaceDataset):
-    """Dataset for conversation data with multimodal support."""
-    SUPPORTED_DATASET_PATHS = {
-        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
-    }
+        if self.data.features is None or "conversations" \
+            not in self.data.features:
+            raise ValueError(
+                "HuggingFaceDataset currently only supports datasets with "
+                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
+                "Please consider contributing if you would like to add "
+                "support for additional dataset formats.")
+        # Shuffle and filter examples with at least 2 conversations.
+        self.data = self.data.shuffle(seed=self.random_seed).filter(
+            lambda x: len(x["conversations"]) >= 2)

    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
@ -614,13 +611,10 @@ class ConversationDataset(HuggingFaceDataset):
               output_len: Optional[int] = None,
               enable_multimodal_chat: bool = False,
               **kwargs) -> list:
-        # Filter examples with at least 2 conversations
-        filtered_data = self.data.filter(
-            lambda x: len(x["conversations"]) >= 2)
        sampled_requests = []
        dynamic_output = output_len is None

-        for item in filtered_data:
+        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            conv = item["conversations"]
@ -665,12 +659,29 @@ class VisionArenaDataset(HuggingFaceDataset):
    """

    DEFAULT_OUTPUT_LEN = 128
-    SUPPORTED_DATASET_PATHS = {
-        "lmarena-ai/VisionArena-Chat":
-        lambda x: x["conversation"][0][0]["content"],
-        "lmarena-ai/vision-arena-bench-v0.1":
-        lambda x: x["turns"][0][0]["content"]
-    }
+    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
+            raise ValueError(f"Only support Vision Arena dataset.\
+                    This data path {self.dataset_path} is not valid.")
+        if self.dataset_subset is None and self.dataset_split != "train":
+            raise ValueError("Dataset split must be 'train'.")
+
+        self.load_data()
+
+    def load_data(self) -> None:
+        dataset = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = dataset.shuffle(seed=self.random_seed)

    def sample(
        self,
@ -686,11 +697,7 @@ class VisionArenaDataset(HuggingFaceDataset):
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
-            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
-            if parser_fn is None:
-                raise ValueError(
-                    f"Unsupported dataset path: {self.dataset_path}")
-            prompt = parser_fn(item)
+            prompt = item["turns"][0][0]["content"]
            mm_content = process_image(item["images"][0])
            prompt_len = len(tokenizer(prompt).input_ids)
            if enable_multimodal_chat:
@ -708,96 +715,3 @@ class VisionArenaDataset(HuggingFaceDataset):
                ))
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# Instruct Coder Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class InstructCoderDataset(HuggingFaceDataset):
-    """
-    InstructCoder Dataset.
-    https://huggingface.co/datasets/likaixin/InstructCoder
-
-    InstructCoder is the dataset designed for general code editing.  It consists
-    of 114,239 instruction-input-output triplets, and covers multiple distinct
-    code editing scenario.
-    """
-
-    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
-    SUPPORTED_DATASET_PATHS = {
-        "likaixin/InstructCoder",
-    }
-
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
-        output_len = (output_len
-                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
-        sampled_requests = []
-        for item in self.data:
-            if len(sampled_requests) >= num_requests:
-                break
-            prompt = f"{item['instruction']}:\n{item['input']}"
-            prompt_len = len(tokenizer(prompt).input_ids)
-            sampled_requests.append(
-                SampleRequest(
-                    prompt=prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
-        return sampled_requests
-
-
-# -----------------------------------------------------------------------------
-# AIMO Dataset Implementation
-# -----------------------------------------------------------------------------
-
-
-class AIMODataset(HuggingFaceDataset):
-    """
-    Dataset class for processing a AIMO dataset with reasoning questions.
-    """
-    SUPPORTED_DATASET_PATHS = {
-        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
-        "AI-MO/NuminaMath-CoT"
-    }
-
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               **kwargs) -> list:
-        sampled_requests = []
-        dynamic_output = output_len is None
-
-        for item in self.data:
-            if len(sampled_requests) >= num_requests:
-                break
-            prompt, completion = item['problem'], item["solution"]
-
-            prompt_ids = tokenizer(prompt).input_ids
-            completion_ids = tokenizer(completion).input_ids
-            prompt_len = len(prompt_ids)
-            completion_len = len(completion_ids)
-            output_len = completion_len if dynamic_output else output_len
-            assert isinstance(output_len, int) and output_len > 0
-            if dynamic_output and not is_valid_sequence(prompt_len,
-                                                        completion_len,
-                                                        max_prompt_len=2048,
-                                                        max_total_len=32000):
-                continue
-            sampled_requests.append(
-                SampleRequest(
-                    prompt=prompt,
-                    prompt_len=prompt_len,
-                    expected_output_len=output_len,
-                    multi_modal_data=None,
-                ))
-        self.maybe_oversample_requests(sampled_requests, num_requests)
-        return sampled_requests
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -7,6 +7,9 @@ On the server side, run one of the following commands:
        --swap-space 16 \
        --disable-log-requests

+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
 On the client side, run:
    python benchmarks/benchmark_serving.py \
        --backend <backend> \
@ -34,8 +37,7 @@ from datetime import datetime
 from typing import Any, Optional

 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS,
-                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
@ -50,11 +52,9 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser

-from benchmark_dataset import (AIMODataset, BurstGPTDataset,
-                               ConversationDataset, HuggingFaceDataset,
-                               InstructCoderDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json

 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -261,7 +261,6 @@ async def benchmark(
    goodput_config_dict: dict[str, float],
    max_concurrency: Optional[int],
    lora_modules: Optional[Iterable[str]],
-    extra_body: Optional[dict],
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -289,7 +288,6 @@ async def benchmark(
        logprobs=logprobs,
        multi_modal_content=test_mm_content,
        ignore_eos=ignore_eos,
-        extra_body=extra_body,
    )

    test_output = await request_func(request_func_input=test_input)
@ -316,8 +314,7 @@ async def benchmark(
                                         output_len=test_output_len,
                                         logprobs=logprobs,
                                         multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos,
-                                         extra_body=extra_body)
+                                         ignore_eos=ignore_eos)
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")
@ -367,8 +364,7 @@ async def benchmark(
                                              output_len=output_len,
                                              logprobs=logprobs,
                                              multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos,
-                                              extra_body=extra_body)
+                                              ignore_eos=ignore_eos)
        tasks.append(
            asyncio.create_task(
                limited_request_func(request_func_input=request_func_input,
@ -590,39 +586,19 @@ def main(args: argparse.Namespace):
                                            return_prompt_formatted=True)

    elif args.dataset_name == "hf":
-        # all following datasets are implemented from the
-        # HuggingFaceDataset base class
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = VisionArenaDataset
-            args.hf_split = "train"
-            args.hf_subset = None
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = InstructCoderDataset
-            args.hf_split = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = ConversationDataset
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
-            dataset_class = AIMODataset
-            args.hf_split = "train"
-        else:
-            supported_datasets = set([
-                dataset_name for cls in HuggingFaceDataset.__subclasses__()
-                for dataset_name in cls.SUPPORTED_DATASET_PATHS
-            ])
-            raise ValueError(
-                f"Unsupported dataset path: {args.dataset_path}. "
-                "Huggingface dataset only supports dataset_path"
-                f" from one of following: {supported_datasets}. "
-                "Please consider contributing if you would "
-                "like to add support for additional dataset formats.")
+        # Choose between VisionArenaDataset
+        # and HuggingFaceDataset based on provided parameters.
+        dataset_class = (VisionArenaDataset if args.dataset_path
+                         == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                         and args.hf_subset is None else HuggingFaceDataset)
        input_requests = dataset_class(
            dataset_path=args.dataset_path,
            dataset_subset=args.hf_subset,
            dataset_split=args.hf_split,
-            random_seed=args.seed,
        ).sample(
            num_requests=args.num_prompts,
            tokenizer=tokenizer,
+            random_seed=args.seed,
            output_len=args.hf_output_len,
        )

@ -657,26 +633,6 @@ def main(args: argparse.Namespace):
            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
    goodput_config_dict = check_goodput_args(args)

-    # Collect the sampling parameters.
-    sampling_params = {
-        k: v
-        for k, v in {
-            "top_p": args.top_p,
-            "top_k": args.top_k,
-            "min_p": args.min_p,
-            "temperature": args.temperature
-        }.items() if v is not None
-    }
-
-    # Sampling parameters are only supported by openai-compatible backend.
-    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
-        raise ValueError(
-            "Sampling parameters are only supported by openai-compatible "
-            "backends.")
-
-    if "temperature" not in sampling_params:
-        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
-
    # Avoid GC processing "static" data - reduce pause times.
    gc.collect()
    gc.freeze()
@ -703,7 +659,6 @@ def main(args: argparse.Namespace):
            goodput_config_dict=goodput_config_dict,
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
-            extra_body=sampling_params,
        ))

    # Save config and results to json
@ -1026,33 +981,6 @@ if __name__ == "__main__":
        "from the sampled HF dataset.",
    )

-    sampling_group = parser.add_argument_group("sampling parameters")
-    sampling_group.add_argument(
-        "--top-p",
-        type=float,
-        default=None,
-        help="Top-p sampling parameter. Only has effect on openai-compatible "
-        "backends.")
-    sampling_group.add_argument(
-        "--top-k",
-        type=int,
-        default=None,
-        help="Top-k sampling parameter. Only has effect on openai-compatible "
-        "backends.")
-    sampling_group.add_argument(
-        "--min-p",
-        type=float,
-        default=None,
-        help="Min-p sampling parameter. Only has effect on openai-compatible "
-        "backends.")
-    sampling_group.add_argument(
-        "--temperature",
-        type=float,
-        default=None,
-        help="Temperature sampling parameter. Only has effect on "
-        "openai-compatible backends. If not specified, default to greedy "
-        "decoding (i.e. temperature==0.0).")
-
    parser.add_argument(
        '--tokenizer-mode',
        type=str,
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -5,6 +5,9 @@ On the server side, run one of the following commands:
    (vLLM OpenAI API server)
    vllm serve <your_model> --disable-log-requests

+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
 On the client side, run:
    python benchmarks/benchmark_serving_structured_output.py \
        --backend <backend> \
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -11,8 +11,7 @@ from typing import Any, Optional, Union

 import torch
 import uvloop
-from benchmark_dataset import (AIMODataset, BurstGPTDataset,
-                               ConversationDataset, InstructCoderDataset,
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
                               RandomDataset, SampleRequest, ShareGPTDataset,
                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
@ -301,7 +300,6 @@ def get_requests(args, tokenizer):
        "input_len": args.input_len,
        "output_len": args.output_len,
    }
-
    if args.dataset_path is None or args.dataset_name == "random":
        sample_kwargs["range_ratio"] = args.random_range_ratio
        sample_kwargs["prefix_len"] = args.prefix_len
@ -319,23 +317,18 @@ def get_requests(args, tokenizer):
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
    elif args.dataset_name == "hf":
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = VisionArenaDataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = InstructCoderDataset
-            common_kwargs['dataset_split'] = "train"
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = ConversationDataset
-            common_kwargs['dataset_subset'] = args.hf_subset
-            common_kwargs['dataset_split'] = args.hf_split
-            sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
-            dataset_cls = AIMODataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
+        if args.backend != "vllm-chat":
+            raise ValueError(
+                "hf datasets only are supported by vllm-chat backend")
+        # Choose between VisionArenaDataset and HuggingFaceDataset based on
+        # provided parameters.
+        dataset_cls = (VisionArenaDataset if args.dataset_path
+                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                       and args.hf_subset is None else HuggingFaceDataset)
+        common_kwargs['dataset_subset'] = args.hf_subset
+        common_kwargs['dataset_split'] = args.hf_split
+        sample_kwargs["enable_multimodal_chat"] = True
+
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
@ -469,17 +462,9 @@ def validate_args(args):
        warnings.warn("--hf-subset and --hf-split will be ignored \
                since --dataset-name is not 'hf'.",
                      stacklevel=2)
-    elif args.dataset_name == "hf":
-        if args.dataset_path in (
-                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
-                | ConversationDataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
-                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
-        else:
-            raise ValueError(
-                f"{args.dataset_path} is not supported by hf dataset.")
+    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
+        raise ValueError(
+            "When --dataset-name is 'hf', backend must be 'vllm-chat'")

    # --random-range-ratio: only used when dataset_name is 'random'
    if args.dataset_name != 'random' and args.random_range_ratio is not None:
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -30,18 +30,19 @@ class BenchmarkConfig(TypedDict):
    num_stages: int


-def benchmark_config(config: BenchmarkConfig,
-                     num_tokens: int,
-                     num_experts: int,
-                     shard_intermediate_size: int,
-                     hidden_size: int,
-                     topk: int,
-                     dtype: torch.dtype,
-                     use_fp8_w8a8: bool,
-                     use_int8_w8a16: bool,
-                     num_iters: int = 100,
-                     block_quant_shape: List[int] = None,
-                     use_deep_gemm: bool = False) -> float:
+def benchmark_config(
+    config: BenchmarkConfig,
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    num_iters: int = 100,
+    block_quant_shape: List[int] = None,
+) -> float:
    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
    if use_int8_w8a16:
@ -114,41 +115,22 @@ def benchmark_config(config: BenchmarkConfig,
    def run():
        from vllm.model_executor.layers.fused_moe import override_config
        with override_config(config):
-            if use_deep_gemm:
-                topk_weights, topk_ids = fused_topk(x, input_gating, topk,
-                                                    False)
-                return fused_experts(
-                    x,
-                    w1,
-                    w2,
-                    topk_weights,
-                    topk_ids,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                    allow_deep_gemm=True,
-                )
-            else:
-                fused_moe(
-                    x,
-                    w1,
-                    w2,
-                    input_gating,
-                    topk,
-                    renormalize=True,
-                    inplace=True,
-                    use_fp8_w8a8=use_fp8_w8a8,
-                    use_int8_w8a16=use_int8_w8a16,
-                    w1_scale=w1_scale,
-                    w2_scale=w2_scale,
-                    a1_scale=a1_scale,
-                    a2_scale=a2_scale,
-                    block_shape=block_quant_shape,
-                )
+            fused_moe(
+                x,
+                w1,
+                w2,
+                input_gating,
+                topk,
+                renormalize=True,
+                inplace=True,
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                block_shape=block_quant_shape,
+            )

    # JIT compilation & warmup
    run()
@ -384,7 +366,6 @@ class BenchmarkWorker:
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
        block_quant_shape: List[int] = None,
-        use_deep_gemm: bool = False,
    ) -> tuple[dict[str, int], float]:
        current_platform.seed_everything(self.seed)
        dtype_str = get_config_dtype_str(dtype,
@ -415,8 +396,7 @@ class BenchmarkWorker:
                                       use_fp8_w8a8,
                                       use_int8_w8a16,
                                       num_iters=100,
-                                       block_quant_shape=block_quant_shape,
-                                       use_deep_gemm=use_deep_gemm)
+                                       block_quant_shape=block_quant_shape)
        return config, kernel_time

    def tune(
@ -431,7 +411,6 @@ class BenchmarkWorker:
        use_int8_w8a16: bool,
        search_space: list[dict[str, int]],
        block_quant_shape: list[int],
-        use_deep_gemm: bool,
    ) -> dict[str, int]:
        best_config = None
        best_time = float("inf")
@ -457,8 +436,7 @@ class BenchmarkWorker:
                        use_fp8_w8a8,
                        use_int8_w8a16,
                        num_iters=20,
-                        block_quant_shape=block_quant_shape,
-                        use_deep_gemm=use_deep_gemm)
+                        block_quant_shape=block_quant_shape)
                except triton.runtime.autotuner.OutOfResources:
                    # Some configurations may be invalid and fail to compile.
                    continue
@ -553,9 +531,6 @@ def main(args: argparse.Namespace):
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
-        if not hasattr(config, "hidden_size"):
-            # Support for llama4
-            config = config.text_config
        # Default: Mixtral.
        E = config.num_local_experts
        topk = config.num_experts_per_tok
@ -575,8 +550,6 @@ def main(args: argparse.Namespace):
    else:
        batch_sizes = [args.batch_size]

-    use_deep_gemm = bool(args.use_deep_gemm)
-
    ray.init()
    num_gpus = int(ray.available_resources()["GPU"])
    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@ -599,10 +572,10 @@ def main(args: argparse.Namespace):

        start = time.time()
        configs = _distribute(
-            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
-                      block_quant_shape, use_deep_gemm)
-                     for batch_size in batch_sizes])
+            "tune",
+            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
+              use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
+             for batch_size in batch_sizes])
        best_configs = {
            M: sort_config(config)
            for M, config in zip(batch_sizes, configs)
@ -616,7 +589,7 @@ def main(args: argparse.Namespace):
        outputs = _distribute(
            "benchmark",
            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
             for batch_size in batch_sizes])

        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@ -638,7 +611,6 @@ if __name__ == "__main__":
                        type=str,
                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
                        default="auto")
-    parser.add_argument("--use-deep-gemm", action="store_true")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--tune", action="store_true")
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+PORT=8000
+MODEL=$1
+TOKENS=$2
+
+docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
+           -v "$PWD/data:/data" \
+           ghcr.io/huggingface/text-generation-inference:2.2.0 \
+           --model-id "$MODEL" \
+           --sharded false  \
+           --max-input-length 1024 \
+           --max-total-tokens 2048 \
+           --max-best-of 5 \
+           --max-concurrent-requests 5000 \
+           --max-batch-total-tokens "$TOKENS"
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -33,6 +33,8 @@ endif()

 if(MACOSX_FOUND)
    list(APPEND CXX_COMPILE_FLAGS
+        "-Xpreprocessor"
+        "-fopenmp"
        "-DVLLM_CPU_EXTENSION")
 else()
    list(APPEND CXX_COMPILE_FLAGS
@ -195,7 +197,6 @@ set(VLLM_EXT_SRC
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/quant.cpp"
-        "csrc/cpu/shm.cpp"
        ${VLLM_EXT_SRC})
 endif()

--- a/collect_env.py
+++ b/collect_env.py
@ -482,28 +482,16 @@ def get_pip_packages(run_lambda, patterns=None):
    if patterns is None:
        patterns = DEFAULT_PIP_PATTERNS

-    def run_with_pip():
-        try:
-            import importlib.util
-            pip_spec = importlib.util.find_spec('pip')
-            pip_available = pip_spec is not None
-        except ImportError:
-            pip_available = False
-
-        if pip_available:
-            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
-        elif os.environ.get("UV") is not None:
-            print("uv is set")
-            cmd = ["uv", "pip", "list", "--format=freeze"]
-        else:
-            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
-
-        out = run_and_read_all(run_lambda, cmd)
+    # People generally have `pip` as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    def run_with_pip(pip):
+        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
        return "\n".join(line for line in out.splitlines()
                         if any(name in line for name in patterns))

    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
-    out = run_with_pip()
+    out = run_with_pip([sys.executable, '-mpip'])
+
    return pip_version, out


--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -78,14 +78,9 @@ struct FP16Vec16 : public Vec<FP16Vec16> {

  __m256i reg;

-  // normal load
  explicit FP16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}

-  // non-temproal load
-  explicit FP16Vec16(bool, void* ptr)
-      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
-
  explicit FP16Vec16(const FP32Vec16&);

  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -115,14 +110,9 @@ struct BF16Vec16 : public Vec<BF16Vec16> {

  __m256i reg;

-  // normal load
  explicit BF16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}

-  // non-temproal load
-  explicit BF16Vec16(bool, void* ptr)
-      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
-
  explicit BF16Vec16(const FP32Vec16&);

  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@ -323,13 +313,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {

  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}

-  // normal load
  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}

-  // non-temproal load
-  explicit FP32Vec16(bool, void* ptr)
-      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
-
  explicit FP32Vec16(__m512 data) : reg(data) {}

  explicit FP32Vec16(const FP32Vec4& data)
@ -562,33 +547,6 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
    _mm_mask_storeu_epi8(ptr, mask, reg);
  }
 };
-
-struct INT8Vec64 : public Vec<INT8Vec64> {
-  constexpr static int VEC_ELEM_NUM = 64;
-  union AliasReg {
-    __m512i reg;
-    int8_t values[VEC_ELEM_NUM];
-  };
-
-  __m512i reg;
-
-  // normal load
-  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
-
-  // non-temproal load
-  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
-
-  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
-
-  void save(int8_t* ptr, const int elem_num) const {
-    constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
-    __mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
-    _mm512_mask_storeu_epi8(ptr, mask, reg);
-  }
-
-  // non-temproal save
-  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
-};
 #endif

 template <typename T>
@ -699,22 +657,6 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {

 inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }

-#ifdef __AVX512F__
-inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
-  _mm256_stream_si256((__m256i*)ptr, vec.reg);
-}
-inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
-  _mm512_stream_si512((__m512i*)ptr, vec.reg);
-}
-inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
-  _mm256_stream_si256((__m256i*)ptr, vec.reg);
-}
-inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
-  _mm512_stream_ps((float*)ptr, vec.reg);
-}
-#endif
-
-inline void mem_barrier() { _mm_mfence(); }
 };  // namespace vec_op

 #endif
--- a/csrc/cpu/shm.cpp
+++ b/csrc/cpu/shm.cpp
@ -1,781 +0,0 @@
-#include "cpu/cpu_types.hpp"
-
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-namespace {
-#define MAX_SHM_RANK_NUM 8
-#define MAX_THREAD_NUM 12
-#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
-#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
-#define MAX_P2P_SEND_TENSOR_NUM 8
-
-template <typename scalar_t>
-struct KernelVecType {
-  using scalar_vec_t = void;
-};
-
-template <>
-struct KernelVecType<float> {
-  using scalar_vec_t = vec_op::FP32Vec16;
-};
-
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using scalar_vec_t = vec_op::BF16Vec16;
-};
-
-template <>
-struct KernelVecType<c10::Half> {
-  using scalar_vec_t = vec_op::FP16Vec16;
-};
-
-enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
-
-struct ThreadSHMContext {
-  volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
-  int thread_id;
-  int thread_num;
-  int rank;
-  int group_size;
-  size_t _spinning_count;
-  int swizzled_ranks[MAX_SHM_RANK_NUM];
-  void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
-  ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
-
-  ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
-                   const int group_size, void* thread_shm_ptr)
-      : thread_id(thread_id),
-        thread_num(thread_num),
-        rank(rank),
-        group_size(group_size),
-        _spinning_count(0) {
-    static_assert(sizeof(ThreadSHMContext) % 64 == 0);
-    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
-    TORCH_CHECK((size_t)this % 64 == 0);
-    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
-    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
-      shm_contexts[i] = nullptr;
-      thread_shm_ptrs[i] = nullptr;
-      swizzled_ranks[i] = (i + rank) % group_size;
-      thread_stats[i] = ThreadSHMStat::DONE;
-    }
-    set_context(rank, this, thread_shm_ptr);
-  }
-
-  void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
-    TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
-    TORCH_CHECK(ptr);
-    TORCH_CHECK(thread_shm_ptr);
-    TORCH_CHECK_EQ(ptr->thread_num, thread_num);
-    TORCH_CHECK_EQ(ptr->thread_id, thread_id);
-    shm_contexts[rank] = ptr;
-    thread_shm_ptrs[rank] = thread_shm_ptr;
-  }
-
-  template <typename T>
-  T* get_thread_shm_ptr(int rank) {
-    return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
-  }
-
-  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
-
-  void wait_for_all(ThreadSHMStat prev_stat) {
-    for (int idx = 0; idx < group_size; ++idx) {
-      int rank = get_swizzled_rank(idx);
-      while (thread_stats[rank] == prev_stat) {
-        ++_spinning_count;
-        _mm_pause();
-      }
-    }
-    vec_op::mem_barrier();
-  }
-
-  void wait_for_one(int rank, ThreadSHMStat prev_stat) {
-    while (thread_stats[rank] == prev_stat) {
-      ++_spinning_count;
-      _mm_pause();
-    }
-    vec_op::mem_barrier();
-  }
-
-  void set_thread_stat(ThreadSHMStat stat) {
-    for (int idx = 0; idx < group_size; ++idx) {
-      int rank = get_swizzled_rank(idx);
-      shm_contexts[rank]->thread_stats[this->rank] = stat;
-    }
-  }
-
-  void set_thread_stat(int target_rank, ThreadSHMStat stat) {
-    for (int idx = 0; idx < group_size; ++idx) {
-      int rank = get_swizzled_rank(idx);
-      shm_contexts[rank]->thread_stats[target_rank] = stat;
-    }
-  }
-
-  // barrier for all ranks in the group, used for all2all ops
-  // DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
-  void barrier(ThreadSHMStat next_stat) {
-    if (next_stat == ThreadSHMStat::THREAD_READY) {
-      set_thread_stat(ThreadSHMStat::THREAD_READY);
-      wait_for_all(ThreadSHMStat::DONE);
-    } else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
-      set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
-      wait_for_all(ThreadSHMStat::THREAD_READY);
-    } else if (next_stat == ThreadSHMStat::DONE) {
-      set_thread_stat(ThreadSHMStat::DONE);
-      wait_for_all(ThreadSHMStat::SHM_DATA_READY);
-    } else {
-      TORCH_CHECK(false, "Invalid next_stat to barrier.");
-    }
-  }
-
-  std::string to_string() const {
-    std::stringstream ss;
-    ss << "SHMContext:";
-    ss << "\nrank: " << rank;
-    ss << "\ngroup_size: " << group_size;
-    ss << "\nthread_num: " << thread_num;
-    ss << "\nthread_id: " << thread_id;
-
-    ss << "\nshm_ctx_stat_loop_seq: [";
-    for (int i = 0; i < group_size; ++i) {
-      ss << swizzled_ranks[i] << ", ";
-    }
-    ss << "]";
-
-    ss << "\nshm_contexts: [";
-    for (int i = 0; i < group_size; ++i) {
-      if (shm_contexts[i]) {
-        ss << shm_contexts[i]->rank << ", ";
-      }
-    }
-    ss << "]";
-
-    return ss.str();
-  }
-};
-
-class SHMManager {
- public:
-  explicit SHMManager(const std::string& name, const int rank,
-                      const int group_size)
-      : _rank(rank),
-        _group_size(group_size),
-        _thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
-        _shm_names({""}),
-        _shared_mem_ptrs({nullptr}),
-        _shm_ctx(nullptr) {
-    _shm_names[rank] = get_shm_name(name, rank);
-    _shared_mem_ptrs[rank] = init_shm(rank);
-    _shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
-
-    for (int i = 0; i < _thread_num; ++i) {
-      ThreadSHMContext* ctx = new (_shm_ctx + i)
-          ThreadSHMContext(i, _thread_num, _rank, _group_size,
-                           compute_thread_shm_ptr(_shm_ctx, i));
-    }
-  }
-
-  void join(const std::string& name) {
-    for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
-      if (rank_idx != _rank) {
-        TORCH_CHECK(_shm_names[rank_idx].empty());
-        TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
-        _shm_names[rank_idx] = get_shm_name(name, rank_idx);
-        _shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
-        ThreadSHMContext* target_ctx =
-            reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
-        for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
-          _shm_ctx[thread_idx].set_context(
-              rank_idx, target_ctx + thread_idx,
-              compute_thread_shm_ptr(target_ctx, thread_idx));
-        }
-      }
-    }
-  }
-
-  ~SHMManager() { destroy_shm(); }
-
-  ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
-
-  static std::string get_shm_name(const std::string& name, int rank) {
-    return name + "_" + std::to_string(rank);
-  }
-
-  static int64_t create_singleton_instance(const std::string& name,
-                                           const int group_size,
-                                           const int rank) {
-    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
-    SingletonInstances.emplace_back(
-        std::make_unique<SHMManager>(name, rank, group_size));
-    return static_cast<int64_t>(SingletonInstances.size() - 1);
-  }
-
-  static SHMManager* get_singleton_instance(int64_t handle) {
-    return SingletonInstances[handle].get();
-  }
-
- protected:
-  static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
-  static std::mutex SingletonInstancesLock;
-
- private:
-  static size_t round_to_alignment(size_t num) {
-    return ((num + 63) / 64) * 64;
-  }
-
-  int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
-    int8_t* thread_shm_ptr =
-        reinterpret_cast<int8_t*>(ctx) +
-        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
-    return thread_shm_ptr +
-           thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
-  }
-
-  size_t compute_shm_size() {
-    const size_t rounded_rank_buffer_size =
-        round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
-    const size_t rounded_thread_shm_ctx_size =
-        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
-    const size_t shm_size =
-        rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
-    return shm_size;
-  }
-
-  void* init_shm(int target_rank) {
-    const std::string& shm_name = _shm_names[target_rank];
-    const int local_rank = _rank;
-    const size_t shm_size = compute_shm_size();
-
-    int fd = -1;
-    if (local_rank == target_rank) {
-      fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
-                    S_IRUSR | S_IWUSR);
-
-      if (fd == -1)
-        TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
-                               std::to_string(errno));
-
-      if (ftruncate(fd, shm_size) == -1)
-        TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
-                               std::to_string(errno));
-    } else {
-      fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
-
-      if (fd == -1)
-        TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
-                               std::to_string(errno));
-    }
-
-    void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
-                         MAP_SHARED | MAP_POPULATE, fd, 0);
-
-    if (shm_ptr == MAP_FAILED) {
-      TORCH_CHECK(false,
-                  "mmap in SHMManager failed. errno: " + std::to_string(errno));
-    }
-
-    if (close(fd) != 0) {
-      TORCH_CHECK(
-          false, "close in SHMManager failed. errno: " + std::to_string(errno));
-    }
-
-    TORCH_CHECK((size_t)shm_ptr % 64 == 0);
-
-    return shm_ptr;
-  }
-
-  void destroy_shm() {
-    std::stringstream ss;
-    ss << "local rank " << _rank << ": [";
-    for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
-      ss << _shm_ctx[thread_id]._spinning_count << ", ";
-    }
-    ss << "]\n";
-
-    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
-      if (_shared_mem_ptrs[i] != nullptr) {
-        munmap(_shared_mem_ptrs[i], compute_shm_size());
-      }
-
-      if (!_shm_names[i].empty()) {
-        shm_unlink(_shm_names[i].c_str());
-      }
-    }
-  }
-
-  int _rank;
-  int _group_size;
-  int _thread_num;
-  std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
-  std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
-  ThreadSHMContext* _shm_ctx;
-};
-
-namespace shm_cc_ops {
-template <typename scalar_t, typename F>
-void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
-  int thread_num = ctx->thread_num;
-  int64_t total_bytes = elem_num * sizeof(scalar_t);
-  int64_t total_units_num =
-      (total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
-  int64_t per_thread_units_num =
-      (total_units_num + thread_num - 1) / thread_num;
-  int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
-  int64_t max_per_thread_iteration_elem_num =
-      PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
-  int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
-
-#pragma omp parallel for schedule(static, 1)
-  for (int i = 0; i < thread_num; ++i) {
-    int64_t offset = i * per_thread_elem_num;
-    int64_t end = std::min(elem_num, offset + per_thread_elem_num);
-    int64_t curr_elem_num =
-        std::min(max_per_thread_iteration_elem_num, end - offset);
-    ThreadSHMContext* thread_ctx = ctx + i;
-
-    while (curr_elem_num > 0) {
-      inner_func(thread_ctx, offset, curr_elem_num);
-
-      offset += max_per_thread_iteration_elem_num;
-      curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
-    }
-  }
-}
-};  // namespace shm_cc_ops
-
-namespace shm_cc_ops {
-
-void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
-  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
-  int64_t i = 0;
-#pragma GCC unroll 4
-  for (; i < aligned_bytes; i += 64) {
-    vec_op::INT8Vec64 data(
-        true, (int8_t*)src + i);  // stream loading shm to avoid caching
-    data.save((int8_t*)dst + i);
-  }
-  if (aligned_bytes < bytes) {
-    vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
-    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
-  }
-}
-
-void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
-#pragma GCC unroll 4
-  for (int64_t i = 0; i < bytes; i += 64) {
-    vec_op::INT8Vec64 data((int8_t*)src + i);
-    data.nt_save((int8_t*)dst + i);
-  }
-}
-
-void memcpy(void* dst, void* src, const int64_t bytes) {
-  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
-  int64_t i = 0;
-#pragma GCC unroll 4
-  for (; i < aligned_bytes; i += 64) {
-    vec_op::INT8Vec64 data((int8_t*)src + i);
-    data.save((int8_t*)dst + i);
-  }
-  if (aligned_bytes < bytes) {
-    vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
-    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
-  }
-}
-
-template <typename scalar_t, int RANKS>
-void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
-                         size_t elem_num) {
-  CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
-  using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
-  constexpr int64_t vec_elem_num = vec_t::get_elem_num();
-  const int worldsize = ctx->group_size;
-
-  shm_cc_ops::shm_cc_loop<scalar_t>(
-      ctx, elem_num,
-      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
-        int rank = thread_ctx->rank;
-        scalar_t* thread_shm_ptr =
-            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
-        scalar_t* thread_data_ptr = data + data_offset;
-        int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
-
-        scalar_t* remote_data_ptrs[RANKS - 1];
-        vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
-          remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
-              thread_ctx->get_swizzled_rank(idx + 1));
-        });
-
-        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
-
-        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
-                                  thread_data_elem_num);
-
-        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
-
-        int64_t aligned_data_elem_num =
-            (data_elem_num / vec_elem_num) * vec_elem_num;
-        int64_t i = 0;
-#pragma GCC unroll 4
-        for (; i < aligned_data_elem_num; i += vec_elem_num) {
-          vec_t local_data(thread_data_ptr + i);  // load from cache
-          vec_op::FP32Vec16 local_data_fp32(local_data);
-          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
-            vec_t remote_data(
-                true, remote_data_ptrs[idx] + i);  // stream load from shm
-            vec_op::FP32Vec16 remote_data_fp32(remote_data);
-            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
-          });
-          vec_t reduced_data(local_data_fp32);
-          reduced_data.save(thread_data_ptr + i);
-        }
-
-        if (i < data_elem_num) {
-          vec_t local_data(thread_data_ptr + i);  // load from cache
-          vec_op::FP32Vec16 local_data_fp32(local_data);
-          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
-            vec_t remote_data(
-                true, remote_data_ptrs[idx] + i);  // stream load from shm
-            vec_op::FP32Vec16 remote_data_fp32(remote_data);
-            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
-          });
-          vec_t reduced_data(local_data_fp32);
-          reduced_data.save(thread_data_ptr + i,
-                            data_elem_num - aligned_data_elem_num);
-        }
-
-        thread_ctx->barrier(ThreadSHMStat::DONE);
-      });
-
-  return;
-}
-};  // namespace shm_cc_ops
-
-std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
-std::mutex SHMManager::SingletonInstancesLock = {};
-
-template <typename scalar_t>
-void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
-  switch (ctx->group_size) {
-    case 2:
-      shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
-      break;
-    case 3:
-      shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
-      break;
-    case 4:
-      shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
-      break;
-    case 8:
-      shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
-      break;
-    default:
-      TORCH_CHECK(false,
-                  "Invalid world size: " + std::to_string(ctx->group_size));
-  }
-}
-
-template <typename scalar_t>
-void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
-                     scalar_t** outputs, const int dst) {
-  CPU_KERNEL_GUARD_IN(shm_gather_impl)
-  const int worldsize = ctx->group_size;
-  TORCH_CHECK_LT(dst, worldsize);
-  shm_cc_ops::shm_cc_loop<scalar_t>(
-      ctx, elem_num,
-      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
-        int rank = thread_ctx->rank;
-        scalar_t* thread_shm_ptr =
-            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
-
-        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
-
-        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
-                                  data_elem_num * sizeof(scalar_t));
-
-        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
-
-        if (rank == dst) {
-          shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
-                             data_elem_num * sizeof(scalar_t));
-          for (int i = 1; i < worldsize; ++i) {
-            int src_rank = thread_ctx->get_swizzled_rank(i);
-            scalar_t* src_ptr =
-                thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank);  // shm
-            scalar_t* dst_ptr = outputs[src_rank] + data_offset;
-            shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
-                                        data_elem_num * sizeof(scalar_t));
-          }
-        }
-
-        thread_ctx->barrier(ThreadSHMStat::DONE);
-      });
-
-  return;
-}
-
-struct MemPiece {
-  void* ptr;
-  int64_t size;
-
-  template <typename T>
-  T* data_ptr() {
-    return reinterpret_cast<T*>(ptr);
-  }
-};
-
-struct TensorListMeta {
-  int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
-  torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
-  int64_t tensor_num;
-  int64_t total_bytes;
-
-  TensorListMeta() : tensor_num(0), total_bytes(0) {
-    static_assert(sizeof(TensorListMeta) % 64 == 0);
-    static_assert(sizeof(TensorListMeta) <
-                  MIN_THREAD_PROCESS_SIZE);  // To ensure the metadata always
-                                             // hold by the thread 0
-    for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
-      tensor_bytes[i] = 0;
-      tensor_ptrs[i] = nullptr;
-      tensor_types[i] = torch::ScalarType::Undefined;
-    }
-  }
-
-  // For send and recv
-  void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
-    TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
-                "Re-bind TensorListMeta is not allowed.")
-    TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
-    tensor_num = tensor_list.size();
-    int64_t bytes_sum = 0;
-    for (int i = 0; i < tensor_list.size(); ++i) {
-      torch::Tensor& t = tensor_list[i];
-      TORCH_CHECK(t.is_contiguous());
-      tensor_bytes[i] = t.nbytes();
-      tensor_types[i] = t.scalar_type();
-      tensor_ptrs[i] = t.data_ptr();
-      bytes_sum += t.nbytes();
-    }
-    total_bytes = bytes_sum;
-  }
-
-  // For recv
-  std::vector<torch::Tensor> generate_tensor_list() {
-    std::vector<torch::Tensor> tensor_list;
-    tensor_list.reserve(tensor_num);
-
-    for (int i = 0; i < tensor_num; ++i) {
-      int64_t bytes = tensor_bytes[i];
-      auto type = tensor_types[i];
-      int64_t elem_bytes = torch::elementSize(type);
-
-      TORCH_CHECK_EQ(bytes % elem_bytes, 0);
-      int64_t elem_num = bytes / elem_bytes;
-      auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
-      tensor_list.emplace_back(torch::empty({elem_num}, options));
-    }
-    return tensor_list;
-  }
-
-  MemPiece get_data(int64_t offset) {
-    for (int i = 0; i < tensor_num; ++i) {
-      if (offset < tensor_bytes[i]) {
-        return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
-                tensor_bytes[i] - offset};
-      }
-      offset -= tensor_bytes[i];
-    }
-    return {nullptr, 0};
-  }
-
- private:
-  void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
-  int8_t _padding[40];
-};
-
-void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
-                               const std::vector<torch::Tensor>& tensor_list) {
-  CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
-  std::vector<torch::Tensor> tensor_list_with_metadata;
-  tensor_list_with_metadata.reserve(1 + tensor_list.size());
-
-  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
-  tensor_list_with_metadata.emplace_back(
-      torch::empty({sizeof(TensorListMeta)}, options));
-  tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
-                                   tensor_list.begin(), tensor_list.end());
-
-  torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
-  TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
-
-  TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
-  metadata->bind_tensor_list(tensor_list_with_metadata);
-
-  shm_cc_ops::shm_cc_loop<int8_t>(
-      ctx, metadata->total_bytes,
-      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
-        int rank = thread_ctx->rank;
-        // Wait until the receiver set the stat to DONE
-        thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
-
-        int64_t curr_shm_offset = 0;
-        while (curr_shm_offset < data_elem_num) {
-          MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
-          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
-          shm_cc_ops::memcpy(
-              thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
-              frag.ptr, frag.size);
-          curr_shm_offset += frag.size;
-        }
-
-        thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
-      });
-}
-
-std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
-                                                     int64_t src) {
-  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
-  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
-  torch::Tensor metadata_tensor =
-      torch::empty({sizeof(TensorListMeta)}, options);
-
-  // Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
-  ctx->wait_for_one(src, ThreadSHMStat::DONE);
-  shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
-                     ctx->get_thread_shm_ptr<void>(src),
-                     sizeof(TensorListMeta));
-  TensorListMeta* src_metadata =
-      reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
-  std::vector<torch::Tensor> tensor_list_with_metadata =
-      src_metadata->generate_tensor_list();
-
-  TensorListMeta metadata;
-  metadata.bind_tensor_list(tensor_list_with_metadata);
-  TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
-  TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
-
-  shm_cc_ops::shm_cc_loop<int8_t>(
-      ctx, metadata.total_bytes,
-      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
-          int64_t data_elem_num) {
-        // Wait until the sender set the stat to SHM_DATA_READY
-        thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
-        int64_t curr_shm_offset = 0;
-        while (curr_shm_offset < data_elem_num) {
-          MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
-          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
-          shm_cc_ops::memcpy(
-              frag.ptr,
-              thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
-              frag.size);
-          curr_shm_offset += frag.size;
-        }
-
-        thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
-      });
-
-  std::vector<torch::Tensor> tensor_list;
-  tensor_list.reserve(metadata.tensor_num - 1);
-  tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
-                     tensor_list_with_metadata.end());
-
-  return tensor_list;
-}
-}  // namespace
-
-void shm_gather(int64_t handle, torch::Tensor& data,
-                const std::optional<std::vector<torch::Tensor>>& outputs,
-                int64_t dst) {
-  TORCH_CHECK(data.is_contiguous())
-  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
-    CPU_KERNEL_GUARD_IN(shm_gather_impl)
-
-    if (outputs.has_value()) {
-      TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
-      scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
-      for (int i = 0; i < outputs->size(); ++i) {
-        output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
-      }
-      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
-                      data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
-                      dst);
-    } else {
-      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
-                      data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
-                      dst);
-    }
-
-    CPU_KERNEL_GUARD_OUT(shm_gather_impl)
-  });
-}
-
-void shm_all_gather(int64_t handle, const torch::Tensor& data,
-                    torch::Tensor& output) {
-  TORCH_CHECK(data.is_contiguous())
-  TORCH_CHECK(output.is_contiguous())
-
-  const int64_t input_elem_num = data.numel();
-  const int64_t output_elem_num = output.numel();
-  TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
-  const int world_size = output_elem_num / input_elem_num;
-
-  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
-    CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
-    auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
-    TORCH_CHECK_EQ(ctx->group_size, world_size);
-
-    scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
-    for (int i = 0; i < world_size; ++i) {
-      output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
-    }
-    shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
-                    ctx->rank);
-    CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
-  });
-}
-
-void shm_allreduce(int64_t handle, torch::Tensor& data) {
-  TORCH_CHECK(data.is_contiguous())
-  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
-    CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
-    shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
-                      data.data_ptr<scalar_t>(), data.numel());
-    CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
-  });
-}
-
-void shm_send_tensor_list(int64_t handle,
-                          const std::vector<torch::Tensor>& tensor_list,
-                          int64_t dst) {
-  CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
-  shm_send_tensor_list_impl(
-      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
-  CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
-}
-
-std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
-  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
-  auto tensor_list = shm_recv_tensor_list_impl(
-      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
-  CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
-  return tensor_list;
-}
-
-int64_t init_shm_manager(const std::string& name, const int64_t group_size,
-                         const int64_t rank) {
-  return SHMManager::create_singleton_instance(name, group_size, rank);
-}
-
-std::string join_shm_manager(int64_t handle, const std::string& name) {
-  auto shm_manager = SHMManager::get_singleton_instance(handle);
-  TORCH_CHECK(shm_manager);
-  shm_manager->join(name);
-  return shm_manager->get_shm_ctx()->to_string();
-}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -22,26 +22,6 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
                        torch::Tensor& block_tables, torch::Tensor& seq_lens);

-int64_t init_shm_manager(const std::string& name, const int64_t group_size,
-                         const int64_t rank);
-
-std::string join_shm_manager(int64_t handle, const std::string& name);
-
-void shm_allreduce(int64_t handle, torch::Tensor& data);
-
-void shm_gather(int64_t handle, torch::Tensor& data,
-                const std::optional<std::vector<torch::Tensor>>& outputs,
-                int64_t dst);
-
-void shm_all_gather(int64_t handle, const torch::Tensor& data,
-                    torch::Tensor& output);
-
-void shm_send_tensor_list(int64_t handle,
-                          const std::vector<torch::Tensor>& tensor_list,
-                          int64_t dst);
-
-std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
-
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  // vLLM custom ops

@ -151,29 +131,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                  Tensor? azp, Tensor? bias) -> ()");
  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif
-
-// SHM CCL
-#ifdef __AVX512F__
-  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
-          &init_shm_manager);
-  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
-  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
-  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
-  ops.def(
-      "shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
-      "()");
-  ops.impl("shm_gather", torch::kCPU, &shm_gather);
-  ops.def(
-      "shm_all_gather(int handle, Tensor data, Tensor! output) -> "
-      "()");
-  ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
-  ops.def(
-      "shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
-      "()");
-  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
-  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
-          &shm_recv_tensor_list);
-#endif
 }

 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@ -18,7 +18,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {

 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
-  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
+  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
  TORCH_CHECK(omp_cpu_mask->size > 0);
  std::vector<int> omp_cpu_ids;
  omp_cpu_ids.reserve(omp_cpu_mask->size);
--- a/csrc/cuda_view.cu
+++ b/csrc/cuda_view.cu
@ -1,39 +0,0 @@
-#include <torch/all.h>
-#include <torch/cuda.h>
-#include <cuda_runtime.h>
-
-// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
-// memory, and that UVA (Unified Virtual Addressing) is enabled.
-torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
-  TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
-
-  // Get raw host pointer from CPU tensor
-  void* host_ptr = cpu_tensor.data_ptr();
-
-  // Get a device pointer corresponding to the pinned host memory
-  void* device_ptr = nullptr;
-  cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
-  TORCH_CHECK(err == cudaSuccess,
-              "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
-
-  // We'll use the same sizes, strides, and dtype as the CPU tensor.
-  // TODO: check if layout is respected.
-  auto sizes = cpu_tensor.sizes();
-  auto strides = cpu_tensor.strides();
-  auto options = cpu_tensor.options().device(torch::kCUDA);
-
-  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
-  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
-  // memory, so we don't free it here.
-  auto deleter = [](void*) {
-    // no-op, since the memory is owned by the original CPU tensor
-  };
-
-  torch::Tensor cuda_tensor =
-      torch::from_blob(device_ptr, sizes, strides, deleter, options);
-
-  TORCH_CHECK(cuda_tensor.device().is_cuda(),
-              "Resulting tensor is not on CUDA device");
-
-  return cuda_tensor;
-}
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));

 fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
                      torch::Tensor& rank_data, int64_t rank,
-                      bool fully_connected) {
+                      bool full_nvlink) {
  int world_size = fake_ipc_ptrs.size();
  if (world_size > 8)
    throw std::invalid_argument("world size > 8 is not supported");
@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
  }
  return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
                                            rank_data.numel(), rank, world_size,
-                                            fully_connected);
+                                            full_nvlink);
 }

 /**
@ -142,48 +142,3 @@ void register_graph_buffers(fptr_t _fa,
  bytes.reserve(handles.size());
  fa->register_graph_buffers(bytes, offsets);
 }
-
-std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
-    int64_t size) {
-  auto device_index = c10::cuda::current_device();
-  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
-  void* buffer;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  auto stream = c10::cuda::getCurrentCUDAStream().stream();
-  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-
-  // Allocate buffer
-#if defined(USE_ROCM)
-  // data buffers need to be "uncached" for signal on MI200
-  AT_CUDA_CHECK(
-      hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
-#else
-  AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
-#endif
-  AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
-  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
-  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-
-  // Create IPC memhandle for the allocated buffer.
-  // Will use it in open_mem_handle.
-  auto options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
-  auto handle =
-      torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
-  AT_CUDA_CHECK(
-      cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
-
-  return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
-}
-
-fptr_t open_mem_handle(torch::Tensor& mem_handle) {
-  void* ipc_ptr;
-  AT_CUDA_CHECK(cudaIpcOpenMemHandle(
-      (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
-      cudaIpcMemLazyEnablePeerAccess));
-  return reinterpret_cast<fptr_t>(ipc_ptr);
-}
-
-void free_shared_buffer(fptr_t buffer) {
-  AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
-}
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -5,10 +5,6 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>

-#if defined(USE_ROCM)
-typedef __hip_bfloat16 nv_bfloat16;
-#endif
-
 #include <iostream>
 #include <array>
 #include <limits>
@ -16,7 +12,6 @@ typedef __hip_bfloat16 nv_bfloat16;
 #include <unordered_map>
 #include <vector>

-namespace vllm {
 #define CUDACHECK(cmd)                                              \
  do {                                                              \
    cudaError_t e = cmd;                                            \
@ -27,37 +22,24 @@ namespace vllm {
    }                                                               \
  } while (0)

-// Maximal number of blocks in allreduce kernel.
+namespace vllm {
+
 constexpr int kMaxBlocks = 36;
-
-// Default number of blocks in allreduce kernel.
-#ifndef USE_ROCM
-const int defaultBlockLimit = 36;
-CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
-#else
-const int defaultBlockLimit = 16;
-hipPointer_attribute rangeStartAddrAttr =
-    HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
-#endif
-
 // Counter may overflow, but it's fine since unsigned int overflow is
 // well-defined behavior.
 using FlagType = uint32_t;
-
-// Two sets of peer counters are needed for two syncs: starting and ending an
-// operation. The reason is that it's possible for peer GPU block to arrive at
-// the second sync point while the current GPU block haven't passed the first
-// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
-// waiting for counter. We use alternating counter array to avoid this
-// possibility.
 struct Signal {
-  alignas(128) FlagType start[kMaxBlocks][8];
-  alignas(128) FlagType end[kMaxBlocks][8];
-  alignas(128) FlagType _flag[kMaxBlocks];  // incremental flags for each rank
+  alignas(128) FlagType self_counter[kMaxBlocks][8];
+  // Two sets of peer counters are needed for two syncs. The reason is that
+  // it's possible for peer GPU block to arrive at the second sync point while
+  // the current GPU block haven't passed the first sync point. Thus, peer GPU
+  // may write counter+1 while current GPU is busy waiting for counter. We use
+  // alternating counter array to avoid this possibility.
+  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };

 struct __align__(16) RankData {
-  const void* ptrs[8];
+  const void* __restrict__ ptrs[8];
 };

 struct __align__(16) RankSignals {
@ -152,29 +134,27 @@ DINLINE O downcast(array_t<float, O::size> val) {
  }
 }

-#if !defined(USE_ROCM)
-
 static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
               "l"(flag_addr));
-  #else
+#else
  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
               "l"(flag_addr));
-  #endif
+#endif
 }

 static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
  FlagType flag;
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
               : "=r"(flag)
               : "l"(flag_addr));
-  #else
+#else
  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
               : "=r"(flag)
               : "l"(flag_addr));
-  #endif
+#endif
  return flag;
 }

@ -190,99 +170,37 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
  return flag;
 }

-// This function is meant to be used as the first synchronization in the all
-// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
-// prior memory accesses. Note: volatile writes will not be reordered against
-// other volatile writes.
-template <int ngpus>
-DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
-                              int rank) {
-  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+// is_start: whether this is the very first synchronization barrier.
+// need_fence: whether a memory fence is needed. If true, a release-acquire
+// semantic is used to enforce memory access order before and after this
+// barrier.
+template <int ngpus, bool is_start, bool need_fence = false>
+DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
+                               int rank) {
+  if constexpr (!is_start) __syncthreads();
+  static_assert(
+      !(is_start && need_fence));  // Start barrier shouldn't need fence.
  if (threadIdx.x < ngpus) {
-    auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
-    auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
-    // Write the expected counter value to peer and wait for correct value
-    // from peer.
-    st_flag_volatile(peer_counter_ptr, flag);
-    while (ld_flag_volatile(self_counter_ptr) != flag);
-  }
-  __syncthreads();
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
-}
-
-// This function is meant to be used as the second or the final
-// synchronization barrier in the all reduce kernel. If it's the final
-// synchronization barrier, we don't need to make any visibility guarantees
-// for prior memory accesses.
-template <int ngpus, bool final_sync = false>
-DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
-  __syncthreads();
-  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
-  if (threadIdx.x < ngpus) {
-    auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
-    auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
+    // Increment the counter. Technically we only need one counter, but we use
+    // multiple per block to eliminate the need to share the counter via smem.
+    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
    // Write the expected counter value to peer and wait for correct value from
    // peer.
-    if constexpr (!final_sync) {
-      st_flag_release(peer_counter_ptr, flag);
-      while (ld_flag_acquire(self_counter_ptr) != flag);
+    auto peer_counter_ptr =
+        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
+    auto self_counter_ptr =
+        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
+    if constexpr (need_fence) {
+      st_flag_release(peer_counter_ptr, val);
+      while (ld_flag_acquire(self_counter_ptr) != val);
    } else {
-      st_flag_volatile(peer_counter_ptr, flag);
-      while (ld_flag_volatile(self_counter_ptr) != flag);
+      st_flag_volatile(peer_counter_ptr, val);
+      while (ld_flag_volatile(self_counter_ptr) != val);
    }
  }
-  if constexpr (!final_sync) __syncthreads();
-
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+  if constexpr (is_start || need_fence) __syncthreads();
 }

-#else
-
-template <int ngpus>
-DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
-                              int rank) {
-  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
-  if (threadIdx.x < ngpus) {
-    // simultaneously write to the corresponding flag of all ranks.
-    // Latency = 1 p2p write
-    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
-                            flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
-    // wait until we got true from all ranks
-    while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
-                                  __ATOMIC_RELAXED,
-                                  __MEMORY_SCOPE_DEVICE) < flag);
-  }
-  __syncthreads();
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
-}
-
-template <int ngpus, bool final_sync = false>
-DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
-  __syncthreads();
-  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
-  if (threadIdx.x < ngpus) {
-    // simultaneously write to the corresponding flag of all ranks.
-    // Latency = 1 p2p write
-    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
-                            flag,
-                            final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
-                            __MEMORY_SCOPE_SYSTEM);
-    // wait until we got true from all ranks
-    while (
-        __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
-                               final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
-                               __MEMORY_SCOPE_DEVICE) < flag);
-  }
-  if constexpr (!final_sync) __syncthreads();
-  // use one thread to update flag
-  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
-}
-
-#endif
-
 template <typename P, int ngpus, typename A>
 DINLINE P packed_reduce(const P* ptrs[], int idx) {
  A tmp = upcast(ptrs[0][idx]);
@ -302,13 +220,13 @@ __global__ void __launch_bounds__(512, 1)
  // note: we don't reorder the address so the accumulation order is the same
  // for all ranks, ensuring bitwise identical results
  auto dp = *_dp;
-  barrier_at_start<ngpus>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
  // do the actual reduction
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
       idx += gridDim.x * blockDim.x) {
    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
  }
-  barrier_at_end<ngpus, true>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
 }

 template <typename P>
@ -337,20 +255,18 @@ __global__ void __launch_bounds__(512, 1)
    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
  }
  auto tmp_out = tmps[0];
-  barrier_at_start<ngpus>(sg, self_sg, rank);
-
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
  // stage 1: reduce scatter
  for (int idx = start + tid; idx < end; idx += stride) {
    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
  }
-  barrier_at_end<ngpus>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);

  // stage 2: allgather. Note: it's important to match the tid between
  // the two stages, because visibility across devices is only guaranteed
  // between threads that have the same tid. If thread i computes the sum of
-  // start + i in the first stage, then thread i also gathers start + i from
-  // all ranks.
-
+  // start + i in the first stage, then thread i also gathers start + i from all
+  // ranks.
  for (int idx = tid; idx < largest_part; idx += stride) {
 #pragma unroll
    for (int i = 0; i < ngpus; i++) {
@ -371,22 +287,21 @@ class CustomAllreduce {
 public:
  int rank_;
  int world_size_;
-  // Full NVLink or xGMI connection between GPUs.
-  bool fully_connected_;
+  bool full_nvlink_;

  RankSignals sg_;
-  // Stores an map from a pointer to its peer pointers from all ranks.
+  // Stores an map from a pointer to its peer pointters from all ranks.
  std::unordered_map<void*, RankData*> buffers_;
  Signal* self_sg_;

  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
  // For cuda graph to work, all kernel arguments must be fixed during graph
-  // capture time. However, the peer pointers are not known during graph
-  // capture time. Therefore, during capture, we increment the rank data
-  // pointer and use that as the argument to the kernel. The kernel arguments
-  // are stored in graph_unreg_buffers_. The actual peer pointers will be
-  // filled in at the memory pointed to by the pointers in
-  // graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
+  // capture time. However, the peer pointers are not known during graph capture
+  // time. Therefore, during capture, we increment the rank data pointer and use
+  // that as the argument to the kernel. The kernel arguments are stored in
+  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+  // memory pointed to by the pointers in graph_unreg_buffers_ when
+  // the IPC handles are exchanged between ranks.
  //
  // The overall process looks like this:
  // 1. Graph capture.
@ -404,18 +319,17 @@ class CustomAllreduce {
   * Signals are an array of ipc-enabled buffers from all ranks.
   * For each of the buffer, the layout is as follows:
   * | -- sizeof(Signal) -- | ------ a few MB ----- |
-   * The first section is for allreduce synchronization, and the second
-   * section is for storing the intermediate results required by some
-   * allreduce algos.
+   * The first section is for allreduce synchronization, and the second section
+   * is for storing the intermediate results required by some allreduce algos.
   *
   * Note: this class does not own any device memory. Any required buffers
   * are passed in from the constructor.
   */
  CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
-                  int rank, int world_size, bool fully_connected = true)
+                  int rank, int world_size, bool full_nvlink = true)
      : rank_(rank),
        world_size_(world_size),
-        fully_connected_(fully_connected),
+        full_nvlink_(full_nvlink),
        self_sg_(signals[rank]),
        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
@ -447,7 +361,8 @@ class CustomAllreduce {
      void* base_ptr;
      // note: must share the base address of each allocation, or we get wrong
      // address
-      if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
+      if (cuPointerGetAttribute(&base_ptr,
+                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
        throw std::runtime_error("failed to get pointer attr");
      CUDACHECK(cudaIpcGetMemHandle(
@ -481,11 +396,11 @@ class CustomAllreduce {

  // Note: when registering graph buffers, we intentionally choose to not
  // deduplicate the addresses. That means if the allocator reuses some
-  // addresses, they will be registered again. This is to account for the
-  // remote possibility of different allocation patterns between ranks. For
-  // example, rank 1 may get the same input address for the second allreduce,
-  // but rank 2 got a different address. IPC handles have internal reference
-  // counting mechanism so overhead should be small.
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allreduce, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
  void register_graph_buffers(
      const std::vector<std::string>& handles,
      const std::vector<std::vector<int64_t>>& offsets) {
@ -516,15 +431,15 @@ class CustomAllreduce {
  /**
   * Performs allreduce, assuming input has already been registered.
   *
-   * Block and grid default configs are results after careful grid search.
-   * Using 36 blocks give the best or close to the best runtime on the devices
-   * I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
-   * only take a small amount of SMs. Not quite sure the underlying reason,
-   * but my guess is that too many SMs will cause contention on NVLink bus.
+   * Block and grid default configs are results after careful grid search. Using
+   * 36 blocks give the best or close to the best runtime on the devices I
+   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+   * take a small amount of SMs. Not quite sure the underlying reason, but my
+   * guess is that too many SMs will cause contention on NVLink bus.
   */
  template <typename T>
  void allreduce(cudaStream_t stream, T* input, T* output, int size,
-                 int threads = 512, int block_limit = defaultBlockLimit) {
+                 int threads = 512, int block_limit = 36) {
    auto d = packed_t<T>::P::size;
    if (size % d != 0)
      throw std::runtime_error(
@ -558,11 +473,13 @@ class CustomAllreduce {
 #define KL(ngpus, name)                                                       \
  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                 rank_, size);
+    // TODO(hanzhi713): Threshold is different for A100 and H100.
+    // Add per device threshold.
 #define REDUCE_CASE(ngpus)                            \
  case ngpus: {                                       \
    if (world_size_ == 2) {                           \
      KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (fully_connected_) {                    \
+    } else if (full_nvlink_) {                        \
      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
          (world_size_ <= 8 && bytes < 256 * 1024)) { \
        KL(ngpus, cross_device_reduce_1stage);        \
@ -580,8 +497,7 @@ class CustomAllreduce {
      REDUCE_CASE(8)
      default:
        throw std::runtime_error(
-            "custom allreduce only supports num gpus in (2,4,6,8). Actual "
-            "num "
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
            "gpus = " +
            std::to_string(world_size_));
    }
@ -595,11 +511,10 @@ class CustomAllreduce {
    }
  }
 };
-
 /**
- * To inspect PTX/SASS, copy paste this header file to compiler explorer and
- add a template instantiation:
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
+ a template instantiation:
 * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
 half *, int, int, int);
 */
-}  // namespace vllm
+}  // namespace vllm
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@ -1,9 +1,9 @@
 /**
 * This is a standalone test for custom allreduce.
 * To compile, make sure you have MPI and NCCL installed in your system.
- * export MPI_HOME=XXX
+ * export MPI_HOME=xxx
 * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
- * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
+ * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
 *
 * Warning: this C++ test is not designed to be very readable and was used
 * during the rapid prototyping process.
@ -22,15 +22,7 @@
 #include "cuda_profiler_api.h"
 #include "custom_all_reduce.cuh"
 #include "mpi.h"
-#ifdef USE_ROCM
-  #include <hip/hip_bf16.h>
-typedef __hip_bfloat16 nv_bfloat16;
-  #include "rccl/rccl.h"
-  #include "custom_all_reduce_hip.cuh"
-#else
-  #include "nccl.h"
-  #include "custom_all_reduce.cuh"
-#endif
+#include "nccl.h"

 #define MPICHECK(cmd)                                                  \
  do {                                                                 \
@ -51,29 +43,16 @@ typedef __hip_bfloat16 nv_bfloat16;
    }                                                               \
  } while (0)

-#ifdef USE_ROCM
 __global__ void dummy_kernel() {
-  for (int i = 0; i < 100; i++) {
-    uint64_t start = wall_clock64();
-    uint64_t cycles_elapsed;
-    do {
-      cycles_elapsed = wall_clock64() - start;
-    } while (cycles_elapsed < 100);
-  }
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
-}
 #else
-__global__ void dummy_kernel() {
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
-  #else
  for (int i = 0; i < 100; i++) {
    long long int start = clock64();
    while (clock64() - start < 150000000);  // approximately 98.4ms on P40
  }
-  #endif
-}
 #endif
+}

 template <typename T>
 __global__ void set_data(T* data, int size, int myRank) {
@ -142,14 +121,8 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
   * registration, they are allocated and registered together in the test for
   * convenience.
   */
-#ifdef USE_ROCM
-  CUDACHECK(hipExtMallocWithFlags(
-      (void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
-      hipDeviceMallocUncached));
-#else
  CUDACHECK(
      cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
-#endif
  CUDACHECK(
      cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
  CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
@ -338,18 +311,13 @@ int main(int argc, char** argv) {

  bool performance_test = true;
  cudaProfilerStart();
-// Uncomment to scan through different block size configs.
-// for (int threads : {256, 512, 1024}) {
-//   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
-//     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
-//     performance_test);
-//   }
-// }
-#ifdef USE_ROCM
-  const int block_limit = 16;
-#else
-  const int block_limit = 36;
-#endif
+  // Uncomment to scan through different block size configs.
+  // for (int threads : {256, 512, 1024}) {
+  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
+  //     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
+  //     performance_test);
+  //   }
+  // }
  // Scan through different sizes to test performance.
  for (int sz = 512; sz <= (8 << 20); sz *= 2) {
    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
@ -358,4 +326,4 @@ int main(int argc, char** argv) {
  cudaProfilerStop();
  MPICHECK(MPI_Finalize());
  return EXIT_SUCCESS;
-}
+}
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -119,8 +119,6 @@ void advance_step_flashinfer(
    torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
    torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);

-torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
-
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                        const torch::Tensor& codebooks,
@ -145,8 +143,7 @@ torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
 #endif

 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
-                              int64_t n,
-                              std::optional<at::ScalarType> const& dtype);
+                              int64_t n);

 torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
                                  int64_t type, int64_t row);
@ -268,10 +265,10 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
                       const std::optional<at::Tensor>& has_initial_state,
                       bool silu_activation, int64_t pad_slot_id);

+#ifndef USE_ROCM
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
-                      torch::Tensor& rank_data, int64_t rank,
-                      bool fully_connected);
+                      torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
 void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
 void dispose(fptr_t _fa);
@ -282,7 +279,4 @@ get_graph_buffer_ipc_meta(fptr_t _fa);
 void register_graph_buffers(fptr_t _fa,
                            const std::vector<std::vector<int64_t>>& handles,
                            const std::vector<std::vector<int64_t>>& offsets);
-std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
-    int64_t size);
-int64_t open_mem_handle(torch::Tensor& mem_handle);
-void free_shared_buffer(int64_t buffer);
+#endif
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@ -30,6 +30,9 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
    fp8_type* __restrict__ out, float* __restrict__ scale,
    scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
    const int hidden_size) {
+  float const min_scaling_factor =
+      1.0f / (fp8_e4m3_adjusted_max_v<fp8_type> * 512.f);
+
  int const tid = threadIdx.x;
  int const token_idx = blockIdx.x;

@ -64,8 +67,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
      token_scale = block_absmax_val_maybe;
    }
    // token scale computation
-    token_scale = max(token_scale / quant_type_max_v<fp8_type>,
-                      min_scaling_factor<fp8_type>::val());
+    token_scale = max(token_scale / fp8_e4m3_adjusted_max_v<fp8_type>,
+                      min_scaling_factor);
    scale[token_idx] = token_scale;
  }
  __syncthreads();
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@ -1,12 +1,20 @@
 #pragma once

 #include "quantization/vectorization.cuh"
-#include "quantization/utils.cuh"

 #include <cmath>
+#include <c10/core/ScalarType.h>

-#ifdef USE_ROCM
+#ifndef USE_ROCM
+  #include <c10/util/Float8_e4m3fn.h>
+  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
+#else
+  #include <ATen/hip/HIPContext.h>
+  #include <c10/util/Float8_e4m3fn.h>
+  #include <c10/util/Float8_e4m3fnuz.h>
  #include "amd/quant_utils.cuh"
+  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
+  #define MAYBE_HOST_DEVICE
 #endif

 // Determines the preferred FP8 type for the current platform.
@ -23,6 +31,29 @@ static bool is_fp8_ocp() {
 #endif
 }

+template <typename T>
+struct fp8_e4m3_adjusted_max;
+
+template <>
+struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fn> {
+  static constexpr c10::Float8_e4m3fn val() {
+    return std::numeric_limits<c10::Float8_e4m3fn>::max();
+  }
+};
+
+// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
+// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
+template <>
+struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fnuz> {
+  static constexpr c10::Float8_e4m3fnuz val() {
+    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
+  }
+};
+
+template <typename T>
+MAYBE_HOST_DEVICE static constexpr T fp8_e4m3_adjusted_max_v =
+    fp8_e4m3_adjusted_max<T>::val();
+
 namespace vllm {

 __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
@ -45,8 +76,8 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
    x = val / scale;
  }

-  float r =
-      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
+  float r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
+                 fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
 #ifndef USE_ROCM
  return static_cast<fp8_type>(r);
 #else
@ -92,7 +123,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
  // Finally, since cache[0] contains the maximum for this thread block,
  // atomically write the max to the target location
  if (threadIdx.x == 0) {
-    atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
+    atomicMaxFloat(scale, cache[0] / fp8_e4m3_adjusted_max_v<fp8_type>);
  }
 }

--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@ -14,7 +14,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
    float* __restrict__ scales,           // [num_tokens]
    scalar_t const* __restrict__ input,   // [..., hidden_size]
    scalar_t const* __restrict__ weight,  // [hidden_size]
-    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
    scalar_t* __restrict__ residual = nullptr) {
  float rms = 0.0f;
  float token_scale = 0.0f;
@ -26,8 +27,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
  // Compute scale
  vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
                                                     has_residual>(
-      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
-      residual);
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);

  // RMS Norm + Quant
  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@ -49,7 +50,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
    float* __restrict__ scales,           // [num_tokens]
    scalar_t const* __restrict__ input,   // [..., hidden_size]
    scalar_t const* __restrict__ weight,  // [hidden_size]
-    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
+    float const* scale_ub, float const var_epsilon,
+    float const min_scaling_factor, int32_t const hidden_size,
    scalar_t* __restrict__ residual = nullptr) {
  // For vectorization, token_input and token_output pointers need to be
  // aligned at 8-byte and 4-byte addresses respectively.
@ -58,8 +60,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
  if (can_vectorize) {
    return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
                                                has_residual>(
-        out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
-        residual);
+        out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
+        hidden_size, residual);
  }

  float rms = 0.0f;
@ -70,8 +72,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
                                            var_epsilon, residual);
  // Compute Scale
  vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
-      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
-      residual);
+      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
+      hidden_size, residual);

  // RMS Norm + Quant
  if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@ -103,6 +105,11 @@ void rms_norm_dynamic_per_token_quant_dispatch(
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();

+  const float min_scaling_factor =
+      out.dtype() == torch::kInt8
+          ? std::numeric_limits<float>::epsilon()
+          : 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
+
  if (residual.has_value()) {
    VLLM_DISPATCH_QUANT_TYPES(
        out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
@ -112,7 +119,8 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
+                  var_epsilon, min_scaling_factor, hidden_size,
+                  residual->data_ptr<scalar_in_t>());
        });

  } else {
@ -124,7 +132,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                  out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                  input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                  scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, hidden_size, nullptr);
+                  var_epsilon, min_scaling_factor, hidden_size, nullptr);
        });
  }
 }
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@ -5,7 +5,6 @@
 */

 #include "quantization/vectorization.cuh"
-#include "quantization/utils.cuh"
 #include "quant_conversions.cuh"

 #ifndef USE_ROCM
@ -52,11 +51,11 @@ __device__ void compute_dynamic_per_token_scales(
    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
    float const rms, float const* __restrict__ scale_ub,
-    int32_t const hidden_size,
+    float const min_scaling_factor, int32_t const hidden_size,
    scalar_t const* __restrict__ residual = nullptr) {
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  ;
-  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};

  float block_absmax_val_maybe = 0.0f;
  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
@ -84,7 +83,7 @@ __device__ void compute_dynamic_per_token_scales(
      scale = block_absmax_val_maybe;
    }
    // token scale computation
-    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+    scale = max(scale / qmax, min_scaling_factor);
    s_token_scale = scale;                 // Shared memory store
    all_token_scales[blockIdx.x] = scale;  // Global output store
  }
@ -185,7 +184,7 @@ __device__ void compute_dynamic_per_token_scales(
    float* __restrict__ token_scale, float* __restrict__ all_token_scales,
    scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
    float const rms, float const* __restrict__ scale_ub,
-    int32_t const hidden_size,
+    float const min_scaling_factor, int32_t const hidden_size,
    scalar_t const* __restrict__ residual = nullptr) {
  int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
  ;
@ -201,7 +200,7 @@ __device__ void compute_dynamic_per_token_scales(
        reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
  }

-  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
+  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};

  int32_t const num_vec_elems = hidden_size >> 2;
  float block_absmax_val_maybe = 0.0f;
@ -249,7 +248,7 @@ __device__ void compute_dynamic_per_token_scales(
      scale = block_absmax_val_maybe;
    }
    // token scale computation
-    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
+    scale = max(scale / qmax, min_scaling_factor);
    s_token_scale = scale;                 // shared memory store
    all_token_scales[blockIdx.x] = scale;  // global output store
  }
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@ -33,8 +33,8 @@ static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {

 template <typename fp8_type>
 static __device__ __forceinline__ fp8_type float_to_fp8(float const x) {
-  float const r =
-      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
+  float const r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
+                       fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
  return static_cast<fp8_type>(r);
 }

--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@ -94,8 +94,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
    dfloat2 v;
    dequantize_kernel(vx, ib, iqs, v);

-    y[iybs + iqs + 0]        = convert_from_half<dst_t>(v.x);
-    y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y);
+    y[iybs + iqs + 0]        = v.x;
+    y[iybs + iqs + y_offset] = v.y;
 }

 template<typename dst_t>
@ -114,10 +114,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t

    half dall = __low2half(x[i].dm);
    half dmin = __high2half(x[i].dm);
-    y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4))));
-    y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4))));
-    y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4))));
-    y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4))));
+    y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4)));
+    y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4)));
+    y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4)));
+    y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4)));
 }

 template<typename dst_t>
@ -148,9 +148,7 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
    const uint8_t * q = x[i].qs + 32*n;
    const uint8_t * hm = x[i].hmask;

-    for (int l = l0; l < l0+4; ++l) {
-        y[l] = convert_from_half<dst_t>(__hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
-    }
+    for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
 }

 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
@ -190,8 +188,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
    const half d2 = __hmul(dall, __int2half_rn(sc));
    const half m2 = __hmul(dmin, __int2half_rn(m));
    for (int l = 0; l < n; ++l) {
-        y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1));
-        y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2));
+        y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
+        y[l +32] = __hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2);
    }
 }

@ -222,11 +220,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
    const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));

    uint8_t   hm  = 1 << (2*il);
-    y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1));
-    y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1));
+    y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
+    y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
    hm <<= 1;
-    y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2));
-    y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2));
+    y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2);
+    y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2);
 }

 template<typename dst_t>
@ -249,10 +247,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
    const uint8_t   qh = x[i].qh[32*ip + il];
    const int8_t  * sc = x[i].scales + is;

-    y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))));
-    y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))));
-    y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32))));
-    y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32))));
+    y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
+    y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
+    y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
+    y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
 }

 template<typename dst_t>
@ -271,7 +269,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
    const uint32_t aux32 = q2[2] | (q2[3] << 16);
    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
 }

 template<typename dst_t>
@ -288,7 +286,7 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));

 }

@ -305,7 +303,7 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
    const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
-    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
+    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
 }

 template<typename dst_t>
@ -326,8 +324,8 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
    const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
    const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
    for (int j = 0; j < 4; ++j) {
-        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
-        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
+        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
    }
 }

@ -347,8 +345,8 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
    const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
    const uint8_t signs = x[i].signs[4*ib + il];
    for (int j = 0; j < 4; ++j) {
-        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
-        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
+        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
+        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
    }
 }

@ -369,7 +367,7 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
    grid32[0] &= 0x0f0f0f0f;
    for (int j = 0; j < 8; ++j) {
-        y[j] = d * (q[j] + delta);
+        y[j] = __float2half(d * (q[j] + delta));
    }
 }

@ -394,7 +392,7 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
    grid32[0] &= 0x0f0f0f0f;
    for (int j = 0; j < 8; ++j) {
-        y[j] = d * (q[j] + delta);
+        y[j] = __float2half(d * (q[j] + delta));
    }
 }

@ -411,8 +409,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
    const uint8_t  * q4 = x[ib].qs + 4*il;
    const float d = __half2float(x[ib].d);
    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
-        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
+        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
    }

 }
@ -429,8 +427,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
    const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
    for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
-        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
+        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
+        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
    }
 }

@ -524,8 +522,7 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
    dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 }

-template<typename dst_t>
-static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
+static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
    switch (type) {
        case 2:
            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@ -1063,8 +1063,7 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
 typedef half dfloat; // dequantize float
 typedef half2 dfloat2;
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-template<typename dst_t>
-using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream);
+typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
 typedef void (*load_tiles_cuda_t)(
@ -1076,25 +1075,6 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(

 // Utility function

-template<typename dst_t>
-static __device__ __forceinline__ dst_t convert_from_half(half val) {
-    return val;
-}
-
-template<>
-__device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half val) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-    return __float2bfloat16(__half2float(val));
-#else
-    return __half2float(val);
-#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-}
-
-template<>
-__device__ __forceinline__ float convert_from_half<float>(half val) {
-    return __half2float(val);
-}
-
 #if defined(USE_ROCM)

 #ifndef __has_builtin
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@ -71,19 +71,14 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
 }

 torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
-                              int64_t type, int64_t m, int64_t n,
-                              std::optional<at::ScalarType> const& dtype) {
+                              int64_t type, int64_t m, int64_t n) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
-  auto dtype_ = dtype.value_or(torch::kFloat16);
-  auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
+  auto options =
+      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
  at::Tensor DW = torch::empty({m, n}, options);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-
-  VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
-    auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
-    to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
-  });
-
+  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
+  to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream);
  return DW;
 }

--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -1785,7 +1785,7 @@ __global__ void Marlin(
            <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
                num_groups, prob_m, prob_n, prob_k, lda, locks,                \
-                part_use_atomic_add, use_fp32_reduce);                         \
+                use_atomic_add, use_fp32_reduce);                              \
      }                                                                        \
    }

@ -2215,10 +2215,6 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
      thread_m_blocks = exec_cfg.max_m_blocks;
    }

-    // atomic add reduce have better performance only when m * n is small
-    bool part_use_atomic_add =
-        use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
-
    if (false) {
    }
    GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
--- a/csrc/quantization/utils.cuh
+++ b/csrc/quantization/utils.cuh
@ -1,59 +0,0 @@
-#pragma once
-
-/**
- * Quantization utilities including:
- *   Adjusted maximum values for qtypes.
- *   Minimum scaling factors for qtypes.
- */
-
-#include <cmath>
-#include <torch/types.h>
-
-#ifndef USE_ROCM
-  #include <c10/util/Float8_e4m3fn.h>
-  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
-#else
-  #include <ATen/hip/HIPContext.h>
-  #include <c10/util/Float8_e4m3fn.h>
-  #include <c10/util/Float8_e4m3fnuz.h>
-  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
-  #define MAYBE_HOST_DEVICE
-#endif
-
-template <typename T,
-          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
-                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
-                                      std::is_same_v<T, int8_t>>>
-struct quant_type_max {
-  static constexpr T val() { return std::numeric_limits<T>::max(); }
-};
-
-// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
-// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
-template <>
-struct quant_type_max<c10::Float8_e4m3fnuz> {
-  static constexpr c10::Float8_e4m3fnuz val() {
-    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
-  }
-};
-
-template <typename T>
-MAYBE_HOST_DEVICE static constexpr T quant_type_max_v =
-    quant_type_max<T>::val();
-
-template <typename T,
-          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
-                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
-                                      std::is_same_v<T, int8_t>>>
-struct min_scaling_factor {
-  C10_DEVICE C10_ALWAYS_INLINE static float val() {
-    return 1.0f / (quant_type_max_v<T> * 512.0f);
-  }
-};
-
-template <>
-struct min_scaling_factor<int8_t> {
-  C10_DEVICE C10_ALWAYS_INLINE static float val() {
-    return std::numeric_limits<float>::epsilon();
-  }
-};
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@ -272,7 +272,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const float scale,    
    const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
    const int* __restrict__ context_lens,   // [num_seqs]
-    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes, // [num_heads]
    const int q_stride,
@ -292,13 +291,6 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  const int rowid = laneid / 16;

  const auto seq_idx = blockIdx.x;
-  // NOTE queries with sequence len > 1 are prefills and taken care by another
-  // kernel.
-  if (query_start_loc_ptr != nullptr &&
-      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
-    return;
-  }
-
  const auto partition_idx = blockIdx.y;

  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
@ -385,10 +377,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
  // fetch Q in shared across warps and then write to registers
  const int local_qhead_idx = 4 * warpid + rowid;
  const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
-  const int64_t query_start_off = static_cast<int64_t>(
-      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
  const scalar_t* q_ptr =
-      q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
+      q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;

  const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
  if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
@ -786,7 +777,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    const float scale,
    const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
    const int* __restrict__ context_lens,   // [num_seqs]
-    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes, // [num_heads]
    const int q_stride,
@ -804,12 +794,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
  const int lane4id = laneid % 4;

  const auto seq_idx = blockIdx.x;
-  // NOTE queries with sequence len > 1 are prefills and taken care by another
-  // kernel.
-  if (query_start_loc_ptr != nullptr &&
-      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
-    return;
-  }
  const auto partition_idx = blockIdx.y;
  const auto partition_size = blockDim.x;
  const auto max_num_partitions = gridDim.y;
@ -898,11 +882,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    }

    // fetch q elements
-    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elemsc
-    const int64_t query_start_off = static_cast<int64_t>(
-        query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elems
    const scalar_t* q_ptr =
-        q + query_start_off * q_stride + wg_start_head_idx * HEAD_SIZE;
+        q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE;
    const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
    const int qhead_elemh8 = laneid / 4;

@ -1285,19 +1267,10 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                           // max_num_partitions, head_size]
    const int* __restrict__ context_lens,  // [num_seqs]
-    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
    const int max_num_partitions) {
  const auto num_heads = gridDim.x;
  const auto head_idx = blockIdx.x;
  const auto seq_idx = blockIdx.y;
-
-  // NOTE queries with sequence len > 1 are prefills and taken care by another
-  // kernel.
-  if (query_start_loc_ptr != nullptr &&
-      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
-    return;
-  }
-
  const int context_len = context_lens[seq_idx];
  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
@ -1466,9 +1439,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
      __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
  acc *= inv_global_exp_sum;

-  const int64_t query_start_off = static_cast<int64_t>(
-      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
-  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
+  OUTT* out_ptr = out + static_cast<int64_t>(seq_idx) * num_heads * HEAD_SIZE +
                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
  if constexpr (std::is_same<OUTT, bit8_t>::value) {
    out_ptr[threadIdx.x] =
@ -1495,7 +1466,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
    const float scale,
    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
    const int* __restrict__ context_lens,    // [num_seqs]
-    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride,
@ -1522,7 +1492,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
    const float scale,
    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
    const int* __restrict__ context_lens,    // [num_seqs]
-    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
    const int max_num_blocks_per_seq,
    const float* __restrict__ alibi_slopes,  // [num_heads]
    const int q_stride,
@ -1546,7 +1515,6 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
    const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
    const int* __restrict__ context_lens,  // [num_seqs]
-    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
    const int max_num_partitions) {
  UNREACHABLE_CODE
 }
@ -1554,34 +1522,34 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(

 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support

-#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
-  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
-                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
-                                          GQA_RATIO>                           \
-      <<<grid, block, 0, stream>>>(                                            \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
-          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
-          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
-          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
-          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
+#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                             \
+  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
+                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,     \
+                                          GQA_RATIO>                          \
+      <<<grid, block, 0, stream>>>(                                           \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
+          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
+          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
+          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
+          k_scale_ptr, v_scale_ptr);

-#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                               \
-  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,   \
-                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,       \
-                                         GQA_RATIO>                            \
-      <<<grid, block, 0, stream>>>(                                            \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
-          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
-          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
-          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
-          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
+#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                              \
+  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
+                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
+                                         GQA_RATIO>                           \
+      <<<grid, block, 0, stream>>>(                                           \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
+          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
+          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
+          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
+          k_scale_ptr, v_scale_ptr);

 #define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
  paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
                                      PARTITION_SIZE, NPAR_LOOPS>    \
      <<<reduce_grid, reduce_block, 0, stream>>>(                    \
          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
-          context_lens_ptr, query_start_loc_ptr, max_num_partitions);
+          context_lens_ptr, max_num_partitions);

 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
@ -1591,10 +1559,9 @@ void paged_attention_custom_launcher(
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, const int num_kv_heads, float scale,
    torch::Tensor& block_tables, torch::Tensor& context_lens,
-    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
-    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
-    torch::Tensor& v_scale) {
-  int num_seqs = block_tables.size(0);
+    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
+    torch::Tensor& k_scale, torch::Tensor& v_scale) {
+  int num_seqs = query.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
  int max_num_blocks_per_seq = block_tables.size(1);
@ -1602,13 +1569,6 @@ void paged_attention_custom_launcher(
  int kv_block_stride = key_cache.stride(0);
  int kv_head_stride = key_cache.stride(1);

-  // NOTE: query start location is optional for V0 decode should not be used.
-  // If batch contains mix of prefills and decode, prefills should be skipped.
-  const int* query_start_loc_ptr =
-      query_start_loc
-          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
-          : nullptr;
-
  // NOTE: alibi_slopes is optional.
  const float* alibi_slopes_ptr =
      alibi_slopes
@ -1740,8 +1700,8 @@ void paged_attention_custom_launcher(
  paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
                                  PSIZE, ALIBI_ENABLED>(                    \
      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-      num_kv_heads, scale, block_tables, context_lens, query_start_loc,     \
-      max_context_len, alibi_slopes, k_scale, v_scale);
+      num_kv_heads, scale, block_tables, context_lens, max_context_len,     \
+      alibi_slopes, k_scale, v_scale);

 #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,      \
                                   PSIZE)                                      \
@ -1790,7 +1750,6 @@ void paged_attention(
    double scale,
    torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& context_lens, // [num_seqs]
-    const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
    int64_t block_size, int64_t max_context_len,
    const std::optional<torch::Tensor>& alibi_slopes,
    const std::string& kv_cache_dtype, torch::Tensor& k_scale,
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@ -7,9 +7,8 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                     torch::Tensor& query, torch::Tensor& key_cache,
                     torch::Tensor& value_cache, int64_t num_kv_heads,
                     double scale, torch::Tensor& block_tables,
-                     torch::Tensor& context_lens,
-                     const std::optional<torch::Tensor>& query_start_loc,
-                     int64_t block_size, int64_t max_context_len,
+                     torch::Tensor& context_lens, int64_t block_size,
+                     int64_t max_context_len,
                     const std::optional<torch::Tensor>& alibi_slopes,
                     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
                     torch::Tensor& v_scale);
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@ -23,9 +23,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
      "                Tensor query, Tensor key_cache,"
      "                Tensor value_cache, int num_kv_heads,"
      "                float scale, Tensor block_tables,"
-      "                Tensor context_lens,"
-      "                Tensor? query_start_loc,"
-      "                int block_size,"
+      "                Tensor context_lens, int block_size,"
      "                int max_context_len,"
      "                Tensor? alibi_slopes,"
      "                str kv_cache_dtype,"
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -31,10 +31,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("weak_ref_tensor(Tensor input) -> Tensor");
  ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);

-  ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
-  ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
-           &get_cuda_view_from_cpu_tensor);
-
  // Attention ops
  // Compute the attention between an input query and the cached
  // keys/values using PagedAttention.
@ -295,9 +291,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif

  // Dequantization for GGML.
-  ops.def(
-      "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
-      "dtype) -> Tensor");
+  ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
  ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);

  // mmvq kernel for GGML.
@ -616,11 +610,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
                  &get_max_shared_memory_per_block_device_attribute);
 }

+#ifndef USE_ROCM
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  // Custom all-reduce kernels
  custom_ar.def(
      "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
-      "int rank, bool fully_connected) -> int");
+      "int rank, bool full_nvlink) -> int");
  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
  custom_ar.def(
      "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
@ -633,13 +628,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
  custom_ar.def("register_buffer", &register_buffer);
  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
  custom_ar.def("register_graph_buffers", &register_graph_buffers);
-
-  custom_ar.def("allocate_shared_buffer_and_handle",
-                &allocate_shared_buffer_and_handle);
-  custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
-  custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
-
-  custom_ar.def("free_shared_buffer", &free_shared_buffer);
 }
+#endif

 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -1,138 +0,0 @@
-# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
-#
-# Build targets:
-#   vllm-openai (default): used for serving deployment
-#   vllm-test: used for CI tests
-#   vllm-dev: used for development
-#
-# Build arguments:
-#   PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
-#   VLLM_CPU_DISABLE_AVX512=false (default)|true
-#
-
-######################### BASE IMAGE #########################
-FROM ubuntu:22.04 AS base
-
-WORKDIR /workspace/
-
-ARG PYTHON_VERSION=3.12
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-
-# Install minimal dependencies and uv
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get update -y \
-    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
-        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-ENV CCACHE_DIR=/root/.cache/ccache
-ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-ENV PATH="/root/.local/bin:$PATH"
-ENV VIRTUAL_ENV="/opt/venv"
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
-
-ENV UV_HTTP_TIMEOUT=500
-
-# Install Python dependencies 
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-ENV UV_LINK_MODE="copy"
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
-    uv pip install --upgrade pip && \
-    uv pip install -r requirements/cpu.txt
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
-
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
-
-RUN echo 'ulimit -c 0' >> ~/.bashrc
-
-######################### BUILD IMAGE #########################
-FROM base AS vllm-build
-
-ARG GIT_REPO_CHECK=0
-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-
-WORKDIR /workspace/vllm
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
-    uv pip install -r requirements/build.txt
-
-COPY . .
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
-
-######################### DEV IMAGE #########################
-FROM vllm-build AS vllm-dev
-
-WORKDIR /workspace/vllm
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    apt-get install -y --no-install-recommends vim numactl
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils 
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt && \
-    pre-commit install --hook-type pre-commit --hook-type commit-msg
-
-ENTRYPOINT ["bash"]
-
-######################### TEST IMAGE #########################
-FROM base AS vllm-test
-
-WORKDIR /workspace/
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
-    uv pip install -r requirements/test.txt
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
-    uv pip install dist/*.whl
-
-ADD ./tests/ ./tests/
-ADD ./examples/ ./examples/
-ADD ./benchmarks/ ./benchmarks/
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils 
-
-ENTRYPOINT ["bash"]
-
-######################### RELEASE IMAGE #########################
-FROM base AS vllm-openai
-
-WORKDIR /workspace/
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
-    uv pip install dist/*.whl
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docs/README.md
+++ b/docs/README.md
@ -2,42 +2,19 @@

 ## Build the docs

- Make sure in `docs` directory
-
-```bash
-cd docs
-```
-
- Install the dependencies:
-
 ```bash
+# Install dependencies.
 pip install -r ../requirements/docs.txt
-```

- Clean the previous build (optional but recommended):
-
-```bash
+# Build the docs.
 make clean
-```
-
- Generate the HTML documentation:
-
-```bash
 make html
 ```

 ## Open the docs with your browser

- Serve the documentation locally:
-
 ```bash
 python -m http.server -d build/html/
 ```

-This will start a local server at http://localhost:8000. You can now open your browser and view the documentation.
-
-If port 8000 is already in use, you can specify a different port, for example:
-
-```bash
-python -m http.server 3000 -d build/html/
-```
+Launch your browser and open localhost:8000.
--- a/docs/source/assets/design/v1/prefix_caching/example-time-1.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-1.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-3.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-3.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-4.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-4.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-5.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-5.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-6.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-6.png
--- a/docs/source/assets/design/v1/prefix_caching/example-time-7.png
+++ b/docs/source/assets/design/v1/prefix_caching/example-time-7.png
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@ -4,8 +4,6 @@

 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:

- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
 - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@ -22,7 +22,6 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -104,7 +104,7 @@ myst_url_schemes = {
        "classes": ["github"],
    },
    "gh-project": {
-        "url": "https://github.com/orgs/vllm-project/projects/{{path}}",
+        "url": "https://github.com/vllm-project/projects/{{path}}",
        "title": "Project #{{path}}",
        "classes": ["github"],
    },
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@ -1,6 +1,6 @@
 # Dockerfile

-We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
 More information about deploying with Docker can be found [here](#deployment-docker).

 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
@ -28,7 +28,7 @@ The edges of the build graph represent:
  > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
  >
  > ```bash
-  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
+  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
  > ```
  >
  > or in case you want to run it directly with the docker image:
@ -43,7 +43,7 @@ The edges of the build graph represent:
  >    --output png \
  >    --dpi 200 \
  >    --max-label-length 50 \
-  >    --filename docker/Dockerfile \
+  >    --filename Dockerfile \
  >    --legend
  > ```
  >
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@ -44,12 +44,6 @@ pre-commit run --all-files
 pytest tests/
 ```

-:::{tip}
-Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
-
-Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
-:::
-
 :::{note}
 Currently, the repository is not fully checked by `mypy`.
 :::
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:

 ```Dockerfile
-FROM vllm/vllm-openai:v0.8.3
+FROM vllm/vllm-openai:v0.8.2

-# e.g. install the `audio` optional dependencies
+# e.g. install the `audio` and `video` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio]==0.8.3
+RUN uv pip install --system vllm[audio,video]==0.8.2
 ```

 :::
@ -61,11 +61,11 @@ RUN uv pip install --system git+https://github.com/huggingface/transformers.git

 ## Building vLLM's Docker Image from Source

-You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
+You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:

 ```console
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
+DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 ```

 :::{note}
@ -92,7 +92,6 @@ Keep an eye on memory usage with parallel jobs as it can be substantial (see exa
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
 $ python3 use_existing_torch.py
 $ DOCKER_BUILDKIT=1 docker build . \
-  --file docker/Dockerfile \
  --target vllm-openai \
  --platform "linux/arm64" \
  -t vllm/vllm-gh200-openai:latest \
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@ -46,7 +46,6 @@ metadata:
 type: Opaque
 data:
  token: $(HF_TOKEN)
-EOF
 ```

 Next, start the vLLM server as a Kubernetes Deployment and Service:
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@ -69,14 +69,14 @@ server {

 ```console
 cd $vllm_root
-docker build -f docker/Dockerfile . --tag vllm
+docker build -f Dockerfile . --tag vllm
 ```

 If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:

 ```console
 cd $vllm_root
-docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
+docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
 ```

 (nginxloadbalancer-nginx-docker-network)=
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@ -8,7 +8,7 @@ Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModal

 ## Prompt Update Detection

-One of the main responsibilities of HF processor is to update the prompt with placeholder tokens. For example:
+One of the main responsibilies of HF processor is to update the prompt with placeholder tokens. For example:

 - Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
 - Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@ -24,7 +24,7 @@ This document describes how vLLM deals with these challenges.
 [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:

 - `spawn` - spawn a new Python process. This will be the default as of Python
-  3.14. In macOS, this is already the default.
+  3.14.

 - `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
  in Python versions prior to 3.14.
@ -34,7 +34,7 @@ This document describes how vLLM deals with these challenges.
 ### Tradeoffs

 `fork` is the fastest method, but is incompatible with dependencies that use
-threads. If you are under macOS, using `fork` may cause the process to crash.
+threads.

 `spawn` is more compatible with dependencies, but can be problematic when vLLM
 is used as a library. If the consuming code does not use a `__main__` guard (`if
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
@ -126,7 +126,7 @@ Unfortunately, because auto-tuning takes quite a long time (from seconds to minu

 ## Cudagraph Capture

-vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
+vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trival to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.

 The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.

--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@ -19,20 +19,17 @@ And usually, these repositories have a config.json file that includes a quantiza

 ## Read quantized checkpoint

-For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument.
-
 ```python
 from vllm import LLM
 import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+quantization="bitsandbytes")
 ```

 ## Inflight quantization: load as 4bit quantization

-For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument.
-
 ```python
 from vllm import LLM
 import torch
@ -43,7 +40,7 @@ quantization="bitsandbytes")

 ## OpenAI Compatible Server

-Append the following to your model arguments for 4bit inflight quantization:
+Append the following to your 4bit model arguments:

 ```console
 --quantization bitsandbytes
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@ -29,7 +29,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlam
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
 :::

-GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
+GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-confing-path

 ```console
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@ -16,6 +16,5 @@ gptqmodel
 int4
 int8
 fp8
-quark
 quantized_kvcache
 :::
--- a/docs/source/features/quantization/quark.md
+++ b/docs/source/features/quantization/quark.md
@ -1,217 +0,0 @@
-(quark)=
-
-# AMD QUARK
-
-Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
-throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
-the flexible and powerful quantization toolkit, to produce performant quantized models to run on AMD GPUs. Quark has specialized support for quantizing large language models with weight,
-activation and kv-cache quantization and cutting-edge quantization algorithms like
-AWQ, GPTQ, Rotation and SmoothQuant.
-
-## Quark Installation
-
-Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
-
-```console
-pip install amd-quark
-```
-
-You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
-for more installation details.
-
-## Quantization Process
-
-After installing Quark, we will use an example to illustrate how to use Quark.  
-The Quark quantization process can be listed for 5 steps as below:
-
-1. Load the model
-2. Prepare the calibration dataloader
-3. Set the quantization configuration
-4. Quantize the model and export
-5. Evaluation in vLLM
-
-### 1. Load the Model
-
-Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
-to fetch model and tokenizer.
-
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
-MAX_SEQ_LEN = 512
-
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
-)
-model.eval()
-
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
-tokenizer.pad_token = tokenizer.eos_token
-```
-
-### 2. Prepare the Calibration Dataloader
-
-Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)
-to load calibration data. For more details about how to use calibration datasets efficiently, please refer
-to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
-
-```python
-from datasets import load_dataset
-from torch.utils.data import DataLoader
-
-BATCH_SIZE = 1
-NUM_CALIBRATION_DATA = 512
-
-# Load the dataset and get calibration data.
-dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
-text_data = dataset["text"][:NUM_CALIBRATION_DATA]
-
-tokenized_outputs = tokenizer(text_data, return_tensors="pt",
-    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
-calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
-    batch_size=BATCH_SIZE, drop_last=True)
-```
-
-### 3. Set the Quantization Configuration
-
-We need to set the quantization configuration, you can check
-[quark config guide](https://quark.docs.amd.com/latest/pytorch/user_guide_config_description.html)
-for further details. Here we use FP8 per-tensor quantization on weight, activation,
-kv-cache and the quantization algorithm is AutoSmoothQuant.
-
-:::{note}
-Note the quantization algorithm needs a JSON config file and the config file is located in
-[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
-under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
-AutoSmoothQuant config file for Llama is
-`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
-:::
-
-```python
-from quark.torch.quantization import (Config, QuantizationConfig,
-                                     FP8E4M3PerTensorSpec,
-                                     load_quant_algo_config_from_file)
-
-# Define fp8/per-tensor/static spec.
-FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-    is_dynamic=False).to_quantization_spec()
-
-# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-    weight=FP8_PER_TENSOR_SPEC)
-
-# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
-KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
-kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-kv_cache_quant_config = {name :
-    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
-                       weight=global_quant_config.weight,
-                       output_tensors=KV_CACHE_SPEC)
-    for name in kv_cache_layer_names_for_llama}
-layer_quant_config = kv_cache_quant_config.copy()
-
-# Define algorithm config by config file.
-LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
-    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
-algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
-
-EXCLUDE_LAYERS = ["lm_head"]
-quant_config = Config(
-    global_quant_config=global_quant_config,
-    layer_quant_config=layer_quant_config,
-    kv_cache_quant_config=kv_cache_quant_config,
-    exclude=EXCLUDE_LAYERS,
-    algo_config=algo_config)
-```
-
-### 4. Quantize the Model and Export
-
-Then we can apply the quantization. After quantizing, we need to freeze the
-quantized model first before exporting. Note that we need to export model with format of
-HuggingFace `safetensors`, you can refer to
-[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
-for more exporting format details.
-
-```python
-import torch
-from quark.torch import ModelQuantizer, ModelExporter
-from quark.torch.export import ExporterConfig, JsonExporterConfig
-
-# Apply quantization.
-quantizer = ModelQuantizer(quant_config)
-quant_model = quantizer.quantize_model(model, calib_dataloader)
-
-# Freeze quantized model to export.
-freezed_model = quantizer.freeze(model)
-
-# Define export config.
-LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
-export_config = ExporterConfig(json_export_config=JsonExporterConfig())
-export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
-
-EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
-exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
-with torch.no_grad():
-    exporter.export_safetensors_model(freezed_model,
-        quant_config=quant_config, tokenizer=tokenizer)
-```
-
-### 5. Evaluation in vLLM
-
-Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
-
-```python
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-# Create an LLM.
-llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-          kv_cache_dtype='fp8',quantization='quark')
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-print("\nGenerated Outputs:\n" + "-" * 60)
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt:    {prompt!r}")
-    print(f"Output:    {generated_text!r}")
-    print("-" * 60)
-```
-
-Or, you can use `lm_eval` to evaluate accuracy:
-
-```console
-$ lm_eval --model vllm \
-  --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
-  --tasks gsm8k
-```
-
-## Quark Quantization Script
-In addition to the example of Python API above, Quark also offers a
-[quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
-to quantize large language models more conveniently. It supports quantizing models with variety
-of different quantization schemes and optimization algorithms. It can export the quantized model
-and run evaluation tasks on the fly. With the script, the example above can be:
-
-```console
-python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
-                          --output_dir /path/to/output \
-                          --quant_scheme w_fp8_a_fp8 \
-                          --kv_cache_dtype fp8 \
-                          --quant_algo autosmoothquant \
-                          --num_calib_data 512 \
-                          --model_export hf_format \
-                          --tasks gsm8k
-```
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@ -136,14 +136,7 @@ Remember to check whether the `reasoning_content` exists in the response before

 ## Structured output

-The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
-
-```bash
-VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
-    --enable-reasoning --reasoning-parser deepseek_r1
-```
-
-Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
+The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output.

 ```python
 from openai import OpenAI
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@ -52,7 +52,7 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model
 ```

 :::{warning}
-Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
+Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
 :::

 Then use a client:
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@ -1,6 +1,6 @@
 # Tool Calling

-vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`) and `none` options for the `tool_choice` field in the chat completion API.
+vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but [on the roadmap](gh-issue:13002).

 ## Quickstart

@ -91,12 +91,6 @@ For best results, we recommend ensuring that the expected output format / schema
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.

-## Required Function Calling
-
-vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine.
-
-When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
-
 ## Automatic Function Calling

 To enable this feature, you should set the following flags:
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@ -17,7 +17,6 @@ def fix_case(text: str) -> str:
        "cli": "CLI",
        "cpu": "CPU",
        "llm": "LLM",
-        "mae": "MAE",
        "tpu": "TPU",
        "aqlm": "AQLM",
        "gguf": "GGUF",
@ -25,7 +24,6 @@ def fix_case(text: str) -> str:
        "rlhf": "RLHF",
        "vllm": "vLLM",
        "openai": "OpenAI",
-        "lmcache": "LMCache",
        "multilora": "MultiLoRA",
        "mlpspeculator": "MLPSpeculator",
        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@ -86,7 +86,7 @@ Currently, there are no pre-built Intel Gaudi images.
 ### Build image from source

 ```console
-docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
+docker build -f Dockerfile.hpu -t vllm-hpu-env  .
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```

--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@ -132,7 +132,7 @@ Currently, there are no pre-built Neuron images.

 See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.

-Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
+Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.

 ## Extra information

--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@ -169,10 +169,10 @@ See <project:#deployment-docker-pre-built-image> for instructions on using the o

 ### Build image from source

-You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
+You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.

 ```console
-docker build -f docker/Dockerfile.tpu -t vllm-tpu .
+docker build -f Dockerfile.tpu -t vllm-tpu .
 ```

 Run the Docker image with the following command:
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@ -159,45 +159,26 @@ Currently, there are no pre-built CPU wheels.

 ### Pre-built images

-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Intel/AMD x86
-:sync: x86
-
-:::{include} cpu/x86.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
-:::::
+Currently, there are no pre-build CPU images.

 ### Build image from source

 ```console
-$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
-
-# Launching OpenAI server 
-$ docker run --rm \
-             --privileged=true \
-             --shm-size=4g \
-             -p 8000:8000 \
-             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
-             -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
-             vllm-cpu-env \
-             --model=meta-llama/Llama-3.2-1B-Instruct \
-             --dtype=bfloat16 \
-             other vLLM OpenAI server arguments
+$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+$ docker run -it \
+             --rm \
+             --network=host \
+             --cpuset-cpus=<cpu-id-list, optional> \
+             --cpuset-mems=<memory-node, optional> \
+             vllm-cpu-env
 ```

 ::::{tip}
-For ARM or Apple silicon, use `docker/Dockerfile.arm`
+For ARM or Apple silicon, use `Dockerfile.arm`
 ::::

 ::::{tip}
-For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
+For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float`
 ::::

 ## Supported features
@ -272,14 +253,12 @@ $ python examples/offline_inference/basic/basic.py

 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.

- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.

-  - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
+  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:

    ```console
    VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
    ```

-  - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
-
-  - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
+  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md).
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@ -34,8 +34,6 @@ There are no pre-built wheels or images for this device, so you must build vLLM

 ### Pre-built images

-See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
-
 ### Build image from source

 ## Extra information
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@ -8,7 +8,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu

 ## Requirements

- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
+- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
 - ROCm 6.3

 ## Set up using Python
@ -31,7 +31,7 @@ Currently, there are no pre-built ROCm wheels.
    ```console
    # Install PyTorch
    $ pip uninstall torch -y
-    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
+    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.3
    ```

 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
@ -123,7 +123,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO

 #### (Optional) Build an image with ROCm software stack

-Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
+Build a docker image from <gh-file:Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.

@ -140,12 +140,12 @@ It is important that the user kicks off the docker build using buildkit. Either
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:

 ```console
-DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base .
+DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base .
 ```

 #### Build an image with vLLM

-First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
+First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

 ```console
@ -156,10 +156,10 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```

-<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
+<gh-file:Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:

- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
 - `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
 - `BUILD_RPD`: Include RocmProfileData profiling tool in the image
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
@ -169,13 +169,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:

 ```console
-DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
 ```

 To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:

 ```console
-DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
 ```

 To run the above docker image `vllm-rocm`, use the below command:
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@ -54,7 +54,7 @@ Currently, there are no pre-built XPU images.
 ### Build image from source

 ```console
-$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
 $ docker run -it \
             --rm \
             --network=host \
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@ -1,4 +1,4 @@
-You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html):
+You can create a new Python environment using `conda`:

 ```console
 # (Recommended) Create a new conda environment.
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@ -208,5 +208,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.

 ```{attention}
-There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
+There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
 ```
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Robert Shaw	4c42267293	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-03-28 02:26:20 +00:00
Robert Shaw	24f68342b4	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-03-28 02:17:42 +00:00
Robert Shaw	c5d963835b	updated Signed-off-by: Robert Shaw <robshaw@redhat.com>	2025-03-28 01:54:01 +00:00
rshaw@neuralmagic.com	b313220727	updates Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-03-27 23:51:36 +00:00