format

Fix hashing logic for non-full blocks
2024-06-02 00:02:54 +00:00 · 2024-06-02 00:01:30 +00:00
250 changed files with 3020 additions and 9959 deletions
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-# Install system packages
-apt update
-apt install -y curl jq
-
-# Install minijinja for templating
-curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
-source $HOME/.cargo/env
-
-# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
-if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
-
-  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
-    echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
-  else
-    echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
-    exit 0
-  fi
-fi
-
-# Upload sample.yaml
-buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
--- a/.buildkite/nightly-benchmarks/sample.yaml
+++ b/.buildkite/nightly-benchmarks/sample.yaml
@ -1,39 +0,0 @@
-steps:
-  # NOTE(simon): You can create separate blocks for different jobs
-  - label: "A100: NVIDIA SMI"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          # - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
-          # TODO(simon): check latest main branch or use the PR image.
-          - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
-            command:
-            - bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  # TODO(simon): bring H100 online
-  # - label: "H100: NVIDIA SMI"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
-  #       command:
-  #       - bash -c 'nvidia-smi && nvidia-smi topo -m'
-  #       propagate-environment: true
-  #       ipc: host
-  #       gpus: all
-
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
 echo '```' >> benchmark_results.md
-tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
 echo '```' >> benchmark_results.md

 # if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f /usr/bin/buildkite-agent ]; then
+if [ ! -f /workspace/buildkite-agent ]; then
    exit 0
 fi

 # upload the results to buildkite
-buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

 # exit with the exit code of the benchmarks
 if [ $bench_latency_exit_code -ne 0 ]; then
@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
 fi

 rm ShareGPT_V3_unfiltered_cleaned_split.json
-buildkite-agent artifact upload "*.json"
+/workspace/buildkite-agent artifact upload "*.json"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -10,15 +10,5 @@ remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Run the image
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
-
-# offline inference
-docker exec cpu-test bash -c "python3 examples/offline_inference.py"
-
-# Run basic model test
-docker exec cpu-test bash -c "cd tests;
-  pip install pytest Pillow protobuf
-  bash ../.buildkite/download-images.sh
-  cd ../
-  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+# Run the image and launch offline inference
+docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -37,7 +37,6 @@ steps:
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  commands:
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
@ -46,8 +45,7 @@ steps:
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist.py 

 - label: Distributed Tests (Multiple Groups)
  #mirror_hardwares: [amd]
@ -64,6 +62,7 @@ steps:
  mirror_hardwares: [amd]

  commands:
+  - pytest -v -s test_inputs.py
  - pytest -v -s entrypoints -m llm
  - pytest -v -s entrypoints -m openai

@ -80,13 +79,6 @@ steps:
    - python3 llava_example.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors

- label: Inputs Test
-  #mirror_hardwares: [amd]
-  commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s test_inputs.py
-    - pytest -v -s multimodal
-
 - label: Kernels Test %N
  #mirror_hardwares: [amd]
  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@ -95,13 +87,14 @@ steps:
 - label: Models Test
  #mirror_hardwares: [amd]
  commands:
-    - pytest -v -s models -m \"not llava\"
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models --ignore=models/test_llava.py

 - label: Llava Test
  mirror_hardwares: [amd]
  commands:
    - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m llava
+    - pytest -v -s models/test_llava.py

 - label: Prefix Caching Test
  mirror_hardwares: [amd]
@ -125,10 +118,7 @@ steps:

 - label: Speculative decoding tests
  #mirror_hardwares: [amd]
-  commands:
-    # See https://github.com/vllm-project/vllm/issues/5152
-    - export VLLM_ATTENTION_BACKEND=XFORMERS
-    - pytest -v -s spec_decode
+  command: pytest -v -s spec_decode

 - label: LoRA Test %N
  #mirror_hardwares: [amd]
@ -140,7 +130,14 @@ steps:
  num_gpus: 4
  # This test runs llama 13B, so it is required to run on 4 GPUs.
  commands:
-    - pytest -v -s -x lora/test_long_context.py
+    # Temporarily run this way because we cannot clean up GPU mem usage
+    # for multi GPU tests.
+    # TODO(sang): Fix it.
+    - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
+    - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
+    - pytest -v -s lora/test_long_context.py::test_self_consistency
+    - pytest -v -s lora/test_long_context.py::test_quality
+    - pytest -v -s lora/test_long_context.py::test_max_len

 - label: Tensorizer Test
  #mirror_hardwares: [amd]
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@ -1,64 +0,0 @@
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-
-steps:
-  - label: ":docker: build image"
-    agents:
-      queue: cpu_queue
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
-  {% for step in steps %}
-  - label: "{{ step.label }}"
-    agents:
-      {% if step.label == "Documentation Build" %}
-      queue: small_cpu_queue
-      {% elif step.no_gpu %}
-      queue: cpu_queue
-      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
-      queue: gpu_4_queue
-      {% else %}
-      queue: gpu_1_queue
-      {% endif %}
-    soft_fail: true
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-      - docker#v5.2.0:
-          image: {{ docker_image }}
-          always-pull: true
-          propagate-environment: true
-          {% if not step.no_gpu %}
-          gpus: all
-          {% endif %}
-          {% if step.label == "Benchmarks" %}
-          mount-buildkite-agent: true
-          {% endif %}
-          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
-          environment:
-            - VLLM_USAGE_SOURCE=ci-test
-            - HF_TOKEN
-            {% if step.label == "Speculative decoding tests" %}
-            - VLLM_ATTENTION_BACKEND=XFORMERS
-            {% endif %}
-          volumes:
-            - /dev/shm:/dev/shm
-  {% endfor %}
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@ -4,7 +4,7 @@

 steps:
  - label: ":docker: build image"
-    commands:
+    commands: 
      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
      - "docker push {{ docker_image }}"
    env:
@ -28,7 +28,6 @@ steps:
        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
        env:
          DOCKER_BUILDKIT: "1"
-        soft_fail: true
    {% endif %}
    {% endfor %}

@ -37,12 +36,10 @@ steps:
    agents:
      queue: neuron
    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: false
+    soft_fail: true

  - label: "Intel Test"
    depends_on: ~
-    agents:
-      queue: intel
    command: bash .buildkite/run-cpu-test.sh

  {% for step in steps %}
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -37,7 +37,6 @@ jobs:
        mypy vllm/distributed --config-file pyproject.toml
        mypy vllm/entrypoints --config-file pyproject.toml
        mypy vllm/executor --config-file pyproject.toml
-        mypy vllm/multimodal --config-file pyproject.toml
        mypy vllm/usage --config-file pyproject.toml
        mypy vllm/*.py --config-file pyproject.toml
        mypy vllm/transformers_utils --config-file pyproject.toml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -66,6 +66,19 @@ endif()
 #
 find_package(Torch REQUIRED)

+#
+# Normally `torch.utils.cpp_extension.CUDAExtension` would add
+# `libtorch_python.so` for linking against an extension. Torch's cmake
+# configuration does not include this library (presumably since the cmake
+# config is used for standalone C++ binaries that link against torch).
+# The `libtorch_python.so` library defines some of the glue code between
+# torch/python via pybind and is required by VLLM extensions for this
+# reason. So, add it by manually with `find_library` using torch's
+# installed library path.
+#
+find_library(torch_python_LIBRARY torch_python PATHS
+  "${TORCH_INSTALL_PREFIX}/lib")
+
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -158,7 +171,7 @@ set(VLLM_EXT_SRC
  "csrc/quantization/fp8/common.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/moe_align_block_size_kernels.cu"
-  "csrc/torch_bindings.cpp")
+  "csrc/pybind.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
  include(FetchContent)
@ -205,7 +218,6 @@ define_gpu_extension_target(
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
-  USE_SABI 3
  WITH_SOABI)

 #
@ -213,7 +225,7 @@ define_gpu_extension_target(
 #

 set(VLLM_MOE_EXT_SRC
-  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/moe_ops.cpp"
  "csrc/moe/topk_softmax_kernels.cu")

 define_gpu_extension_target(
@ -223,7 +235,6 @@ define_gpu_extension_target(
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  USE_SABI 3
  WITH_SOABI)

 #
@ -238,7 +249,7 @@ set(VLLM_PUNICA_EXT_SRC
  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
  "csrc/punica/punica_ops.cu"
-  "csrc/punica/torch_bindings.cpp")
+  "csrc/punica/punica_pybind.cpp")

 #
 # Copy GPU compilation flags+update for punica
@ -275,7 +286,6 @@ if (VLLM_PUNICA_GPU_ARCHES)
    SOURCES ${VLLM_PUNICA_EXT_SRC}
    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
-    USE_SABI 3
    WITH_SOABI)
 else()
  message(WARNING "Unable to create _punica_C target because none of the "
@ -301,9 +311,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
  message(STATUS "Enabling C extension.")
  add_dependencies(default _C)

-  message(STATUS "Enabling moe extension.")
-  add_dependencies(default _moe_C)
-
  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
  # there are supported target arches.
@ -313,3 +320,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
    add_dependencies(default _punica_C)
  endif()
 endif()
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  message(STATUS "Enabling moe extension.")
+  add_dependencies(default _moe_C)
+endif()
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -1,15 +1,13 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.

-FROM ubuntu:22.04 AS cpu-test-1
+FROM ubuntu:22.04

 RUN apt-get update  -y \
    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

 RUN pip install --upgrade pip \
-    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
-
-FROM cpu-test-1 AS build
+    && pip install wheel packaging ninja setuptools>=49.4.0 numpy

 COPY ./ /workspace/vllm

@ -21,6 +19,4 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

-RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-
 CMD ["/bin/bash"]
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -28,7 +28,7 @@ COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
 RUN cd /app/vllm \
    && python3 -m pip install -U -r requirements-neuron.txt

-ENV VLLM_TARGET_DEVICE neuron
+ENV VLLM_BUILD_WITH_NEURON 1
 RUN cd /app/vllm \
    && pip install -e . \
    && cd ..
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -106,9 +106,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -U -r requirements-rocm.txt \
    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
    && python3 setup.py install \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
    && cd ..


--- a/README.md
+++ b/README.md
@ -16,13 +16,6 @@ Easy, fast, and cheap LLM serving for everyone

 ---

-**Ray Summit CPF is Open (June 4th to June 20th)!**
-
-There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
-If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
-This will be a great chance for everyone in the community to get together and learn.
-Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
-
 **The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**

 We are thrilled to announce our fourth vLLM Meetup!
@ -114,7 +107,6 @@ vLLM is a community project. Our compute resources for development and testing a
 - Replicate
 - Roblox
 - RunPod
- Sequoia Capital
 - Trainy
 - UC Berkeley
 - UC San Diego
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -36,8 +36,7 @@ def main(args: argparse.Namespace):
              enable_chunked_prefill=args.enable_chunked_prefill,
              download_dir=args.download_dir,
              block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization,
-              distributed_executor_backend=args.distributed_executor_backend)
+              gpu_memory_utilization=args.gpu_memory_utilization)

    sampling_params = SamplingParams(
        n=args.n,
@ -222,12 +221,5 @@ if __name__ == '__main__':
                        help='the fraction of GPU memory to be used for '
                        'the model executor, which can range from 0 to 1.'
                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -56,9 +56,6 @@ class BenchmarkMetrics:
    mean_tpot_ms: float
    median_tpot_ms: float
    p99_tpot_ms: float
-    mean_itl_ms: float
-    median_itl_ms: float
-    p99_itl_ms: float


 def sample_sharegpt_requests(
@ -203,24 +200,16 @@ def calculate_metrics(
    actual_output_lens = []
    total_input = 0
    completed = 0
-    itls = []
    tpots = []
    ttfts = []
    for i in range(len(outputs)):
        if outputs[i].success:
-            # We use the tokenizer to count the number of output tokens for all
-            # serving backends instead of looking at len(outputs[i].itl) since
-            # multiple output tokens may be bundled together
-            # Note: this may inflate the output token count slightly
-            output_len = len(
-                tokenizer(outputs[i].generated_text,
-                          add_special_tokens=False).input_ids)
+            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
            if output_len > 1:
                tpots.append(
                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
-            itls += outputs[i].itl
            ttfts.append(outputs[i].ttft)
            completed += 1
        else:
@ -245,9 +234,6 @@ def calculate_metrics(
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
-        mean_itl_ms=np.mean(itls or 0) * 1000,
-        median_itl_ms=np.median(itls or 0) * 1000,
-        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
    )

    return metrics, actual_output_lens
@ -347,10 +333,6 @@ async def benchmark(
    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
                                    metrics.median_tpot_ms))
    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
-    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
-    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
-    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
-    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
    print("=" * 50)

    result = {
@ -367,9 +349,6 @@ async def benchmark(
        "mean_tpot_ms": metrics.mean_tpot_ms,
        "median_tpot_ms": metrics.median_tpot_ms,
        "p99_tpot_ms": metrics.p99_tpot_ms,
-        "mean_itl_ms": metrics.mean_itl_ms,
-        "median_itl_ms": metrics.median_itl_ms,
-        "p99_itl_ms": metrics.p99_itl_ms,
        "input_lens": [output.prompt_len for output in outputs],
        "output_lens": actual_output_lens,
        "ttfts": [output.ttft for output in outputs],
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -78,7 +78,6 @@ def run_vllm(
    enable_prefix_caching: bool,
    enable_chunked_prefill: bool,
    max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str],
    gpu_memory_utilization: float = 0.9,
    download_dir: Optional[str] = None,
 ) -> float:
@ -101,7 +100,6 @@ def run_vllm(
        download_dir=download_dir,
        enable_chunked_prefill=enable_chunked_prefill,
        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
    )

    # Add the requests to the engine.
@ -227,8 +225,8 @@ def main(args: argparse.Namespace):
            args.enforce_eager, args.kv_cache_dtype,
            args.quantization_param_path, args.device,
            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.gpu_memory_utilization, args.download_dir)
+            args.max_num_batched_tokens, args.gpu_memory_utilization,
+            args.download_dir)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -370,13 +368,6 @@ if __name__ == "__main__":
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -1,352 +0,0 @@
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Callable, Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-# helpers
-
-
-def to_fp8(tensor: torch.tensor) -> torch.tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.tensor) -> torch.tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.tensor, torch.tensor]:
-
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
-
-# impl
-
-
-def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                    scale_b: torch.tensor,
-                    out_dtype: torch.dtype) -> torch.tensor:
-    return torch.mm(a, b)
-
-
-def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                     scale_b: torch.tensor,
-                     out_dtype: torch.dtype) -> torch.tensor:
-    return torch._scaled_mm(a,
-                            b,
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=out_dtype)
-
-
-def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
-                                scale_a: torch.tensor, scale_b: torch.tensor,
-                                out_dtype: torch.dtype) -> torch.tensor:
-    return torch._scaled_mm(a,
-                            b,
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=out_dtype,
-                            use_fast_accum=True)
-
-
-def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                 scale_b: torch.tensor,
-                 out_dtype: torch.dtype) -> torch.tensor:
-    return ops.cutlass_scaled_mm_dq(a,
-                                    b,
-                                    scale_a,
-                                    scale_b,
-                                    out_dtype=out_dtype)
-
-
-# bench
-def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-             scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
-             sub_label: str, fn: Callable, description: str) -> TMeasurement:
-
-    min_run_time = 1
-
-    globals = {
-        "a": a,
-        "b": b,
-        "scale_a": scale_a,
-        "scale_b": scale_b,
-        "out_dtype": out_dtype,
-        "fn": fn,
-    }
-    return TBenchmark.Timer(
-        stmt="fn(a, b, scale_a, scale_b, out_dtype)",
-        globals=globals,
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.int8
-    a, b = make_rand_tensors(torch.int8, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
-    timers = []
-    # pytorch impl
-    timers.append(
-        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_i8_impl,
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.bfloat16, label, sub_label, cutlass_impl,
-                 "cutlass_i8_i8_bf16_scaled_mm"))
-
-    return timers
-
-
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float8_e4m3fn
-    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
-    timers = []
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 pytorch_fp8_impl_fast_accum,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 pytorch_fp8_impl_fast_accum,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.bfloat16, label, sub_label, cutlass_impl,
-                 "cutlass_fp8_fp8_bf16_scaled_mm"))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.float16, label, sub_label, cutlass_impl,
-                 "cutlass_fp8_fp8_fp16_scaled_mm"))
-    return timers
-
-
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-
-    results = []
-    for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args.dtype, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        raise ValueError("unsupported dtype")
-
-    parser = argparse.ArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8']")
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@ -1,37 +0,0 @@
-# Weight Shapes are in the format
-# ([K, N], TP_SPLIT_DIM)
-# Example:
-#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
-#   - TP1 : K = 14336, N = 4096
-#   - TP2 : K = 7168, N = 4096
-#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
-#   - TP1 : K = 4096, N = 6144
-#   - TP4 : K = 4096, N = 1536
-
-# TP1 shapes
-WEIGHT_SHAPES = {
-    "mistralai/Mistral-7B-v0.1": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
-    "meta-llama/Llama-2-7b-hf": [
-        ([4096, 12288], 1),
-        ([4096, 4096], 0),
-        ([4096, 22016], 1),
-        ([11008, 4096], 0),
-    ],
-    "meta-llama/Llama-2-13b-hf": [
-        ([5120, 15360], 1),
-        ([5120, 5120], 0),
-        ([5120, 27648], 1),
-        ([13824, 5120], 0),
-    ],
-    "meta-llama/Llama-2-70b-hf": [
-        ([8192, 10240], 1),
-        ([8192, 8192], 0),
-        ([8192, 57344], 1),
-        ([28672, 8192], 0),
-    ],
-}
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@ -0,0 +1,239 @@
+import argparse
+import json
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+import triton
+from tqdm import tqdm
+
+from vllm.model_executor.layers.fused_moe import (fused_moe,
+                                                  get_config_file_name)
+
+
+def main(model, tp_size, gpu, dtype: str):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
+    method = fused_moe
+    for bs in [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+    ]:
+        run_grid(bs,
+                 model=model,
+                 method=method,
+                 gpu=gpu,
+                 tp_size=tp_size,
+                 dtype=dtype)
+
+
+def run_grid(bs, model, method, gpu, tp_size, dtype: str):
+    if model == '8x7B':
+        d_model = 4096
+        model_intermediate_size = 14336
+        num_layers = 32
+    elif model == '8x22B':
+        d_model = 6144
+        model_intermediate_size = 16384
+        num_layers = 56
+    else:
+        raise ValueError(f'Unsupported Mixtral model {model}')
+    num_total_experts = 8
+    top_k = 2
+    # tp_size = 2
+    num_calls = 100
+
+    num_warmup_trials = 1
+    num_trials = 1
+
+    configs = []
+
+    for block_size_n in [32, 64, 128, 256]:
+        for block_size_m in [16, 32, 64, 128, 256]:
+            for block_size_k in [64, 128, 256]:
+                for group_size_m in [1, 16, 32, 64]:
+                    for num_warps in [4, 8]:
+                        for num_stages in [2, 3, 4, 5]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_size_m,
+                                "BLOCK_SIZE_N": block_size_n,
+                                "BLOCK_SIZE_K": block_size_k,
+                                "GROUP_SIZE_M": group_size_m,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
+
+    best_config = None
+    best_time_us = 1e20
+
+    print(f'{tp_size=} {bs=}')
+
+    for config in tqdm(configs):
+        # warmup
+        try:
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                    dtype=dtype,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+
+        # trial
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+                dtype=dtype,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+
+                tqdm.write(
+                    f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
+                    f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
+                    f'{d_model=} {model_intermediate_size=} {num_layers=}')
+
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(num_total_experts,
+                                    model_intermediate_size // tp_size,
+                                    "float8" if dtype == "float8" else None)
+    print(f"writing config to file {filename}")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
+
+
+def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
+               top_k: int, tp_size: int, model_intermediate_size: int, method,
+               config, dtype: str) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=torch.float16,
+    )
+
+    w1 = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w2 = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+
+    if dtype == "float8":
+        w1 = w1.to(torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fn)
+        w1_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        w2_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a1_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a2_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+
+    gating_output = F.softmax(torch.rand(
+        (num_calls, bs, num_total_experts),
+        device=hidden_states.device,
+        dtype=torch.float32,
+    ),
+                              dim=-1)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            gating_output=gating_output[i],
+            topk=2,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=dtype == "float8",
+        )
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='benchmark_mixtral_moe',
+        description='Benchmark and tune the fused_moe kernel',
+    )
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['float8', 'float16'],
+        help='Data type used for fused_moe kernel computations',
+    )
+    parser.add_argument('--model',
+                        type=str,
+                        default='8x7B',
+                        choices=['8x7B', '8x22B'],
+                        help='The Mixtral model to benchmark')
+    parser.add_argument('--tp-size',
+                        type=int,
+                        default=2,
+                        help='Tensor paralleli size')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=0,
+                        help="GPU ID for benchmarking")
+    args = parser.parse_args()
+    sys.exit(main(args.model, args.tp_size, args.gpu, args.dtype))
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -1,322 +0,0 @@
-import argparse
-import time
-from datetime import datetime
-from typing import Any, Dict, List, Tuple
-
-import ray
-import torch
-import triton
-from ray.experimental.tqdm_ray import tqdm
-from transformers import AutoConfig
-
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-
-
-def benchmark_config(
-    config: Dict[str, int],
-    num_tokens: int,
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8: bool,
-    num_iters: int = 100,
-) -> float:
-    init_dtype = torch.float16 if use_fp8 else dtype
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    w1 = torch.randn(num_experts,
-                     shard_intermediate_size,
-                     hidden_size,
-                     dtype=init_dtype)
-    w2 = torch.randn(num_experts,
-                     hidden_size,
-                     shard_intermediate_size // 2,
-                     dtype=init_dtype)
-    gating_output = torch.randn(num_iters,
-                                num_tokens,
-                                num_experts,
-                                dtype=torch.float32)
-
-    w1_scale = None
-    w2_scale = None
-    a1_scale = None
-    a2_scale = None
-    if use_fp8:
-        w1_scale = torch.randn(num_experts, dtype=torch.float32)
-        w2_scale = torch.randn(num_experts, dtype=torch.float32)
-        a1_scale = torch.randn(1, dtype=torch.float32)
-        a2_scale = torch.randn(1, dtype=torch.float32)
-
-        w1 = w1.to(torch.float8_e4m3fn)
-        w2 = w2.to(torch.float8_e4m3fn)
-
-    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
-
-    def prepare(i: int):
-        input_gating.copy_(gating_output[i])
-
-    def run():
-        fused_moe(
-            x,
-            w1,
-            w2,
-            input_gating,
-            topk,
-            renormalize=True,
-            inplace=True,
-            override_config=config,
-            use_fp8=use_fp8,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-        )
-
-    # JIT compilation & warmup
-    run()
-    torch.cuda.synchronize()
-
-    # Capture 10 invocations with CUDA graph
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph):
-        for _ in range(10):
-            run()
-    torch.cuda.synchronize()
-
-    # Warmup
-    for _ in range(5):
-        graph.replay()
-    torch.cuda.synchronize()
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    latencies = []
-    for i in range(num_iters):
-        prepare(i)
-        torch.cuda.synchronize()
-
-        start_event.record()
-        graph.replay()
-        end_event.record()
-        end_event.synchronize()
-        latencies.append(start_event.elapsed_time(end_event))
-    avg = sum(latencies) / (num_iters * 10) * 1000  # us
-    graph.reset()
-    return avg
-
-
-def get_configs_compute_bound() -> List[Dict[str, int]]:
-    # Reduced search space for faster tuning.
-    # TODO(woosuk): Increase the search space and use a performance model to
-    # prune the search space.
-    configs = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128, 256]:
-                for block_n in [32, 64, 128, 256]:
-                    for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_m,
-                                "BLOCK_SIZE_N": block_n,
-                                "BLOCK_SIZE_K": block_k,
-                                "GROUP_SIZE_M": group_size,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
-    return configs
-
-
-@ray.remote(num_gpus=1)
-class BenchmarkWorker:
-
-    def __init__(self, seed: int) -> None:
-        torch.set_default_device("cuda")
-        torch.cuda.manual_seed_all(seed)
-        self.seed = seed
-
-    def benchmark(
-        self,
-        num_tokens: int,
-        num_experts: int,
-        shard_intermediate_size: int,
-        hidden_size: int,
-        topk: int,
-        dtype: torch.dtype,
-        use_fp8: bool,
-    ) -> Tuple[Dict[str, int], float]:
-        torch.cuda.manual_seed_all(self.seed)
-
-        dtype_str = "float8" if use_fp8 else None
-        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
-        # is the intermediate size after silu_and_mul.
-        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
-        if op_config is None:
-            config = get_default_config(num_tokens, num_experts,
-                                        shard_intermediate_size, hidden_size,
-                                        topk, dtype_str)
-        else:
-            config = op_config[min(op_config.keys(),
-                                   key=lambda x: abs(x - num_tokens))]
-        kernel_time = benchmark_config(config, num_tokens, num_experts,
-                                       shard_intermediate_size, hidden_size,
-                                       topk, dtype, use_fp8)
-        return config, kernel_time
-
-    def tune(
-        self,
-        num_tokens: int,
-        num_experts: int,
-        shard_intermediate_size: int,
-        hidden_size: int,
-        topk: int,
-        dtype: torch.dtype,
-        use_fp8: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
-        best_config = None
-        best_time = float("inf")
-        for config in tqdm(search_space):
-            try:
-                kernel_time = benchmark_config(config,
-                                               num_tokens,
-                                               num_experts,
-                                               shard_intermediate_size,
-                                               hidden_size,
-                                               topk,
-                                               dtype,
-                                               use_fp8,
-                                               num_iters=10)
-            except triton.runtime.autotuner.OutOfResources:
-                # Some configurations may be invalid and fail to compile.
-                continue
-
-            if kernel_time < best_time:
-                best_time = kernel_time
-                best_config = config
-        now = datetime.now()
-        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
-        return best_config
-
-
-def sort_config(config: Dict[str, int]) -> Dict[str, int]:
-    return {
-        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
-        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
-        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
-        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
-        "num_warps": config["num_warps"],
-        "num_stages": config["num_stages"],
-    }
-
-
-def save_configs(
-    configs: Dict[int, Dict[str, int]],
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8: bool,
-) -> None:
-    dtype_str = "float8" if use_fp8 else None
-    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
-    # is the intermediate size after silu_and_mul.
-    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
-    print(f"Writing best config to {filename}...")
-    with open(filename, "w") as f:
-        json.dump(configs, f, indent=4)
-        f.write("\n")
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    config = AutoConfig.from_pretrained(args.model)
-    if config.architectures[0] == "DbrxForCausalLM":
-        E = config.ffn_config.moe_num_experts
-        topk = config.ffn_config.moe_top_k
-        intermediate_size = config.ffn_config.ffn_hidden_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    else:
-        # Default: Mixtral.
-        E = config.num_local_experts
-        topk = config.num_experts_per_tok
-        intermediate_size = config.intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-
-    hidden_size = config.hidden_size
-    dtype = config.torch_dtype
-    use_fp8 = args.dtype == "fp8"
-
-    if args.batch_size is None:
-        batch_sizes = [
-            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
-            2048, 3072, 4096
-        ]
-    else:
-        batch_sizes = [args.batch_size]
-
-    ray.init()
-    num_gpus = int(ray.available_resources()["GPU"])
-    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
-
-    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
-        outputs = []
-        worker_idx = 0
-        for input_args in inputs:
-            worker = workers[worker_idx]
-            worker_method = getattr(worker, method)
-            output = worker_method.remote(*input_args)
-            outputs.append(output)
-            worker_idx = (worker_idx + 1) % num_gpus
-        return ray.get(outputs)
-
-    if args.tune:
-        search_space = get_configs_compute_bound()
-        print(f"Start tuning over {len(search_space)} configurations...")
-
-        start = time.time()
-        configs = _distribute(
-            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8, search_space)
-                     for batch_size in batch_sizes])
-        best_configs = {
-            M: sort_config(config)
-            for M, config in zip(batch_sizes, configs)
-        }
-        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8)
-        end = time.time()
-        print(f"Tuning took {end - start:.2f} seconds")
-    else:
-        outputs = _distribute("benchmark",
-                              [(batch_size, E, shard_intermediate_size,
-                                hidden_size, topk, dtype, use_fp8)
-                               for batch_size in batch_sizes])
-
-        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
-            print(f"Batch size: {batch_size}, config: {config}")
-            print(f"Kernel time: {kernel_time:.2f} us")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model",
-                        type=str,
-                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
-    parser.add_argument("--tp-size", "-tp", type=int, default=2)
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["auto", "fp8"],
-                        default="auto")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, required=False)
-    parser.add_argument("--tune", action="store_true")
-    args = parser.parse_args()
-
-    main(args)
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -12,7 +12,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
-list(APPEND CXX_COMPILE_FLAGS
+list(APPEND CXX_COMPILE_FLAGS 
    "-fopenmp"
    "-DVLLM_CPU_EXTENSION")

@ -44,8 +44,8 @@ if (AVX512_FOUND)

    find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
    if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND 
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) 
            list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
        else()
            message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
@ -73,7 +73,7 @@ set(VLLM_EXT_SRC
    "csrc/cpu/cache.cpp"
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp")
+    "csrc/cpu/pybind.cpp")

 define_gpu_extension_target(
    _C
@ -81,10 +81,10 @@ define_gpu_extension_target(
    LANGUAGE CXX
    SOURCES ${VLLM_EXT_SRC}
    COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-    USE_SABI 3
-    WITH_SOABI
+    WITH_SOABI 
 )

 add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
+
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -5,7 +5,7 @@
 macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
  set(Python_EXECUTABLE ${EXECUTABLE})
-  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
+  find_package(Python COMPONENTS Interpreter Development.Module)
  if (NOT Python_FOUND)
    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
  endif()
@ -294,7 +294,6 @@ endmacro()
 # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
 # LIBRARIES <libraries>      - Extra link libraries.
 # WITH_SOABI                 - Generate library with python SOABI suffix name.
-# USE_SABI <version>         - Use python stable api <version>
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
@ -302,7 +301,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  cmake_parse_arguments(PARSE_ARGV 1
    GPU
    "WITH_SOABI"
-    "DESTINATION;LANGUAGE;USE_SABI"
+    "DESTINATION;LANGUAGE"
    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")

  # Add hipify preprocessing step when building with HIP/ROCm.
@ -316,11 +315,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
    set(GPU_WITH_SOABI)
  endif()

-  if (GPU_USE_SABI)
-    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
-  else()
-    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
-  endif()
+  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})

  if (GPU_LANGUAGE STREQUAL "HIP")
    # Make this target dependent on the hipify preprocessor step.
--- a/collect_env.py
+++ b/collect_env.py
@ -64,7 +64,6 @@ DEFAULT_CONDA_PATTERNS = {
    "triton",
    "optree",
    "nccl",
-    "transformers",
 }

 DEFAULT_PIP_PATTERNS = {
@ -76,7 +75,6 @@ DEFAULT_PIP_PATTERNS = {
    "optree",
    "onnx",
    "nccl",
-    "transformers",
 }


@ -603,11 +601,6 @@ Versions of relevant libraries:
 {conda_packages}
 """.strip()

-# both the above code and the following code use `strip()` to
-# remove leading/trailing whitespaces, so we need to add a newline
-# in between to separate the two sections
-env_info_fmt += "\n"
-
 env_info_fmt += """
 ROCM Version: {rocm_version}
 Neuron SDK Version: {neuron_sdk_version}
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>

 #include <cmath>
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@ -17,7 +17,7 @@
 * limitations under the License.
 */

-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <algorithm>
@ -808,17 +808,16 @@ void paged_attention_v1(
    torch::Tensor&
        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
    torch::Tensor&
-        value_cache,       // [num_blocks, num_heads, head_size, block_size]
-    int64_t num_kv_heads,  // [num_heads]
-    double scale,
+        value_cache,   // [num_blocks, num_heads, head_size, block_size]
+    int num_kv_heads,  // [num_heads]
+    float scale,
    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& seq_lens,      // [num_seqs]
-    int64_t block_size, int64_t max_seq_len,
+    int block_size, int max_seq_len,
    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  const bool is_block_sparse = (blocksparse_vert_stride > 1);

  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
@ -973,17 +972,16 @@ void paged_attention_v2(
    torch::Tensor&
        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
    torch::Tensor&
-        value_cache,       // [num_blocks, num_heads, head_size, block_size]
-    int64_t num_kv_heads,  // [num_heads]
-    double scale,
+        value_cache,   // [num_blocks, num_heads, head_size, block_size]
+    int num_kv_heads,  // [num_heads]
+    float scale,
    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& seq_lens,      // [num_seqs]
-    int64_t block_size, int64_t max_seq_len,
+    int block_size, int max_seq_len,
    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  const bool is_block_sparse = (blocksparse_vert_stride > 1);
  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
                             CALL_V2_LAUNCHER_BLOCK_SIZE)
@ -992,4 +990,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -1,6 +1,6 @@
 #pragma once

-#include <torch/all.h>
+#include <torch/extension.h>

 #include <map>
 #include <vector>
@ -8,18 +8,14 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping);

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
+void copy_blocks(std::vector<torch::Tensor>& key_caches,
+                 std::vector<torch::Tensor>& value_caches,
                 const torch::Tensor& block_mapping);

 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype,
-                       const double kv_scale);
+                       const std::string& kv_cache_dtype, const float kv_scale);

 void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                             torch::Tensor& key_cache,
@ -29,4 +25,4 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,

 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const double scale, const std::string& kv_cache_dtype);
+                 const float scale, const std::string& kv_cache_dtype);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

@ -95,11 +95,8 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,

 }  // namespace vllm

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
+void copy_blocks(std::vector<torch::Tensor>& key_caches,
+                 std::vector<torch::Tensor>& value_caches,
                 const torch::Tensor& block_mapping) {
  int num_layers = key_caches.size();
  TORCH_CHECK(num_layers == value_caches.size());
@ -258,7 +255,7 @@ void reshape_and_cache(
    torch::Tensor&
        value_cache,  // [num_blocks, num_heads, head_size, block_size]
    torch::Tensor& slot_mapping,  // [num_tokens]
-    const std::string& kv_cache_dtype, const double kv_scale) {
+    const std::string& kv_cache_dtype, const float kv_scale) {
  int num_tokens = key.size(0);
  int num_heads = key.size(1);
  int head_size = key.size(2);
@ -337,7 +334,7 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,

 // Only for testing.
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const double kv_scale, const std::string& kv_cache_dtype) {
+                 const float kv_scale, const std::string& kv_cache_dtype) {
  torch::Device src_device = src_cache.device();
  torch::Device dst_device = dst_cache.device();
  TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@ -420,13 +420,12 @@ void paged_attention_v1_impl_launcher(

 void paged_attention_v1(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  TORCH_CHECK(kv_scale == 1.0f);
  TORCH_CHECK(blocksparse_vert_stride <= 1,
              "CPU backend does not support blocksparse attention yet.");
@ -739,13 +738,12 @@ void paged_attention_v2_impl_launcher(
 void paged_attention_v2(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  TORCH_CHECK(kv_scale == 1.0f);
  TORCH_CHECK(blocksparse_vert_stride <= 1,
              "CPU backend does not support blocksparse attention yet.");
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@ -5,8 +5,8 @@

 namespace {
 template <typename scalar_t>
-void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
-                          std::vector<torch::Tensor> const& value_caches,
+void copy_blocks_cpu_impl(std::vector<torch::Tensor>& key_caches,
+                          std::vector<torch::Tensor>& value_caches,
                          const torch::Tensor& mapping_pairs,
                          const int element_num_per_block,
                          const int layer_num) {
@ -82,11 +82,8 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
+void copy_blocks(std::vector<torch::Tensor>& key_caches,
+                 std::vector<torch::Tensor>& value_caches,
                 const torch::Tensor& block_mapping) {
  unsigned num_layers = key_caches.size();
  TORCH_CHECK(num_layers == value_caches.size());
@ -107,7 +104,7 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, double kv_scale) {
+                       const std::string& kv_cache_dtype, float kv_scale) {
  TORCH_CHECK(kv_scale == 1.0f);

  int num_tokens = key.size(0);
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@ -3,7 +3,7 @@
 #define CPU_TYPES_HPP

 #include <immintrin.h>
-#include <torch/all.h>
+#include <torch/extension.h>

 namespace vec_op {

--- a/csrc/cpu/layernorm.cpp
+++ b/csrc/cpu/layernorm.cpp
@ -88,7 +88,7 @@ void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
 }  // namespace

 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-              double epsilon) {
+              float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

@ -102,7 +102,7 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 }

 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
-                        torch::Tensor& weight, double epsilon) {
+                        torch::Tensor& weight, float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@ -21,57 +21,7 @@ void rotary_embedding_impl(
  constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();

  const int embed_dim = rot_dim / 2;
-  bool flag = (embed_dim % VEC_ELEM_NUM == 0);
-  const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
-
-  auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
-                          scalar_t* qk) {
-    int j = 0;
-    for (; j < loop_upper; j += VEC_ELEM_NUM) {
-      const int rot_offset = j;
-      const int x_index = rot_offset;
-      const int y_index = embed_dim + rot_offset;
-
-      const int64_t out_x = token_head + x_index;
-      const int64_t out_y = token_head + y_index;
-
-      const scalar_vec_t cos(cache_ptr + x_index);
-      const scalar_vec_t sin(cache_ptr + y_index);
-
-      const scalar_vec_t q_x(qk + out_x);
-      const scalar_vec_t q_y(qk + out_y);
-
-      vec_op::FP32Vec8 fp32_cos(cos);
-      vec_op::FP32Vec8 fp32_sin(sin);
-
-      vec_op::FP32Vec8 fp32_q_x(q_x);
-      vec_op::FP32Vec8 fp32_q_y(q_y);
-
-      auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
-      scalar_vec_t(out1).save(qk + out_x);
-
-      auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
-      scalar_vec_t(out2).save(qk + out_y);
-    }
-    if (!flag) {
-      for (; j < embed_dim; ++j) {
-        const int x_index = j;
-        const int y_index = embed_dim + j;
-
-        const int64_t out_x = token_head + x_index;
-        const int64_t out_y = token_head + y_index;
-
-        const float fp32_cos = cache_ptr[x_index];
-        const float fp32_sin = cache_ptr[y_index];
-
-        const float fp32_q_x = qk[out_x];
-        const float fp32_q_y = qk[out_y];
-
-        qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
-        qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
-      }
-    }
-  };
+  TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);

 #pragma omp parallel for
  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
@ -82,13 +32,62 @@ void rotary_embedding_impl(
      const int head_idx = i;
      const int64_t token_head =
          token_idx * query_stride + head_idx * head_size;
-      compute_loop(token_head, cache_ptr, query);
+      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
+        const int rot_offset = j;
+        const int x_index = rot_offset;
+        const int y_index = embed_dim + rot_offset;
+
+        const int64_t out_x = token_head + x_index;
+        const int64_t out_y = token_head + y_index;
+
+        const scalar_vec_t cos(cache_ptr + x_index);
+        const scalar_vec_t sin(cache_ptr + y_index);
+
+        const scalar_vec_t q_x(query + out_x);
+        const scalar_vec_t q_y(query + out_y);
+
+        vec_op::FP32Vec8 fp32_cos(cos);
+        vec_op::FP32Vec8 fp32_sin(sin);
+
+        vec_op::FP32Vec8 fp32_q_x(q_x);
+        vec_op::FP32Vec8 fp32_q_y(q_y);
+
+        auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+        scalar_vec_t(out1).save(query + out_x);
+
+        auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+        scalar_vec_t(out2).save(query + out_y);
+      }
    }

    for (int i = 0; i < num_kv_heads; ++i) {
      const int head_idx = i;
      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
-      compute_loop(token_head, cache_ptr, key);
+      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
+        const int rot_offset = j;
+        const int x_index = rot_offset;
+        const int y_index = embed_dim + rot_offset;
+
+        const int64_t out_x = token_head + x_index;
+        const int64_t out_y = token_head + y_index;
+
+        const scalar_vec_t cos(cache_ptr + x_index);
+        const scalar_vec_t sin(cache_ptr + y_index);
+
+        const scalar_vec_t k_x(key + out_x);
+        const scalar_vec_t k_y(key + out_y);
+
+        vec_op::FP32Vec8 fp32_cos(cos);
+        vec_op::FP32Vec8 fp32_sin(sin);
+
+        vec_op::FP32Vec8 fp32_k_x(k_x);
+        vec_op::FP32Vec8 fp32_k_y(k_y);
+
+        auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
+        scalar_vec_t(out1).save(key + out_x);
+        auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
+        scalar_vec_t(out2).save(key + out_y);
+      }
    }
  }
 }
@ -168,7 +167,7 @@ void rotary_embedding_gptj_impl(
 };  // namespace

 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int64_t head_size,
+                      torch::Tensor& key, int head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox) {
  int num_tokens = query.numel() / query.size(-1);
  int rot_dim = cos_sin_cache.size(1);
--- a/csrc/cpu/pybind.cpp
+++ b/csrc/cpu/pybind.cpp
@ -0,0 +1,44 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // vLLM custom ops
+  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
+
+  // Attention ops
+  ops.def("paged_attention_v1", &paged_attention_v1,
+          "Compute the attention between an input query and the cached "
+          "keys/values using PagedAttention.");
+  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
+
+  // Activation ops
+  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
+  ops.def("gelu_and_mul", &gelu_and_mul,
+          "Activation function used in GeGLU with `none` approximation.");
+  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
+          "Activation function used in GeGLU with `tanh` approximation.");
+  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
+  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
+
+  // Layernorm
+  ops.def("rms_norm", &rms_norm,
+          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
+
+  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
+          "In-place fused Add and RMS Normalization");
+
+  // Rotary embedding
+  ops.def("rotary_embedding", &rotary_embedding,
+          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
+
+  // Cache ops
+  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
+  cache_ops.def("swap_blocks", &swap_blocks,
+                "Swap in (out) the cache blocks from src to dst");
+  cache_ops.def("copy_blocks", &copy_blocks,
+                "Copy the cache blocks from src to dst");
+  cache_ops.def("reshape_and_cache", &reshape_and_cache,
+                "Reshape the key and value tensors and cache them");
+}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -1,106 +0,0 @@
-#include "cache.h"
-#include "ops.h"
-#include "registration.h"
-
-#include <torch/library.h>
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-  // vLLM custom ops
-
-  // Attention ops
-  // Compute the attention between an input query and the cached keys/values
-  // using PagedAttention.
-  ops.def(
-      "paged_attention_v1("
-      "    Tensor! out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
-
-  // PagedAttention V2.
-  ops.def(
-      "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
-
-  // Activation ops
-
-  // Activation function used in SwiGLU.
-  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
-  ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
-
-  // Activation function used in GeGLU with `none` approximation.
-  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
-
-  // Activation function used in GeGLU with `tanh` approximation.
-  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
-
-  // GELU implementation used in GPT-2.
-  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_new", torch::kCPU, &gelu_new);
-
-  // Approximate GELU implementation.
-  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
-
-  // Layernorm
-  // Apply Root Mean Square (RMS) Normalization to the input tensor.
-  ops.def(
-      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
-      "()");
-  ops.impl("rms_norm", torch::kCPU, &rms_norm);
-
-  // In-place fused Add and RMS Normalization.
-  ops.def(
-      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
-      "float epsilon) -> ()");
-  ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
-
-  // Rotary embedding
-  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
-  ops.def(
-      "rotary_embedding(Tensor positions, Tensor! query,"
-      "                 Tensor! key, int head_size,"
-      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
-  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
-}
-
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
-  // Cache ops
-  // Swap in (out) the cache blocks from src to dst.
-  cache_ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
-  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
-
-  // Copy the cache blocks from src to dst.
-  cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
-      "block_mapping) -> ()");
-  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
-
-  // Reshape the key and value tensors and cache them.
-  cache_ops.def(
-      "reshape_and_cache(Tensor key, Tensor value,"
-      "                  Tensor! key_cache, Tensor! value_cache,"
-      "                  Tensor slot_mapping,"
-      "                  str kv_cache_dtype,"
-      "                  float kv_scale) -> ()");
-  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@ -19,12 +19,8 @@
 #ifndef USE_ROCM
  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
-  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
-    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
 #else
  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
-  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
-    __shfl_xor(var, lane_mask, width)
 #endif

 #ifndef USE_ROCM
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@ -1,5 +1,7 @@
 #pragma once

-int64_t get_device_attribute(int64_t attribute, int64_t device_id);
+#include <torch/extension.h>

-int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
+int get_device_attribute(int attribute, int device_id);
+
+int get_max_shared_memory_per_block_device_attribute(int device_id);
--- a/csrc/cuda_utils_kernels.cu
+++ b/csrc/cuda_utils_kernels.cu
@ -2,7 +2,7 @@
  #include <hip/hip_runtime.h>
  #include <hip/hip_runtime_api.h>
 #endif
-int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
+int get_device_attribute(int attribute, int device_id) {
  int device, value;
  if (device_id < 0) {
    cudaGetDevice(&device);
@ -14,8 +14,8 @@ int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
  return value;
 }

-int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
-  int64_t attribute;
+int get_max_shared_memory_per_block_device_attribute(int device_id) {
+  int attribute;
  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
  // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74

--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@ -1,17 +1,17 @@
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
-#include <torch/all.h>
+#include <torch/extension.h>

 #include "custom_all_reduce.cuh"

-// fake pointer type, must match fptr_t type in ops.h
-using fptr_t = int64_t;
+// fake pointer type
+using fptr_t = uint64_t;
 static_assert(sizeof(void*) == sizeof(fptr_t));

 fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                      const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int64_t rank,
+                      const std::vector<int64_t>& offsets, int rank,
                      bool full_nvlink) {
  int world_size = offsets.size();
  if (world_size > 8)
@ -55,7 +55,7 @@ bool _is_weak_contiguous(torch::Tensor& t) {
          t.numel() * t.element_size());
 }

-bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
+bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
                      bool full_nvlink) {
  auto inp_size = inp.numel() * inp.element_size();
  // custom allreduce requires input byte size to be multiples of 16
@ -125,7 +125,7 @@ void dispose(fptr_t _fa) {
  delete fa;
 }

-int64_t meta_size() { return sizeof(vllm::Signal); }
+int meta_size() { return sizeof(vllm::Signal); }

 void register_buffer(fptr_t _fa, torch::Tensor& t,
                     const std::vector<std::string>& handles,
@ -134,16 +134,10 @@ void register_buffer(fptr_t _fa, torch::Tensor& t,
  fa->register_buffer(handles, offsets, t.data_ptr());
 }

-std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
    fptr_t _fa) {
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
-  auto options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
-  auto handles =
-      torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
-  std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
-  return {handles, std::move(offsets)};
+  return fa->get_graph_buffer_ipc_meta();
 }

 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@ -4,7 +4,7 @@
 */
 #pragma once

-#include <torch/all.h>
+#include <torch/extension.h>

 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

@ -291,7 +291,7 @@ fused_add_rms_norm_kernel(
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
              torch::Tensor& input,   // [..., hidden_size]
              torch::Tensor& weight,  // [hidden_size]
-              double epsilon) {
+              float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

@ -319,7 +319,7 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
 void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                        torch::Tensor& residual,  // [..., hidden_size]
                        torch::Tensor& weight,    // [hidden_size]
-                        double epsilon) {
+                        float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

--- a/csrc/moe/moe_ops.cpp
+++ b/csrc/moe/moe_ops.cpp
@ -0,0 +1,8 @@
+#include "moe_ops.h"
+
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("topk_softmax", &topk_softmax,
+        "Apply topk softmax to the gating outputs.");
+}
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -1,6 +1,6 @@
 #pragma once

-#include <torch/all.h>
+#include <torch/extension.h>

 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -16,25 +16,18 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
-#include "../cuda_compat.h"

-#ifndef USE_ROCM
-    #include <cub/util_type.cuh>
-    #include <cub/cub.cuh>
-#else
-    #include <hipcub/util_type.hpp>
-    #include <hipcub/hipcub.hpp>
-#endif
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#include <cub/cub.cuh>
+#include <cub/util_type.cuh>

 namespace vllm {
 namespace moe {

+static constexpr int WARP_SIZE = 32;
+
 /// Aligned array type
 template <
    typename T,
@ -272,7 +265,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
 #pragma unroll
    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
    {
-        thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
+        thread_max = max(thread_max, __shfl_xor_sync(0xFFFFFFFF, thread_max, mask, THREADS_PER_ROW));
    }

    // From this point, thread max in all the threads have the max within the row.
@ -289,7 +282,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
 #pragma unroll
    for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
    {
-        row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
+        row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask, THREADS_PER_ROW);
    }

    // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
@ -339,8 +332,8 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
 #pragma unroll
        for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
        {
-            float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
-            int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
+            float other_max = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, THREADS_PER_ROW);
+            int other_expert = __shfl_xor_sync(0xFFFFFFFF, expert, mask, THREADS_PER_ROW);

            // We want lower indices to "win" in every thread so we break ties this way
            if (other_max > max_val || (other_max == max_val && other_expert < expert))
@ -390,7 +383,7 @@ struct TopkConstants
 {
    static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
    static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
-    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
+    static constexpr int VECs_PER_THREAD = std::max(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
    static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
    static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
    static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
@ -403,7 +396,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
 {
    static constexpr std::size_t MAX_BYTES_PER_LDG = 16;

-    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
+    static constexpr int BYTES_PER_LDG = std::min(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
    using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
    static constexpr int VPT = Constants::VPT;
    static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -1,12 +0,0 @@
-#include "registration.h"
-#include "moe_ops.h"
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
-  // Apply topk softmax to the gating outputs.
-  m.def(
-      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
-      "token_expert_indices, Tensor gating_output) -> ()");
-  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>

 #include <ATen/ATen.h>
@ -108,8 +108,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
 }
 }  // namespace vllm

-void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
-                          int64_t block_size, torch::Tensor sorted_token_ids,
+void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
+                          int block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad) {
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -1,42 +1,40 @@
 #pragma once

-#include <torch/library.h>
+#include <torch/extension.h>

 void paged_attention_v1(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step);
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step);

 void paged_attention_v2(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step);
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step);

 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-              double epsilon);
+              float epsilon);

 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
-                        torch::Tensor& weight, double epsilon);
+                        torch::Tensor& weight, float epsilon);

 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int64_t head_size,
+                      torch::Tensor& key, int head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox);

 void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                              torch::Tensor& key, int64_t head_size,
+                              torch::Tensor& key, int head_size,
                              torch::Tensor& cos_sin_cache, bool is_neox,
-                              int64_t rot_dim,
+                              int rot_dim,
                              torch::Tensor& cos_sin_cache_offsets);

 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
@ -62,12 +60,12 @@ torch::Tensor aqlm_dequant(const torch::Tensor& codes,

 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
-                       int64_t split_k_iters);
+                       int split_k_iters);

 torch::Tensor awq_dequantize(torch::Tensor _kernel,
                             torch::Tensor _scaling_factors,
-                             torch::Tensor _zeros, int64_t split_k_iters,
-                             int64_t thx, int64_t thy);
+                             torch::Tensor _zeros, int split_k_iters, int thx,
+                             int thy);

 torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                          torch::Tensor& b_scales, torch::Tensor& workspace,
@ -90,17 +88,14 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits);

-void cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
-                          torch::Tensor const& b, torch::Tensor const& a_scales,
-                          torch::Tensor const& b_scales);
+int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b, torch::Tensor const& a_scales,
+                         torch::Tensor const& b_scales);

 #endif

-void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
-                              torch::Tensor const& scale);
-
-void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
-                               torch::Tensor& scales);
+void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor& input,
+                              float scale);

 void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
                     torch::Tensor lookup_table);
@ -108,9 +103,9 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        torch::Tensor b_gptq_qzeros,
                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int64_t bit);
+                        bool use_exllama, int bit);

-void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit);

 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                             torch::Tensor& scale);
@ -118,28 +113,28 @@ void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
 void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                              torch::Tensor& scale);

-void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
-                          int64_t block_size, torch::Tensor sorted_token_ids,
+void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
+                          int block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);

 #ifndef USE_ROCM
-using fptr_t = int64_t;
+using fptr_t = uint64_t;
 fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                      const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int64_t rank,
+                      const std::vector<int64_t>& offsets, int rank,
                      bool full_nvlink);
-bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
+bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
                      bool full_nvlink);
 void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
 void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
                      torch::Tensor& out);
 void dispose(fptr_t _fa);
-int64_t meta_size();
+int meta_size();
 void register_buffer(fptr_t _fa, torch::Tensor& t,
                     const std::vector<std::string>& handles,
                     const std::vector<int64_t>& offsets);
-std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
    fptr_t _fa);
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
                            const std::vector<std::vector<int64_t>>& offsets);
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

@ -127,7 +127,7 @@ void rotary_embedding(
                           // [num_tokens, num_heads * head_size]
    torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
                           // [num_tokens, num_kv_heads * head_size]
-    int64_t head_size,
+    int head_size,
    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
    bool is_neox) {
  int64_t num_tokens = query.numel() / query.size(-1);
@ -138,7 +138,7 @@ void rotary_embedding(
  int64_t key_stride = key.stride(-2);

  dim3 grid(num_tokens);
-  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+  dim3 block(std::min(num_heads * rot_dim / 2, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
@ -168,9 +168,9 @@ void batched_rotary_embedding(
                           // [num_tokens, num_heads * head_size]
    torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
                           // [num_tokens, num_kv_heads * head_size]
-    int64_t head_size,
+    int head_size,
    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
-    bool is_neox, int64_t rot_dim,
+    bool is_neox, int rot_dim,
    torch::Tensor& cos_sin_cache_offsets  // [num_tokens]
 ) {
  int64_t num_tokens = cos_sin_cache_offsets.size(0);
@ -180,7 +180,7 @@ void batched_rotary_embedding(
  int64_t key_stride = key.stride(-2);

  dim3 grid(num_tokens);
-  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+  dim3 block(std::min(num_heads * rot_dim / 2, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
--- a/csrc/punica/punica_ops.cu
+++ b/csrc/punica/punica_ops.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cstdint>

@ -88,7 +88,7 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
 }

 void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale) {
+                   torch::Tensor indicies, int64_t layer_idx, float scale) {
  CHECK_INPUT(y);
  CHECK_INPUT(x);
  CHECK_INPUT(w);
@ -320,7 +320,7 @@ void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,

 void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
+                             float scale, int64_t h_in, int64_t h_out,
                             int64_t y_offset) {
  CHECK_INPUT(y);
  CHECK_INPUT(x);
--- a/csrc/punica/punica_ops.h
+++ b/csrc/punica/punica_ops.h
@ -1,11 +1,11 @@
 #pragma once

-#include <torch/all.h>
+#include <torch/extension.h>

 void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale);
+                   torch::Tensor indicies, int64_t layer_idx, float scale);

 void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
+                             float scale, int64_t h_in, int64_t h_out,
                             int64_t y_offset);
--- a/csrc/punica/punica_pybind.cpp
+++ b/csrc/punica/punica_pybind.cpp
@ -0,0 +1,13 @@
+#include <torch/extension.h>
+
+#include "punica_ops.h"
+
+//====== pybind ======
+
+#define DEFINE_pybind(name) m.def(#name, &name, #name);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("dispatch_bgmv", &dispatch_bgmv, "dispatch_bgmv");
+  m.def("dispatch_bgmv_low_level", &dispatch_bgmv_low_level,
+        "dispatch_bgmv_low_level");
+}
--- a/csrc/punica/torch_bindings.cpp
+++ b/csrc/punica/torch_bindings.cpp
@ -1,18 +0,0 @@
-#include "registration.h"
-#include "punica_ops.h"
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
-  m.def(
-      "dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int "
-      "layer_idx, float scale) -> ()");
-  m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
-
-  m.def(
-      "dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w,"
-      "Tensor indicies, int layer_idx,"
-      "float scale, int h_in, int h_out,"
-      "int y_offset) -> ()");
-  m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@ -0,0 +1,111 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // vLLM custom ops
+  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
+
+  // Attention ops
+  ops.def("paged_attention_v1", &paged_attention_v1,
+          "Compute the attention between an input query and the cached "
+          "keys/values using PagedAttention.");
+  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
+
+  // Activation ops
+  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
+  ops.def("gelu_and_mul", &gelu_and_mul,
+          "Activation function used in GeGLU with `none` approximation.");
+  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
+          "Activation function used in GeGLU with `tanh` approximation.");
+  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
+  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
+
+  // Layernorm
+  ops.def("rms_norm", &rms_norm,
+          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
+
+  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
+          "In-place fused Add and RMS Normalization");
+
+  // Rotary embedding
+  ops.def("rotary_embedding", &rotary_embedding,
+          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
+
+  ops.def("batched_rotary_embedding", &batched_rotary_embedding,
+          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key "
+          "(supports multiple loras)");
+
+// Quantization ops
+#ifndef USE_ROCM
+  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
+  ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
+  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
+  ops.def("marlin_gemm", &marlin_gemm,
+          "Marlin (Dense) Optimized Quantized GEMM for GPTQ");
+  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm,
+          "Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ");
+  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm,
+          "gptq_marlin Optimized Quantized GEMM for GPTQ");
+  ops.def("gptq_marlin_repack", &gptq_marlin_repack,
+          "gptq_marlin repack from GPTQ");
+  ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
+  ops.def("cutlass_scaled_mm_dq", &cutlass_scaled_mm_dq,
+          "CUTLASS w8a8 GEMM, supporting symmetric per-tensor or "
+          "per-row/column quantization.");
+#endif
+
+  ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
+  ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
+  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
+  ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant,
+          "Compute FP8 quantized tensor for given scaling factor");
+  ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant,
+          "Compute FP8 quantized tensor and scaling factor");
+  ops.def("moe_align_block_size", &moe_align_block_size,
+          "Aligning the number of tokens to be processed by each expert such "
+          "that it is divisible by the block size.");
+
+  ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
+          "Compute int8 quantized tensor for given scaling factor");
+
+  // Cache ops
+  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
+  cache_ops.def("swap_blocks", &swap_blocks,
+                "Swap in (out) the cache blocks from src to dst");
+  cache_ops.def("copy_blocks", &copy_blocks,
+                "Copy the cache blocks from src to dst");
+  cache_ops.def("reshape_and_cache", &reshape_and_cache,
+                "Reshape the key and value tensors and cache them");
+  cache_ops.def("reshape_and_cache_flash", &reshape_and_cache_flash,
+                "Reshape the key and value tensors and cache them");
+  cache_ops.def("convert_fp8", &convert_fp8,
+                "Convert the key and value cache to fp8 data type");
+
+  // Cuda utils
+  pybind11::module cuda_utils =
+      m.def_submodule("cuda_utils", "vLLM cuda utils");
+  cuda_utils.def("get_device_attribute", &get_device_attribute,
+                 "Gets the specified device attribute.");
+
+  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
+                 &get_max_shared_memory_per_block_device_attribute,
+                 "Gets the maximum shared memory per block device attribute.");
+
+#ifndef USE_ROCM
+  // Custom all-reduce kernels
+  pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
+  custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
+  custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
+  custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
+  custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
+  custom_ar.def("dispose", &dispose, "dispose");
+  custom_ar.def("meta_size", &meta_size, "meta_size");
+  custom_ar.def("register_buffer", &register_buffer, "register_buffer");
+  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
+                "get_graph_buffer_ipc_meta");
+  custom_ar.def("register_graph_buffers", &register_graph_buffers,
+                "register_graph_buffers");
+#endif
+}
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@ -18,7 +18,7 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/cuda/CUDAGuard.h>

--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@ -7,7 +7,7 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
 }
 */

-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>

 #include "dequantize.cuh"
@ -435,8 +435,8 @@ __global__ void __launch_bounds__(64)

 torch::Tensor awq_dequantize(torch::Tensor _kernel,
                             torch::Tensor _scaling_factors,
-                             torch::Tensor _zeros, int64_t split_k_iters,
-                             int64_t thx, int64_t thy) {
+                             torch::Tensor _zeros, int split_k_iters, int thx,
+                             int thy) {
  int in_c = _kernel.size(0);
  int qout_c = _kernel.size(1);
  int out_c = qout_c * 8;
@ -491,7 +491,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,

 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
-                       int64_t split_k_iters) {
+                       int split_k_iters) {
  int num_in_feats = _in_feats.size(0);
  int num_in_channels = _in_feats.size(1);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -1,9 +1,8 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <cmath>

 #include "../../dispatch_utils.h"
-#include "../../reduction_utils.cuh"

 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
@ -28,88 +27,33 @@ namespace vllm {

 template <typename scalar_t, typename scale_type>
 __global__ void static_scaled_int8_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type const* scale_ptr, const int hidden_size) {
-  int const tid = threadIdx.x;
-  int const token_idx = blockIdx.x;
-  scale_type const scale = *scale_ptr;
+    const scalar_t* __restrict__ input, int8_t* __restrict__ out,
+    scale_type scale, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int token_idx = blockIdx.x;

  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[token_idx * hidden_size + i] = float_to_int8_rn(
-        static_cast<float>(input[token_idx * hidden_size + i]) / scale);
+    out[token_idx * hidden_size + i] =
+        float_to_int8_rn(((float)input[token_idx * hidden_size + i]) / scale);
  }
 }
-
-template <typename scalar_t, typename scale_type>
-__global__ void dynamic_scaled_int8_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type* scale, const int hidden_size) {
-  int const tid = threadIdx.x;
-  int const token_idx = blockIdx.x;
-  float absmax_val = 0.0f;
-  float const zero = 0.0f;
-
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    float val = static_cast<float>(input[token_idx * hidden_size + i]);
-    val = val > zero ? val : -val;
-    absmax_val = val > absmax_val ? val : absmax_val;
-  }
-
-  float const block_absmax_val_maybe = blockReduceMax(absmax_val);
-  __shared__ float block_absmax_val;
-  if (tid == 0) {
-    block_absmax_val = block_absmax_val_maybe;
-    scale[token_idx] = block_absmax_val / 127.0f;
-  }
-  __syncthreads();
-
-  float const tmp_scale = 127.0f / block_absmax_val;
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[token_idx * hidden_size + i] = float_to_int8_rn(
-        static_cast<float>(input[token_idx * hidden_size + i]) * tmp_scale);
-  }
-}
-
 }  // namespace vllm

-void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
-                              torch::Tensor const& input,  // [..., hidden_size]
-                              torch::Tensor const& scale) {
+void static_scaled_int8_quant(torch::Tensor& out,    // [..., hidden_size]
+                              torch::Tensor& input,  // [..., hidden_size]
+                              float scale) {
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
-  TORCH_CHECK(scale.numel() == 1);
-
-  int const hidden_size = input.size(-1);
-  int const num_tokens = input.numel() / hidden_size;
-  dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
        vllm::static_scaled_int8_quant_kernel<scalar_t, float>
            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
-                                         out.data_ptr<int8_t>(),
-                                         scale.data_ptr<float>(), hidden_size);
-      });
-}
-
-void dynamic_scaled_int8_quant(
-    torch::Tensor& out,          // [..., hidden_size]
-    torch::Tensor const& input,  // [..., hidden_size]
-    torch::Tensor& scales) {
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.is_contiguous());
-
-  int const hidden_size = input.size(-1);
-  int const num_tokens = input.numel() / hidden_size;
-  dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
-        vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
-            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
-                                         out.data_ptr<int8_t>(),
-                                         scales.data_ptr<float>(), hidden_size);
+                                         out.data_ptr<int8_t>(), scale,
+                                         hidden_size);
      });
 }
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
@ -1,5 +1,5 @@
 #include <stddef.h>
-#include <torch/all.h>
+#include <torch/extension.h>

 #include <ATen/cuda/CUDAContext.h>

@ -48,44 +48,9 @@ using namespace cute;

 namespace {

-// Wrappers for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm75_to_sm80 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm80_to_sm89 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm89_to_sm90 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Arch, template <typename> typename ArchGuard,
-          typename ElementAB_, typename ElementD_, typename TileShape,
-          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
+template <typename Arch, typename ElementAB_, typename ElementD_,
+          typename TileShape, typename WarpShape, typename InstructionShape,
+          int32_t MainLoopStages>
 struct cutlass_2x_gemm {
  using ElementAB = ElementAB_;
  using ElementD = ElementD_;
@ -136,7 +101,7 @@ struct cutlass_2x_gemm {
  using RowMajor = typename cutlass::layout::RowMajor;
  using ColumnMajor = typename cutlass::layout::ColumnMajor;
  using KernelType = 
-    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+    typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16, 
      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16, 
      float, cutlass::layout::RowMajor, 4,
@ -147,7 +112,7 @@ struct cutlass_2x_gemm {
      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
      MainLoopStages, Operator,
      1 /* epilogue stages */
-      >::GemmKernel>;
+      >::GemmKernel;
  // clang-format on

  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
@ -243,16 +208,16 @@ void cutlass_scaled_mm_dq_sm75(torch::Tensor& out, torch::Tensor const& a,
  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;

  if (out.dtype() == torch::kBFloat16) {
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        TileShape, WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
-                                                    b_scales);
+    return cutlass_scaled_mm_dq_dispatcher<
+        cutlass_2x_gemm<cutlass::arch::Sm75, int8_t, cutlass::bfloat16_t,
+                        TileShape, WarpShape, InstructionShape, 2>>(
+        out, a, b, a_scales, b_scales);
  } else {
    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
-        TileShape, WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
-                                                    b_scales);
+    return cutlass_scaled_mm_dq_dispatcher<
+        cutlass_2x_gemm<cutlass::arch::Sm75, int8_t, cutlass::half_t, TileShape,
+                        WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
+                                                         b_scales);
  }
 }

@ -270,16 +235,16 @@ void cutlass_scaled_mm_dq_sm80(torch::Tensor& out, torch::Tensor const& a,
  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;

  if (out.dtype() == torch::kBFloat16) {
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::bfloat16_t,
-        TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                    b_scales);
+    return cutlass_scaled_mm_dq_dispatcher<
+        cutlass_2x_gemm<cutlass::arch::Sm80, int8_t, cutlass::bfloat16_t,
+                        TileShape, WarpShape, InstructionShape, 5>>(
+        out, a, b, a_scales, b_scales);
  } else {
    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::half_t,
-        TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                    b_scales);
+    return cutlass_scaled_mm_dq_dispatcher<
+        cutlass_2x_gemm<cutlass::arch::Sm80, int8_t, cutlass::half_t, TileShape,
+                        WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
+                                                         b_scales);
  }
 }

@ -298,16 +263,16 @@ void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a,
    TORCH_CHECK(b.dtype() == torch::kInt8);

    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                      b_scales);
+      return cutlass_scaled_mm_dq_dispatcher<
+          cutlass_2x_gemm<cutlass::arch::Sm89, int8_t, cutlass::bfloat16_t,
+                          TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, a_scales, b_scales);
    } else {
      assert(out.dtype() == torch::kFloat16);
-      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                      b_scales);
+      return cutlass_scaled_mm_dq_dispatcher<
+          cutlass_2x_gemm<cutlass::arch::Sm89, int8_t, cutlass::half_t,
+                          TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, a_scales, b_scales);
    }
  } else {
    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
@ -315,15 +280,15 @@ void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a,

    if (out.dtype() == torch::kBFloat16) {
      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::bfloat16_t, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          cutlass::arch::Sm89, cutlass::float_e4m3_t, cutlass::bfloat16_t,
+          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
+                                                      b_scales);
    } else {
      TORCH_CHECK(out.dtype() == torch::kFloat16);
      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
-          cutlass::half_t, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+          cutlass::arch::Sm89, cutlass::float_e4m3_t, cutlass::half_t,
+          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
+                                                      b_scales);
    }
  }
 }
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
@ -4,7 +4,7 @@

 #if defined CUDA_VERSION && CUDA_VERSION >= 12000

-#include <torch/all.h>
+#include <torch/extension.h>

 #include <ATen/cuda/CUDAContext.h>

@ -51,26 +51,6 @@ using namespace cute;

 namespace {

-uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-  #endif
-  }
-};
-
 template <typename ElementAB_, typename ElementD_, typename TileShape,
          typename ClusterShape, typename KernelSchedule,
          typename EpilogueSchedule>
@ -141,9 +121,9 @@ struct cutlass_3x_gemm {
          KernelSchedule>::CollectiveOp;
  // clang-format on

-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+  using KernelType = cutlass::gemm::kernel::GemmUniversal<
      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
+      cutlass::gemm::PersistentScheduler>;

  struct GemmKernel : public KernelType {};
 };
@ -208,89 +188,8 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
  cutlass::Status status = gemm_op.run(args, workspace.get(), stream);
  CUTLASS_CHECK(status);
 }
-
-template <typename InType, typename OutType, int32_t M>
-struct sm90_fp8_config {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
-                      EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType>
-struct sm90_fp8_config<InType, OutType, 128> {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
-                      EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType>
-struct sm90_fp8_config<InType, OutType, 64> {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
-                      EpilogueSchedule>;
-};
-
 }  // namespace

-template <typename InType, typename OutType>
-void cutlass_scaled_mm_dq_sm90_fp8_dispatch(torch::Tensor& out,
-                                            torch::Tensor const& a,
-                                            torch::Tensor const& b,
-                                            torch::Tensor const& a_scales,
-                                            torch::Tensor const& b_scales) {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  using Cutlass3xGemmDefault =
-      typename sm90_fp8_config<InType, OutType, 0>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_fp8_config<InType, OutType, 64>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm90_fp8_config<InType, OutType, 128>::Cutlass3xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 64) {
-    // m in [1, 64]
-    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmM64>(
-        out, a, b, a_scales, b_scales);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmM128>(
-        out, a, b, a_scales, b_scales);
-  } else {
-    // m in (128, inf)
-    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmDefault>(
-        out, a, b, a_scales, b_scales);
-  }
-}
-
 void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
                               torch::Tensor const& b,
                               torch::Tensor const& a_scales,
@ -324,14 +223,24 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);

+    using TileShape = Shape<_128, _128, _128>;
+    using ClusterShape = Shape<_1, _2, _1>;
+    using KernelSchedule =
+        typename cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative;
+    using EpilogueSchedule =
+        typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+
    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                                    cutlass::bfloat16_t>(
+      return cutlass_scaled_mm_dq_dispatcher<
+          cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::bfloat16_t, TileShape,
+                          ClusterShape, KernelSchedule, EpilogueSchedule>>(
          out, a, b, a_scales, b_scales);
    } else {
      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_scaled_mm_dq_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                                    cutlass::half_t>(
+
+      return cutlass_scaled_mm_dq_dispatcher<
+          cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::half_t, TileShape,
+                          ClusterShape, KernelSchedule, EpilogueSchedule>>(
          out, a, b, a_scales, b_scales);
    }
  }
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
@ -1,7 +1,7 @@
 #include <cudaTypedefs.h>

 #include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
+#include <torch/extension.h>

 void cutlass_scaled_mm_dq_sm75(torch::Tensor& c, torch::Tensor const& a,
                               torch::Tensor const& b,
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>

 #include <cmath>
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@ -6,7 +6,7 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
 #include <cstdint>
 #include <cstdio>

-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda_runtime.h>
@ -1823,7 +1823,7 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        torch::Tensor b_gptq_qzeros,
                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int64_t bit) {
+                        bool use_exllama, int bit) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
@ -1845,7 +1845,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
  return c;
 }

-void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
  vllm::gptq::shuffle_exllama_weight(
      (uint32_t*)q_weight.data_ptr(),
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -1867,4 +1867,4 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
  return c;
 }

-#endif
+#endif
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cuh
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cuh
@ -1,6 +1,6 @@
 #pragma once

-#include <torch/all.h>
+#include <torch/extension.h>

 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@ -15,7 +15,7 @@
 * limitations under the License.
 */

-#include <torch/all.h>
+#include <torch/extension.h>

 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@ -16,7 +16,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include <torch/all.h>
+#include <torch/extension.h>

 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@ -1,4 +1,5 @@
 #include <torch/all.h>
+#include <torch/python.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
--- a/csrc/reduction_utils.cuh
+++ b/csrc/reduction_utils.cuh
@ -21,47 +21,29 @@
 #include "cuda_compat.h"

 namespace vllm {
-
-namespace detail {
-
-template <typename T>
-__inline__ __device__ T _max(T a, T b) {
-  return max(a, b);
+template <typename T, int numLanes = WARP_SIZE>
+__inline__ __device__ T warpReduceSum(T val) {
+  static_assert(numLanes > 0 && (numLanes & (numLanes - 1)) == 0,
+                "numLanes is not a positive power of 2!");
+  static_assert(numLanes <= WARP_SIZE);
+#pragma unroll
+  for (int mask = numLanes >> 1; mask > 0; mask >>= 1)
+    val += VLLM_SHFL_XOR_SYNC(val, mask);
+  return val;
 }

-template <typename T>
-__inline__ __device__ T _sum(T a, T b) {
-  return a + b;
-}
-
-}  // namespace detail
-
-template <typename T>
-using ReduceFnType = T (*)(T, T);
-
 // Helper function to return the next largest power of 2
 static constexpr int _nextPow2(unsigned int num) {
  if (num <= 1) return num;
  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }

-template <typename T, int numLanes = WARP_SIZE>
-__inline__ __device__ T warpReduce(T val, ReduceFnType<T> fn) {
-  static_assert(numLanes > 0 && (numLanes & (numLanes - 1)) == 0,
-                "numLanes is not a positive power of 2!");
-  static_assert(numLanes <= WARP_SIZE);
-#pragma unroll
-  for (int mask = numLanes >> 1; mask > 0; mask >>= 1)
-    val = fn(val, VLLM_SHFL_XOR_SYNC(val, mask));
-
-  return val;
-}
-
+/* Calculate the sum of all elements in a block */
 template <typename T, int maxBlockSize = 1024>
-__inline__ __device__ T blockReduce(T val, ReduceFnType<T> fn) {
+__inline__ __device__ T blockReduceSum(T val) {
  static_assert(maxBlockSize <= 1024);
  if constexpr (maxBlockSize > WARP_SIZE) {
-    val = warpReduce<T>(val, fn);
+    val = warpReduceSum<T>(val);
    // Calculates max number of lanes that need to participate in the last
    // warpReduce
    constexpr int maxActiveLanes = (maxBlockSize + WARP_SIZE - 1) / WARP_SIZE;
@ -74,22 +56,12 @@ __inline__ __device__ T blockReduce(T val, ReduceFnType<T> fn) {

    val = (threadIdx.x < blockDim.x / float(WARP_SIZE)) ? shared[lane]
                                                        : (T)(0.0f);
-    val = warpReduce<T, _nextPow2(maxActiveLanes)>(val, fn);
+    val = warpReduceSum<T, _nextPow2(maxActiveLanes)>(val);
  } else {
    // A single warpReduce is equal to blockReduce
-    val = warpReduce<T, _nextPow2(maxBlockSize)>(val, fn);
+    val = warpReduceSum<T, _nextPow2(maxBlockSize)>(val);
  }
  return val;
 }

-template <typename T, int maxBlockSize = 1024>
-__inline__ __device__ T blockReduceMax(T val) {
-  return blockReduce<T, maxBlockSize>(val, detail::_max<T>);
-}
-
-template <typename T, int maxBlockSize = 1024>
-__inline__ __device__ T blockReduceSum(T val) {
-  return blockReduce<T, maxBlockSize>(val, detail::_sum<T>);
-}
-
 }  // namespace vllm
--- a/csrc/registration.h
+++ b/csrc/registration.h
@ -1,22 +0,0 @@
-#pragma once
-
-#include <Python.h>
-
-#define _CONCAT(A, B) A##B
-#define CONCAT(A, B) _CONCAT(A, B)
-
-#define _STRINGIFY(A) #A
-#define STRINGIFY(A) _STRINGIFY(A)
-
-// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
-// could be a macro instead of a literal token.
-#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
-
-// REGISTER_EXTENSION allows the shared library to be loaded and initialized
-// via python's import statement.
-#define REGISTER_EXTENSION(NAME)                                               \
-  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
-    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
-                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
-    return PyModule_Create(&module);                                           \
-  }
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -1,283 +0,0 @@
-#include "cache.h"
-#include "cuda_utils.h"
-#include "ops.h"
-#include "registration.h"
-
-#include <torch/library.h>
-
-// Note on op signatures:
-// The X_meta signatures are for the meta functions corresponding to op X.
-// They must be kept in sync with the signature for X. Generally, only
-// functions that return Tensors require a meta function.
-//
-// See the following links for detailed docs on op registration and function
-// schemas.
-// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
-// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-  // vLLM custom ops
-
-  // Attention ops
-  // Compute the attention between an input query and the cached
-  // keys/values using PagedAttention.
-  ops.def(
-      "paged_attention_v1("
-      "    Tensor! out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
-
-  // PagedAttention V2.
-  ops.def(
-      "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
-
-  // Activation ops
-  // Activation function used in SwiGLU.
-  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
-  ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
-
-  // Activation function used in GeGLU with `none` approximation.
-  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
-
-  // Activation function used in GeGLU with `tanh` approximation.
-  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);
-
-  // GELU implementation used in GPT-2.
-  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_new", torch::kCUDA, &gelu_new);
-
-  // Approximate GELU implementation.
-  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
-
-  // Layernorm
-  // Apply Root Mean Square (RMS) Normalization to the input tensor.
-  ops.def(
-      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
-      "()");
-  ops.impl("rms_norm", torch::kCUDA, &rms_norm);
-
-  // In-place fused Add and RMS Normalization.
-  ops.def(
-      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
-      "float epsilon) -> ()");
-  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
-
-  // Rotary embedding
-  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
-  ops.def(
-      "rotary_embedding(Tensor positions, Tensor! query,"
-      "                 Tensor! key, int head_size,"
-      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
-  ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);
-
-  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key
-  // (supports multiple loras).
-  ops.def(
-      "batched_rotary_embedding(Tensor positions, Tensor! query,"
-      "                         Tensor! key, int head_size,"
-      "                         Tensor cos_sin_cache, bool is_neox,"
-      "                         int rot_dim,"
-      "                         Tensor cos_sin_cache_offsets) -> ()");
-  ops.impl("batched_rotary_embedding", torch::kCUDA, &batched_rotary_embedding);
-
-  // Quantization ops
-#ifndef USE_ROCM
-  // Quantized GEMM for AQLM.
-  ops.def("aqlm_gemm", &aqlm_gemm);
-  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
-
-  // Decompression method for AQLM.
-  ops.def("aqlm_dequant", &aqlm_dequant);
-  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
-
-  // Quantized GEMM for AWQ.
-  ops.def("awq_gemm", &awq_gemm);
-  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
-
-  // Dequantization for AWQ.
-  ops.def("awq_dequantize", &awq_dequantize);
-  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
-
-  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
-  ops.def("marlin_gemm", &marlin_gemm);
-  ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
-
-  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
-  ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
-
-  // gptq_marlin Optimized Quantized GEMM for GPTQ.
-  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm);
-  ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
-
-  // gptq_marlin repack from GPTQ.
-  ops.def("gptq_marlin_repack", &gptq_marlin_repack);
-  ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
-
-  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
-  // quantization.
-  ops.def(
-      "cutlass_scaled_mm_dq(Tensor! out, Tensor a,"
-      "                     Tensor b, Tensor a_scales,"
-      "                     Tensor b_scales) -> ()");
-  ops.impl("cutlass_scaled_mm_dq", torch::kCUDA, &cutlass_scaled_mm_dq);
-#endif
-
-  // Quantized GEMM for GPTQ.
-  ops.def("gptq_gemm", &gptq_gemm);
-  ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
-
-  // Post processing for GPTQ.
-  ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
-  ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
-
-  // Quantized GEMM for SqueezeLLM.
-  ops.def(
-      "squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
-      "lookup_table) -> ()");
-  ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
-
-  // Compute FP8 quantized tensor for given scaling factor.
-  ops.def(
-      "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
-  ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
-
-  // Compute FP8 quantized tensor and scaling factor.
-  ops.def(
-      "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
-      "()");
-  ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
-
-  // Aligning the number of tokens to be processed by each expert such
-  // that it is divisible by the block size.
-  ops.def(
-      "moe_align_block_size(Tensor topk_ids, int num_experts,"
-      "                     int block_size, Tensor! sorted_token_ids,"
-      "                     Tensor! experts_ids,"
-      "                     Tensor! num_tokens_post_pad) -> ()");
-  ops.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
-
-  // Compute int8 quantized tensor for given scaling factor.
-  ops.def(
-      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
-      "()");
-  ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);
-
-  // Compute int8 quantized tensor and scaling factor
-  ops.def(
-      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
-      "()");
-  ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
-           &dynamic_scaled_int8_quant);
-}
-
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
-  // Cache ops
-  // Swap in (out) the cache blocks from src to dst.
-  cache_ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
-  cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
-
-  // Copy the cache blocks from src to dst.
-  cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
-      "block_mapping) -> ()");
-  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
-
-  // Reshape the key and value tensors and cache them.
-  cache_ops.def(
-      "reshape_and_cache(Tensor key, Tensor value,"
-      "                  Tensor! key_cache, Tensor! value_cache,"
-      "                  Tensor slot_mapping,"
-      "                  str kv_cache_dtype,"
-      "                  float kv_scale) -> ()");
-  cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
-
-  // Reshape the key and value tensors and cache them.
-  cache_ops.def(
-      "reshape_and_cache_flash(Tensor key, Tensor value,"
-      "                        Tensor! key_cache,"
-      "                        Tensor! value_cache,"
-      "                        Tensor slot_mapping,"
-      "                        str kv_cache_dtype) -> ()");
-  cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
-                 &reshape_and_cache_flash);
-
-  // Convert the key and value cache to fp8 data type.
-  cache_ops.def(
-      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str "
-      "kv_cache_dtype) -> ()");
-  cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
-}
-
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
-  // Cuda utils
-
-  // Gets the specified device attribute.
-  cuda_utils.def("get_device_attribute", &get_device_attribute);
-  cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute);
-
-  // Gets the maximum shared memory per block device attribute.
-  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
-                 &get_max_shared_memory_per_block_device_attribute);
-  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
-                  torch::kCUDA,
-                  &get_max_shared_memory_per_block_device_attribute);
-}
-
-#ifndef USE_ROCM
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
-  // Custom all-reduce kernels
-  custom_ar.def("init_custom_ar", &init_custom_ar);
-  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
-
-  custom_ar.def("should_custom_ar", &should_custom_ar);
-  custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
-
-  custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
-  custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);
-
-  custom_ar.def(
-      "all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> "
-      "()");
-  custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
-
-  custom_ar.def("dispose", &dispose);
-  custom_ar.impl("dispose", torch::kCPU, &dispose);
-
-  custom_ar.def("meta_size", &meta_size);
-  custom_ar.impl("meta_size", torch::kCPU, &meta_size);
-
-  custom_ar.def("register_buffer", &register_buffer);
-  custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
-
-  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
-  custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
-                 &get_graph_buffer_ipc_meta);
-
-  custom_ar.def("register_graph_buffers", &register_graph_buffers);
-  custom_ar.impl("register_graph_buffers", torch::kCPU,
-                 &register_graph_buffers);
-}
-#endif
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/docs/source/automatic_prefix_caching/apc.rst
+++ b/docs/source/automatic_prefix_caching/apc.rst
@ -1,110 +0,0 @@
-.. _apc:
-
-Introduction
-============
-
-What is Automatic Prefix Caching
--------------------------------
-
-Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
-
-
-.. note::
-
-   Technical details on how vLLM implements APC are in the next page.
-
-
-
-Enabling APC in vLLM
--------------------
-
-Set ``enable_prefix_caching=True`` in vLLM engine to enable APC. Here is an example:
-
-.. code-block:: python
-
-    import time
-    from vllm import LLM, SamplingParams
-
-
-    # A prompt containing a large markdown table. The table is randomly generated by GPT-4.
-    LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
-    | ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
-    |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
-    | 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
-    | 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
-    | 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
-    | 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
-    | 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
-    | 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
-    | 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
-    | 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
-    | 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
-    | 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
-    | 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
-    | 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
-    | 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
-    | 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
-    | 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
-    | 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
-    | 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
-    | 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
-    | 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
-    | 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
-    | 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
-    | 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
-    | 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
-    | 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
-    | 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
-    | 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
-    | 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
-    | 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
-    | 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
-    | 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
-    """
-
-
-    def get_generation_time(llm, sampling_params, prompts):
-        # time the generation
-        start_time = time.time()
-        output = llm.generate(prompts, sampling_params=sampling_params)
-        end_time = time.time()
-        # print the output and generation time
-        print(f"Output: {output[0].outputs[0].text}")
-        print(f"Generation time: {end_time - start_time} seconds.")
-
-
-    # set enable_prefix_caching=True to enable APC
-    llm = LLM(
-        model='lmsys/longchat-13b-16k',
-        enable_prefix_caching=True
-    )
-
-    sampling_params = SamplingParams(temperature=0, max_tokens=100)
-
-    # Querying the age of John Doe
-    get_generation_time(
-        llm,
-        sampling_params,
-        LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
-    )
-
-    # Querying the age of Zack Blue
-    # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
-    get_generation_time(
-        llm,
-        sampling_params,
-        LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
-    )
-
-Example workloads
-----------------
-
-We describe two example workloads, where APC can provide huge performance benefit:
-
- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
-
-
-Limits
------
-APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
--- a/docs/source/automatic_prefix_caching/details.md
+++ b/docs/source/automatic_prefix_caching/details.md
@ -1,43 +0,0 @@
-# Implementation
-
-The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
-
-To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block.
-
-```
-                    Block 1                  Block 2                  Block 3
-         [A gentle breeze stirred] [the leaves as children] [laughed in the distance]
-Block 1: |<--- block tokens ---->|
-Block 2: |<------- prefix ------>| |<--- block tokens --->|
-Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->|
-```
-
-
-In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping:
-
-```
-hash(prefix tokens + block tokens) <--> KV Block
-```
-
-With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space.
-
-
-This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
-
-
-# Generalized Caching Policy
-
-Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
-
-Managing KV cache with a hash table allows us to implement flexible caching policies. As an example, in current vLLM, we implement the following eviction policy:
-
-* When there are no free blocks left, we will evict a KV block with reference count (i.e., number of current requests using the block) equals 0.
-* If there are multiple blocks with reference count equals to 0, we prioritize to evict the least recently used block (LRU).
-* If there are multiple blocks whose last access time are the same, we prioritize the eviction of the block that is at the end of the longest prefix (i.e., has the maximum number of blocks before it).
-
-Note that this eviction policy effectively implements the exact policy as in [RadixAttention](https://lmsys.org/blog/2024-01-17-sglang/) when applied to models with full attention, which prioritizes to evict reference count zero and least recent used leaf nodes in the prefix tree.
-
-However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above:
-
- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency.
- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images.
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@ -18,9 +18,8 @@ vLLM is a community project. Our compute resources for development and testing a
 - Replicate
 - Roblox
 - RunPod
- Sequoia Capital
 - Trainy
 - UC Berkeley
 - UC San Diego

-We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
+We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -90,9 +90,7 @@ autodoc_mock_imports = [
    "sentencepiece",
    "vllm.cuda_utils",
    "vllm._C",
-    "PIL",
    "numpy",
-    'triton',
    "tqdm",
    "tensorizer",
 ]
@ -118,13 +116,12 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
 autodoc.ClassDocumenter = MockedClassDocumenter

 intersphinx_mapping = {
-    "python": ("https://docs.python.org/3", None),
-    "typing_extensions":
-    ("https://typing-extensions.readthedocs.io/en/latest", None),
-    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
-    "numpy": ("https://numpy.org/doc/stable", None),
-    "torch": ("https://pytorch.org/docs/stable", None),
-    "psutil": ("https://psutil.readthedocs.io/en/stable", None),
+    'python': ('https://docs.python.org/3', None),
+    'typing_extensions':
+    ('https://typing-extensions.readthedocs.io/en/latest', None),
+    'numpy': ('https://numpy.org/doc/stable', None),
+    'torch': ('https://pytorch.org/docs/stable', None),
+    'psutil': ('https://psutil.readthedocs.io/en/stable', None),
 }

 autodoc_preserve_defaults = True
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@ -1,51 +0,0 @@
-Multi-Modality
-==============
-
-.. currentmodule:: vllm.multimodal
-    
-vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
-
-:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
-which allows you to pass in multi-modal input alongside text and token prompts.
-
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
-
-.. contents::
-   :local:
-   :backlinks: none
-
-Module Contents
-+++++++++++++++
-
-.. automodule:: vllm.multimodal
-
-Registry
--------
-
-.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
-
-    The global :class:`MultiModalRegistry` which is used by model runners.
-
-.. autoclass:: vllm.multimodal.MultiModalRegistry
-    :members:
-    :show-inheritance:
-
-Base Classes
------------
-
-.. autoclass:: vllm.multimodal.MultiModalData
-    :members:
-    :show-inheritance:
-
-.. autoclass:: vllm.multimodal.MultiModalPlugin
-    :members:
-    :show-inheritance:
-
-Image Classes
-------------
-
-.. automodule:: vllm.multimodal.image
-    :members:
-    :show-inheritance:
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@ -54,7 +54,7 @@ Build from source
 .. code-block:: console

    $ pip install --upgrade pip
-    $ pip install wheel packaging ninja "setuptools>=49.4.0" numpy
+    $ pip install wheel packaging ninja setuptools>=49.4.0 numpy
    $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu

 - Finally, build and install vLLM CPU backend: 
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@ -1,42 +0,0 @@
-.. _debugging:
-
-Debugging Tips
-===============
-
-Debugging hang/crash issues
---------------------------
-
-When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time:
-
- Downloading a model: do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue.
- Loading the model from disk: if the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory.
- Tensor parallel inference: if the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
-
-If you already take care of the above issues, and the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue:
-
- Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
- Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble.
- Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
- Set the environment variable ``export VLLM_TRACE_FUNCTION=1`` . All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs. **Note: it will generate a lot of logs and slow down the system. Only use it for debugging purposes.**
-
-With more logging, hopefully you can find the root cause of the issue.
-
-Here are some common issues that can cause hangs:
-
- The network setup is incorrect. The vLLM instance cannot get the correct IP address. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``.
- Hardware/driver setup is incorrect. GPU communication cannot be established. You can run a sanity check script below to see if the GPU communication is working correctly.
-
-.. code-block:: python
-
-    # save it as `test.py`` , and run it with `NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py`
-    # adjust `--nproc-per-node` to the number of GPUs you want to use.
-    import torch
-    import torch.distributed as dist
-    dist.init_process_group(backend="nccl")
-    data = torch.FloatTensor([1,] * 128).to(f"cuda:{dist.get_rank()}")
-    dist.all_reduce(data, op=dist.ReduceOp.SUM)
-    torch.cuda.synchronize()
-    value = data.mean().item()
-    assert value == dist.get_world_size()
-
-If the problem persists, feel free to open an `issue <https://github.com/vllm-project/vllm/issues/new/choose>`_ on GitHub, with a detailed description of the issue, your environment, and the logs.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -66,7 +66,6 @@ Documentation
   getting_started/neuron-installation
   getting_started/cpu-installation
   getting_started/quickstart
-   getting_started/debugging
   getting_started/examples/examples_index

 .. toctree::
@ -89,8 +88,6 @@ Documentation
   models/adding_model
   models/engine_args
   models/lora
-   models/vlm
-   models/spec_decode
   models/performance

 .. toctree::
@ -98,29 +95,21 @@ Documentation
   :caption: Quantization

   quantization/auto_awq
-   quantization/fp8
   quantization/fp8_e5m2_kvcache
   quantization/fp8_e4m3_kvcache

 .. toctree::
-   :maxdepth: 1
-   :caption: Automatic Prefix Caching
-
-   automatic_prefix_caching/apc
-   automatic_prefix_caching/details
-
-.. toctree::
+   :maxdepth: 2
   :caption: Developer Documentation
-
+   
   dev/sampling_params
   dev/offline_inference/offline_index
   dev/engine/engine_index
   dev/kernel/paged_attention
-   dev/multimodal/multimodal_index
   dev/dockerfile/dockerfile

 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 2
   :caption: Community

   community/meetups
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@ -1,77 +0,0 @@
-.. _spec_decode:
-
-Speculative decoding in vLLM
-============================
-
-.. warning::
-    Please note that speculative decoding in vLLM is not yet optimized and does
-    not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work
-    to optimize it is ongoing and can be followed in `this issue. <https://github.com/vllm-project/vllm/issues/4630>`_
-
-This document shows how to use `Speculative Decoding <https://x.com/karpathy/status/1697318534555336961>`_ with vLLM.
-Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
-
-Speculating with a draft model
------------------------------
-
-The following code configures vLLM to use speculative decoding with a draft model, speculating 5 tokens at a time.
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-    
-    prompts = [
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    
-    llm = LLM(
-        model="facebook/opt-6.7b",
-        tensor_parallel_size=1,
-        speculative_model="facebook/opt-125m",
-        num_speculative_tokens=5,
-        use_v2_block_manager=True,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-    
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-Speculating by matching n-grams in the prompt
---------------------------------------------
-
-The following code configures vLLM to use speculative decoding where proposals are generated by
-matching n-grams in the prompt. For more information read `this thread. <https://x.com/joao_gante/status/1747322413006643259>`_
-
-.. code-block:: python
-
-    from vllm import LLM, SamplingParams
-    
-    prompts = [
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    
-    llm = LLM(
-        model="facebook/opt-6.7b",
-        tensor_parallel_size=1,
-        speculative_model="[ngram]",
-        num_speculative_tokens=5,
-        ngram_prompt_lookup_max=4,
-        use_v2_block_manager=True,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-    
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-Resources for vLLM contributors
-------------------------------
-* `A Hacker's Guide to Speculative Decoding in vLLM <https://www.youtube.com/watch?v=9wNAgpX6z_4>`_
-* `What is Lookahead Scheduling in vLLM? <https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a>`_
-* `Information on batch expansion. <https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8>`_
-* `Dynamic speculative decoding <https://github.com/vllm-project/vllm/issues/4565>`_
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@ -87,14 +87,6 @@ Alongside each architecture, we include some popular models that use it.
    - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
    - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
    - ✅︎
-  * - :code:`LlavaForConditionalGeneration`
-    - LLaVA-1.5
-    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
-    -
-  * - :code:`LlavaNextForConditionalGeneration`
-    - LLaVA-NeXT
-    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-    -
  * - :code:`MiniCPMForCausalLM`
    - MiniCPM
    - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@ -1,130 +0,0 @@
-.. _vlm:
-
-Using VLMs
-==========
-
-vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
-
-Engine Arguments
----------------
-
-The following :ref:`engine arguments <engine_args>` are specific to VLMs:
-
-.. argparse::
-    :module: vllm.engine.arg_utils
-    :func: _vlm_engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
-    :nodefaultconst:
-
-.. important::
-    Currently, the support for vision language models on vLLM has the following limitations:
-
-    * Only single image input is supported per text prompt.
-    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means model output might not exactly match the HuggingFace implementation.
-
-    We are continuously improving user & developer experience for VLMs. Please raise an issue on GitHub if you have any feedback or feature requests.
-
-Offline Batched Inference
-------------------------
-
-To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
-
-.. code-block:: python
-
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
-        image_token_id=32000,
-        image_input_shape="1,3,336,336",
-        image_feature_size=576,
-    )
-
-To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
-
-* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
-
-.. code-block:: python
-
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
-
-    # Load the image using PIL.Image
-    image = ...
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
-
-Online OpenAI Vision API Compatible Inference
----------------------------------------------
-
-You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
-
-.. note::
-    Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be
-    added in the future.
-
-Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server.
-
-.. important::
-    Since OpenAI Vision API is based on `Chat <https://platform.openai.com/docs/api-reference/chat>`_ API, a chat template 
-    is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the 
-    HuggingFace Llava chat template that you can find in the example folder `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
-
-.. code-block:: bash
-
-    python -m vllm.entrypoints.openai.api_server \
-        --model llava-hf/llava-1.5-7b-hf \
-        --image-input-type pixel_values \
-        --image-token-id 32000 \
-        --image-input-shape 1,3,336,336 \
-        --image-feature-size 576 \
-        --chat-template template_llava.jinja
-
-To consume the server, you can use the OpenAI client like in the example below:
-
-.. code-block:: python
-
-    from openai import OpenAI
-    openai_api_key = "EMPTY"
-    openai_api_base = "http://localhost:8000/v1"
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-    chat_response = client.chat.completions.create(
-        model="llava-hf/llava-1.5-7b-hf",
-        messages=[{
-            "role": "user",
-            "content": [
-                {"type": "text", "text": "What's in this image?"},
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-                    },
-                },
-            ],
-        }],
-    )
-    print("Chat response:", chat_response)
-
-.. note::
-
-    By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
-
-    .. code-block:: shell
-
-        export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
-
-.. note::
-    The prompt formatting with the image token ``<image>`` is not needed when serving VLMs with the API server since the prompt will be 
-    processed automatically by the server.
--- a/docs/source/quantization/fp8.rst
+++ b/docs/source/quantization/fp8.rst
@ -1,206 +0,0 @@
-.. _fp8:
-
-FP8
-==================
-
-vLLM supports FP8 (8-bit floating point) computation using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are supported. Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy.
-
-Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM <https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127>`_.
-
-The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios:
-
- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and ``nan``.
- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- ``inf``, and ``nan``. The tradeoff for the increased dynamic range is lower precision of the stored values.
-
-Quick Start with Online Dynamic Quantization
--------------------------------------------
-
-Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying ``--quantization="fp8"`` in the command line or setting ``quantization="fp8"`` in the LLM constructor.
-
-In this mode, all Linear modules (except for the final ``lm_head``) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode.
-
-.. code-block:: python
-
-    from vllm import LLM
-    model = LLM("facebook/opt-125m", quantization="fp8")
-    # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
-    result = model.generate("Hello, my name is")
-
-.. warning::
-
-    Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
-
-Offline Quantization
--------------------
-
-For offline quantization to FP8, please install the `AutoFP8 library <https://github.com/neuralmagic/autofp8>`_.
-
-.. code-block:: bash
-
-    git clone https://github.com/neuralmagic/AutoFP8.git
-    pip install -e AutoFP8
-
-This package introduces the ``AutoFP8ForCausalLM`` and ``BaseQuantizeConfig`` objects for managing how your model will be compressed.
-
-Offline Quantization with Dynamic Activation Scaling Factors
------------------------------------------------------------
-
-You can use AutoFP8 to produce checkpoints with their weights quantized to FP8 ahead of time and let vLLM handle calculating dynamic scales for the activations at runtime for maximum accuracy. You can enable this with the ``activation_scheme="dynamic"`` argument.
-
-.. warning::
-
-    Please note that although this mode doesn't give you better performance, it reduces memory footprint compared to online quantization.
-
-.. code-block:: python
-
-    from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
-
-    pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-    quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8-Dynamic"
-
-    # Define quantization config with static activation scales
-    quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="dynamic")
-    # For dynamic activation scales, there is no need for calbration examples
-    examples = []
-
-    # Load the model, quantize, and save checkpoint
-    model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
-    model.quantize(examples)
-    model.save_quantized(quantized_model_dir)
-
-In the output of the above script, you should be able to see the quantized Linear modules (FP8DynamicLinear) replaced in the model definition. 
-Note that the ``lm_head`` Linear module at the end is currently skipped by default.
-
-.. code-block:: text
-
-    LlamaForCausalLM(
-      (model): LlamaModel(
-        (embed_tokens): Embedding(128256, 4096)
-        (layers): ModuleList(
-          (0-31): 32 x LlamaDecoderLayer(
-            (self_attn): LlamaSdpaAttention(
-              (q_proj): FP8DynamicLinear()
-              (k_proj): FP8DynamicLinear()
-              (v_proj): FP8DynamicLinear()
-              (o_proj): FP8DynamicLinear()
-              (rotary_emb): LlamaRotaryEmbedding()
-            )
-            (mlp): LlamaMLP(
-              (gate_proj): FP8DynamicLinear()
-              (up_proj): FP8DynamicLinear()
-              (down_proj): FP8DynamicLinear()
-              (act_fn): SiLU()
-            )
-            (input_layernorm): LlamaRMSNorm()
-            (post_attention_layernorm): LlamaRMSNorm()
-          )
-        )
-        (norm): LlamaRMSNorm()
-      )
-      (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
-    )
-    Saving the model to Meta-Llama-3-8B-Instruct-FP8-Dynamic
-
-Your model checkpoint with quantized weights should be available at ``Meta-Llama-3-8B-Instruct-FP8/``.
-We can see that the weights are smaller than the original BF16 precision.
-
-.. code-block:: bash
-
-    ls -lh Meta-Llama-3-8B-Instruct-FP8-Dynamic/
-    total 8.5G
-    -rw-rw-r-- 1 user user  869 Jun  7 14:43 config.json
-    -rw-rw-r-- 1 user user  194 Jun  7 14:43 generation_config.json
-    -rw-rw-r-- 1 user user 4.7G Jun  7 14:43 model-00001-of-00002.safetensors
-    -rw-rw-r-- 1 user user 3.9G Jun  7 14:43 model-00002-of-00002.safetensors
-    -rw-rw-r-- 1 user user  43K Jun  7 14:43 model.safetensors.index.json
-    -rw-rw-r-- 1 user user  296 Jun  7 14:43 special_tokens_map.json
-    -rw-rw-r-- 1 user user  50K Jun  7 14:43 tokenizer_config.json
-    -rw-rw-r-- 1 user user 8.7M Jun  7 14:43 tokenizer.json
-
-Finally, you can load the quantized model checkpoint directly in vLLM.
-
-.. code-block:: python
-
-    from vllm import LLM
-    model = LLM(model="Meta-Llama-3-8B-Instruct-FP8-Dynamic/")
-    # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB
-    result = model.generate("Hello, my name is")
-
-Offline Quantization with Static Activation Scaling Factors
-----------------------------------------------------------
-
-For the best inference performance, you can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the ``activation_scheme="static"`` argument.
-
-.. code-block:: python
-
-    from datasets import load_dataset
-    from transformers import AutoTokenizer
-    from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
-
-    pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-    quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8"
-
-    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    # Load and tokenize 512 dataset samples for calibration of activation scales
-    ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512))
-    examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds]
-    examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda")
-
-    # Define quantization config with static activation scales
-    quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static")
-
-    # Load the model, quantize, and save checkpoint
-    model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
-    model.quantize(examples)
-    model.save_quantized(quantized_model_dir)
-
-Your model checkpoint with quantized weights and activations should be available at ``Meta-Llama-3-8B-Instruct-FP8/``.
-Finally, you can load the quantized model checkpoint directly in vLLM.
-
-.. code-block:: python
-
-    from vllm import LLM
-    model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/")
-    # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB
-    result = model.generate("Hello, my name is")
-
-FP8 checkpoint structure explanation
-----------------------------------------------------------
-
-Here we detail the structure for the FP8 checkpoints.
-
-The following is necessary to be present in the model's ``config.json``:
-
-.. code-block:: text
-
-    "quantization_config": {
-        "quant_method": "fp8",
-        "activation_scheme": "static" or "dynamic"
-    }
-
-
-Each quantized layer in the state_dict will have these tensors:
-
-* If the config has ``"activation_scheme": "static"``:
-
-.. code-block:: text
-
-    model.layers.0.mlp.down_proj.weight              < F8_E4M3
-    model.layers.0.mlp.down_proj.input_scale         < F32
-    model.layers.0.mlp.down_proj.weight_scale        < F32
-
-* If the config has ``"activation_scheme": "dynamic"``:
-
-.. code-block:: text
-
-    model.layers.0.mlp.down_proj.weight              < F8_E4M3
-    model.layers.0.mlp.down_proj.weight_scale        < F32
-
-
-Additionally, there can be `FP8 kv-cache scaling factors <https://github.com/vllm-project/vllm/pull/4893>`_ contained within quantized checkpoints specified through the ``.kv_scale`` parameter present on the Attention Module, such as:
-
-.. code-block:: text
-
-    model.layers.0.self_attn.kv_scale	             < F32
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@ -3,9 +3,11 @@
 Distributed Inference and Serving
 =================================

-vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with:

-Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
+.. code-block:: console
+
+    $ pip install ray

 To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:

@ -23,12 +25,10 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh
    $     --model facebook/opt-13b \
    $     --tensor-parallel-size 4

-To scale vLLM beyond a single machine, install and start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
+To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:

 .. code-block:: console

-    $ pip install ray
-
    $ # On head node
    $ ray start --head

--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -30,8 +30,6 @@ Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-refer
 - Chat: `tools`, and `tool_choice`.
 - Completions: `suffix`.

-vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst).
-
 ## Extra Parameters
 vLLM supports a set of parameters that are not part of the OpenAI API.
 In order to use them, you can pass them as extra parameters in the OpenAI client.
@ -111,15 +109,4 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 :module: vllm.entrypoints.openai.cli_args
 :func: make_arg_parser
 :prog: -m vllm.entrypoints.openai.api_server
-```
-
-## Tool calling in the chat completion API
-vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap.
-
-To use a named function you need to define the function in the `tools` parameter and call it in the `tool_choice` parameter. 
-
-It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt. **This may change in the future.**
-
-vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
-
-Please refer to the OpenAI API reference documentation for more information.
+```
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@ -3,36 +3,33 @@ import os
 import subprocess

 import torch
-from PIL import Image

 from vllm import LLM
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.sequence import MultiModalData

 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
-# You can use `.buildkite/download-images.sh` to download them


-def run_llava_pixel_values(*, disable_image_processor: bool = False):
+def run_llava_pixel_values():
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
        image_input_type="pixel_values",
        image_token_id=32000,
        image_input_shape="1,3,336,336",
        image_feature_size=576,
-        disable_image_processor=disable_image_processor,
    )

    prompt = "<image>" * 576 + (
        "\nUSER: What is the content of this image?\nASSISTANT:")

-    if disable_image_processor:
-        image = torch.load("images/stop_sign_pixel_values.pt")
-    else:
-        image = Image.open("images/stop_sign.jpg")
+    # This should be provided by another online or offline component.
+    image = torch.load("images/stop_sign_pixel_values.pt")

    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "prompt":
+        prompt,
+        "multi_modal_data":
+        MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
    })

    for o in outputs:
@ -52,13 +49,15 @@ def run_llava_image_features():
    prompt = "<image>" * 576 + (
        "\nUSER: What is the content of this image?\nASSISTANT:")

-    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
+    # This should be provided by another online or offline component.
+    image = torch.load("images/stop_sign_image_features.pt")

    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": ImageFeatureData(image),
+        "prompt":
+        prompt,
+        "multi_modal_data":
+        MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
    })
-
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
--- a/examples/lora_with_quantization_inference.py
+++ b/examples/lora_with_quantization_inference.py
@ -1,140 +0,0 @@
-"""
-This example shows how to use LoRA with different quantization techniques
-for offline inference.
-
-Requires HuggingFace credentials for access.
-"""
-
-import gc
-from typing import List, Optional, Tuple
-
-import torch
-from huggingface_hub import snapshot_download
-
-from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
-from vllm.lora.request import LoRARequest
-
-
-def create_test_prompts(
-        lora_path: str
-) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
-    return [
-        # this is an example of using quantization without LoRA
-        ("My name is",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128), None),
-        # the next three examples use quantization with LoRA
-        ("my name is",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128),
-         LoRARequest("lora-test-1", 1, lora_path)),
-        ("The capital of USA is",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128),
-         LoRARequest("lora-test-2", 1, lora_path)),
-        ("The capital of France is",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128),
-         LoRARequest("lora-test-3", 1, lora_path)),
-    ]
-
-
-def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams,
-                                              Optional[LoRARequest]]]):
-    """Continuously process a list of prompts and handle the outputs."""
-    request_id = 0
-
-    while test_prompts or engine.has_unfinished_requests():
-        if test_prompts:
-            prompt, sampling_params, lora_request = test_prompts.pop(0)
-            engine.add_request(str(request_id),
-                               prompt,
-                               sampling_params,
-                               lora_request=lora_request)
-            request_id += 1
-
-        request_outputs: List[RequestOutput] = engine.step()
-        for request_output in request_outputs:
-            if request_output.finished:
-                print("----------------------------------------------------")
-                print(f"Prompt: {request_output.prompt}")
-                print(f"Output: {request_output.outputs[0].text}")
-
-
-def initialize_engine(model: str, quantization: str,
-                      lora_repo: Optional[str]) -> LLMEngine:
-    """Initialize the LLMEngine."""
-
-    if quantization == "bitsandbytes":
-        # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique.
-        # It quantizes the model when loading, with some config info from the
-        # LoRA adapter repo. So need to set the parameter of load_format and
-        # qlora_adapter_name_or_path as below.
-        engine_args = EngineArgs(
-            model=model,
-            quantization=quantization,
-            qlora_adapter_name_or_path=lora_repo,
-            load_format="bitsandbytes",
-            enable_lora=True,
-            max_lora_rank=64,
-            # set it only in GPUs of limited memory
-            enforce_eager=True)
-    else:
-        engine_args = EngineArgs(
-            model=model,
-            quantization=quantization,
-            enable_lora=True,
-            max_loras=4,
-            # set it only in GPUs of limited memory
-            enforce_eager=True)
-    return LLMEngine.from_engine_args(engine_args)
-
-
-def main():
-    """Main function that sets up and runs the prompt processing."""
-
-    test_configs = [{
-        "name": "qlora_inference_example",
-        'model': "huggyllama/llama-7b",
-        'quantization': "bitsandbytes",
-        'lora_repo': 'timdettmers/qlora-flan-7b'
-    }, {
-        "name": "AWQ_inference_with_lora_example",
-        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
-        'quantization': "awq",
-        'lora_repo': 'jashing/tinyllama-colorist-lora'
-    }, {
-        "name": "GPTQ_inference_with_lora_example",
-        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
-        'quantization': "gptq",
-        'lora_repo': 'jashing/tinyllama-colorist-lora'
-    }]
-
-    for test_config in test_configs:
-        print(
-            f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~"
-        )
-        engine = initialize_engine(test_config['model'],
-                                   test_config['quantization'],
-                                   test_config['lora_repo'])
-        lora_path = snapshot_download(repo_id=test_config['lora_repo'])
-        test_prompts = create_test_prompts(lora_path)
-        process_requests(engine, test_prompts)
-
-        # Clean up the GPU memory for the next test
-        del engine
-        gc.collect()
-        torch.cuda.empty_cache()
-
-
-if __name__ == '__main__':
-    main()
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@ -1,8 +1,5 @@
-from time import time
-
 from vllm import LLM, SamplingParams

-# Common prefix.
 prefix = (
    "You are an expert school principal, skilled in effectively managing "
    "faculty and staff. Draft 10-15 questions for a potential first grade "
@ -21,62 +18,36 @@ prompts = [
    "The capital of France is",
    "The future of AI is",
 ]
-
-generating_prompts = [prefix + prompt for prompt in prompts]
-
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.0)

 # Create an LLM.
-regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
+llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)

-prefix_cached_llm = LLM(model="facebook/opt-125m",
-                        enable_prefix_caching=True,
-                        gpu_memory_utilization=0.4)
-print("Results without `enable_prefix_caching`")
+generating_prompts = [prefix + prompt for prompt in prompts]

 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
-start_time_regular = time()
-outputs = regular_llm.generate(generating_prompts, sampling_params)
-duration_regular = time() - start_time_regular
-
-regular_generated_texts = []
+outputs = llm.generate(generating_prompts, sampling_params)
 # Print the outputs.
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
-    regular_generated_texts.append(generated_text)
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

 print("-" * 80)

-# Warmup so that the shared prompt's KV cache is computed.
-prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+# The llm.generate call will batch all prompts and send the batch at once
+# if resources allow. The prefix will only be cached after the first batch
+# is processed, so we need to call generate once to calculate the prefix
+# and cache it.
+outputs = llm.generate(generating_prompts[0], sampling_params)

-# Generate with prefix caching.
-start_time_cached = time()
-outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
-duration_cached = time() - start_time_cached
+# Subsequent batches can leverage the cached prefix
+outputs = llm.generate(generating_prompts, sampling_params)

-print("Results with `enable_prefix_caching`")
-
-cached_generated_texts = []
-# Print the outputs. You should see the same outputs as before.
+# Print the outputs. You should see the same outputs as before
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
-    cached_generated_texts.append(generated_text)
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-print("-" * 80)
-
-# Compare the results and display the speedup
-generated_same = all([
-    regular_generated_texts[i] == cached_generated_texts[i]
-    for i in range(len(prompts))
-])
-print(f"Generated answers are the same: {generated_same}")
-
-speedup = round(duration_regular / duration_cached, 2)
-print(f"Speed up of cached generation compared to the regular is: {speedup}")
--- a/examples/template_llava.jinja
+++ b/examples/template_llava.jinja
@ -1,23 +0,0 @@
-{%- if messages[0]['role'] == 'system' -%}
-    {%- set system_message = messages[0]['content'] -%}
-    {%- set messages = messages[1:] -%}
-{%- else -%}
-    {% set system_message = '' -%}
-{%- endif -%}
-
-{{ bos_token + system_message }}
-{%- for message in messages -%}
-    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
-        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
-    {%- endif -%}
-
-    {%- if message['role'] == 'user' -%}
-        {{ 'USER: ' + message['content'] + '\n' }}
-    {%- elif message['role'] == 'assistant' -%}
-        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
-    {%- endif -%}
-{%- endfor -%}
-
-{%- if add_generation_prompt -%}
-    {{ 'ASSISTANT:' }}
-{% endif %}
--- a/format.sh
+++ b/format.sh
@ -101,7 +101,6 @@ mypy vllm/core --config-file pyproject.toml
 mypy vllm/distributed --config-file pyproject.toml
 mypy vllm/entrypoints --config-file pyproject.toml
 mypy vllm/executor --config-file pyproject.toml
-mypy vllm/multimodal --config-file pyproject.toml
 mypy vllm/usage --config-file pyproject.toml
 mypy vllm/*.py --config-file pyproject.toml
 mypy vllm/transformers_utils --config-file pyproject.toml
@ -118,7 +117,7 @@ mypy vllm/model_executor --config-file pyproject.toml
 # https://github.com/codespell-project/codespell/issues/1915
 # Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem
 CODESPELL_EXCLUDES=(
-    '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**'
+    '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,tests/lora/data/**,build/**'
 )

 # check spelling of specified files
--- a/pyproject.toml
+++ b/pyproject.toml
@ -71,5 +71,4 @@ markers = [
    "skip_global_cleanup",
    "llm: run tests for vLLM API only",
    "openai: run tests for OpenAI API only",
-    "llava: run tests for LLaVA models only",
 ]
--- a/requirements-common.txt
+++ b/requirements-common.txt
@ -12,11 +12,10 @@ aiohttp
 openai
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
-pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.10.1
-outlines >= 0.0.43 # Requires torch >= 2.1.0
+outlines == 0.0.34 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@ -6,4 +6,4 @@ ray >= 2.9
 nvidia-ml-py # for pynvml package
 torch == 2.3.0
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
-vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
+vllm-flash-attn == 2.5.8.post2  # Requires PyTorch 2.3.0
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -33,5 +33,5 @@ sentence-transformers # required for embedding
 # Benchmarking
 aiohttp

-# quantization
-bitsandbytes==0.42.0
+# Multimodal
+pillow
--- a/setup.py
+++ b/setup.py
@ -60,7 +60,7 @@ def remove_prefix(text, prefix):
 class CMakeExtension(Extension):

    def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
-        super().__init__(name, sources=[], py_limited_api=True, **kwa)
+        super().__init__(name, sources=[], **kwa)
        self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)


@ -222,7 +222,7 @@ def _is_neuron() -> bool:
        subprocess.run(["neuron-ls"], capture_output=True, check=True)
    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
        torch_neuronx_installed = False
-    return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron"
+    return torch_neuronx_installed or envs.VLLM_BUILD_WITH_NEURON


 def _is_cpu() -> bool:
@ -382,7 +382,7 @@ def get_requirements() -> List[str]:

 ext_modules = []

-if _is_cuda() or _is_hip():
+if _is_cuda():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))

 if not _is_neuron():
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@ -55,8 +55,9 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
                                                 temperature=0.0)

    assert completion.id is not None
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) >= 5
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
    assert completion.choices[0].finish_reason == "length"
    assert completion.usage == openai.types.CompletionUsage(
        completion_tokens=5, prompt_tokens=6, total_tokens=11)
@ -68,7 +69,8 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
        max_tokens=5,
        temperature=0.0,
    )
-    assert len(completion.choices[0].text) >= 5
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5


@pytest.mark.asyncio
@ -88,14 +90,15 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI):
                                                           logprobs=True,
                                                           top_logprobs=5)
    assert chat_completion.id is not None
-    assert len(chat_completion.choices) == 1
-
-    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
-    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=13, total_tokens=23)
-
-    message = choice.message
+    assert chat_completion.choices is not None and len(
+        chat_completion.choices) == 1
+    assert chat_completion.choices[0].message is not None
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.content[
+        0].top_logprobs is not None
+    assert len(
+        chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5
+    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 10
    assert message.role == "assistant"
    messages.append({"role": "assistant", "content": message.content})
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -43,14 +43,16 @@ def test_models(
    if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
        pytest.skip("Skipping non-eager test for FlashInferBackend.")

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model

-    with vllm_runner(model,
-                     dtype=dtype,
-                     enforce_eager=enforce_eager,
-                     gpu_memory_utilization=0.7) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             enforce_eager=enforce_eager,
+                             gpu_memory_utilization=0.7)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@ -40,19 +40,21 @@ def test_models(
        enable_chunked_prefill = True
        max_num_batched_tokens = chunked_prefill_token_size

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model

-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        max_num_batched_tokens=max_num_batched_tokens,
+        enable_chunked_prefill=enable_chunked_prefill,
+        tensor_parallel_size=tensor_parallel_size,
+        enforce_eager=enforce_eager,
+        max_num_seqs=max_num_seqs,
+    )
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@ -43,19 +43,21 @@ def test_chunked_prefill_recompute(
        enable_chunked_prefill = True
        max_num_batched_tokens = chunked_prefill_token_size

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model

-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        max_num_batched_tokens=max_num_batched_tokens,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_seqs=max_num_seqs,
+    )
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@ -80,19 +82,21 @@ def test_preemption(
 ) -> None:
    """By default, recompute preemption is enabled"""

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model

-    with vllm_runner(
-            model,
-            dtype=dtype,
-            disable_log_stats=False,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        disable_log_stats=False,
+    )
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    total_preemption = (
+        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@ -133,22 +137,24 @@ def test_swap(
 ) -> None:
    """Use beam search enables swapping."""
    example_prompts = example_prompts[:1]
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                               max_tokens)
+    del hf_model

-    with vllm_runner(
-            model,
-            dtype=dtype,
-            swap_space=10,
-            disable_log_stats=False,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
-                                                       beam_width, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        swap_space=10,
+        disable_log_stats=False,
+    )
+    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    total_preemption = (
+        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, _ = hf_outputs[i]
@ -193,28 +199,28 @@ def test_swap_infeasible(
    decode_blocks = max_tokens // BLOCK_SIZE
    example_prompts = example_prompts[:1]

-    with vllm_runner(
-            model,
-            dtype=dtype,
-            swap_space=10,
-            block_size=BLOCK_SIZE,
-            # Since beam search have more than 1 sequence, prefill +
-            # decode blocks are not enough to finish.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks,
-            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
-    ) as vllm_model:
-        sampling_params = SamplingParams(n=beam_width,
-                                         use_beam_search=True,
-                                         temperature=0.0,
-                                         max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.model.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
-
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        swap_space=10,
+        block_size=BLOCK_SIZE,
+        # Since beam search have more than 1 sequence, prefill + decode blocks
+        # are not enough to finish.
+        num_gpu_blocks_override=prefill_blocks + decode_blocks,
+        max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
+    )
+    sampling_params = SamplingParams(n=beam_width,
+                                     use_beam_search=True,
+                                     temperature=0.0,
+                                     max_tokens=max_tokens,
+                                     ignore_eos=True)
+    req_outputs = vllm_model.model.generate(
+        example_prompts,
+        sampling_params=sampling_params,
+    )
+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
    # Verify the request is ignored and not hang.
    assert req_outputs[0].outputs[0].finish_reason == "length"

@ -233,26 +239,25 @@ def test_preemption_infeasible(
    BLOCK_SIZE = 16
    prefill_blocks = 2
    decode_blocks = max_tokens // BLOCK_SIZE
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            block_size=BLOCK_SIZE,
-            # Not enough gpu blocks to complete a single sequence.
-            # preemption should happen, and the sequence should be
-            # ignored instead of hanging forever.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
-            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-    ) as vllm_model:
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.model.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        block_size=BLOCK_SIZE,
+        # Not enough gpu blocks to complete a single sequence.
+        # preemption should happen, and the sequence should be
+        # ignored instead of hanging forever.
+        num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+        max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+    )
+    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    req_outputs = vllm_model.model.generate(
+        example_prompts,
+        sampling_params=sampling_params,
+    )

+    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+            ARTIFICIAL_PREEMPTION_MAX_CNT)
+    del vllm_model
    # Verify the request is ignored and not hang.
    for req_output in req_outputs:
        outputs = req_output.outputs
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,27 +1,20 @@
 import contextlib
 import gc
 import os
-import subprocess
-import sys
-from typing import Any, Dict, List, Optional, Tuple, TypeVar
+from typing import Any, Dict, List, Optional, Tuple

 import pytest
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
-                          AutoProcessor, AutoTokenizer, BatchEncoding)
+from transformers import (AutoModelForCausalLM, AutoProcessor, AutoTokenizer,
+                          LlavaConfig, LlavaForConditionalGeneration)

 from vllm import LLM, SamplingParams
 from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
 from vllm.distributed import destroy_model_parallel
-from vllm.inputs import TextPrompt
+from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalData
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
-from vllm.sequence import SampleLogprobs
-from vllm.utils import is_cpu
+from vllm.sequence import MultiModalData

 logger = init_logger(__name__)

@ -30,20 +23,24 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]

 # Multi modal related
-# You can use `.buildkite/download-images.sh` to download the assets
-PIXEL_VALUES_FILES = [
+_PIXEL_VALUES_FILES = [
    os.path.join(_TEST_DIR, "images", filename) for filename in
    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
 ]
-IMAGE_FEATURES_FILES = [
+_IMAGE_FEATURES_FILES = [
    os.path.join(_TEST_DIR, "images", filename) for filename in
    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
 ]
-IMAGE_FILES = [
+_IMAGE_FILES = [
    os.path.join(_TEST_DIR, "images", filename)
    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
 ]
-assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
+_IMAGE_PROMPTS = [
+    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "<image>\nUSER: What is the season?\nASSISTANT:"
+]
+assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(
+    _IMAGE_FILES) == len(_IMAGE_PROMPTS)


 def _read_prompts(filename: str) -> List[str]:
@ -57,8 +54,7 @@ def cleanup():
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
-    if not is_cpu():
-        torch.cuda.empty_cache()
+    torch.cuda.empty_cache()


@pytest.fixture()
@ -81,29 +77,37 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
        cleanup()


+@pytest.fixture(scope="session")
+def hf_image_prompts() -> List[str]:
+    return _IMAGE_PROMPTS
+
+
@pytest.fixture(scope="session")
 def hf_images() -> List[Image.Image]:
-    return [Image.open(filename) for filename in IMAGE_FILES]
+    return [Image.open(filename) for filename in _IMAGE_FILES]


@pytest.fixture()
-def vllm_images(request) -> List[MultiModalData]:
+def vllm_images(request) -> "torch.Tensor":
    vision_language_config = request.getfixturevalue("model_and_config")[1]
+    all_images = []
    if vision_language_config.image_input_type == (
            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
-        return [
-            ImageFeatureData(torch.load(filename))
-            for filename in IMAGE_FEATURES_FILES
-        ]
+        filenames = _IMAGE_FEATURES_FILES
    else:
-        return [
-            ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
-        ]
+        filenames = _PIXEL_VALUES_FILES
+    for filename in filenames:
+        all_images.append(torch.load(filename))
+    return torch.concat(all_images, dim=0)


@pytest.fixture()
-def vllm_image_tensors(request) -> List[torch.Tensor]:
-    return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
+def vllm_image_prompts(request) -> List[str]:
+    vision_language_config = request.getfixturevalue("model_and_config")[1]
+    return [
+        "<image>" * (vision_language_config.image_feature_size - 1) + p
+        for p in _IMAGE_PROMPTS
+    ]


@pytest.fixture
@ -128,50 +132,38 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
    "float": torch.float,
 }

-_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
+AutoModelForCausalLM.register(LlavaConfig, LlavaForConditionalGeneration)
+
+_EMBEDDING_MODELS = [
+    "intfloat/e5-mistral-7b-instruct",
+]


 class HfRunner:

-    def wrap_device(self, input: _T) -> _T:
-        if not is_cpu():
-            return input.to("cuda")
-        else:
-            return input.to("cpu")
-
    def __init__(
        self,
        model_name: str,
        dtype: str = "half",
-        *,
-        is_embedding_model: bool = False,
-        is_vision_model: bool = False,
    ) -> None:
        assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]

        self.model_name = model_name

-        if is_embedding_model:
+        if model_name in _EMBEDDING_MODELS:
            # Lazy init required for AMD CI
            from sentence_transformers import SentenceTransformer
-            self.model = self.wrap_device(
-                SentenceTransformer(
-                    model_name,
-                    device="cpu",
-                ).to(dtype=torch_dtype))
+            self.model = SentenceTransformer(
+                model_name,
+                device="cpu",
+            ).to(dtype=torch_dtype).cuda()
        else:
-            if is_vision_model:
-                auto_cls = AutoModelForVision2Seq
-            else:
-                auto_cls = AutoModelForCausalLM
-
-            self.model = self.wrap_device(
-                auto_cls.from_pretrained(
-                    model_name,
-                    torch_dtype=torch_dtype,
-                    trust_remote_code=True,
-                ))
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            ).cuda()

        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
@ -196,11 +188,10 @@ class HfRunner:
        prompts: List[str],
        images: Optional[List[Image.Image]] = None,
        **kwargs,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> List[Tuple[List[int], str]]:
+        outputs: List[Tuple[List[int], str]] = []
        if images:
            assert len(prompts) == len(images)
-
-        outputs: List[Tuple[List[List[int]], List[str]]] = []
        for i, prompt in enumerate(prompts):
            processor_kwargs: Dict[str, Any] = {
                "text": prompt,
@ -210,13 +201,17 @@ class HfRunner:
                processor_kwargs["images"] = images[i]

            inputs = self.processor(**processor_kwargs)
+            inputs = {
+                key: value.cuda() if value is not None else None
+                for key, value in inputs.items()
+            }

            output_ids = self.model.generate(
-                **self.wrap_device(inputs),
+                **inputs,
                use_cache=True,
                **kwargs,
            )
-            output_str = self.processor.batch_decode(
+            output_str = self.tokenizer.batch_decode(
                output_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
@ -229,22 +224,23 @@ class HfRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[Image.Image]] = None,
+        images: Optional["torch.Tensor"] = None,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
                                images=images)
-
-        return [(output_ids[0], output_str[0])
-                for output_ids, output_str in outputs]
+        for i in range(len(outputs)):
+            output_ids, output_str = outputs[i]
+            outputs[i] = (output_ids[0], output_str[0])
+        return outputs

    def generate_beam_search(
        self,
        prompts: List[str],
        beam_width: int,
        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
                                max_new_tokens=max_tokens,
@ -269,7 +265,7 @@ class HfRunner:
        for prompt in prompts:
            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
            output = self.model.generate(
-                self.wrap_device(input_ids),
+                input_ids.cuda(),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@ -286,7 +282,9 @@ class HfRunner:
                if self.model.get_output_embeddings().bias is not None:
                    logits += self.model.get_output_embeddings(
                    ).bias.unsqueeze(0)
-                logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+                logprobs = torch.nn.functional.log_softmax(logits,
+                                                           dim=-1,
+                                                           dtype=torch.float32)
                seq_logprobs.append(logprobs)
            all_logprobs.append(seq_logprobs)
        return all_logprobs
@ -296,15 +294,15 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
-    ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
-        all_logprobs: List[List[Dict[int, float]]] = []
-        all_output_ids: List[List[int]] = []
-        all_output_strs: List[str] = []
+    ) -> List[Tuple[List[int], str]]:
+        all_logprobs = []
+        all_output_ids = []
+        all_output_strs = []

        for prompt in prompts:
            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
            output = self.model.generate(
-                self.wrap_device(input_ids),
+                input_ids.cuda(),
                use_cache=True,
                do_sample=False,
                max_new_tokens=max_tokens,
@ -312,7 +310,7 @@ class HfRunner:
                return_dict_in_generate=True,
            )

-            seq_logprobs: List[torch.Tensor] = []
+            seq_logprobs = []
            for _, hidden_states in enumerate(output.hidden_states):
                last_hidden_states = hidden_states[-1][0]
                logits = torch.matmul(
@ -323,11 +321,13 @@ class HfRunner:
                           None) is not None:
                    logits += self.model.get_output_embeddings(
                    ).bias.unsqueeze(0)
-                logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+                logprobs = torch.nn.functional.log_softmax(logits,
+                                                           dim=-1,
+                                                           dtype=torch.float32)
                seq_logprobs.append(logprobs)

            # convert to dict
-            seq_logprobs_lst: List[Dict[int, float]] = []
+            seq_logprobs_lst = []
            for tok_idx, tok_logprobs in enumerate(seq_logprobs):
                # drop prompt logprobs
                if tok_idx == 0:
@ -354,10 +354,7 @@ class HfRunner:
    def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
        return self.model.encode(prompts)

-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
+    def __del__(self):
        del self.model
        cleanup()

@ -375,13 +372,13 @@ class VllmRunner:
        tokenizer_name: Optional[str] = None,
        # Use smaller max model length, otherwise bigger model cannot run due
        # to kv cache size limit.
-        max_model_len: int = 1024,
+        max_model_len=1024,
        dtype: str = "half",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
        block_size: int = 16,
        enable_chunked_prefill: bool = False,
-        swap_space: int = 4,
+        swap_space=4,
        **kwargs,
    ) -> None:
        self.model = LLM(
@ -402,25 +399,32 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
-        images: Optional[List[MultiModalData]] = None,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+        images: Optional["torch.Tensor"] = None,
+    ) -> List[Tuple[List[int], str]]:
        if images is not None:
-            assert len(prompts) == len(images)
+            assert len(prompts) == images.shape[0]

-        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
-        if images is not None:
-            for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = image
+        prompt_inputs: List[PromptInputs] = []
+        for i, prompt in enumerate(prompts):
+            image = None if images is None else images[i:i + 1]
+            mm_data = None if image is None else MultiModalData(
+                type=MultiModalData.Type.IMAGE,
+                data=image,
+            )

-        req_outputs = self.model.generate(inputs,
+            prompt_inputs.append({
+                "prompt": prompt,
+                "multi_modal_data": mm_data,
+            })
+
+        req_outputs = self.model.generate(prompt_inputs,
                                          sampling_params=sampling_params)
-
-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        outputs = []
        for req_output in req_outputs:
            prompt_str = req_output.prompt
            prompt_ids = req_output.prompt_token_ids
-            req_sample_output_ids: List[List[int]] = []
-            req_sample_output_strs: List[str] = []
+            req_sample_output_ids = []
+            req_sample_output_strs = []
            for sample in req_output.outputs:
                output_str = sample.text
                output_ids = sample.token_ids
@ -433,12 +437,12 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
-    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
+    ) -> List[Tuple[List[int], str]]:
        assert sampling_params.logprobs is not None

        req_outputs = self.model.generate(prompts,
                                          sampling_params=sampling_params)
-        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
+        outputs = []
        for req_output in req_outputs:
            for sample in req_output.outputs:
                output_str = sample.text
@ -451,7 +455,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[torch.Tensor] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts, greedy_params, images=images)
@ -463,7 +467,7 @@ class VllmRunner:
        prompts: List[str],
        max_tokens: int,
        num_logprobs: int,
-    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
+    ) -> List[Tuple[List[int], str]]:
        greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                max_tokens=max_tokens,
                                                logprobs=num_logprobs)
@ -477,7 +481,7 @@ class VllmRunner:
        prompts: List[str],
        beam_width: int,
        max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> List[Tuple[List[int], str]]:
        beam_search_params = SamplingParams(n=beam_width,
                                            use_beam_search=True,
                                            temperature=0.0,
@ -493,10 +497,7 @@ class VllmRunner:
            outputs.append(embedding)
        return outputs

-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
+    def __del__(self):
        del self.model
        cleanup()

@ -530,22 +531,3 @@ def caplog_vllm(temporary_enable_log_propagate, caplog):
    # To capture vllm log, we should enable propagate=True temporarily
    # because caplog depends on logs propagated to the root logger.
    yield caplog
-
-
-@pytest.fixture(scope="session")
-def num_gpus_available():
-    """Get number of GPUs without initializing the CUDA context
-    in current process."""
-
-    try:
-        out = subprocess.run([
-            sys.executable, "-c",
-            "import torch; print(torch.cuda.device_count())"
-        ],
-                             capture_output=True,
-                             check=True,
-                             text=True)
-    except subprocess.CalledProcessError as e:
-        logger.warning("Failed to get number of GPUs.", exc_info=e)
-        return 0
-    return int(out.stdout.strip())
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@ -24,13 +24,7 @@ from .conftest import get_token_ids_from_llm_generator
@pytest.mark.parametrize("baseline_llm_kwargs", [{
    "use_v2_block_manager": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
-    "preemption_mode": "swap"
-}, {
-    "use_v2_block_manager": True,
-    "preemption_mode": "recompute"
-}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
 def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
@ -101,13 +95,7 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
@pytest.mark.parametrize("baseline_llm_kwargs", [{
    "use_v2_block_manager": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
-    "preemption_mode": "swap"
-}, {
-    "use_v2_block_manager": True,
-    "preemption_mode": "recompute"
-}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
 def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
@ -191,18 +179,11 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
 }])
@pytest.mark.parametrize(
    "test_llm_kwargs",
-    [
-        {
-            # We run one test with block_size < lookahead_slots, one test with
-            # block_size > lookahead_slots
-            "num_lookahead_slots": 10,
-            "preemption_mode": "swap",
-        },
-        {
-            "num_lookahead_slots": 10,
-            "preemption_mode": "recompute",
-        }
-    ])
+    [{
+        # We run one test with block_size < lookahead_slots, one test with
+        # block_size > lookahead_slots
+        "num_lookahead_slots": 10,
+    }])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
 def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
@ -341,13 +322,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
@pytest.mark.parametrize("baseline_llm_kwargs", [{
    "use_v2_block_manager": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "use_v2_block_manager": True,
-    "preemption_mode": "swap"
-}, {
-    "use_v2_block_manager": True,
-    "preemption_mode": "recompute"
-}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
 def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
@ -422,13 +397,7 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
@pytest.mark.parametrize("baseline_llm_kwargs", [{
    "enable_prefix_caching": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "enable_prefix_caching": True,
-    "preemption_mode": "swap"
-}, {
-    "enable_prefix_caching": True,
-    "preemption_mode": "recompute"
-}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"enable_prefix_caching": True}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
 def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@ -7,8 +7,7 @@ from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list

-from ..utils import (create_dummy_prompt, create_seq_group,
-                     create_seq_group_encoder_decoder)
+from ..utils import create_seq_group, create_seq_group_encoder_decoder


@pytest.mark.parametrize("block_size", [16])
@ -256,61 +255,6 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
    assert num_consumed_blocks == expected_consumed_blocks


-@pytest.mark.parametrize("block_size", [8])
-@pytest.mark.parametrize("num_cpu_blocks", [4])
-@pytest.mark.parametrize("num_gpu_blocks", [4])
-@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
-@pytest.mark.parametrize("enable_caching", [False, True])
-def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
-              enable_caching):
-    """Verify blocks number on src/desc device is correct after swapping in/out
-        sequence group (not missing or extra blocks).
-    """
-    block_manager = BlockSpaceManagerV2(block_size,
-                                        num_cpu_blocks,
-                                        num_gpu_blocks,
-                                        watermark=0,
-                                        enable_caching=enable_caching)
-    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
-    prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-    # Emulate a forward pass by appending a single token.
-    # The block manager then knows how many unprocessed
-    # tokens will be written in the next forward pass.
-    token_id = 0
-    prompt.status = SequenceStatus.RUNNING
-    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    # Swap seq group from GPU -> CPU.
-    gpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_out(seq_group)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_out(seq_group)
-    mapping_keys = [key for key, _ in mapping]
-    assert mapping_keys == gpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
-    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
-    prompt.status = SequenceStatus.SWAPPED
-
-    # Swap seq group from CPU -> GPU.
-    assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_in(seq_group)
-    cpu_blocks = block_manager.get_block_table(prompt)
-    mapping_keys = [key for key, _ in mapping]
-    assert mapping_keys == [cpu_blocks[0]]
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
-
-
-# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
-
-
@pytest.mark.parametrize("block_size", [8, 16])
@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
@pytest.mark.parametrize("num_slots_to_append", [50])
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@ -42,16 +42,18 @@ def test_models(
    backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
    enforce_eager = backend_by_env_var == "FLASHINFER"

-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model

-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     enforce_eager=enforce_eager,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    vllm_model = vllm_runner(
+        model,
+        dtype=dtype,
+        tensor_parallel_size=2,
+        enforce_eager=enforce_eager,
+        distributed_executor_backend=distributed_executor_backend)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Zhuohan Li	1936d7bab0	format	2024-06-02 00:02:54 +00:00
Zhuohan Li	996cf2de5c	Fix hashing logic for non-full blocks	2024-06-02 00:01:30 +00:00