[Misc] Support quantization of MllamaForCausalLM (#8822 )

[Doc] Update doc for Transformers 4.45 (#8817 )
[Model] Add support for the multi-modal Llama 3.2 model (#8811 )
2024-09-25 14:46:22 -07:00 · 2024-09-25 13:29:48 -07:00 · 2024-09-25 13:29:32 -07:00 · 2024-09-25 10:36:26 -07:00 · 2024-09-25 09:43:36 -07:00 · 2024-09-25 09:37:41 -07:00
385 changed files with 20070 additions and 7214 deletions
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -8,8 +8,7 @@ steps:
          containers:
          - image: badouralix/curl-jq
            command:
-            - sh
-            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
  - wait
  - label: "A100"
    agents:
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -2,9 +2,11 @@
 TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
 URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"

+TIMEOUT_SECONDS=10
+
 retries=0
 while [ $retries -lt 1000 ]; do
-    if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+    if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
        exit 0
    fi

--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_encoder_decoder_attn.py \
  --ignore=kernels/test_flash_attn.py \
  --ignore=kernels/test_flashinfer.py \
+  --ignore=kernels/test_gguf.py \
  --ignore=kernels/test_int8_quant.py \
  --ignore=kernels/test_machete_gemm.py \
  --ignore=kernels/test_mamba_ssm.py \
@ -93,6 +94,16 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_sampler.py"
 fi

+#ignore certain Entrypoints tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_accuracy.py \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_encoder_decoder.py \
+  --ignore=entrypoints/openai/test_embedding.py \
+  --ignore=entrypoints/openai/test_oot_registration.py "}
+fi
+
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -22,7 +22,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"

 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
+  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
  pytest -v -s tests/models/decoder_only/language \
    --ignore=tests/models/test_fp8.py \
    --ignore=tests/models/decoder_only/language/test_jamba.py \
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -43,13 +43,15 @@ steps:
  fast_check: true
  source_file_dependencies:
  - vllm/
+  - tests/mq_llm_engine
  - tests/async_engine
  - tests/test_inputs
  - tests/multimodal
  - tests/test_utils
  - tests/worker
  commands:
-  - pytest -v -s async_engine # Async Engine
+  - pytest -v -s mq_llm_engine # MQLLMEngine
+  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
  - pytest -v -s test_inputs.py
  - pytest -v -s multimodal
@ -68,7 +70,7 @@ steps:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-  
+
 - label: Core Test # 10min
  mirror_hardwares: [amd]
  fast_check: true
@ -82,14 +84,17 @@ steps:
 - label: Entrypoints Test # 20min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
  commands:
  - pip install -e ./plugins/vllm_add_dummy_model
  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
  - pytest -v -s entrypoints/openai
  - pytest -v -s entrypoints/test_chat_utils.py
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@ -163,13 +168,6 @@ steps:
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference_encoder_decoder.py

- label: torch compile integration test
-  source_file_dependencies:
-  - vllm/
-  commands:
-    - pytest -v -s ./compile/test_full_graph.py
-    - pytest -v -s ./compile/test_wrapper.py
-
 - label: Prefix Caching Test # 7min
  #mirror_hardwares: [amd]
  source_file_dependencies:
@ -212,6 +210,21 @@ steps:
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
  parallelism: 4

+- label: "PyTorch Fullgraph Smoke Test"
+  fast_check: true
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph_smoke.py
+
+- label: "PyTorch Fullgraph Test"
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph.py
+
 - label: Kernels Test %N # 30min each
  mirror_hardwares: [amd]
  source_file_dependencies:
@ -259,6 +272,13 @@ steps:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - bash ./run-tests.sh -c configs/models-small.txt -t 1

+- label: Encoder Decoder tests # 5min
+  source_file_dependencies:
+  - vllm/
+  - tests/encoder_decoder
+  commands:
+    - pytest -v -s encoder_decoder
+
 - label: OpenAI-Compatible Tool Use # 20 min
  fast_check: false
  mirror_hardwares: [ amd ]
@ -348,7 +368,10 @@ steps:
  - vllm/executor/
  - vllm/model_executor/models/
  - tests/distributed/
+  - vllm/compilation
  commands:
+  - pytest -v -s ./compile/test_full_graph_multi_gpu.py
+  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
  # Avoid importing model tests that cause CUDA reinitialization error
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -25,10 +25,10 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
+        pip install -r requirements-lint.txt
    - name: Analysing the code with ruff
      run: |
-        ruff .
+        ruff check .
    - name: Spelling check with codespell
      run: |
        codespell --toml pyproject.toml
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -15,5 +15,6 @@ $python_executable -m pip install -r requirements-cuda.txt
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
 export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,8 @@
-# vllm commit id, generated by setup.py
-vllm/commit_id.py
+# version file generated by setuptools-scm
+/vllm/_version.py
+
+# vllm-flash-attn built from source
+vllm/vllm_flash_attn/

 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -12,6 +15,8 @@ __pycache__/
 # Distribution / packaging
 .Python
 build/
+cmake-build-*/
+CMakeUserPresets.json
 develop-eggs/
 dist/
 downloads/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,16 @@
 cmake_minimum_required(VERSION 3.26)

+# When building directly using CMake, make sure you run the install step
+# (it places the .so files in the correct location).
+#
+# Example:
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_INSTALL_PREFIX=.. ..
+# cmake --build . --target install
+#
+# If you want to only build one target, make sure to install it manually:
+# cmake --build . --target _C
+# cmake --install . --component _C
 project(vllm_extensions LANGUAGES CXX)

 # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
@ -13,6 +24,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")

+# Prevent installation of dependencies (cutlass) by default.
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
+
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@ -70,19 +84,6 @@ endif()
 find_package(Torch REQUIRED)

 #
-# Add the `default` target which detects which extensions should be
-# built based on platform/architecture.  This is the same logic that
-# setup.py uses to select which extensions should be built and should
-# be kept in sync.
-#
-# The `default` target makes direct use of cmake easier since knowledge
-# of which extensions are supported has been factored in, e.g.
-#
-# mkdir build && cd build
-# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
-# cmake --build . --target default
-#
-add_custom_target(default)
 message(STATUS "Enabling core extension.")

 # Define _core_C extension
@ -100,8 +101,6 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)

-add_dependencies(default _core_C)
-
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -167,6 +166,8 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()

+include(FetchContent)
+
 #
 # Define other extension targets
 #
@ -190,8 +191,11 @@ set(VLLM_EXT_SRC
  "csrc/torch_bindings.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  include(FetchContent)
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
+
+  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
+  set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use")
+
  FetchContent_Declare(
        cutlass
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
@ -219,6 +223,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/gguf/gguf_kernel.cu"
    "csrc/quantization/fp8/fp8_marlin.cu"
    "csrc/custom_all_reduce.cu"
+    "csrc/permute_cols.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
@ -283,6 +288,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    csrc/quantization/machete/machete_pytorch.cu)
 endif()

+message(STATUS "Enabling C extension.")
 define_gpu_extension_target(
  _C
  DESTINATION vllm
@ -310,9 +316,15 @@ set(VLLM_MOE_EXT_SRC

 if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC
+      "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
+      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
+      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
+      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
+      "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
      "csrc/moe/marlin_moe_ops.cu")
 endif()

+message(STATUS "Enabling moe extension.")
 define_gpu_extension_target(
  _moe_C
  DESTINATION vllm
@ -323,13 +335,85 @@ define_gpu_extension_target(
  USE_SABI 3
  WITH_SOABI)

+if(VLLM_GPU_LANG STREQUAL "HIP")
+  #
+  # _rocm_C extension
+  #
+  set(VLLM_ROCM_EXT_SRC
+    "csrc/rocm/torch_bindings.cpp"
+    "csrc/rocm/attention.cu")

-
-if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
-  message(STATUS "Enabling C extension.")
-  add_dependencies(default _C)
-
-  message(STATUS "Enabling moe extension.")
-  add_dependencies(default _moe_C)
-
+  define_gpu_extension_target(
+    _rocm_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_ROCM_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_GPU_ARCHES}
+    USE_SABI 3
+    WITH_SOABI)
 endif()
+
+# vllm-flash-attn currently only supported on CUDA
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
+  return()
+endif ()
+
+#
+# Build vLLM flash attention from source
+#
+# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
+# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
+# They should be identical but if they aren't, this is a massive footgun.
+#
+# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
+# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
+# If no component is specified, vllm-flash-attn is still installed.
+
+# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
+# This is to enable local development of vllm-flash-attn within vLLM.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
+  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
+endif()
+
+if(VLLM_FLASH_ATTN_SRC_DIR)
+  FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
+else()
+  FetchContent_Declare(
+          vllm-flash-attn
+          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
+          GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
+          GIT_PROGRESS TRUE
+  )
+endif()
+
+# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
+set(VLLM_PARENT_BUILD ON)
+
+# Ensure the vllm/vllm_flash_attn directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
+
+# Make sure vllm-flash-attn install rules are nested under vllm/
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
+install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
+
+# Fetch the vllm-flash-attn library
+FetchContent_MakeAvailable(vllm-flash-attn)
+message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
+
+# Restore the install prefix
+install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
+install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
+
+# Copy over the vllm-flash-attn python files
+install(
+        DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+        DESTINATION vllm/vllm_flash_attn
+        COMPONENT vllm_flash_attn_c
+        FILES_MATCHING PATTERN "*.py"
+)
+
+# Nothing after vllm-flash-attn, see comment about macros above
--- a/16
+++ b/16
@ -48,6 +48,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # see https://github.com/pytorch/pytorch/pull/123243
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+# Override the arch list for flash-attn to reduce the binary size
+ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
+ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################

 #################### WHEEL BUILD IMAGE ####################
@ -76,14 +79,13 @@ ENV MAX_JOBS=${max_jobs}
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads

-ARG buildkite_commit
-ENV BUILDKITE_COMMIT=${buildkite_commit}
-
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
@ -92,6 +94,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
        && export SCCACHE_IDLE_TIMEOUT=0 \
        && export CMAKE_BUILD_TYPE=Release \
        && sccache --show-stats \
@ -102,6 +105,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git  \
    if [ "$USE_SCCACHE" != "1" ]; then \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi
@ -180,10 +184,6 @@ FROM vllm-base AS test
 ADD . /vllm-workspace/

 # install development dependencies (for testing)
-# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
-# This installation must complete before the test dependencies are collected and installed.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install "setuptools>=74.1.1"
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -r requirements-dev.txt

@ -202,7 +202,7 @@ FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+    pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -24,6 +24,8 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc

 RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl

+WORKDIR /workspace
+
 ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
@ -60,8 +62,10 @@ ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}

 RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl
+    pip install dist/*.whl && \
+    rm -rf dist

 WORKDIR /workspace/

--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -1,14 +1,17 @@
 # default base image
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.19.1-ubuntu20.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04"

 FROM $BASE_IMAGE

 RUN echo "Base image is $BASE_IMAGE"

 # Install some basic utilities
-RUN apt-get update \
-    && apt-get install python3 python3-pip -y \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update && \
+    apt-get install -y \
+        git \
+        python3 \
+        python3-pip \
+        ffmpeg libsm6 libxext6 libgl1

 ### Mount Point ###
 # When launching the container, mount the code directory to /app
@ -20,19 +23,19 @@ RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
 RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
-RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U

-COPY ./vllm /app/vllm/vllm
-COPY ./setup.py /app/vllm/setup.py
-COPY ./requirements-common.txt /app/vllm/requirements-common.txt
-COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
+COPY . /app/vllm

 RUN cd /app/vllm \
-    && python3 -m pip install -U -r requirements-neuron.txt
+    && python3 -m pip install -U \
+        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        -r requirements-neuron.txt

 ENV VLLM_TARGET_DEVICE neuron
-RUN cd /app/vllm \
-    && pip install -e . \
+RUN --mount=type=bind,source=.git,target=.git \
+    cd /app/vllm \
+    && pip install --no-build-isolation -v -e . \
    && cd ..

 CMD ["/bin/bash"]
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -4,8 +4,9 @@
 FROM ubuntu:22.04 AS dev

 RUN apt-get update -y && \
-    apt-get install -y python3-pip git && \
-    apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+    apt-get install -y \
+        git python3-pip \
+        ffmpeg libsm6 libxext6 libgl1
 WORKDIR /workspace

 # copy requirements
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -16,9 +16,15 @@ COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm

 # These packages will be in rocketce eventually
-RUN pip install -v cmake xformers torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+RUN --mount=type=cache,target=/root/.cache/pip  \
+    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        torch==2.3.1 \
+        -r requirements-cpu.txt \
+        xformers uvloop==0.20.0

-RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+RUN --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -1,5 +1,5 @@
-# Default ROCm 6.1 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+# Default ROCm 6.2 base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"

 # Default ROCm ARCHes to build vLLM for.
 ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
@ -7,18 +7,12 @@ ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
 # Whether to install CK-based flash-attention
 # If 0, will not install flash-attention
 ARG BUILD_FA="1"
-# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL`
-# If this succeeds, we use the downloaded wheel and skip building flash-attention.
-# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the
-# architectures specified in `FA_GFX_ARCHS`
-ARG TRY_FA_WHEEL="1"
-ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="23a2b1c2"
+ARG FA_BRANCH="3cea2fb"

 # Whether to build triton on rocm
 ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="e0fc12c"
+ARG TRITON_BRANCH="e192dba"

 ### Base image build stage
 FROM $BASE_IMAGE AS base
@ -50,14 +44,17 @@ RUN python3 -m pip install --upgrade pip
 # Remove sccache so it doesn't interfere with ccache
 # TODO: implement sccache support across components
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
-# Install torch == 2.5.0 on ROCm
-RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.1"*) \
+
+# Install torch == 2.6.0 on ROCm
+RUN --mount=type=cache,target=/root/.cache/pip \
+    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.2"*) \
            python3 -m pip uninstall -y torch torchvision \
-            && python3 -m pip install --no-cache-dir --pre \
-                torch==2.5.0.dev20240726 \
-                torchvision==0.20.0.dev20240726 \
-               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
+            && python3 -m pip install --pre \
+                torch==2.6.0.dev20240918 \
+                setuptools-scm>=8 \
+                torchvision==0.20.0.dev20240918 \
+                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
        *) ;; esac

 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
@ -79,25 +76,18 @@ RUN cd /opt/rocm/share/amd_smi \
 ### Flash-Attention wheel build stage
 FROM base AS build_fa
 ARG BUILD_FA
-ARG TRY_FA_WHEEL
-ARG FA_WHEEL_URL
 ARG FA_GFX_ARCHS
 ARG FA_BRANCH
 # Build ROCm flash-attention wheel if `BUILD_FA = 1`
 RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_FA" = "1" ]; then \
-        if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
-            # If a suitable wheel exists, we download it instead of building FA
-            mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
-        else \
-            mkdir -p libs \
-            && cd libs \
-            && git clone https://github.com/ROCm/flash-attention.git \
-            && cd flash-attention \
-            && git checkout "${FA_BRANCH}" \
-            && git submodule update --init \
-            && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
-        fi; \
+        mkdir -p libs \
+        && cd libs \
+        && git clone https://github.com/ROCm/flash-attention.git \
+        && cd flash-attention \
+        && git checkout "${FA_BRANCH}" \
+        && git submodule update --init \
+        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
    # Create an empty directory otherwise as later build stages expect one
    else mkdir -p /install; \
    fi
@ -112,6 +102,7 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \
    if [ "$BUILD_TRITON" = "1" ]; then \
    mkdir -p libs \
    && cd libs \
+    && python3 -m pip install ninja cmake wheel pybind11 \
    && git clone https://github.com/OpenAI/triton.git \
    && cd triton \
    && git checkout "${TRITON_BRANCH}" \
@ -129,7 +120,7 @@ COPY . .

 # Package upgrades for useful functionality or to avoid dependency issues
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
+    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard


 # Workaround for ray >= 2.10.0
@ -138,15 +129,9 @@ ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false

 RUN --mount=type=cache,target=${CCACHE_DIR} \
+    --mount=type=bind,source=.git,target=.git \
    --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install -Ur requirements-rocm.txt \
-    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.1"*) \
-            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
-            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \
-            # Prevent interference if torch bundles its own HIP runtime
-            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
-        *) ;; esac \
    && python3 setup.py clean --all \
    && python3 setup.py develop

--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -5,16 +5,25 @@ FROM $BASE_IMAGE
 WORKDIR /workspace

 # Install some basic utilities
-RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update && apt-get install -y \
+    git \
+    ffmpeg libsm6 libxext6 libgl1

 # Install the TPU and Pallas dependencies.
-RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html

 # Build vLLM.
 COPY . /workspace/vllm
 ENV VLLM_TARGET_DEVICE="tpu"
-RUN cd /workspace/vllm && python3 -m pip install -r requirements-tpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+     cd /workspace/vllm && \
+    python3 -m pip install \
+        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        -r requirements-tpu.txt
 RUN cd /workspace/vllm && python3 setup.py develop

 CMD ["/bin/bash"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,21 +1,26 @@
-FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04

 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    rm /etc/apt/sources.list.d/intel-graphics.list && \
    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
    chmod 644 /usr/share/keyrings/intel-graphics.gpg

-RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update  -y && \
+    apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
+
 COPY ./ /workspace/vllm

 WORKDIR /workspace/vllm

-RUN pip install -v -r requirements-xpu.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
+        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        -r requirements-xpu.txt

-RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=xpu python3 setup.py install

 CMD ["/bin/bash"]
--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1,12 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. 
+We will investigate all legitimate reports and do our best to quickly fix the problem.
+
+Please report security issues using https://github.com/vllm-project/vllm/security/advisories/new
+
+---
+Please see PyTorch Security for more information how to securely interact with models: https://github.com/pytorch/pytorch/blob/main/SECURITY.md
+This document mostly references the recommendation from PyTorch, thank you! 
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -25,6 +25,7 @@ class RequestFuncInput:
    best_of: int = 1
    use_beam_search: bool = False
    logprobs: Optional[int] = None
+    multi_modal_content: Optional[dict] = None


@dataclass
@ -312,12 +313,15 @@ async def async_request_openai_chat_completions(

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
        payload = {
            "model": request_func_input.model,
            "messages": [
                {
                    "role": "user",
-                    "content": request_func_input.prompt,
+                    "content": content
                },
            ],
            "temperature": 0.0,
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -0,0 +1,295 @@
+"""Benchmark offline prioritization."""
+import argparse
+import json
+import random
+import time
+from typing import List, Optional, Tuple
+
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+
+        #Select a equi-probable random priority
+        priority = 0 if random.random() < 0.5 else 1
+
+        filtered_dataset.append((prompt, prompt_len, output_len, priority))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    gpu_memory_utilization: float = 0.9,
+    download_dir: Optional[str] = None,
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        disable_log_stats=False,
+    )
+
+    # Add the requests to the engine.
+    prompts = []
+    sampling_params = []
+    priority = []
+    for prompt, _, output_len, _priority in requests:
+        prompts.append(prompt)
+        priority.append(_priority)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
+
+    start = time.perf_counter()
+    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        prompt = "hi" * (args.input_len - 1)
+        requests = [(prompt, args.input_len, args.output_len)
+                    for _ in range(args.num_prompts)]
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+                                   args.output_len)
+
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(
+            requests, args.model, args.tokenizer, args.quantization,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.trust_remote_code, args.dtype, args.max_model_len,
+            args.enforce_eager, args.kv_cache_dtype,
+            args.quantization_param_path, args.device,
+            args.enable_prefix_caching, args.enable_chunked_prefill,
+            args.max_num_batched_tokens, args.gpu_memory_utilization,
+            args.download_dir)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(prompt_len + output_len
+                           for _, prompt_len, output_len, priority in requests)
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=[*QUANTIZATION_METHODS, None],
+                        default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=200,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument("--enforce-eager",
+                        action="store_true",
+                        help="enforce eager execution")
+    parser.add_argument(
+        '--kv-cache-dtype',
+        type=str,
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help='device type for vLLM execution, supporting CUDA and CPU.')
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument("--enable-chunked-prefill",
+                        action='store_true',
+                        help="enable chunked prefill for vLLM backend.")
+    parser.add_argument('--max-num-batched-tokens',
+                        type=int,
+                        default=None,
+                        help='maximum number of batched tokens per '
+                        'iteration')
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -24,6 +24,8 @@ On the client side, run:
 """
 import argparse
 import asyncio
+import base64
+import io
 import json
 import os
 import random
@ -31,11 +33,13 @@ import time
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple

 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                  RequestFuncOutput)
+from datasets import load_dataset
+from PIL.Image import Image
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

@ -84,7 +88,7 @@ def sample_sharegpt_requests(
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, int, int]]:
+) -> List[Tuple[str, int, int, None]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
    # Load the dataset.
@ -119,7 +123,7 @@ def sample_sharegpt_requests(
        if prompt_len > 1024 or prompt_len + output_len > 2048:
            # Prune too long sequences.
            continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
+        filtered_dataset.append((prompt, prompt_len, output_len, None))

    return filtered_dataset

@ -131,7 +135,7 @@ def sample_sonnet_requests(
    output_len: int,
    prefix_len: int,
    tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, str, int, int]]:
+) -> List[Tuple[str, str, int, int, None]]:
    assert (
        input_len > prefix_len
    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
@ -189,7 +193,65 @@ def sample_sonnet_requests(
            message, add_generation_prompt=True, tokenize=False)
        prompt_len = len(tokenizer(prompt_formatted).input_ids)
        sampled_requests.append(
-            (prompt, prompt_formatted, prompt_len, output_len))
+            (prompt, prompt_formatted, prompt_len, output_len, None))
+
+    return sampled_requests
+
+
+def sample_hf_requests(
+    dataset_path: str,
+    dataset_subset: str,
+    dataset_split: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+    dataset = load_dataset(dataset_path,
+                           name=dataset_subset,
+                           split=dataset_split,
+                           streaming=True)
+    assert "conversations" in dataset.features, (
+        "HF Dataset must have 'conversations' column.")
+    filtered_dataset = dataset.shuffle().filter(
+        lambda x: len(x["conversations"]) >= 2)
+    sampled_requests: List[Tuple[str, int, int, Dict[str,
+                                                     Collection[str]]]] = []
+    for data in filtered_dataset:
+        if len(sampled_requests) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = data["conversations"][0]["value"]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = data["conversations"][1]["value"]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+
+        if "image" in data and isinstance(data["image"], Image):
+            image: Image = data["image"]
+            image = image.convert("RGB")
+            image_data = io.BytesIO()
+            image.save(image_data, format='JPEG')
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+            mm_content = {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_base64}"
+                },
+            }
+        else:
+            mm_content = None
+
+        sampled_requests.append((prompt, prompt_len, output_len, mm_content))

    return sampled_requests

@ -223,8 +285,8 @@ def sample_random_requests(
                                  [(offsets[i] + i + j) % tokenizer.vocab_size
                                   for j in range(input_lens[i])])

-        input_requests.append(
-            (prompt, int(prefix_len + input_lens[i]), int(output_lens[i])))
+        input_requests.append((prompt, int(prefix_len + input_lens[i]),
+                               int(output_lens[i]), None))

    return input_requests

@ -343,7 +405,12 @@ async def benchmark(
        raise ValueError(f"Unknown backend: {backend}")

    print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len = input_requests[0]
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+        input_requests[0])
+    if backend != "openai-chat" and test_mm_content is not None:
+        # multi-modal benchmark is only available on OpenAI Chat backend.
+        raise ValueError(
+            "Multi-modal content is only supported on 'openai-chat' backend.")
    test_input = RequestFuncInput(
        model=model_id,
        prompt=test_prompt,
@ -353,6 +420,7 @@ async def benchmark(
        logprobs=logprobs,
        best_of=best_of,
        use_beam_search=use_beam_search,
+        multi_modal_content=test_mm_content,
    )
    test_output = await request_func(request_func_input=test_input)
    if not test_output.success:
@ -373,6 +441,7 @@ async def benchmark(
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
+            multi_modal_content=test_mm_content,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
@ -385,7 +454,7 @@ async def benchmark(
    benchmark_start_time = time.perf_counter()
    tasks: List[asyncio.Task] = []
    async for request in get_request(input_requests, request_rate):
-        prompt, prompt_len, output_len = request
+        prompt, prompt_len, output_len, mm_content = request
        request_func_input = RequestFuncInput(
            model=model_id,
            prompt=prompt,
@ -395,6 +464,7 @@ async def benchmark(
            logprobs=logprobs,
            best_of=best_of,
            use_beam_search=use_beam_search,
+            multi_modal_content=mm_content,
        )
        tasks.append(
            asyncio.create_task(
@ -556,9 +626,9 @@ def main(args: argparse.Namespace):
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
            )
-            input_requests = [(prompt, prompt_len, output_len)
+            input_requests = [(prompt, prompt_len, output_len, None)
                              for prompt, prompt_formatted, prompt_len,
-                              output_len in input_requests]
+                              output_len, _ in input_requests]
        else:
            assert (
                tokenizer.chat_template or tokenizer.default_chat_template
@ -571,9 +641,19 @@ def main(args: argparse.Namespace):
                prefix_len=args.sonnet_prefix_len,
                tokenizer=tokenizer,
            )
-            input_requests = [(prompt_formatted, prompt_len, output_len)
+            input_requests = [(prompt_formatted, prompt_len, output_len, None)
                              for prompt, prompt_formatted, prompt_len,
-                              output_len in input_requests]
+                              output_len, _ in input_requests]
+
+    elif args.dataset_name == "hf":
+        input_requests = sample_hf_requests(
+            dataset_path=args.dataset_path,
+            dataset_subset=args.hf_subset,
+            dataset_split=args.hf_split,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.hf_output_len,
+        )

    elif args.dataset_name == "random":
        input_requests = sample_random_requests(
@ -685,13 +765,14 @@ if __name__ == "__main__":
        "--dataset-name",
        type=str,
        default="sharegpt",
-        choices=["sharegpt", "sonnet", "random"],
+        choices=["sharegpt", "sonnet", "random", "hf"],
        help="Name of the dataset to benchmark on.",
    )
    parser.add_argument("--dataset-path",
                        type=str,
                        default=None,
-                        help="Path to the dataset.")
+                        help="Path to the sharegpt/sonnet dataset. "
+                        "Or the huggingface dataset ID if using HF dataset.")
    parser.add_argument(
        "--model",
        type=str,
@ -718,26 +799,6 @@ if __name__ == "__main__":
        default=1000,
        help="Number of prompts to process.",
    )
-    parser.add_argument(
-        "--sharegpt-output-len",
-        type=int,
-        default=None,
-        help="Output length for each request. Overrides the output length "
-        "from the ShareGPT dataset.")
-    parser.add_argument(
-        "--sonnet-input-len",
-        type=int,
-        default=550,
-        help=
-        "Number of input tokens per request, used only for sonnet dataset.",
-    )
-    parser.add_argument(
-        "--sonnet-output-len",
-        type=int,
-        default=150,
-        help=
-        "Number of output tokens per request, used only for sonnet dataset.",
-    )
    parser.add_argument(
        "--logprobs",
        type=int,
@ -748,42 +809,6 @@ if __name__ == "__main__":
              "logprob is returned for each token; or (2) if beam search "
              "is enabled 1 logprob per token is computed"),
    )
-    parser.add_argument(
-        "--sonnet-prefix-len",
-        type=int,
-        default=200,
-        help=
-        "Number of prefix tokens per request, used only for sonnet dataset.",
-    )
-    parser.add_argument(
-        "--random-input-len",
-        type=int,
-        default=1024,
-        help=
-        "Number of input tokens per request, used only for random sampling.",
-    )
-    parser.add_argument(
-        "--random-output-len",
-        type=int,
-        default=128,
-        help=
-        "Number of output tokens per request, used only for random sampling.",
-    )
-    parser.add_argument(
-        "--random-range-ratio",
-        type=float,
-        default=1.0,
-        help="Range of sampled ratio of input/output length, "
-        "used only for random sampling.",
-    )
-    parser.add_argument(
-        "--random-prefix-len",
-        type=int,
-        default=0,
-        help="Number of fixed prefix tokens before random "
-        " context. The length range of context in a random "
-        " request is [random-prefix-len, "
-        " random-prefix-len + random-prefix-len * random-range-ratio).")
    parser.add_argument(
        "--request-rate",
        type=float,
@ -857,5 +882,85 @@ if __name__ == "__main__":
        "Use \"--percentile-metrics\" to select metrics.",
    )

+    # group for dataset specific arguments
+    sonnet_group = parser.add_argument_group("sonnet dataset options")
+    sonnet_group.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+
+    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
+    sharegpt_group.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.")
+
+    random_group = parser.add_argument_group("random dataset options")
+    random_group.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-prefix-len",
+        type=int,
+        default=0,
+        help="Number of fixed prefix tokens before random "
+        " context. The length range of context in a random "
+        " request is [random-prefix-len, "
+        " random-prefix-len + random-prefix-len * random-range-ratio).")
+
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument("--hf-subset",
+                          type=str,
+                          default=None,
+                          help="Subset of the HF dataset.")
+    hf_group.add_argument("--hf-split",
+                          type=str,
+                          default=None,
+                          help="Split of the HF dataset.")
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths "
+        "from the sampled HF dataset.",
+    )
+
    args = parser.parse_args()
-    main(args)
+    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -90,6 +90,7 @@ def run_vllm(
    download_dir: Optional[str] = None,
    load_format: str = EngineArgs.load_format,
    disable_async_output_proc: bool = False,
+    use_new_beam_search_impl: bool = False,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@ -132,9 +133,23 @@ def run_vllm(
                max_tokens=output_len,
            ))

-    start = time.perf_counter()
-    llm.generate(prompts, sampling_params, use_tqdm=True)
-    end = time.perf_counter()
+    if not use_new_beam_search_impl:
+        start = time.perf_counter()
+        llm.generate(prompts, sampling_params, use_tqdm=True)
+        end = time.perf_counter()
+    else:
+        assert use_beam_search
+        prompts = [prompt for prompt, _, _ in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0][2]
+        for prompt, input_len, _output_len in requests:
+            assert _output_len == output_len
+        start = time.perf_counter()
+        llm.beam_search(prompts,
+                        beam_width=n,
+                        max_tokens=output_len,
+                        ignore_eos=True)
+        end = time.perf_counter()
    return end - start


@ -191,7 +206,6 @@ async def run_vllm_async(
        use_v2_block_manager=use_v2_block_manager,
        disable_async_output_proc=disable_async_output_proc,
        worker_use_ray=False,
-        engine_use_ray=False,
        disable_log_requests=True,
    )

@ -337,7 +351,7 @@ def main(args: argparse.Namespace):
            run_args.append(args.disable_frontend_multiprocessing)
            elapsed_time = uvloop.run(run_vllm_async(*run_args))
        else:
-            elapsed_time = run_vllm(*run_args)
+            elapsed_time = run_vllm(*run_args, args.use_new_beam_search_impl)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -397,6 +411,7 @@ if __name__ == "__main__":
                        default=1,
                        help="Number of generated sequences per prompt.")
    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--use-new-beam-search-impl", action="store_true")
    parser.add_argument("--num-prompts",
                        type=int,
                        default=1000,
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -1,10 +1,10 @@
-import random
 import time

 import torch

 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
+                        seed_everything)


@torch.inference_mode()
@ -16,10 +16,7 @@ def main(num_tokens: int,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device("cuda")

    layer = RMSNorm(hidden_size).to(dtype=dtype)
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -4,8 +4,10 @@ import itertools
 import math
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Tuple
+from itertools import product
+from typing import Callable, Iterable, List, Optional, Tuple

+import pandas as pd
 import torch
 import torch.utils.benchmark as TBenchmark
 from torch.utils.benchmark import Measurement as TMeasurement
@ -84,6 +86,10 @@ def loop_over_weights(
        fn(a, w_ref, w_q, w_s)


+_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
+_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+
+
 def bench(atype: torch.dtype,
          wtype: ScalarType,
          group_size: int,
@ -94,6 +100,8 @@ def bench(atype: torch.dtype,
          sub_label: str,
          benchmark_marlinv1: bool = True,
          sweep_schedules: bool = True) -> Iterable[TMeasurement]:
+    global _SWEEP_SCHEDULES_RESULTS
+
    a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
    sub_label += f", L={len(weights)}"

@ -163,6 +171,11 @@ def bench(atype: torch.dtype,
        best_schedule = None
        schedules = ops.machete_supported_schedules(wtype)
        for schedule in reversed(schedules):
+            schedule_M = int(schedule.split("_")[0].split("x")[1])
+
+            # Prune known bad schedules
+            if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
+                continue

            def run(a, _, w_q, w_s, schedule=schedule):
                ops.machete_gemm(a,
@ -175,6 +188,20 @@ def bench(atype: torch.dtype,
            res = bench_fn(label, sub_label, "machete_best",
                           lambda: loop_over_weights(a, weights_machete, run))

+            results_row = {
+                "M": m,
+                "K": k,
+                "N": n,
+                "group_size": group_size,
+                "schedule": schedule,
+                "median": res.median,
+            }
+            if _SWEEP_SCHEDULES_RESULTS is None:
+                _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(
+                    columns=results_row.keys())
+            _SWEEP_SCHEDULES_RESULTS.\
+                loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
+
            print(f"  {res.median:5.5} ", schedule)
            if not best or res.median < best.median:
                best = res
@ -235,18 +262,22 @@ def run_square_bench(args):
    dim_sizes = list(
        range(args.dim_start, args.dim_end + 1, args.dim_increment))
    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+
    data = run(args.dtype, args.sweep_schedules, MKNs)

    make_output(data, MKNs, f"square_bench-{args.dtype}")


 def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
+    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
+    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+    m_increment, k_increment, n_increment = \
+        [int(x) for x in args.dim_increment.split(",")]
+    Ms = list(range(m_start, m_end + 1, m_increment))
+    Ks = list(range(k_start, k_end + 1, k_increment))
+    Ns = list(range(n_start, n_end + 1, n_increment))
+    MKNs = list(product(Ms, Ks, Ns))
+
    data = run(args.dtype, args.sweep_schedules, MKNs)

    make_output(data, MKNs, f"range_bench-{args.dtype}")
@ -333,6 +364,9 @@ Benchmark Machete GEMM.
        action="store_true",
        help="Run a sweep over all supported schedules",
    )
+    parser.add_argument("--sweep-csv-out",
+                        help="CSV to store sweep results",
+                        default="sch_sweep_results.csv")
    subparsers = parser.add_subparsers(dest="cmd", required=True)

    square_parser = subparsers.add_parser("square_bench")
@ -342,12 +376,21 @@ Benchmark Machete GEMM.
    square_parser.set_defaults(func=run_square_bench)

    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.add_argument(
+        "--dim-start",
+        type=str,
+        required=True,
+        help="Start value for M,K,N as common separated list")
+    range_parser.add_argument(
+        "--dim-end",
+        type=str,
+        required=True,
+        help="End value (inclusive) for M,K,N as common separated list")
+    range_parser.add_argument(
+        "--dim-increment",
+        type=str,
+        required=True,
+        help="Increment value for M,K,N as common separated list")
    range_parser.set_defaults(func=run_range_bench)

    model_parser = subparsers.add_parser("model_bench")
@ -369,4 +412,9 @@ Benchmark Machete GEMM.
    model_parser.set_defaults(func=run_model_bench)

    args = parser.parse_args()
+
+    _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
    args.func(args)
+
+    if _SWEEP_SCHEDULES_RESULTS is not None:
+        _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -10,7 +10,7 @@ from ray.experimental.tqdm_ray import tqdm
 from transformers import AutoConfig

 from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, seed_everything


 class BenchmarkConfig(TypedDict):
@ -166,7 +166,7 @@ class BenchmarkWorker:

    def __init__(self, seed: int) -> None:
        torch.set_default_device("cuda")
-        torch.cuda.manual_seed_all(seed)
+        seed_everything(seed)
        self.seed = seed

    def benchmark(
@ -180,7 +180,7 @@ class BenchmarkWorker:
        use_fp8_w8a8: bool,
        use_int8_w8a16: bool,
    ) -> Tuple[Dict[str, int], float]:
-        torch.cuda.manual_seed_all(self.seed)
+        seed_everything(self.seed)
        dtype_str = get_config_dtype_str(dtype,
                                         use_int8_w8a16=use_int8_w8a16,
                                         use_fp8_w8a8=use_fp8_w8a8)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -6,7 +6,7 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        create_kv_caches_with_random)
+                        create_kv_caches_with_random, seed_everything)

 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@ -28,10 +28,7 @@ def main(
    device: str = "cuda",
    kv_cache_dtype: Optional[str] = None,
 ) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)

    scale = float(1.0 / (head_size**0.5))
    query = torch.empty(num_seqs,
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -1,10 +1,10 @@
-import random
 import time

 import torch

 from vllm import _custom_ops as ops
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
+                        seed_everything)


@torch.inference_mode()
@ -17,10 +17,7 @@ def main(num_tokens: int,
         do_profile: bool = False,
         num_warmup_iters: int = 5,
         num_iters: int = 100) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device("cuda")

    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -6,7 +6,7 @@ import torch

 from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
                                                         get_rope)
-from vllm.utils import FlexibleArgumentParser
+from vllm.utils import FlexibleArgumentParser, seed_everything


 def benchmark_rope_kernels_multi_lora(
@ -22,9 +22,7 @@ def benchmark_rope_kernels_multi_lora(
    max_position: int = 8192,
    base: int = 10000,
 ) -> None:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
+    seed_everything(seed)
    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@ -45,8 +45,7 @@ if __name__ == "__main__":
    rows = int(math.ceil(len(results) / 2))
    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
    axs = axs.flatten()
-    axs_idx = 0
-    for shape, data in results.items():
+    for axs_idx, (shape, data) in enumerate(results.items()):
        plt.sca(axs[axs_idx])
        df = pd.DataFrame(data)
        sns.lineplot(data=df,
@ -59,6 +58,5 @@ if __name__ == "__main__":
                     palette="Dark2")
        plt.title(f"Shape: {shape}")
        plt.ylabel("time (median, s)")
-        axs_idx += 1
    plt.tight_layout()
    plt.savefig("graph_machete_bench.pdf")
--- a/benchmarks/kernels/requirements.txt
+++ b/benchmarks/kernels/requirements.txt
@ -0,0 +1 @@
+pandas
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -120,4 +120,3 @@ define_gpu_extension_target(
 )

 message(STATUS "Enabling C extension.")
-add_dependencies(default _C)
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -350,18 +350,19 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
    ${GPU_INCLUDE_DIRECTORIES})

-  # TODO: is torch_python_LIBRARY needed?
-  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
-    ${GPU_LIBRARIES})
+  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})

  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
  # dependencies that are not necessary and may not be installed.
  if (GPU_LANGUAGE STREQUAL "CUDA")
+    if ("${CUDA_CUDA_LIB}" STREQUAL "")
+      set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
+    endif()
    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
      ${CUDA_LIBRARIES})
  else()
    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
  endif()

-  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
 endfunction()
--- a/collect_env.py
+++ b/collect_env.py
@ -285,9 +285,14 @@ def summarize_vllm_build_flags():


 def get_gpu_topo(run_lambda):
+    output = None
+
    if get_platform() == 'linux':
-        return run_and_read_all(run_lambda, 'nvidia-smi topo -m')
-    return None
+        output = run_and_read_all(run_lambda, 'nvidia-smi topo -m')
+        if output is None:
+            output = run_and_read_all(run_lambda, 'rocm-smi --showtopo')
+
+    return output


 # example outputs of CPU infos
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@ -257,11 +257,13 @@ void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
 // static-per-tensor quantization.
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                              const torch::Tensor& input,  // [..., hidden_size]
-                              const torch::Tensor& scale) {
+                              const torch::Tensor& scale,
+                              c10::optional<torch::Tensor> const& azp) {
  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(scale.numel() == 1);
+  TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");

  const int hidden_size = input.size(-1);
  const int num_tokens = input.numel() / hidden_size;
@ -277,11 +279,12 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
 void dynamic_scaled_int8_quant(
    torch::Tensor& out,          // [..., hidden_size]
    const torch::Tensor& input,  // [..., hidden_size]
-    torch::Tensor& scale         // [..., 1]
-) {
+    torch::Tensor& scale,        // [..., 1]
+    c10::optional<torch::Tensor> const& azp) {
  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");

  int const hidden_size = input.size(-1);
  int const num_tokens = input.numel() / hidden_size;
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -94,13 +94,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #ifdef __AVX512F__
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
-      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
-      "()");
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
+      "Tensor? azp) -> ()");
  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
+
  // Compute int8 quantized tensor and scaling factor
  ops.def(
-      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
-      "()");
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
+      "Tensor!? azp) -> ()");
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@ -55,18 +55,6 @@ bool _is_weak_contiguous(torch::Tensor& t) {
          t.numel() * t.element_size());
 }

-bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
-                      bool full_nvlink) {
-  auto inp_size = inp.numel() * inp.element_size();
-  // custom allreduce requires input byte size to be multiples of 16
-  if (inp_size % 16 != 0) return false;
-  if (!_is_weak_contiguous(inp)) return false;
-  if (world_size == 2 || full_nvlink) return inp_size <= max_size;
-  // for 4 or more non NVLink-capable GPUs, custom allreduce provides little
-  // performance improvement over NCCL.
-  return false;
-}
-
 void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                 cudaStream_t stream) {
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -6,6 +6,7 @@
 #include <cuda_runtime.h>

 #include <iostream>
+#include <array>
 #include <limits>
 #include <map>
 #include <unordered_map>
@ -23,17 +24,23 @@

 namespace vllm {

-constexpr int kMaxBlocks = 64;
-// note: we don't want to use atomics for signals because peer atomics are no
-// supported on PCIe links
+constexpr int kMaxBlocks = 36;
+// Counter may overflow, but it's fine since unsigned int overflow is
+// well-defined behavior.
+using FlagType = uint32_t;
 struct Signal {
-  alignas(128) uint32_t start[kMaxBlocks][8];
-  alignas(128) uint32_t end[kMaxBlocks][8];
+  alignas(128) FlagType self_counter[kMaxBlocks][8];
+  // Two sets of peer counters are needed for two syncs. The reason is that
+  // it's possible for peer GPU block to arrive at the second sync point while
+  // the current GPU block haven't passed the first sync point. Thus, peer GPU
+  // may write counter+1 while current GPU is busy waiting for counter. We use
+  // alternating counter array to avoid this possibility.
+  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
 };

 struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };

-struct __align__(16) RankSignals { volatile Signal* signals[8]; };
+struct __align__(16) RankSignals { Signal* signals[8]; };

 // like std::array, but aligned
 template <typename T, int sz>
@ -123,47 +130,71 @@ DINLINE O downcast(array_t<float, O::size> val) {
  }
 }

-// This function is meant to be used as the first synchronization in the all
-// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
-// prior memory accesses. Note: volatile writes will not be reordered against
-// other volatile writes.
-template <int ngpus>
-DINLINE void start_sync(const RankSignals& sg, volatile Signal* self_sg,
-                        int rank) {
-  if (threadIdx.x < ngpus) {
-    // reset flag for next time
-    self_sg->end[blockIdx.x][threadIdx.x] = 0;
-    // simultaneously write to the corresponding flag of all ranks.
-    // Latency = 1 p2p write
-    sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
-    // wait until we got true from all ranks
-    while (!self_sg->start[blockIdx.x][threadIdx.x]);
-  }
-  __syncthreads();
+static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#else
+  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#endif
 }

-// This function is meant to be used as the second or the final synchronization
-// barrier in the all reduce kernel. If it's the final synchronization barrier,
-// we don't need to make any visibility guarantees for prior memory accesses.
-template <int ngpus, bool final_sync = false>
-DINLINE void end_sync(const RankSignals& sg, volatile Signal* self_sg,
-                      int rank) {
-  __syncthreads();
-  // eliminate the case that prior writes are not visible after signals become
-  // visible. Note that I did not managed to make this happen through a lot of
-  // testing. Might be the case that hardware provides stronger guarantee than
-  // the memory model.
-  if constexpr (!final_sync) __threadfence_system();
+static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
+  FlagType flag;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#else
+  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#endif
+  return flag;
+}
+
+static DINLINE void st_flag_volatile(FlagType* flag_addr, FlagType flag) {
+  asm volatile("st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+}
+
+static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
+  FlagType flag;
+  asm volatile("ld.volatile.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+  return flag;
+}
+
+// is_start: whether this is the very first synchronization barrier.
+// need_fence: whether a memory fence is needed. If true, a release-acquire
+// semantic is used to enforce memory access order before and after this
+// barrier.
+template <int ngpus, bool is_start, bool need_fence = false>
+DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
+                               int rank) {
+  if constexpr (!is_start) __syncthreads();
+  static_assert(
+      !(is_start && need_fence));  // Start barrier shouldn't need fence.
  if (threadIdx.x < ngpus) {
-    // reset flag for next time
-    self_sg->start[blockIdx.x][threadIdx.x] = 0;
-    // simultaneously write to the corresponding flag of all ranks.
-    // Latency = 1 p2p write
-    sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1;
-    // wait until we got true from all ranks
-    while (!self_sg->end[blockIdx.x][threadIdx.x]);
+    // Increment the counter. Technically we only need one counter, but we use
+    // multiple per block to eliminate the need to share the counter via smem.
+    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
+    // Write the expected counter value to peer and wait for correct value from
+    // peer.
+    auto peer_counter_ptr =
+        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
+    auto self_counter_ptr =
+        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
+    if constexpr (need_fence) {
+      st_flag_release(peer_counter_ptr, val);
+      while (ld_flag_acquire(self_counter_ptr) != val);
+    } else {
+      st_flag_volatile(peer_counter_ptr, val);
+      while (ld_flag_volatile(self_counter_ptr) != val);
+    }
  }
-  if constexpr (!final_sync) __syncthreads();
+  if constexpr (is_start || need_fence) __syncthreads();
 }

 template <typename P, int ngpus, typename A>
@ -178,33 +209,31 @@ DINLINE P packed_reduce(const P* ptrs[], int idx) {

 template <typename T, int ngpus>
 __global__ void __launch_bounds__(512, 1)
-    cross_device_reduce_1stage(RankData* _dp, RankSignals sg,
-                               volatile Signal* self_sg, T* __restrict__ result,
-                               int rank, int size) {
+    cross_device_reduce_1stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
  using P = typename packed_t<T>::P;
  using A = typename packed_t<T>::A;
  // note: we don't reorder the address so the accumulation order is the same
  // for all ranks, ensuring bitwise identical results
  auto dp = *_dp;
-  start_sync<ngpus>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
  // do the actual reduction
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
       idx += gridDim.x * blockDim.x) {
    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
  }
-  end_sync<ngpus, true>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
 }

 template <typename P>
-DINLINE P* get_tmp_buf(volatile Signal* sg) {
+DINLINE P* get_tmp_buf(Signal* sg) {
  return (P*)(((Signal*)sg) + 1);
 }

 template <typename T, int ngpus>
 __global__ void __launch_bounds__(512, 1)
-    cross_device_reduce_2stage(RankData* _dp, RankSignals sg,
-                               volatile Signal* self_sg, T* __restrict__ result,
-                               int rank, int size) {
+    cross_device_reduce_2stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = gridDim.x * blockDim.x;
  using P = typename packed_t<T>::P;
@ -222,12 +251,12 @@ __global__ void __launch_bounds__(512, 1)
    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
  }
  auto tmp_out = tmps[0];
-  start_sync<ngpus>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
  // stage 1: reduce scatter
  for (int idx = start + tid; idx < end; idx += stride) {
    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
  }
-  end_sync<ngpus>(sg, self_sg, rank);
+  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);

  // stage 2: allgather. Note: it's important to match the tid between
  // the two stages, because visibility across devices is only guaranteed
@ -437,6 +466,8 @@ class CustomAllreduce {
 #define KL(ngpus, name)                                                       \
  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                 rank_, size);
+    // TODO(hanzhi713): Threshold is different for A100 and H100.
+    // Add per device threshold.
 #define REDUCE_CASE(ngpus)                            \
  case ngpus: {                                       \
    if (world_size_ == 2) {                           \
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@ -1,15 +1,15 @@
 /**
 * This is a standalone test for custom allreduce.
 * To compile, make sure you have MPI and NCCL installed in your system.
- * export MPI_HOME=XXX
+ * export MPI_HOME=xxx
 * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
- * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
+ * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
 *
 * Warning: this C++ test is not designed to be very readable and was used
 * during the rapid prototyping process.
 *
 * To run:
- * mpirun -np 8 ./custom_all_reduce_test
+ * mpirun --allow-run-as-root -np 8 ./custom_all_reduce_test
 */
 #include <cuda.h>
 #include <curand_kernel.h>
@ -44,7 +44,14 @@
  } while (0)

 __global__ void dummy_kernel() {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+#else
+  for (int i = 0; i < 100; i++) {
+    long long int start = clock64();
+    while (clock64() - start < 150000000);  // approximately 98.4ms on P40
+  }
+#endif
 }

 template <typename T>
@ -302,15 +309,19 @@ int main(int argc, char** argv) {

  bool performance_test = true;
  cudaProfilerStart();
-  // for (int threads : {256, 512}) {
+  // Uncomment to scan through different block size configs.
+  // for (int threads : {256, 512, 1024}) {
  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
-  //     run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024);
+  //     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
+  //     performance_test);
  //   }
  // }
+  // Scan through different sizes to test performance.
  for (int sz = 512; sz <= (8 << 20); sz *= 2) {
    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
  }

  cudaProfilerStop();
+  MPICHECK(MPI_Finalize());
  return EXIT_SUCCESS;
 }
--- a/csrc/cutlass_extensions/torch_utils.hpp
+++ b/csrc/cutlass_extensions/torch_utils.hpp
@ -68,7 +68,13 @@ static inline auto make_cute_layout(torch::Tensor const& tensor,
                        name, ".stride(", idx, ") to be ", StrideEle::value);
            return StrideEle{};
          } else {
-            return tensor.stride(idx);
+            if (tensor.size(idx) == 1) {
+              // use 0 stride for dim with size 1, this is easier for
+              // cute/cutlass to optimize (helps the TMA code flatten dims)
+              return StrideEle{0};
+            } else {
+              return tensor.stride(idx);
+            }
          }
        } else {
          // Extra strides are assumed to be 0 or 1
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@ -198,7 +198,8 @@ causal_conv1d_update(const at::Tensor &x,
                     const at::Tensor &conv_state,
                     const at::Tensor &weight,
                     const c10::optional<at::Tensor> &bias_,
-                     bool silu_activation) {
+                     bool silu_activation,
+                     const c10::optional<at::Tensor> &conv_state_indices_) {
    auto input_type = x.scalar_type();
    auto weight_type = weight.scalar_type();
    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@ -216,7 +217,6 @@ causal_conv1d_update(const at::Tensor &x,
    const int width = weight.size(-1);

    CHECK_SHAPE(x, batch_size, dim);
-    CHECK_SHAPE(conv_state, batch_size, dim, width);
    CHECK_SHAPE(weight, dim, width);

    TORCH_CHECK(width >= 2 && width <= 4, "causal_conv1d only supports width between 2 and 4");
@ -241,6 +241,22 @@ causal_conv1d_update(const at::Tensor &x,
    params.conv_state_c_stride = conv_state.stride(1);
    params.conv_state_l_stride = conv_state.stride(2);

+    if (conv_state_indices_.has_value()) {
+        auto conv_state_indices = conv_state_indices_.value();
+        TORCH_CHECK(conv_state_indices.scalar_type() == torch::kInt32)
+        TORCH_CHECK(conv_state_indices.is_cuda());
+        TORCH_CHECK(conv_state_indices.stride(0) == 1)
+        CHECK_SHAPE(conv_state_indices, batch_size);
+
+        int conv_state_entries = conv_state.size(0);
+        CHECK_SHAPE(conv_state, conv_state_entries, dim, width);
+
+        params.conv_state_indices_ptr = conv_state_indices.data_ptr<int32_t>();
+    } else {
+        CHECK_SHAPE(conv_state, batch_size, dim, width);
+        params.conv_state_indices_ptr = nullptr;
+    }
+
    // Otherwise the kernel will be launched from cuda:0 device
    // Cast to char to avoid compiler warning about narrowing
    at::cuda::CUDAGuard device_guard{(char)x.get_device()};
@ -646,8 +662,16 @@ void causal_conv1d_update_kernel(ConvParamsBase params) {
    const int channel_id = blockIdx.y * kNThreads + tidx;
    input_t *x = reinterpret_cast<input_t *>(params.x_ptr) + batch_id * params.x_batch_stride
        + channel_id * params.x_c_stride;
-    input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr) + batch_id * params.conv_state_batch_stride
+
+    // If params.conv_state_batch_indices is set, then the conv state is gathered from the conv state tensor
+    // along the batch axis. Otherwise, the conv state coordinate is the same as the batch id.
+    const int conv_state_batch_coord = params.conv_state_indices_ptr == nullptr
+        ? batch_id
+        : params.conv_state_indices_ptr[batch_id];
+    input_t *conv_state = reinterpret_cast<input_t *>(params.conv_state_ptr) 
+        + conv_state_batch_coord * params.conv_state_batch_stride
        + channel_id * params.conv_state_c_stride;
+
    weight_t *weight = reinterpret_cast<weight_t *>(params.weight_ptr) + channel_id * params.weight_c_stride;
    input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + batch_id * params.out_batch_stride
        + channel_id * params.out_c_stride;
--- a/csrc/mamba/causal_conv1d/causal_conv1d.h
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.h
@ -36,6 +36,10 @@ struct ConvParamsBase {

    void *__restrict__ conv_state_ptr;

+    // For the continuous batching case. Makes it so that the mamba state for 
+    // the current batch doesn't need to be a contiguous tensor.
+    int32_t *__restrict__ conv_state_indices_ptr;
+
    void *__restrict__ seq_idx_ptr;

    // No __restrict__ since initial_states could be the same as final_states.
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -586,7 +586,7 @@ selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
    });
-    std::vector<at::Tensor> result = {out, x.value()};
+    std::vector<at::Tensor> result = {out};
    if (has_z) { result.push_back(out_z); }
    return result;
 }
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
@ -0,0 +1,29 @@
+#include "marlin_moe_kernel_ku4b8.h"
+
+namespace marlin_moe {
+
+// We return bool so we can create these different kernel calls as a sequence
+// of if-elseif's.
+bool call_marlin_moe_kernel_ku4b8(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
+    int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
+    int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
+    bool replicate_input, bool apply_weights, int m_block, int max_par,
+    int cfg_max_m_blocks) {
+  if (false) {
+  }
+  GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
+  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
+  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
+  GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
+  else {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
@ -0,0 +1,20 @@
+#pragma once
+
+#include "marlin_moe_kernel.h"
+
+namespace marlin_moe {
+
+// We return bool so we can create these different kernel calls as a sequence
+// of if-elseif's.
+bool call_marlin_moe_kernel_ku4b8(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
+    int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
+    int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
+    bool replicate_input, bool apply_weights, int m_block, int max_par,
+    int cfg_max_m_blocks);
+
+}  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
@ -0,0 +1,29 @@
+#include "marlin_moe_kernel_ku8b128.h"
+
+namespace marlin_moe {
+
+// We return bool so we can create these different kernel calls as a sequence
+// of if-elseif's.
+bool call_marlin_moe_kernel_ku8b128(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
+    int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
+    int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
+    bool replicate_input, bool apply_weights, int m_block, int max_par,
+    int cfg_max_m_blocks) {
+  if (false) {
+  }
+  GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
+  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
+  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
+  GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
+  else {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
@ -0,0 +1,18 @@
+#pragma once
+
+#include "marlin_moe_kernel.h"
+
+namespace marlin_moe {
+
+bool call_marlin_moe_kernel_ku8b128(
+    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
+    bool has_act_order, int group_blocks, int num_threads, int blocks,
+    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
+    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
+    const float* topk_weights_ptr, const int4* s_ptr, const int* g_idx_ptr,
+    int* expert_offsets_ptr, int num_groups, int expert_idx, int num_experts,
+    int topk, int prob_m, int prob_n, int prob_k, int tot_m, int* locks,
+    bool replicate_input, bool apply_weights, int m_block, int max_par,
+    int cfg_max_m_blocks);
+
+}
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
--- a/csrc/moe/marlin_moe_ops.h
+++ b/csrc/moe/marlin_moe_ops.h
@ -2,11 +2,14 @@

 #include <torch/all.h>

+#include "core/scalar_type.hpp"
+
 torch::Tensor marlin_gemm_moe(
    const torch::Tensor& a, const torch::Tensor& b_q_weights,
    const torch::Tensor& sorted_ids, const torch::Tensor& topk_weights,
    const torch::Tensor& topk_ids, const torch::Tensor& b_scales,
    const torch::Tensor& g_idx, const torch::Tensor& perm,
-    torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
-    bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
+    torch::Tensor& workspace, vllm::ScalarTypeTorchPtr const& b_q_type,
+    int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full,
+    int64_t num_experts, int64_t topk, int64_t moe_block_size,
    bool replicate_input, bool apply_weights);
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -13,9 +13,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  m.def(
      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
-      "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
-      "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
-      "bool replicate_input, bool apply_weights) -> Tensor");
+      "g_idx, Tensor! perm, Tensor! workspace, "
+      "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, "
+      "int size_n, int size_k, bool is_k_full, int num_experts, int topk, "
+      "int moe_block_size, bool replicate_input, bool apply_weights)"
+      " -> Tensor");
  m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
 #endif
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -113,6 +113,8 @@ torch::Tensor prepack_B(torch::Tensor const& B,

 };  // namespace machete

+torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
+
 torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                  torch::Tensor& b_meta,
                                  torch::Tensor& b_scales,
@ -184,10 +186,12 @@ torch::Tensor marlin_qqq_gemm(torch::Tensor const& a,
 #endif

 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
-                              torch::Tensor const& scale);
+                              torch::Tensor const& scale,
+                              c10::optional<torch::Tensor> const& azp);

 void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
-                               torch::Tensor& scales);
+                               torch::Tensor& scales,
+                               c10::optional<torch::Tensor> const& azp);

 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        torch::Tensor b_gptq_qzeros,
@ -220,11 +224,10 @@ std::vector<torch::Tensor> selective_scan_fwd(
    const c10::optional<torch::Tensor>& index_,
    const c10::optional<torch::Tensor>& x);

-at::Tensor causal_conv1d_update(const at::Tensor& x,
-                                const at::Tensor& conv_state,
-                                const at::Tensor& weight,
-                                const c10::optional<at::Tensor>& bias_,
-                                bool silu_activation);
+at::Tensor causal_conv1d_update(
+    const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight,
+    const c10::optional<at::Tensor>& bias, bool silu_activation,
+    const c10::optional<at::Tensor>& conv_state_indices);

 at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
                             const c10::optional<at::Tensor>& bias_,
@ -239,8 +242,6 @@ fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                      const std::vector<std::string>& handles,
                      const std::vector<int64_t>& offsets, int64_t rank,
                      bool full_nvlink);
-bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
-                      bool full_nvlink);
 void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
 void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
                      torch::Tensor& out);
--- a/csrc/permute_cols.cu
+++ b/csrc/permute_cols.cu
@ -0,0 +1,88 @@
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp16.h>
+
+static constexpr int default_threads = 256;
+static constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+
+// For a given "a" of size [M,K] performs a permutation of the K columns based
+// on the given "perm" indices.
+// Currently only supports 16bit types (since we permute half types)
+__global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
+                                    int const* __restrict__ perm_int_ptr,
+                                    int4* __restrict__ out_int4_ptr, int size_m,
+                                    int size_k, int block_rows) {
+  int start_row = block_rows * blockIdx.x;
+  int finish_row = start_row + block_rows;
+  if (finish_row > size_m) {
+    finish_row = size_m;
+  }
+  int cur_block_rows = std::max(finish_row - start_row, 0);
+
+  int row_stride = size_k * sizeof(half) / 16;
+
+  auto permute_row = [&](int row) {
+    int iters = size_k / default_threads;
+    int rest = size_k % default_threads;
+
+    int offset = row * row_stride;
+
+    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+
+    int base_k = 0;
+
+    for (int i = 0; i < iters; i++) {
+      int cur_k = base_k + threadIdx.x;
+      int src_pos = perm_int_ptr[cur_k];
+
+      out_half[cur_k] = a_row_half[src_pos];
+
+      base_k += default_threads;
+    }
+
+    if (rest) {
+      if (threadIdx.x < rest) {
+        int cur_k = base_k + threadIdx.x;
+        int src_pos = perm_int_ptr[cur_k];
+
+        out_half[cur_k] = a_row_half[src_pos];
+      }
+    }
+  };
+
+  for (int i = 0; i < cur_block_rows; i++) {
+    int cur_row = start_row + i;
+    if (cur_row < size_m) {
+      permute_row(cur_row);
+    }
+  }
+}
+
+// More efficient version of A[..., perm]
+//  taken from gptq_marlin.cu
+torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+  auto dev = A.get_device();
+  auto stream = at::cuda::getCurrentCUDAStream(dev);
+
+  TORCH_CHECK(A.scalar_type() == at::kHalf || A.scalar_type() == at::kBFloat16,
+              "Currently only 16bit types are supported");
+  TORCH_CHECK(A.is_contiguous(), "A must be contiguous");
+  TORCH_CHECK(A.size(-1) % 8 == 0,
+              "A columns must be a multiple of 8 (128bits)");
+  auto A_2d = A.view({-1, A.size(-1)});
+
+  torch::Tensor D = torch::empty_like(A);
+  int sms;
+  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+  int block_rows = div_ceil(A_2d.size(0), sms);
+  permute_cols_kernel<<<sms, default_threads, 0, stream>>>(
+      reinterpret_cast<int4 const*>(A_2d.const_data_ptr()),
+      perm.const_data_ptr<int>(), reinterpret_cast<int4*>(D.mutable_data_ptr()),
+      A_2d.size(0), A_2d.size(1), block_rows);
+  return D;
+}
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -14,12 +14,17 @@

 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
-  static const float i8_min =
+  static constexpr auto i8_min =
      static_cast<float>(std::numeric_limits<int8_t>::min());
-  static const float i8_max =
+  static constexpr auto i8_max =
      static_cast<float>(std::numeric_limits<int8_t>::max());
-  // round
+
+  // To match the rounding mode of CUDA, we use nearbyint.
+  // It uses the current rounding mode, which is always FE_TONEAREST on HIP.
+  // If that changes in the future, we may need to set the rounding mode
+  // explicitly, either at runtime or compile time.
  float dst = std::nearbyint(x);
+
  // saturate
  dst = std::clamp(dst, i8_min, i8_max);
  return static_cast<int8_t>(dst);
@ -31,6 +36,59 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
 #endif
 }

+static inline __device__ int32_t float_to_int32_rn(float x) {
+#ifdef USE_ROCM
+  // int32_max is not exactly representable as float.
+  // Therefore, we need to be careful and manually return int32_max on overflow.
+  // For symmetry, we also do the same for int32_min, even though it is exactly
+  // representable as float and the conversion should be exact.
+  static constexpr auto i32_min = std::numeric_limits<int32_t>::min();
+  static constexpr auto i32_min_f = static_cast<float>(i32_min);
+  static constexpr auto i32_max = std::numeric_limits<int32_t>::max();
+  static constexpr auto i32_max_f = static_cast<float>(i32_max);
+
+  // To match the rounding mode of CUDA, we use nearbyint.
+  // It uses the current rounding mode, which is always FE_TONEAREST on HIP.
+  // If that changes in the future, we may need to set the rounding mode
+  // explicitly, either at runtime or compile time.
+  float dst = std::nearbyint(x);
+
+  // saturate on the higher end.
+  if (dst >= i32_max_f) {
+    return i32_max;
+  }
+  // saturate on the lower end.
+  if (dst <= i32_min_f) {
+    return i32_min;
+  }
+
+  return static_cast<int32_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int32_t&>(dst);
+#endif
+}
+
+static inline __device__ int8_t int32_to_int8(int32_t x) {
+#ifdef USE_ROCM
+  static constexpr auto i8_min =
+      static_cast<int32_t>(std::numeric_limits<int8_t>::min());
+  static constexpr auto i8_max =
+      static_cast<int32_t>(std::numeric_limits<int8_t>::max());
+
+  // saturate
+  int32_t dst = std::clamp(x, i8_min, i8_max);
+  return static_cast<int8_t>(dst);
+#else
+  // CUDA path
+  uint32_t dst;
+  asm volatile("cvt.sat.s8.s32 %0, %1;" : "=r"(dst) : "r"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+#endif
+}
+
 namespace vllm {

 template <typename scalar_t, typename scale_type>
@ -47,6 +105,23 @@ __global__ void static_scaled_int8_quant_kernel(
  }
 }

+template <typename scalar_t, typename scale_type, typename azp_type>
+__global__ void static_scaled_int8_azp_quant_kernel(
+    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
+    scale_type const* scale_ptr, azp_type const* azp_ptr,
+    const int hidden_size) {
+  int const tid = threadIdx.x;
+  int const token_idx = blockIdx.x;
+  scale_type const scale = *scale_ptr;
+  azp_type const azp = *azp_ptr;
+
+  for (int i = tid; i < hidden_size; i += blockDim.x) {
+    auto const val = static_cast<float>(input[token_idx * hidden_size + i]);
+    auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp);
+    out[token_idx * hidden_size + i] = quant_val;
+  }
+}
+
 template <typename scalar_t, typename scale_type>
 __global__ void dynamic_scaled_int8_quant_kernel(
    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
@ -80,14 +155,68 @@ __global__ void dynamic_scaled_int8_quant_kernel(
  }
 }

+template <typename scalar_t, typename scale_type, typename azp_type>
+__global__ void dynamic_scaled_int8_azp_quant_kernel(
+    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
+    scale_type* scale, azp_type* azp, const int hidden_size) {
+  int const token_idx = blockIdx.x;
+
+  // Scan for the min and max value for this token
+  float max_val = std::numeric_limits<float>::min();
+  float min_val = std::numeric_limits<float>::max();
+  for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    auto val = static_cast<float>(input[token_idx * hidden_size + i]);
+    max_val = std::max(max_val, val);
+    min_val = std::min(min_val, val);
+  }
+
+  // Reduce the max and min values across the block
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStorage;
+  max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x);
+  __syncthreads();  // Make sure min doesn't mess with max shared memory
+  min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x);
+
+  __shared__ scale_type scale_sh;
+  __shared__ azp_type azp_sh;
+
+  // Compute the scale and zero point and store them, only on the first thread
+  if (threadIdx.x == 0) {
+    float const scale_val = (max_val - min_val) / 255.0f;
+    // Use rounding to even (same as torch.round)
+    auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val);
+    auto const azp_val = static_cast<azp_type>(azp_float);
+
+    // Store the scale and azp into shared and global
+    scale[token_idx] = scale_sh = scale_val;
+    azp[token_idx] = azp_sh = azp_val;
+  }
+
+  // Wait for the scale and azp to be computed
+  __syncthreads();
+
+  float const scale_val = scale_sh;
+  azp_type const azp_val = azp_sh;
+
+  // Quantize the values
+  for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+    auto const val = static_cast<float>(input[token_idx * hidden_size + i]);
+    auto const quant_val =
+        int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val);
+    out[token_idx * hidden_size + i] = quant_val;
+  }
+}
+
 }  // namespace vllm

 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                              torch::Tensor const& input,  // [..., hidden_size]
-                              torch::Tensor const& scale) {
+                              torch::Tensor const& scale,
+                              c10::optional<torch::Tensor> const& azp) {
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(scale.numel() == 1);
+  TORCH_CHECK(!azp || azp->numel() == 1);

  int const hidden_size = input.size(-1);
  int const num_tokens = input.numel() / hidden_size;
@ -96,19 +225,29 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
-        vllm::static_scaled_int8_quant_kernel<scalar_t, float>
-            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
-                                         out.data_ptr<int8_t>(),
-                                         scale.data_ptr<float>(), hidden_size);
+        if (!azp) {
+          vllm::static_scaled_int8_quant_kernel<scalar_t, float>
+              <<<grid, block, 0, stream>>>(
+                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+                  scale.data_ptr<float>(), hidden_size);
+        } else {
+          vllm::static_scaled_int8_azp_quant_kernel<scalar_t, float, int32_t>
+              <<<grid, block, 0, stream>>>(
+                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+                  scale.data_ptr<float>(), azp->data_ptr<int32_t>(),
+                  hidden_size);
+        }
      });
 }

 void dynamic_scaled_int8_quant(
    torch::Tensor& out,          // [..., hidden_size]
    torch::Tensor const& input,  // [..., hidden_size]
-    torch::Tensor& scales) {
+    torch::Tensor& scales, c10::optional<torch::Tensor> const& azp) {
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(scales.is_contiguous());
+  TORCH_CHECK(!azp || azp->is_contiguous());

  int const hidden_size = input.size(-1);
  int const num_tokens = input.numel() / hidden_size;
@ -117,9 +256,17 @@ void dynamic_scaled_int8_quant(
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
-        vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
-            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
-                                         out.data_ptr<int8_t>(),
-                                         scales.data_ptr<float>(), hidden_size);
+        if (!azp) {
+          vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
+              <<<grid, block, 0, stream>>>(
+                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+                  scales.data_ptr<float>(), hidden_size);
+        } else {
+          vllm::dynamic_scaled_int8_azp_quant_kernel<scalar_t, float, int32_t>
+              <<<grid, block, 0, stream>>>(
+                  input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
+                  scales.data_ptr<float>(), azp->data_ptr<int32_t>(),
+                  hidden_size);
+        }
      });
 }
--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@ -353,18 +353,47 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {

-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq1_s * x = (const block_iq1_s  *) vx;

-    const int tid = threadIdx.x;
-    const int il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
-    const int i8 = 4*ib+il;
-    uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
-    const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
-    const float d = __half2float(x[i].d) * (2*(h & 7) + 1);
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j]);
+    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
+    const float d = __half2float(x[i].d) * (2*((x[i].qh[ib] >> 12) & 7) + 1);
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = __float2half(d * (q[j] + delta));
+    }
+}
+
+template<typename dst_t>
+static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+
+    const int64_t i   = blockIdx.x;
+    const block_iq1_m * x = (const block_iq1_m  *) vx;
+
+    const int64_t tid = threadIdx.x;
+    const int64_t il = tid/8; // 0...3
+    const int64_t ib = tid%8; // 0...7
+    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
+    const uint16_t * sc = (const uint16_t *)x[i].scales;
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+    const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
+    const float d = __half2float(scale.f16) * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
+    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
+    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
+    grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
+    grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
+    grid32[0] &= 0x0f0f0f0f;
+    for (int j = 0; j < 8; ++j) {
+        y[j] = __float2half(d * (q[j] + delta));
+    }
 }

 template<typename dst_t>
@ -475,6 +504,12 @@ static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, c
    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
 }

+template<typename dst_t>
+static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
+}
+
 template<typename dst_t>
 static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = (k + QK_K - 1) / QK_K;
@ -525,6 +560,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
            return dequantize_row_iq2_s_cuda;
        case 23:
            return dequantize_row_iq4_xs_cuda;
+        case 29:
+            return dequantize_row_iq1_m_cuda;
        default:
            return nullptr;
    }
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@ -149,14 +149,30 @@ typedef struct {
    uint8_t scales[IQ3S_N_SCALE];
 } block_iq3_s;

+// 1.5625 bpw
 #define QR1_S 8
 #define QI1_S (QK_K / (4*QR1_S))
 typedef struct {
    half d;
-    uint8_t qs[QK_K/8];
-    uint8_t scales[QK_K/16];
+    uint8_t  qs[QK_K/8];
+    uint16_t qh[QK_K/32];
 } block_iq1_s;

+// 1.75 bpw
+#define QR1_M 8
+#define QI1_M (QK_K / (4*QR1_M))
+typedef struct {
+    uint8_t  qs[QK_K/8];      // grid index, low 8 bits
+    uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
+    uint8_t  scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
+} block_iq1_m;
+
+// Used by IQ1_M quants
+typedef union {
+    half f16;
+    uint16_t  u16;
+} iq1m_scale_t;
+
 #define QK4_NL 32
 #define QR4_NL 2
 #define QI4_NL (QK4_NL / (4*QR4_NL))
@ -733,135 +749,265 @@ static const __device__ uint32_t iq3xs_grid[512] = {
    0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
 };

-static const __device__ uint64_t iq1s_grid[512] = {
-    0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
-    0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
-    0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
-    0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
-    0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
-    0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
-    0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
-    0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
-    0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
-    0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
-    0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
-    0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
-    0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
-    0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
-    0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
-    0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
-    0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
-    0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
-    0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
-    0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
-    0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
-    0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
-    0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
-    0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
-    0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
-    0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
-    0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
-    0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
-    0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
-    0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
-    0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
-    0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
-    0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
-    0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
-    0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
-    0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
-    0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
-    0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
-    0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
-    0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
-    0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
-    0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
-    0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
-    0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
-    0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
-    0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
-    0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
-    0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
-    0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
-    0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
-    0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
-    0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
-    0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
-    0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
-    0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
-    0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
-    0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
-    0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
-    0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
-    0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
-    0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
-    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
-    0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
-    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
-    0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
-    0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
-    0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
-    0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
-    0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
-    0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
-    0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
-    0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
-    0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
-    0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
-    0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
-    0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
-    0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
-    0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
-    0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
-    0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
-    0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
-    0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
-    0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
-    0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
-    0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
-    0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
-    0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
-    0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
-    0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
-    0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
-    0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
-    0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
-    0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
-    0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
-    0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
-    0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
-    0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
-    0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
-    0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
-    0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
-    0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
-    0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
-    0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
-    0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
-    0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
-    0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
-    0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
-    0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
-    0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
-    0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
-    0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
-    0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
-    0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
-    0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
-    0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
-    0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
-    0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
-    0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
-    0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
-    0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
-    0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
-    0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
-    0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
-    0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
-    0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
-    0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
-    0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
-    0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
+#define IQ1S_DELTA 0.125f
+#define IQ1M_DELTA 0.125f
+static const __device__ uint64_t iq1s_grid_gpu[2048] = {
+    0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
+    0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
+    0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
+    0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
+    0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
+    0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
+    0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
+    0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
+    0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
+    0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
+    0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
+    0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
+    0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
+    0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
+    0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
+    0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
+    0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
+    0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
+    0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
+    0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
+    0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
+    0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
+    0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
+    0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
+    0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
+    0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
+    0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
+    0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
+    0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
+    0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
+    0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
+    0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
+    0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
+    0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
+    0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
+    0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
+    0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
+    0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
+    0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
+    0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
+    0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
+    0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
+    0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
+    0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
+    0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
+    0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
+    0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
+    0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
+    0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
+    0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
+    0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
+    0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
+    0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
+    0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
+    0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
+    0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
+    0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
+    0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
+    0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
+    0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
+    0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
+    0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
+    0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
+    0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
+    0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
+    0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
+    0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
+    0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
+    0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
+    0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
+    0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
+    0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
+    0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
+    0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
+    0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
+    0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
+    0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
+    0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
+    0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
+    0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
+    0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
+    0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
+    0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
+    0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
+    0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
+    0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
+    0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
+    0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
+    0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
+    0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
+    0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
+    0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
+    0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
+    0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
+    0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
+    0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
+    0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
+    0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
+    0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
+    0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
+    0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
+    0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
+    0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
+    0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
+    0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
+    0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
+    0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
+    0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
+    0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
+    0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
+    0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
+    0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
+    0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
+    0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
+    0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
+    0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
+    0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
+    0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
+    0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
+    0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
+    0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
+    0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
+    0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
+    0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
+    0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
+    0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
+    0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
+    0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
+    0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
+    0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
+    0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
+    0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
+    0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
+    0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
+    0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
+    0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
+    0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
+    0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
+    0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
+    0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
+    0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
+    0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
+    0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
+    0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
+    0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
+    0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
+    0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
+    0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
+    0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
+    0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
+    0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
+    0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
+    0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
+    0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
+    0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
+    0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
+    0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
+    0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
+    0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
+    0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
+    0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
+    0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
+    0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
+    0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
+    0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
+    0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
+    0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
+    0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
+    0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
+    0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
+    0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
+    0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
+    0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
+    0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
+    0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
+    0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
+    0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
+    0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
+    0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
+    0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
+    0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
+    0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
+    0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
+    0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
+    0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
+    0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
+    0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
+    0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
+    0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
+    0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
+    0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
+    0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
+    0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
+    0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
+    0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
+    0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
+    0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
+    0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
+    0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
+    0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
+    0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
+    0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
+    0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
+    0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
+    0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
+    0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
+    0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
+    0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
+    0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
+    0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
+    0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
+    0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
+    0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
+    0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
+    0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
+    0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
+    0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
+    0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
+    0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
+    0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
+    0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
+    0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
+    0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
+    0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
+    0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
+    0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
+    0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
+    0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
+    0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
+    0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
+    0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
+    0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
+    0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
+    0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
+    0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
+    0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
+    0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
+    0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
+    0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
+    0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
+    0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
+    0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
+    0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
+    0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
+    0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
+    0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
+    0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
+    0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
+    0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
+    0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
+    0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
+    0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
+    0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
+    0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
+    0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
+    0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
 };

 static const __device__ uint8_t ksigns_iq2xs[128] = {
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@ -166,6 +166,11 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W,  // quant weight
                                   (void*)quant_X.data_ptr(),
                                   (half*)Y.data_ptr(), col, row, stream);
      break;
+    case 29:
+      mul_mat_vec_iq1_m_q8_1_cuda((void*)W.data_ptr(),
+                                  (void*)quant_X.data_ptr(),
+                                  (half*)Y.data_ptr(), col, row, stream);
+      break;
  }
  return Y;
 }
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@ -157,6 +157,14 @@ static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, half *
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }

+static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const dim3 block_nums(block_num_y, 1, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
+    mul_mat_vec_q<QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
 static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
    const dim3 block_nums(block_num_y, 1, 1);
--- a/csrc/quantization/gguf/vecdotq.cuh
+++ b/csrc/quantization/gguf/vecdotq.cuh
@ -1,5 +1,18 @@
 // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/vecdotq.cuh
 // and https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
+static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
+    const uint16_t * x16 = (const uint16_t *) x; // assume at least 2 byte alignment
+
+    int x32  = x16[2*i32 + 0] <<  0;
+    x32     |= x16[2*i32 + 1] << 16;
+
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
+    return ((const int *) x)[i32]; // assume at least 4 byte alignment
+}
+
 static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
    int x32 = 0;
@ -1661,24 +1674,76 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
 #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;

-    const int ib32 = iqs;
-    int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
-    const uint8_t h1 = bq1->scales[2*ib32+0];
-    const uint8_t h2 = bq1->scales[2*ib32+1];
-    const int * q8 = (const int *)bq8_1[ib32].qs;
-    const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
-    const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
-    const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
-    const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
-    for (int j = 0; j < 2; ++j) {
-        sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
-        sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
-        sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
-        sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
+    const int       qs_packed = get_int_b2(bq1->qs, iqs);
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    const int qh = bq1->qh[iqs];
+
+    int sumi = 0;
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
+
+        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+        sumi = __dp4a(grid0, u0, sumi);
+        sumi = __dp4a(grid1, u1, sumi);
    }
-    const float d = __half2float(bq1->d) * __low2float(bq8_1[ib32].ds);
-    return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
-                sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
+
+    const float  d1q   = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
+    const float  delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
+    const float2 ds    = __half22float2(bq8_1[iqs].ds);
+    return d1q * (ds.x*sumi + ds.y*delta);
+#endif
+}
+
+static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
+
+    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
+
+    const int       qs_packed = get_int_b4(bq1->qs, iqs);
+    const uint8_t * qs        = (const uint8_t *) &qs_packed;
+
+    int   sumi[2] = {0};
+    float sumf[2] = {0.0f};
+#pragma unroll
+    for (int l0 = 0; l0 < 8; l0 += 2) {
+        const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
+
+        const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
+
+        const int grid0 = (grid >> 0) & 0x0F0F0F0F;
+        const int grid1 = (grid >> 4) & 0x0F0F0F0F;
+
+        const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
+        const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
+
+        sumi[l0/4] = __dp4a(grid0, u0, sumi[l0/4]);
+        sumi[l0/4] = __dp4a(grid1, u1, sumi[l0/4]);
+
+        const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
+        int sumy = 0;
+        sumy = __dp4a(u0, 0x01010101, sumy);
+        sumy = __dp4a(u1, 0x01010101, sumy);
+        sumf[l0/4] += delta*sumy;
+    }
+
+    const uint16_t * sc = (const uint16_t *) bq1->scales;
+
+    iq1m_scale_t scale;
+    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
+    const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
+
+    const int tmp = sc[iqs/2] >> (6*(iqs%2));
+    const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
+    const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
+    return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
 #endif
 }

--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -157,7 +157,7 @@ TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput
 TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative


-@dataclass
+@dataclass(frozen=True)
 class ScheduleConfig:
    tile_shape_mn: Tuple[int, int]
    cluster_shape_mnk: Tuple[int, int, int]
@ -328,56 +328,137 @@ def generate():
    # about how this works
    SCRIPT_DIR = os.path.dirname(__file__)

-    schedules = [
-        ScheduleConfig(
-            tile_shape_mn=tile_shape_mn,
-            cluster_shape_mnk=cluster_shape_mnk,
-            kernel_schedule=kernel_schedule,
-            epilogue_schedule=epilogue_schedule,
-            tile_scheduler=tile_scheduler,
-        ) for tile_shape_mn, cluster_shape_mnk in (
-            ((128, 16), (1, 1, 1)),
-            ((128, 32), (1, 1, 1)),
-            ((128, 64), (1, 1, 1)),
-            ((128, 128), (1, 1, 1)),
-        ) for kernel_schedule in (TmaMI, ) for epilogue_schedule in (TmaCoop, )
-        for tile_scheduler in (TileSchedulerType.StreamK, )
-    ]
+    schedule_common_params = dict(
+        kernel_schedule=TmaMI,
+        epilogue_schedule=TmaCoop,
+        tile_scheduler=TileSchedulerType.StreamK,
+    )

    # For now we use the same heuristic for all types
+    # Heuristic is currently tuned for H100s
    default_heuristic = [
-        ("M > 64",
-         ScheduleConfig(
-             tile_shape_mn=(128, 128),
-             cluster_shape_mnk=(1, 1, 1),
-             kernel_schedule=TmaMI,
-             epilogue_schedule=TmaCoop,
-             tile_scheduler=TileSchedulerType.StreamK,
-         )),
-        ("M > 32",
-         ScheduleConfig(
-             tile_shape_mn=(128, 64),
-             cluster_shape_mnk=(1, 1, 1),
-             kernel_schedule=TmaMI,
-             epilogue_schedule=TmaCoop,
-             tile_scheduler=TileSchedulerType.StreamK,
-         )),
-        ("M > 16",
-         ScheduleConfig(
-             tile_shape_mn=(128, 32),
-             cluster_shape_mnk=(1, 1, 1),
-             kernel_schedule=TmaMI,
-             epilogue_schedule=TmaCoop,
-             tile_scheduler=TileSchedulerType.StreamK,
-         )),
-        (None,
-         ScheduleConfig(tile_shape_mn=(128, 16),
-                        cluster_shape_mnk=(1, 1, 1),
-                        kernel_schedule=TmaMI,
-                        epilogue_schedule=TmaCoop,
-                        tile_scheduler=TileSchedulerType.StreamK))
+        #### M = 257+
+        (
+            "M > 256 && K <= 16384 && N <= 4096",
+            ScheduleConfig(
+                tile_shape_mn=(128, 128),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 256",
+            ScheduleConfig(
+                tile_shape_mn=(128, 256),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 129-256
+        (
+            "M > 128 && K <= 4096 && N <= 4096",
+            ScheduleConfig(
+                tile_shape_mn=(128, 64),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 128 && K <= 8192 && N <= 8192",
+            ScheduleConfig(
+                tile_shape_mn=(128, 128),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 128",
+            ScheduleConfig(
+                tile_shape_mn=(128, 256),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 65-128
+        (
+            "M > 64 && K <= 4069 && N <= 4069",
+            ScheduleConfig(
+                tile_shape_mn=(128, 32),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 64 && K <= 4069 && N <= 8192",
+            ScheduleConfig(
+                tile_shape_mn=(128, 64),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 64 && K >= 8192 && N >= 12288",
+            ScheduleConfig(
+                tile_shape_mn=(256, 128),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 64",
+            ScheduleConfig(
+                tile_shape_mn=(128, 128),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 33-64
+        (
+            "M > 32 && K <= 6144 && N <= 6144",
+            ScheduleConfig(
+                tile_shape_mn=(128, 16),
+                cluster_shape_mnk=(1, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 32 && K >= 16384 && N >= 12288",
+            ScheduleConfig(
+                tile_shape_mn=(256, 64),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 32",
+            ScheduleConfig(
+                tile_shape_mn=(128, 64),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 17-32
+        (
+            "M > 16 && K <= 12288 && N <= 8192",
+            ScheduleConfig(
+                tile_shape_mn=(128, 32),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            "M > 16",
+            ScheduleConfig(
+                tile_shape_mn=(256, 32),
+                cluster_shape_mnk=(2, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        #### M = 1-16
+        (
+            "N >= 26624",
+            ScheduleConfig(
+                tile_shape_mn=(256, 16),
+                cluster_shape_mnk=(1, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
+        (
+            None,
+            ScheduleConfig(
+                tile_shape_mn=(128, 16),
+                cluster_shape_mnk=(1, 1, 1),
+                **schedule_common_params  # type: ignore
+            )),
    ]

+    schedules = list(set([x[1] for x in default_heuristic]))
+
    impl_configs = []

    GPTQ_kernel_type_configs = list(
--- a/csrc/quantization/machete/machete_mm_kernel.cuh
+++ b/csrc/quantization/machete/machete_mm_kernel.cuh
@ -152,7 +152,8 @@ struct MacheteKernelTemplate {

    int M = size<0>(layout_A), N = size<1>(layout_D), K = size<1>(layout_A);

-    int const group_size = maybe_group_size.value_or(K);
+    int const group_size =
+        maybe_group_size == -1 ? K : maybe_group_size.value_or(K);
    int const scale_k = (K + group_size - 1) / group_size;

    TORCH_CHECK(size<0>(layout_A) == M && size<1>(layout_A) == K);
--- a/csrc/quantization/machete/machete_mm_launcher.cuh
+++ b/csrc/quantization/machete/machete_mm_launcher.cuh
@ -71,7 +71,7 @@ torch::Tensor run_impl(PyTorchArguments args) {
  auto arguments = MacheteKernel::create_arguments(
      stream, A_ptr, layout_A, B_ptr, D_ptr, layout_D, C_ptr, layout_C, S_ptr,
      layout_S, Z_ptr, layout_Z, args.alpha.value_or(1), args.beta.value_or(0),
-      args.group_size.value_or(K));
+      args.group_size);
  TORCH_CHECK(MacheteKernel::can_implement(arguments),
              "Machete kernel cannot be run with these arguments");

--- a/csrc/quantization/machete/machete_prepack_launcher.cuh
+++ b/csrc/quantization/machete/machete_prepack_launcher.cuh
@ -53,7 +53,7 @@ torch::Tensor prepack_impl(torch::Tensor const B) {
  // clang-format on

  // Allocate output
-  torch::Tensor D = torch::empty_like(B);
+  torch::Tensor D = torch::empty_like(B, {}, at::MemoryFormat::Contiguous);

  prepack_B<PrepackedLayoutB>(stream, B_ptr, layout_Bt,
                              static_cast<ElementB*>(D.mutable_data_ptr()));
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/all.h>
+
+void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
+                     torch::Tensor& max_logits, torch::Tensor& tmp_out,
+                     torch::Tensor& query, torch::Tensor& key_cache,
+                     torch::Tensor& value_cache, int64_t num_kv_heads,
+                     double scale, torch::Tensor& block_tables,
+                     torch::Tensor& context_lens, int64_t block_size,
+                     int64_t max_context_len,
+                     const c10::optional<torch::Tensor>& alibi_slopes,
+                     const std::string& kv_cache_dtype, double k_scale,
+                     double v_scale);
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@ -0,0 +1,34 @@
+#include "core/registration.h"
+#include "rocm/ops.h"
+
+// Note on op signatures:
+// The X_meta signatures are for the meta functions corresponding to op X.
+// They must be kept in sync with the signature for X. Generally, only
+// functions that return Tensors require a meta function.
+//
+// See the following links for detailed docs on op registration and function
+// schemas.
+// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
+  // vLLM custom ops for rocm
+
+  // Custom attention op
+  // Compute the attention between an input query and the cached
+  // keys/values using PagedAttention.
+  rocm_ops.def(
+      "paged_attention(Tensor! out, Tensor exp_sums,"
+      "                Tensor max_logits, Tensor tmp_out,"
+      "                Tensor query, Tensor key_cache,"
+      "                Tensor value_cache, int num_kv_heads,"
+      "                float scale, Tensor block_tables,"
+      "                Tensor context_lens, int block_size,"
+      "                int max_context_len,"
+      "                Tensor? alibi_slopes,"
+      "                str kv_cache_dtype,"
+      "                float k_scale, float v_scale) -> ()");
+  rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -192,6 +192,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "-> Tensor");
  ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);

+  ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
+  ops.impl("permute_cols", torch::kCUDA, &permute_cols);
+
  // gptq_marlin Optimized Quantized GEMM for GPTQ.
  ops.def(
      "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
@ -272,15 +275,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "Tensor! A, Tensor! B, Tensor! C,"
      "Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
      "bool delta_softplus,"
-      "Tensor? index_, Tensor(a! -> *)? x) -> Tensor(a)[]");
+      "Tensor? index_, Tensor!? x) -> Tensor[]");
  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);

  ops.def(
      "causal_conv1d_update(Tensor! x,"
      "Tensor! conv_state,"
      "Tensor! weight,"
-      "Tensor? bias_,"
-      "bool silu_activation) -> Tensor");
+      "Tensor? bias,"
+      "bool silu_activation,"
+      "Tensor? conv_state_indices) -> Tensor");
  ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);

  ops.def(
@ -288,7 +292,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "Tensor? bias_,"
      "Tensor? seq_idx_,"
      "Tensor? initial_states_,"
-      "Tensor? final_states_out_,"
+      "Tensor!? final_states_out_,"
      "bool silu_activation) -> Tensor");
  ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
 #endif
@ -336,14 +340,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {

  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
-      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
-      "()");
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
+      "Tensor? azp) -> ()");
  ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);

  // Compute int8 quantized tensor and scaling factor
  ops.def(
-      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
-      "()");
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
+      "Tensor!? azp) -> ()");
  ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
           &dynamic_scaled_int8_quant);
 }
@ -411,11 +415,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
      "bool full_nvlink) -> int");
  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);

-  custom_ar.def(
-      "should_custom_ar(Tensor inp, int max_size, int world_size, "
-      "bool full_nvlink) -> bool");
-  custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
-
  custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
  custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);

--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
@ -21,8 +21,8 @@ Traces can be visualized using https://ui.perfetto.dev/.
 .. tip::

   To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
-   Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
-   ``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
+   Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
+   ``export VLLM_RPC_TIMEOUT=1800000``
  
 Example commands and usage:
 ===========================
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@ -3,15 +3,17 @@
 Installation with ROCm
 ======================

-vLLM supports AMD GPUs with ROCm 6.1.
+vLLM supports AMD GPUs with ROCm 6.2.

 Requirements
 ------------

 * OS: Linux
-* Python: 3.8 -- 3.11
+* Python: 3.9 -- 3.12
 * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
-* ROCm 6.1
+* ROCm 6.2
+
+Note: PyTorch 2.5+/ROCm6.2 dropped the support for python 3.8.

 Installation options:

@ -26,8 +28,18 @@ Option 1: Build from source with docker (recommended)
 You can build and install vLLM from source.

 First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:

-`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.1 by default, but also supports ROCm 5.7 and 6.0 in older vLLM branches.
+.. code-block:: console
+    
+    {
+        "features": {
+            "buildkit": true
+        }
+    }
+
+
+`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:

 * `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image.
@ -39,13 +51,13 @@ It provides flexibility to customize the build of docker image using the followi
 Their values can be passed in when running ``docker build`` with ``--build-arg`` options.


-To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default:
+To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:

 .. code-block:: console

    $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .

-To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
+To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:

 .. code-block:: console

@ -79,37 +91,55 @@ Option 2: Build from source

 - `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
 - `PyTorch <https://pytorch.org/>`_
- `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_

-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch-nightly`.
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.

-Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guild in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_
+Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_


 1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_

 Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_

+    .. code-block:: console
+
+        $ python3 -m pip install ninja cmake wheel pybind11
+        $ pip uninstall -y triton 
+        $ git clone https://github.com/OpenAI/triton.git
+        $ cd triton
+        $ git checkout e192dba
+        $ cd python
+        $ pip3 install .
+        $ cd ../..
+
+.. note::
+    - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
+
+
 2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/ck_tile>`_

+
 Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support>`_
 Alternatively, wheels intended for vLLM use can be accessed under the releases.

+For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`.
+Note to get your gfx architecture, run `rocminfo |grep gfx`.
+
+    .. code-block:: console
+
+        $ git clone https://github.com/ROCm/flash-attention.git
+        $ cd flash-attention
+        $ git checkout 3cea2fb
+        $ git submodule update --init
+        $ GPU_ARCHS="gfx90a" python3 setup.py install
+        $ cd ..
+
 .. note::
    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)

 3. Build vLLM.

-.. code-block:: console
-
-    $ cd vllm
-    $ pip install -U -r requirements-rocm.txt
-    $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
-
-
-.. tip::
-
-    For example, vLLM v0.5.3 on ROCM 6.1 can be built with the following steps:
+    For example, vLLM on ROCM 6.2 can be built with the following steps:

    .. code-block:: console

@ -117,7 +147,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.

        $ # Install PyTorch
        $ pip uninstall torch -y
-        $ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
+        $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2

        $ # Build & install AMD SMI
        $ pip install /opt/rocm/share/amd_smi
@ -127,15 +157,14 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
        $ pip install "numpy<2"
        $ pip install -r requirements-rocm.txt

-        $ # Apply the patch to ROCM 6.1 (requires root permission)
-        $ wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
-        $ rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
-
        $ # Build vLLM for MI210/MI250/MI300.
        $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
        $ python3 setup.py develop


+    This may take 5-10 minutes. Currently, :code:`pip install .` does not work for ROCm installation.
+
+
 .. tip::

    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@ -56,7 +56,7 @@ Build from source
 .. code-block:: console

    $ pip install --upgrade pip
-    $ pip install wheel packaging ninja "setuptools>=49.4.0" numpy
+    $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
    $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu

 - Third, build and install oneDNN library from source:
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@ -98,6 +98,13 @@ Here are some common issues that can cause hangs:

    If the script runs successfully, you should see the message ``sanity check is successful!``.

+    Note that multi-node environment is more complicated than single-node. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
+
+    - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``.
+    - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``.
+
+    Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup. The difference is that you need to execute different commands (with different ``--node-rank``) on different nodes.
+
 If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.

 Some known issues:
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@ -72,6 +72,29 @@ You can also build and install vLLM from source:
    $ cd vllm
    $ pip install -e .  # This may take 5-10 minutes.

+.. note::
+
+    This will uninstall existing PyTorch, and install the version required by vLLM. If you want to use an existing PyTorch installation, there need to be some changes:
+
+    .. code-block:: console
+
+        $ git clone https://github.com/vllm-project/vllm.git
+        $ cd vllm
+        $ python use_existing_torch.py
+        $ pip install -r requirements-build.txt
+        $ pip install -e . --no-build-isolation
+
+    The differences are:
+
+    - ``python use_existing_torch.py``: This script will remove all the PyTorch versions in the requirements files, so that the existing PyTorch installation will be used.
+    - ``pip install -r requirements-build.txt``: You need to manually install the requirements for building vLLM.
+    - ``pip install -e . --no-build-isolation``: You need to disable build isolation, so that the build system can use the existing PyTorch installation.
+
+    This is especially useful when the PyTorch dependency cannot be easily installed via pip, e.g.:
+
+    - build vLLM with PyTorch nightly or a custom PyTorch build.
+    - build vLLM with aarch64 and cuda (GH200), where the PyTorch wheels are not available on PyPI. Currently, only PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to install PyTorch nightly, and then build vLLM on top of it.
+
 .. note::

    vLLM can fully run only on Linux, but you can still build it on other systems (for example, macOS). This build is only for development purposes, allowing for imports and a more convenient dev environment. The binaries will not be compiled and not work on non-Linux systems. You can create such a build with the following commands:
@ -95,6 +118,8 @@ You can also build and install vLLM from source:
        $ export MAX_JOBS=6
        $ pip install -e .

+    This is especially useful when you are building on less powerful machines. For example, when you use WSL, it only `gives you half of the memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config>`_, and you'd better use ``export MAX_JOBS=1`` to avoid compiling multiple files simultaneously and running out of memory. The side effect is that the build process will be much slower. If you only touch the Python code, slow compilation is okay, as you are building in an editable mode: you can just change the code and run the Python script without any re-compilation or re-installation.
+
 .. tip::
    If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.

--- a/docs/source/getting_started/neuron-installation.rst
+++ b/docs/source/getting_started/neuron-installation.rst
@ -3,8 +3,8 @@
 Installation with Neuron
 ========================

-vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK.
-At the moment Paged Attention is not supported in Neuron SDK, but naive continuous batching is supported in transformers-neuronx.
+vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
+Paged Attention and Chunked Prefill are currently in development and will be available soon.
 Data types currently supported in Neuron SDK are FP16 and BF16.

 Requirements
--- a/docs/source/getting_started/xpu-installation.rst
+++ b/docs/source/getting_started/xpu-installation.rst
@ -17,8 +17,8 @@ Requirements
 ------------

 * OS: Linux
-* Supported Hardware: Intel Data Center GPU (Intel ARC GPU WIP)
-* OneAPI requirements: oneAPI 2024.1 
+* Supported Hardware: Intel Data Center GPU, Intel ARC GPU
+* OneAPI requirements: oneAPI 2024.2 

 .. _xpu_backend_quick_start_dockerfile:

@ -40,7 +40,7 @@ Quick start using Dockerfile
 Build from source
 -----------------

- First, install required driver and intel OneAPI 2024.1 or later.
+- First, install required driver and intel OneAPI 2024.2 or later.

 - Second, install Python packages for vLLM XPU backend building:

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -43,7 +43,7 @@ vLLM is flexible and easy to use with:
 * Tensor parallelism and pipeline parallelism support for distributed inference
 * Streaming outputs
 * OpenAI-compatible API server
-* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
+* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 * Prefix caching support
 * Multi-lora support

@ -107,6 +107,7 @@ Documentation
   quantization/supported_hardware
   quantization/auto_awq
   quantization/bnb
+   quantization/gguf
   quantization/int8
   quantization/fp8
   quantization/fp8_e5m2_kvcache
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@ -159,3 +159,67 @@ Example request to unload a LoRA adapter:
    -d '{
        "lora_name": "sql_adapter"
    }'
+
+
+New format for `--lora-modules`
+-------------------------------
+
+In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
+
+.. code-block:: bash
+
+    --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+
+This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`.
+Now, you can specify a base_model_name alongside the name and path using JSON format. For example:
+
+.. code-block:: bash
+
+    --lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}'
+
+To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
+
+
+Lora model lineage in model card
+--------------------------------
+
+The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
+
+- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
+- The `root` field points to the artifact location of the lora adapter.
+
+.. code-block:: bash
+
+    $ curl http://localhost:8000/v1/models
+
+    {
+        "object": "list",
+        "data": [
+            {
+            "id": "meta-llama/Llama-2-7b-hf",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
+            "parent": null,
+            "permission": [
+                {
+                .....
+                }
+            ]
+            },
+            {
+            "id": "sql-lora",
+            "object": "model",
+            "created": 1715644056,
+            "owned_by": "vllm",
+            "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
+            "parent": meta-llama/Llama-2-7b-hf,
+            "permission": [
+                {
+                ....
+                }
+            ]
+            }
+        ]
+    }
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@ -107,6 +107,10 @@ Decoder-only Language Models
    - MiniCPM
    - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
    -
+  * - :code:`MiniCPM3ForCausalLM`
+    - MiniCPM3
+    - :code:`openbmb/MiniCPM3-4B`, etc.
+    -
  * - :code:`MistralForCausalLM`
    - Mistral, Mistral-Instruct
    - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
@ -123,6 +127,10 @@ Decoder-only Language Models
    - Nemotron-3, Nemotron-4, Minitron
    - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
    - ✅︎
+  * - :code:`OLMoEForCausalLM`
+    - OLMoE
+    - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc.
+    -
  * - :code:`OLMoForCausalLM`
    - OLMo
    - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
@ -175,6 +183,10 @@ Decoder-only Language Models
    - Starcoder2
    - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc.
    -
+  * - :code:`SolarForCausalLM`
+    - EXAONE-3
+    - :code:`upstage/solar-pro-preview-instruct`, etc.
+    -
  * - :code:`XverseForCausalLM`
    - Xverse
    - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc.
@ -230,13 +242,23 @@ Multimodal Language Models
  * - :code:`LlavaNextVideoForConditionalGeneration`
    - LLaVA-NeXT-Video
    - Video
-    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
+    - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
+    -
+  * - :code:`LlavaOnevisionForConditionalGeneration`
+    - LLaVA-Onevision
+    - Image\ :sup:`+` / Video
+    - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
    -
  * - :code:`MiniCPMV`
    - MiniCPM-V
    - Image\ :sup:`+`
    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
    -
+  * - :code:`MllamaForConditionalGeneration`
+    - Llama 3.2
+    - Image
+    - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
+    -
  * - :code:`PaliGemmaForConditionalGeneration`
    - PaliGemma
    - Image\ :sup:`E`
@ -276,7 +298,7 @@ Multimodal Language Models
  For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630

 .. note::
-  For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
+  For :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
  This can be installed by running the following command: 

  .. code-block:: bash
--- a/docs/source/quantization/bnb.rst
+++ b/docs/source/quantization/bnb.rst
@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.

 .. code-block:: console

-    $ pip install bitsandbytes>=0.42.0
+    $ pip install bitsandbytes>=0.44.0

 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.

--- a/docs/source/quantization/gguf.rst
+++ b/docs/source/quantization/gguf.rst
@ -0,0 +1,73 @@
+.. _gguf:
+
+GGUF
+==================
+
+.. warning::
+
+   Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
+
+.. warning::
+
+   Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use `gguf-split <https://github.com/ggerganov/llama.cpp/pull/6135>`_ tool to merge them to a single-file model.
+
+To run a GGUF model with vLLM, you can download and use the local GGUF model from `TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF <https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF>`_ with the following command:
+
+.. code-block:: console
+
+   $ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
+   $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+   $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+
+You can also add ``--tensor-parallel-size 2`` to enable tensor parallelism inference with 2 GPUs:
+
+.. code-block:: console
+
+   $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
+   $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
+
+.. warning::
+
+   We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
+
+You can also use the GGUF model directly through the LLM entrypoint:
+
+.. code-block:: python
+
+   from vllm import LLM, SamplingParams
+
+   # In this script, we demonstrate how to pass input to the chat method:
+   conversation = [
+      {
+         "role": "system",
+         "content": "You are a helpful assistant"
+      },
+      {
+         "role": "user",
+         "content": "Hello"
+      },
+      {
+         "role": "assistant",
+         "content": "Hello! How can I assist you today?"
+      },
+      {
+         "role": "user",
+         "content": "Write an essay about the importance of higher education.",
+      },
+   ]
+
+   # Create a sampling params object.
+   sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+   # Create an LLM.
+   llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+            tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+   # Generate texts from the prompts. The output is a list of RequestOutput objects
+   # that contain the prompt, generated text, and other information.
+   outputs = llm.chat(conversation, sampling_params)
+
+   # Print the outputs.
+   for output in outputs:
+      prompt = output.prompt
+      generated_text = output.outputs[0].text
+      print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/lora_with_quantization_inference.py
+++ b/examples/lora_with_quantization_inference.py
@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
        # It quantizes the model when loading, with some config info from the
        # LoRA adapter repo. So need to set the parameter of load_format and
        # qlora_adapter_name_or_path as below.
-        engine_args = EngineArgs(
-            model=model,
-            quantization=quantization,
-            qlora_adapter_name_or_path=lora_repo,
-            load_format="bitsandbytes",
-            enable_lora=True,
-            max_lora_rank=64,
-            # set it only in GPUs of limited memory
-            enforce_eager=True)
+        engine_args = EngineArgs(model=model,
+                                 quantization=quantization,
+                                 qlora_adapter_name_or_path=lora_repo,
+                                 load_format="bitsandbytes",
+                                 enable_lora=True,
+                                 max_lora_rank=64)
    else:
-        engine_args = EngineArgs(
-            model=model,
-            quantization=quantization,
-            enable_lora=True,
-            max_loras=4,
-            # set it only in GPUs of limited memory
-            enforce_eager=True)
+        engine_args = EngineArgs(model=model,
+                                 quantization=quantization,
+                                 enable_lora=True,
+                                 max_loras=4)
    return LLMEngine.from_engine_args(engine_args)


--- a/examples/offline_chat_with_tools.py
+++ b/examples/offline_chat_with_tools.py
@ -0,0 +1,138 @@
+# ruff: noqa
+import json
+import random
+import string
+
+from vllm import LLM
+from vllm.sampling_params import SamplingParams
+
+# This script is an offline demo for function calling
+#
+# If you want to run a server/client setup, please follow this code:
+#
+# - Server:
+#
+# ```bash
+# vllm serve mistralai/Mistral-7B-Instruct-v0.3 --tokenizer-mode mistral --load-format mistral --config-format mistral
+# ```
+#
+# - Client:
+#
+# ```bash
+# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
+# --header 'Content-Type: application/json' \
+# --header 'Authorization: Bearer token' \
+# --data '{
+#     "model": "mistralai/Mistral-7B-Instruct-v0.3"
+#     "messages": [
+#       {
+#         "role": "user",
+#         "content": [
+#             {"type" : "text", "text": "Describe this image in detail please."},
+#             {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
+#             {"type" : "text", "text": "and this one as well. Answer in French."},
+#             {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
+#         ]
+#       }
+#     ]
+#   }'
+# ```
+#
+# Usage:
+#     python demo.py simple
+#     python demo.py advanced
+
+model_name = "mistralai/Mistral-7B-Instruct-v0.3"
+# or switch to "mistralai/Mistral-Nemo-Instruct-2407"
+# or "mistralai/Mistral-Large-Instruct-2407"
+# or any other mistral model with function calling ability
+
+sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
+llm = LLM(model=model_name,
+          tokenizer_mode="mistral",
+          config_format="mistral",
+          load_format="mistral")
+
+
+def generate_random_id(length=9):
+    characters = string.ascii_letters + string.digits
+    random_id = ''.join(random.choice(characters) for _ in range(length))
+    return random_id
+
+
+# simulate an API that can be called
+def get_current_weather(city: str, state: str, unit: 'str'):
+    return (f"The weather in {city}, {state} is 85 degrees {unit}. It is "
+            "partly cloudly, with highs in the 90's.")
+
+
+tool_funtions = {"get_current_weather": get_current_weather}
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+
+messages = [{
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
+output = outputs[0].outputs[0].text.strip()
+
+# append the assistant message
+messages.append({
+    "role": "assistant",
+    "content": output,
+})
+
+# let's now actually parse and execute the model's output simulating an API call by using the
+# above defined function
+tool_calls = json.loads(output)
+tool_answers = [
+    tool_funtions[call['name']](**call['arguments']) for call in tool_calls
+]
+
+# append the answer as a tool message and let the LLM give you an answer
+messages.append({
+    "role": "tool",
+    "content": "\n\n".join(tool_answers),
+    "tool_call_id": generate_random_id(),
+})
+
+outputs = llm.chat(messages, sampling_params, tools=tools)
+
+print(outputs[0].outputs[0].text.strip())
+# yields
+#   'The weather in Dallas, TX is 85 degrees fahrenheit. '
+#   'It is partly cloudly, with highs in the 90's.'
--- a/examples/offline_inference_chat.py
+++ b/examples/offline_inference_chat.py
@ -39,6 +39,33 @@ outputs = llm.chat(conversation,
                   use_tqdm=False)
 print_outputs(outputs)

+# You can run batch inference with llm.chat API
+conversation = [
+    {
+        "role": "system",
+        "content": "You are a helpful assistant"
+    },
+    {
+        "role": "user",
+        "content": "Hello"
+    },
+    {
+        "role": "assistant",
+        "content": "Hello! How can I assist you today?"
+    },
+    {
+        "role": "user",
+        "content": "Write an essay about the importance of higher education.",
+    },
+]
+conversations = [conversation for _ in range(10)]
+
+# We turn on tqdm progress bar to verify it's indeed running batch inference
+outputs = llm.chat(messages=conversations,
+                   sampling_params=sampling_params,
+                   use_tqdm=True)
+print_outputs(outputs)
+
 # A chat template can be optionally supplied.
 # If not, the model will use its default chat template.

--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@ -14,7 +14,8 @@ from vllm.utils import FlexibleArgumentParser


 # LLaVA-1.5
-def run_llava(question):
+def run_llava(question, modality):
+    assert modality == "image"

    prompt = f"USER: <image>\n{question}\nASSISTANT:"

@ -24,7 +25,8 @@ def run_llava(question):


 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(question):
+def run_llava_next(question, modality):
+    assert modality == "image"

    prompt = f"[INST] <image>\n{question} [/INST]"
    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
@ -34,15 +36,35 @@ def run_llava_next(question):

 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(question):
+def run_llava_next_video(question, modality):
+    assert modality == "video"
+
    prompt = f"USER: <video>\n{question} ASSISTANT:"
    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
    stop_token_ids = None
    return llm, prompt, stop_token_ids


+# LLaVA-OneVision
+def run_llava_onevision(question, modality):
+
+    if modality == "video":
+        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+
+    elif modality == "image":
+        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+
+    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+              max_model_len=32768)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 # Fuyu
-def run_fuyu(question):
+def run_fuyu(question, modality):
+    assert modality == "image"

    prompt = f"{question}\n"
    llm = LLM(model="adept/fuyu-8b")
@ -51,7 +73,8 @@ def run_fuyu(question):


 # Phi-3-Vision
-def run_phi3v(question):
+def run_phi3v(question, modality):
+    assert modality == "image"

    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
    # Note: The default setting of max_num_seqs (256) and
@ -60,17 +83,32 @@ def run_phi3v(question):

    # In this example, we override max_num_seqs to 5 while
    # keeping the original context length of 128k.
+
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
    llm = LLM(
        model="microsoft/Phi-3-vision-128k-instruct",
        trust_remote_code=True,
        max_num_seqs=5,
+        mm_processor_kwargs={"num_crops": 16},
    )
    stop_token_ids = None
    return llm, prompt, stop_token_ids


 # PaliGemma
-def run_paligemma(question):
+def run_paligemma(question, modality):
+    assert modality == "image"

    # PaliGemma has special prompt format for VQA
    prompt = "caption en"
@ -80,7 +118,8 @@ def run_paligemma(question):


 # Chameleon
-def run_chameleon(question):
+def run_chameleon(question, modality):
+    assert modality == "image"

    prompt = f"{question}<image>"
    llm = LLM(model="facebook/chameleon-7b")
@ -89,7 +128,8 @@ def run_chameleon(question):


 # MiniCPM-V
-def run_minicpmv(question):
+def run_minicpmv(question, modality):
+    assert modality == "image"

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
@ -129,7 +169,9 @@ def run_minicpmv(question):


 # InternVL
-def run_internvl(question):
+def run_internvl(question, modality):
+    assert modality == "image"
+
    model_name = "OpenGVLab/InternVL2-2B"

    llm = LLM(
@ -155,7 +197,8 @@ def run_internvl(question):


 # BLIP-2
-def run_blip2(question):
+def run_blip2(question, modality):
+    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
@ -166,7 +209,8 @@ def run_blip2(question):


 # Qwen
-def run_qwen_vl(question):
+def run_qwen_vl(question, modality):
+    assert modality == "image"

    llm = LLM(
        model="Qwen/Qwen-VL",
@ -180,7 +224,9 @@ def run_qwen_vl(question):


 # Qwen2-VL
-def run_qwen2_vl(question):
+def run_qwen2_vl(question, modality):
+    assert modality == "image"
+
    model_name = "Qwen/Qwen2-VL-7B-Instruct"

    llm = LLM(
@ -196,10 +242,34 @@ def run_qwen2_vl(question):
    return llm, prompt, stop_token_ids


+# LLama
+def run_mllama(question, modality):
+    assert modality == "image"
+
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (131072) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # The configuration below has been confirmed to launch on a
+    # single H100 GPU.
+    llm = LLM(
+        model=model_name,
+        max_num_seqs=16,
+        enforce_eager=True,
+    )
+
+    prompt = f"<|image|><|begin_of_text|>{question}"
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
    "llava": run_llava,
    "llava-next": run_llava_next,
    "llava-next-video": run_llava_next_video,
+    "llava-onevision": run_llava_onevision,
    "fuyu": run_fuyu,
    "phi3_v": run_phi3v,
    "paligemma": run_paligemma,
@ -209,6 +279,7 @@ model_example_map = {
    "internvl_chat": run_internvl,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
+    "mllama": run_mllama,
 }


@ -255,7 +326,7 @@ def main(args):
    data = mm_input["data"]
    question = mm_input["question"]

-    llm, prompt, stop_token_ids = model_example_map[model](question)
+    llm, prompt, stop_token_ids = model_example_map[model](question, modality)

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
@ -306,6 +377,7 @@ if __name__ == "__main__":
    parser.add_argument('--modality',
                        type=str,
                        default="image",
+                        choices=['image', 'video'],
                        help='Modality of the input.')
    parser.add_argument('--num-frames',
                        type=int,
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@ -4,8 +4,9 @@ multi-image input on vision language models, using the chat template defined
 by the model.
 """
 from argparse import Namespace
-from typing import List
+from typing import List, NamedTuple, Optional

+from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer

 from vllm import LLM, SamplingParams
@ -19,7 +20,15 @@ IMAGE_URLS = [
 ]


-def load_qwenvl_chat(question: str, image_urls: List[str]):
+class ModelRequestData(NamedTuple):
+    llm: LLM
+    prompt: str
+    stop_token_ids: Optional[List[str]]
+    image_data: List[Image]
+    chat_template: Optional[str]
+
+
+def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen-VL-Chat"
    llm = LLM(
        model=model_name,
@ -48,24 +57,50 @@ def load_qwenvl_chat(question: str, image_urls: List[str]):

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompt, stop_token_ids, None, chat_template
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=chat_template,
+    )


-def load_phi3v(question: str, image_urls: List[str]):
+def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
+    # num_crops is an override kwarg to the multimodal image processor;
+    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
+    # to use 16 for single frame scenarios, and 4 for multi-frame.
+    #
+    # Generally speaking, a larger value for num_crops results in more
+    # tokens per image instance, because it may scale the image more in
+    # the image preprocessing. Some references in the model docs and the
+    # formula for image tokens after the preprocessing
+    # transform can be found below.
+    #
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
+    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
    llm = LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"num_crops": 4},
    )
    placeholders = "\n".join(f"<|image_{i}|>"
                             for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
-    return llm, prompt, stop_token_ids, None, None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )


-def load_internvl(question: str, image_urls: List[str]):
+def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL2-2B"

    llm = LLM(
@ -93,10 +128,16 @@ def load_internvl(question: str, image_urls: List[str]):
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

-    return llm, prompt, stop_token_ids, None, None
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )


-def load_qwen2_vl(question, image_urls: List[str]):
+def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import process_vision_info
    except ModuleNotFoundError:
@ -143,7 +184,13 @@ def load_qwen2_vl(question, image_urls: List[str]):
    else:
        image_data, _ = process_vision_info(messages)

-    return llm, prompt, stop_token_ids, image_data, None
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=image_data,
+        chat_template=None,
+    )


 model_example_map = {
@ -155,20 +202,17 @@ model_example_map = {


 def run_generate(model, question: str, image_urls: List[str]):
-    llm, prompt, stop_token_ids, image_data, _ = model_example_map[model](
-        question, image_urls)
-    if image_data is None:
-        image_data = [fetch_image(url) for url in image_urls]
+    req_data = model_example_map[model](question, image_urls)

    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=req_data.stop_token_ids)

-    outputs = llm.generate(
+    outputs = req_data.llm.generate(
        {
-            "prompt": prompt,
+            "prompt": req_data.prompt,
            "multi_modal_data": {
-                "image": image_data
+                "image": req_data.image_data
            },
        },
        sampling_params=sampling_params)
@ -179,13 +223,12 @@ def run_generate(model, question: str, image_urls: List[str]):


 def run_chat(model: str, question: str, image_urls: List[str]):
-    llm, _, stop_token_ids, _, chat_template = model_example_map[model](
-        question, image_urls)
+    req_data = model_example_map[model](question, image_urls)

    sampling_params = SamplingParams(temperature=0.0,
                                     max_tokens=128,
-                                     stop_token_ids=stop_token_ids)
-    outputs = llm.chat(
+                                     stop_token_ids=req_data.stop_token_ids)
+    outputs = req_data.llm.chat(
        [{
            "role":
            "user",
@ -203,7 +246,7 @@ def run_chat(model: str, question: str, image_urls: List[str]):
            ],
        }],
        sampling_params=sampling_params,
-        chat_template=chat_template,
+        chat_template=req_data.chat_template,
    )

    for o in outputs:
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@ -38,7 +38,7 @@ chat_completion_from_url = client.chat.completions.create(
        "content": [
            {
                "type": "text",
-                "text": "What’s in this image?"
+                "text": "What's in this image?"
            },
            {
                "type": "image_url",
@ -75,7 +75,7 @@ chat_completion_from_base64 = client.chat.completions.create(
        "content": [
            {
                "type": "text",
-                "text": "What’s in this image?"
+                "text": "What's in this image?"
            },
            {
                "type": "image_url",
--- a/format.sh
+++ b/format.sh
@ -159,7 +159,7 @@ echo 'vLLM codespell: Done'

 # Lint specified files
 lint() {
-    ruff "$@"
+    ruff check "$@"
 }

 # Lint files that differ from main branch. Ignores dirs that are not slated
@ -175,7 +175,7 @@ lint_changed() {

    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             ruff
+             ruff check
    fi

 }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,7 +4,8 @@ requires = [
    "cmake>=3.26",
    "ninja",
    "packaging",
-    "setuptools >= 49.4.0",
+    "setuptools>=61",
+    "setuptools-scm>=8.0",
    "torch == 2.4.0",
    "wheel",
    "jinja2",
@ -19,6 +20,10 @@ exclude = [
    "examples/fp8/quantizer/quantize.py"
 ]

+[tool.ruff.lint.per-file-ignores]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
 [tool.ruff.lint]
 select = [
    # pycodestyle
@ -42,6 +47,8 @@ ignore = [
    "E731",
    # Loop control variable not used within loop body
    "B007",
+    # f-string format
+    "UP032",
 ]

 [tool.mypy]
--- a/requirements-build.txt
+++ b/requirements-build.txt
@ -2,7 +2,8 @@
 cmake>=3.26
 ninja
 packaging
-setuptools>=49.4.0
+setuptools>=61
+setuptools-scm>=8
 torch==2.4.0
 wheel
 jinja2
--- a/requirements-common.txt
+++ b/requirements-common.txt
@ -4,7 +4,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
+transformers >= 4.45.0  # Required for Llama 3.2.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi < 0.113.0; python_version < '3.9'
@ -18,15 +18,16 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.10.6
-outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
+outlines >= 0.0.43, < 0.1
 typing_extensions >= 4.10
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq
 msgspec
-gguf == 0.9.1
+gguf == 0.10.0
 importlib_metadata
-mistral_common >= 1.4.0
+mistral_common >= 1.4.3
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
+setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@ -8,4 +8,3 @@ torch == 2.4.0
 # These must be updated alongside torch
 torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
-vllm-flash-attn == 2.6.1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@ -2,7 +2,7 @@
 yapf==0.32.0
 toml==0.10.2
 tomli==2.0.1
-ruff==0.1.5
+ruff==0.6.5
 codespell==2.3.0
 isort==5.13.2
 clang-format==18.1.5
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@ -2,6 +2,6 @@
 -r requirements-common.txt

 # Dependencies for Neuron devices
-transformers-neuronx >= 0.9.0
-torch-neuronx >= 2.1.0
+transformers-neuronx >= 0.12.0
+torch-neuronx >= 2.1.2
 neuronx-cc
--- a/requirements-test.txt
+++ b/requirements-test.txt
@ -14,13 +14,14 @@ librosa # required for audio test
 opencv-python # required for video test
 peft
 requests
-ray[adag]>=2.35
+ray[adag]==2.35
 sentence-transformers # required for embedding
 soundfile # required for audio test
 compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
+datamodel_code_generator # required for minicpm3 test

 # TODO: Add this after fully implementing llava(mantis)
 # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
@ -29,5 +30,5 @@ matplotlib # required for qwen-vl test
 aiohttp

 # quantization
-bitsandbytes==0.42.0
+bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.8
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@ -3,9 +3,10 @@

 setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.

-torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
-intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
-
-triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+ray >= 2.9
+# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+torch == 2.3.1+cxx11.abi
+intel-extension-for-pytorch == 2.3.110+xpu
+oneccl_bind_pt == 2.3.100+xpu

+triton-xpu == 3.0.0b2
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ import os
 import re
 import subprocess
 import sys
-import warnings
+from pathlib import Path
 from shutil import which
 from typing import Dict, List

@ -13,6 +13,7 @@ import torch
 from packaging.version import Version, parse
 from setuptools import Extension, find_packages, setup
 from setuptools.command.build_ext import build_ext
+from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME


@ -27,34 +28,6 @@ def load_module_from_path(module_name, path):
 ROOT_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)

-
-def embed_commit_hash():
-    try:
-        if "BUILDKITE_COMMIT" in os.environ:
-            # ci build
-            commit_id = os.environ["BUILDKITE_COMMIT"]
-        else:
-            commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
-                                                encoding="utf-8").strip()
-
-        commit_contents = f'__commit__ = "{commit_id}"\n'
-
-        version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
-        with open(version_file, "w", encoding="utf-8") as f:
-            f.write(commit_contents)
-
-    except subprocess.CalledProcessError as e:
-        warnings.warn(f"Failed to get commit hash:\n{e}",
-                      RuntimeWarning,
-                      stacklevel=2)
-    except Exception as e:
-        warnings.warn(f"Failed to embed commit hash:\n{e}",
-                      RuntimeWarning,
-                      stacklevel=2)
-
-
-embed_commit_hash()
-
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
 envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
@ -152,15 +125,8 @@ class cmake_build_ext(build_ext):
        default_cfg = "Debug" if self.debug else "RelWithDebInfo"
        cfg = envs.CMAKE_BUILD_TYPE or default_cfg

-        # where .so files will be written, should be the same for all extensions
-        # that use the same CMakeLists.txt.
-        outdir = os.path.abspath(
-            os.path.dirname(self.get_ext_fullpath(ext.name)))
-
        cmake_args = [
            '-DCMAKE_BUILD_TYPE={}'.format(cfg),
-            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
-            '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
            '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
        ]

@ -224,10 +190,12 @@ class cmake_build_ext(build_ext):
            os.makedirs(self.build_temp)

        targets = []
+        target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."),
+                                              "vllm_flash_attn.")
        # Build all the extensions
        for ext in self.extensions:
            self.configure(ext)
-            targets.append(remove_prefix(ext.name, "vllm."))
+            targets.append(target_name(ext.name))

        num_jobs, _ = self.compute_num_jobs()

@ -240,6 +208,43 @@ class cmake_build_ext(build_ext):

        subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)

+        # Install the libraries
+        for ext in self.extensions:
+            # Install the extension into the proper location
+            outdir = Path(self.get_ext_fullpath(ext.name)).parent.absolute()
+
+            # Skip if the install directory is the same as the build directory
+            if outdir == self.build_temp:
+                continue
+
+            # CMake appends the extension prefix to the install path,
+            # and outdir already contains that prefix, so we need to remove it.
+            prefix = outdir
+            for i in range(ext.name.count('.')):
+                prefix = prefix.parent
+
+            # prefix here should actually be the same for all components
+            install_args = [
+                "cmake", "--install", ".", "--prefix", prefix, "--component",
+                target_name(ext.name)
+            ]
+            subprocess.check_call(install_args, cwd=self.build_temp)
+
+    def run(self):
+        # First, run the standard build_ext command to compile the extensions
+        super().run()
+
+        # copy vllm/vllm_flash_attn/*.py from self.build_lib to current
+        # directory so that they can be included in the editable build
+        import glob
+        files = glob.glob(
+            os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
+        for file in files:
+            dst_file = os.path.join("vllm/vllm_flash_attn",
+                                    os.path.basename(file))
+            print(f"Copying {file} to {dst_file}")
+            self.copy_file(file, dst_file)
+

 def _no_device() -> bool:
    return VLLM_TARGET_DEVICE == "empty"
@ -348,50 +353,43 @@ def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


-def find_version(filepath: str) -> str:
-    """Extract version information from the given filepath.
-
-    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
-    """
-    with open(filepath) as fp:
-        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
-                                  fp.read(), re.M)
-        if version_match:
-            return version_match.group(1)
-        raise RuntimeError("Unable to find version string.")
-
-
 def get_vllm_version() -> str:
-    version = find_version(get_path("vllm", "version.py"))
+    version = get_version(
+        write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+    )
+
+    sep = "+" if "+" not in version else "."  # dev versions might contain +

    if _no_device():
        if envs.VLLM_TARGET_DEVICE == "empty":
-            version += "+empty"
+            version += f"{sep}empty"
    elif _is_cuda():
        cuda_version = str(get_nvcc_cuda_version())
        if cuda_version != MAIN_CUDA_VERSION:
            cuda_version_str = cuda_version.replace(".", "")[:3]
-            version += f"+cu{cuda_version_str}"
+            # skip this for source tarball, required for pypi
+            if "sdist" not in sys.argv:
+                version += f"{sep}cu{cuda_version_str}"
    elif _is_hip():
        # Get the HIP version
        hipcc_version = get_hipcc_rocm_version()
        if hipcc_version != MAIN_CUDA_VERSION:
            rocm_version_str = hipcc_version.replace(".", "")[:3]
-            version += f"+rocm{rocm_version_str}"
+            version += f"{sep}rocm{rocm_version_str}"
    elif _is_neuron():
        # Get the Neuron version
        neuron_version = str(get_neuronxcc_version())
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
-            version += f"+neuron{neuron_version_str}"
+            version += f"{sep}neuron{neuron_version_str}"
    elif _is_openvino():
-        version += "+openvino"
+        version += f"{sep}openvino"
    elif _is_tpu():
-        version += "+tpu"
+        version += f"{sep}tpu"
    elif _is_cpu():
-        version += "+cpu"
+        version += f"{sep}cpu"
    elif _is_xpu():
-        version += "+xpu"
+        version += f"{sep}xpu"
    else:
        raise RuntimeError("Unknown runtime environment")

@ -462,6 +460,13 @@ if _build_core_ext():
 if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))

+if _is_hip():
+    ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
+
+if _is_cuda():
+    ext_modules.append(
+        CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
+
 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@ -26,6 +26,11 @@ class RequestOutput:
    finished: bool = False


+@dataclass
+class MockModelConfig:
+    use_async_output_proc = True
+
+
 class MockEngine:

    def __init__(self):
@ -35,6 +40,7 @@ class MockEngine:
        self.request_id = None
        # Ugly, remove dependency when possible
        self.parallel_config = ParallelConfig(1, 1, False)
+        self.model_config = MockModelConfig()

    async def step_async(self, virtual_engine):
        # PP size is 1, ignore virtual engine
@ -80,7 +86,7 @@ class MockAsyncLLMEngine(AsyncLLMEngine):

@pytest.mark.asyncio
 async def test_new_requests_event():
-    engine = MockAsyncLLMEngine(worker_use_ray=False)
+    engine = MockAsyncLLMEngine()
    engine.start_background_loop()
    await asyncio.sleep(0.01)
    assert engine.engine.step_calls == 0
@ -113,7 +119,7 @@ async def test_new_requests_event():
    assert engine.engine.add_request_calls == 3
    assert engine.engine.step_calls == old_step_calls + 1

-    engine = MockAsyncLLMEngine(worker_use_ray=True)
+    engine = MockAsyncLLMEngine()
    assert engine.get_model_config() is not None
    assert engine.get_tokenizer() is not None
    assert engine.get_decoding_config() is not None
--- a/tests/async_engine/test_openapi_server.py
+++ b/tests/async_engine/test_openapi_server.py
@ -1,106 +0,0 @@
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-
-from ..utils import VLLM_PATH, RemoteOpenAIServer
-
-# any model with a chat template should work here
-MODEL_NAME = "facebook/opt-125m"
-chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
-assert chatml_jinja_path.exists()
-
-
-@pytest.fixture(scope="module")
-def server():
-    args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--max-model-len",
-        "2048",
-        "--enforce-eager",
-        "--chat-template",
-        str(chatml_jinja_path),
-    ]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI):
-    models = await client.models.list()
-    models = models.data
-    served_model = models[0]
-    assert served_model.id == MODEL_NAME
-    assert all(model.root == MODEL_NAME for model in models)
-
-
-@pytest.mark.asyncio
-async def test_single_completion(client: openai.AsyncOpenAI):
-    completion = await client.completions.create(model=MODEL_NAME,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) >= 5
-    assert completion.choices[0].finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 5
-
-
-@pytest.mark.asyncio
-async def test_single_chat_session(client: openai.AsyncOpenAI):
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
-    # test single completion
-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10,
-                                                           logprobs=True,
-                                                           top_logprobs=5)
-    assert chat_completion.id is not None
-    assert len(chat_completion.choices) == 1
-
-    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
-    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=55, total_tokens=65)
-
-    message = choice.message
-    assert message.content is not None and len(message.content) >= 10
-    assert message.role == "assistant"
-    messages.append({"role": "assistant", "content": message.content})
-
-    # test multi-turn dialogue
-    messages.append({"role": "user", "content": "express your result in json"})
-    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=messages,
-        max_tokens=10,
-    )
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/rpc/init.py
+++ b/tests/entrypoints/openai/rpc/init.py
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@ -1,22 +1,13 @@
-import os
-
 import pytest

+from vllm.compilation.backends import vllm_backend

-@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
-def test_full_graph(model):
-    # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
+from .utils import TEST_MODELS, check_full_graph_support

-    from vllm import LLM, SamplingParams
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model="meta-llama/Meta-Llama-3-8B",
-              enforce_eager=True,
-              load_format="dummy")
-    llm.generate(prompts, sampling_params)
+
+@pytest.mark.parametrize("model_info", TEST_MODELS)
+@pytest.mark.parametrize("backend", ["eager", vllm_backend])
+def test_full_graph(model_info, backend):
+    model = model_info[0]
+    model_kwargs = model_info[1]
+    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
--- a/tests/compile/test_full_graph_multi_gpu.py
+++ b/tests/compile/test_full_graph_multi_gpu.py
@ -0,0 +1,22 @@
+import pytest
+
+from vllm.compilation.backends import vllm_backend
+from vllm.utils import cuda_device_count_stateless
+
+from ..utils import fork_new_process_for_each_test
+from .utils import TEST_MODELS_SMOKE, check_full_graph_support
+
+
+@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("backend", ["eager", vllm_backend])
+@fork_new_process_for_each_test
+def test_full_graph_multi_gpu(model_info, tp_size, backend):
+    model = model_info[0]
+    model_kwargs = model_info[1]
+
+    # Skip the test if there are not enough CUDA devices.
+    if cuda_device_count_stateless() < tp_size:
+        pytest.skip("Not enough CUDA devices for the test.")
+
+    check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size)
--- a/tests/compile/test_full_graph_smoke.py
+++ b/tests/compile/test_full_graph_smoke.py
@ -0,0 +1,13 @@
+import pytest
+
+from vllm.compilation.backends import vllm_backend
+
+from .utils import TEST_MODELS_SMOKE, check_full_graph_support
+
+
+@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
+@pytest.mark.parametrize("backend", ["eager", vllm_backend])
+def test_full_graph(model_info, backend):
+    model = model_info[0]
+    model_kwargs = model_info[1]
+    check_full_graph_support(model, model_kwargs, backend, tp_size=1)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Michael Goin	7193774b1f	[Misc] Support quantization of MllamaForCausalLM (#8822 )	2024-09-25 14:46:22 -07:00
Roger Wang	e2c6e0a829	[Doc] Update doc for Transformers 4.45 (#8817 )	2024-09-25 13:29:48 -07:00
Chen Zhang	770ec6024f	[Model] Add support for the multi-modal Llama 3.2 model (#8811 ) Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Chang Su <chang.s.su@oracle.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Roger Wang <ywang@roblox.com>	2024-09-25 13:29:32 -07:00
Simon Mo	4f1ba0844b	Revert "rename PromptInputs and inputs with backward compatibility (#8760 ) (#8810 )	2024-09-25 10:36:26 -07:00
Michael Goin	873edda6cf	[Misc] Support FP8 MoE for compressed-tensors (#8588 )	2024-09-25 09:43:36 -07:00
科英	64840dfae4	[Frontend] MQLLMEngine supports profiling. (#8761 )	2024-09-25 09:37:41 -07:00
Cyrus Leung	28e1299e60	rename PromptInputs and inputs with backward compatibility (#8760 )	2024-09-25 09:36:47 -07:00
DefTruth	0c4d2ad5e6	[VLM][Bugfix] internvl with num_scheduler_steps > 1 (#8614 )	2024-09-25 09:35:53 -07:00
Jee Jee Li	c6f2485c82	[[Misc]] Add extra deps for openai server image (#8792 )	2024-09-25 09:35:23 -07:00
bnellnm	300da09177	[Kernel] Fullgraph and opcheck tests (#8479 )	2024-09-25 08:35:52 -06:00
Hongxia Yang	1c046447a6	[CI/Build][Bugfix][Doc][ROCm] CI fix and doc update after ROCm 6.2 upgrade (#8777 )	2024-09-25 22:26:37 +08:00
Woo-Yeon Lee	8fae5ed7f6	[Misc] Fix minor typo in scheduler (#8765 )	2024-09-25 00:53:03 -07:00
David Newman	3368c3ab36	[Bugfix] Ray 2.9.x doesn't expose available_resources_per_node (#8767 ) Signed-off-by: darthhexx <darthhexx@gmail.com>	2024-09-25 00:52:26 -07:00
Adam Tilghman	1ac3de09cd	[Frontend] OpenAI server: propagate usage accounting to FastAPI middleware layer (#8672 )	2024-09-25 07:49:26 +00:00
sohamparikh	3e073e66f1	[Bugfix] load fc bias from config for eagle (#8790 )	2024-09-24 23:16:30 -07:00
Isotr0py	c23953675f	[Hardware][CPU] Enable mrope and support Qwen2-VL on CPU backend (#8770 )	2024-09-24 23:16:11 -07:00
zifeitong	e3dd0692fa	[BugFix] Propagate 'trust_remote_code' setting in internvl and minicpmv (#8250 )	2024-09-25 05:53:43 +00:00
sroy745	fc3afc20df	Fix tests in test_chunked_prefill_scheduler which fail with BlockManager V2 (#8752 )	2024-09-24 21:26:36 -07:00
sasha0552	b4522474a3	[Bugfix][Kernel] Implement acquire/release polyfill for Pascal (#8776 )	2024-09-24 21:26:33 -07:00
sroy745	ee777d9c30	Fix test_schedule_swapped_simple in test_scheduler.py (#8780 )	2024-09-24 21:26:18 -07:00
Joe Runde	6e0c9d6bd0	[Bugfix] Use heartbeats instead of health checks (#8583 )	2024-09-24 20:37:38 -07:00
Archit Patke	6da1ab6b41	[Core] Adding Priority Scheduling (#5958 )	2024-09-24 19:50:50 -07:00
Travis Johnson	01b6f9e1f0	[Core][Bugfix] Support prompt_logprobs returned with speculative decoding (#8047 ) Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>	2024-09-24 17:29:56 -07:00
Jee Jee Li	13f9f7a3d0	[[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768 )	2024-09-24 17:08:55 -07:00
youkaichao	1e7d5c01f5	[misc] soft drop beam search (#8763 )	2024-09-24 15:48:39 -07:00
Daniele	2467b642dd	[CI/Build] fix setuptools-scm usage (#8771 )	2024-09-24 12:38:12 -07:00
Lucas Wilkinson	72fc97a0f1	[Bugfix] Fix torch dynamo fixes caused by `replace_parameters` (#8748 )	2024-09-24 14:33:21 -04:00
Andy	2529d09b5a	[Frontend] Batch inference for llm.chat() API (#8648 ) Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>	2024-09-24 09:44:11 -07:00
ElizaWszola	a928ded995	[Kernel] Split Marlin MoE kernels into multiple files (#8661 ) Co-authored-by: mgoin <michael@neuralmagic.com>	2024-09-24 09:31:42 -07:00
Hanzhi Zhou	cc4325b66a	[Bugfix] Fix potentially unsafe custom allreduce synchronization (#8558 )	2024-09-24 01:08:14 -07:00
Alex Brooks	8ff7ced996	[Model] Expose Phi3v num_crops as a mm_processor_kwarg (#8658 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-24 07:36:46 +00:00
Peter Salas	3f06bae907	[Core][Model] Support loading weights by ID within models (#7931 )	2024-09-24 07:14:15 +00:00
Cody Yu	b8747e8a7c	[MISC] Skip dumping inputs when unpicklable (#8744 )	2024-09-24 06:10:03 +00:00
Simon Mo	3185fb0cca	Revert "[Core] Rename `PromptInputs` to `PromptType`, and `inputs` to `prompt`" (#8750 )	2024-09-24 05:45:20 +00:00
youkaichao	0250dd68c5	re-implement beam search on top of vllm core (#8726 ) Co-authored-by: Brendan Wong <bjwpokemon@gmail.com>	2024-09-23 22:08:12 -07:00
sroy745	88577ac928	Fix tests in test_scheduler.py that fail with BlockManager V2 (#8728 )	2024-09-24 04:43:13 +00:00
Hongxia Yang	530821d00c	[Hardware][AMD] ROCm6.2 upgrade (#8674 )	2024-09-23 18:52:39 -07:00
Alexander Matveev	1a2aef3e59	Add output streaming support to multi-step + async while ensuring RequestOutput obj reuse (#8335 )	2024-09-23 15:38:04 -07:00
jiqing-feng	5f7bb58427	Fix typical acceptance sampler with correct recovered token ids (#8562 )	2024-09-23 12:32:27 -07:00
Russell Bryant	b05f5c9238	[Core] Allow IPv6 in VLLM_HOST_IP with zmq (#8575 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2024-09-23 12:15:41 -07:00
Jee Jee Li	9b0e3ec970	[Kernel][LoRA] Add assertion for punica sgmv kernels (#7585 )	2024-09-23 18:57:42 +00:00
Lucas Wilkinson	86e9c8df29	[Kernel] (2/N) Machete - Integrate into CompressedTensorsWNA16 and GPTQMarlin (#7701 ) Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>	2024-09-23 13:46:26 -04:00
Daniele	ee5f34b1c2	[CI/Build] use setuptools-scm to set __version__ (#4738 ) Co-authored-by: youkaichao <youkaichao@126.com>	2024-09-23 09:44:26 -07:00
Jani Monoses	f2bd246c17	[VLM] Fix paligemma, fuyu and persimmon with transformers 4.45 : use config.text_config.vocab_size (#8707 )	2024-09-23 14:43:09 +00:00
Yanyi Liu	a79e522984	[Model] Support pp for qwen2-vl (#8696 )	2024-09-23 13:46:59 +00:00
Li, Jiang	3e83c12b5c	[Bugfix][CPU] fix missing input intermediate_tensors in the cpu_model_runner (#8733 )	2024-09-23 13:15:16 +00:00
Isotr0py	e551ca1555	[Hardware][CPU] Refactor CPU model runner (#8729 )	2024-09-23 20:12:20 +08:00
Alex Brooks	9b8c8ba119	[Core][Frontend] Support Passing Multimodal Processor Kwargs (#8657 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>	2024-09-23 07:44:48 +00:00
Yan Ma	d23679eb99	[Bugfix] fix docker build for xpu (#8652 )	2024-09-22 22:54:18 -07:00
Luka Govedič	57a0702e63	[Bugfix] Fix CPU CMake build (#8723 ) Co-authored-by: Yuan <yuan.zhou@intel.com>	2024-09-22 20:40:46 -07:00
Tyler Michael Smith	3dda7c2250	[Bugfix] Avoid some bogus messages RE CUTLASS's revision when building (#8702 )	2024-09-22 22:24:59 -04:00
youkaichao	92ba7e7477	[misc] upgrade mistral-common (#8715 )	2024-09-22 15:41:59 -07:00
youkaichao	d4a2ac8302	[build] enable existing pytorch (for GH200, aarch64, nightly) (#8713 )	2024-09-22 12:47:54 -07:00
Lily Liu	c6bd70d772	[SpecDec][Misc] Cleanup, remove bonus token logic. (#8701 )	2024-09-22 12:34:14 -07:00
litianjian	5b59532760	[Model][VLM] Add LLaVA-Onevision model support (#8486 ) Co-authored-by: litianjian <litianjian@bytedance.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-22 10:51:44 -07:00
Huazhong Ji	ca2b628b3c	[MISC] rename CudaMemoryProfiler to DeviceMemoryProfiler (#8703 )	2024-09-22 10:44:09 -07:00
Alex Brooks	8ca5051b9a	[Misc] Use NamedTuple in Multi-image example (#8705 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>	2024-09-22 20:56:20 +08:00
Cyrus Leung	06ed2815e2	[Model] Refactor BLIP/BLIP-2 to support composite model loading (#8407 )	2024-09-22 12:24:21 +00:00
youkaichao	0e40ac9b7b	[ci][build] fix vllm-flash-attn (#8699 )	2024-09-21 23:24:58 -07:00
Isotr0py	13d88d4137	[Bugfix] Refactor composite weight loading logic (#8656 )	2024-09-22 04:33:27 +00:00
Tyler Michael Smith	d66ac62854	[Kernel][Bugfix] Delete some more useless code in marlin_moe_ops.cu (#8643 )	2024-09-21 23:45:02 +00:00
Divakar Verma	9dc7c6c7f3	[dbrx] refactor dbrx experts to extend FusedMoe class (#8518 )	2024-09-21 15:09:39 -06:00
rasmith	ec4aaad812	[Kernel][Triton][AMD] Remove tl.atomic_add from awq_gemm_kernel, 2-5x speedup MI300, minor improvement for MI250 (#8646 )	2024-09-21 09:20:54 +00:00
Andy Dai	4dfdf43196	[Doc] Fix typo in AMD installation guide (#8689 )	2024-09-21 00:24:12 -07:00
Cyrus Leung	5e85f4f82a	[VLM] Use `SequenceData.from_token_counts` to create dummy data (#8687 )	2024-09-20 23:28:56 -07:00
Luka Govedič	71c60491f2	[Kernel] Build flash-attn from source (#8245 )	2024-09-20 23:27:10 -07:00
youkaichao	0faab90eb0	[beam search] add output for manually checking the correctness (#8684 )	2024-09-20 19:55:33 -07:00
Cyrus Leung	0455c46ed4	[Core] Factor out common code in `SequenceData` and `Sequence` (#8675 )	2024-09-21 02:30:39 +00:00
Kunshang Ji	d4bf085ad0	[MISC] add support custom_op check (#8557 ) Co-authored-by: youkaichao <youkaichao@126.com>	2024-09-20 19:03:55 -07:00
Cyrus Leung	0057894ef7	[Core] Rename `PromptInputs` and `inputs`(#8673 )	2024-09-20 19:00:54 -07:00
zyddnys	0f961b3ce9	[Bugfix] Fix incorrect llava next feature size calculation (#8496 )	2024-09-20 22:48:32 +00:00
omrishiv	7f9c8902e3	[Hardware][AWS] update neuron to 2.20 (#8676 ) Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>	2024-09-20 15:19:44 -07:00
omrishiv	7c8566aa4f	[Doc] neuron documentation update (#8671 ) Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com>	2024-09-20 15:04:37 -07:00
Patrick von Platen	b4e4eda92e	[Bugfix][Core] Fix tekken edge case for mistral tokenizer (#8640 )	2024-09-20 14:33:03 -07:00
Pastel！	2874bac618	[Bugfix] Config got an unexpected keyword argument 'engine' (#8556 )	2024-09-20 14:00:45 -07:00
Cyrus Leung	035fa895ec	[Misc] Show AMD GPU topology in `collect_env.py` (#8649 )	2024-09-20 13:52:19 -07:00
saumya-saran	b28298f2f4	[Bugfix] Validate SamplingParam n is an int (#8548 )	2024-09-20 12:46:02 -07:00
Alexey Kondratiev(AMD)	2940afa04e	[CI/Build] Removing entrypoints/openai/test_embedding.py test from ROCm build (#8670 )	2024-09-20 10:27:44 -07:00
Niklas Muennighoff	3b63de9353	[Model] Add OLMoE (#7922 )	2024-09-20 09:31:41 -07:00
Jiaxin Shan	260d40b5ea	[Core] Support Lora lineage and base model metadata management (#6315 )	2024-09-20 06:20:56 +00:00
William Lin	9e5ec35b1f	[bugfix] [AMD] add multi-step advance_step to ROCmFlashAttentionMetadata (#8474 )	2024-09-19 20:49:54 -07:00
Amit Garg	18ae428a0d	[Bugfix] Fix Phi3.5 mini and MoE LoRA inference (#8571 )	2024-09-20 08:54:02 +08:00
bnellnm	de6f90a13d	[Misc] guard against change in cuda library name (#8609 )	2024-09-20 06:36:30 +08:00
Alexey Kondratiev(AMD)	6cb748e190	[CI/Build] Re-enabling Entrypoints tests on ROCm, excluding ones that fail (#8551 )	2024-09-19 13:06:32 -07:00
Simon Mo	9e99407e3c	Create SECURITY.md (#8642 )	2024-09-19 12:16:28 -07:00
Isotr0py	ea4647b7d7	[Doc] Add documentation for GGUF quantization (#8618 )	2024-09-19 13:15:55 -06:00
盏一	e42c634acb	[Core] simplify logits resort in _apply_top_k_top_p (#8619 )	2024-09-19 18:28:25 +00:00
Charlie Fu	9cc373f390	[Kernel][Amd] Add fp8 kv cache support for rocm custom paged attention (#8577 )	2024-09-19 17:37:57 +00:00
Nick Hill	76515f303b	[Frontend] Use MQLLMEngine for embeddings models too (#8584 )	2024-09-19 12:51:06 -04:00
Kunshang Ji	855c8ae2c9	[MISC] remove engine_use_ray in benchmark_throughput.py (#8615 )	2024-09-18 22:33:20 -07:00
Kuntai Du	c52ec5f034	[Bugfix] fixing sonnet benchmark bug in benchmark_serving.py (#8616 )	2024-09-19 05:24:24 +00:00
Roger Wang	02c9afa2d0	Revert "[Misc][Bugfix] Disable guided decoding for mistral tokenizer" (#8593 )	2024-09-19 04:14:28 +00:00
sroy745	3118f63385	[Bugfix] [Encoder-Decoder] Bugfix for encoder specific metadata construction during decode of encoder-decoder models. (#8545 )	2024-09-19 02:24:15 +00:00
Tyler Michael Smith	4c34ce8916	[Kernel] Remove marlin moe templating on thread_m_blocks (#8573 ) Co-authored-by: lwilkinson@neuralmagic.com	2024-09-19 01:42:49 +00:00
Joe Runde	0d47bf3bf4	[Bugfix] add `dead_error` property to engine client (#8574 ) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>	2024-09-18 22:10:01 +00:00
Nick Hill	d9cd78eb71	[BugFix] Nonzero exit code if MQLLMEngine startup fails (#8572 )	2024-09-18 20:17:55 +00:00
Tyler Michael Smith	db9120cded	[Kernel] Change interface to Mamba selective_state_update for continuous batching (#8039 )	2024-09-18 20:05:06 +00:00
Gregory Shtrasberg	b3195bc9e4	[AMD][ROCm]Quantization methods on ROCm; Fix _scaled_mm call (#8380 ) Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>	2024-09-18 10:41:08 -07:00
Geun, Lim	e18749ff09	[Model] Support Solar Model (#8386 ) Co-authored-by: Michael Goin <michael@neuralmagic.com>	2024-09-18 11:04:00 -06:00
Russell Bryant	d65798f78c	[Core] zmq: bind only to 127.0.0.1 for local-only usage (#8543 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2024-09-18 16:10:27 +00:00
afeldman-nm	a8c1d161a7	[Core] Prompt logprobs support in Multi-step (#8199 )	2024-09-18 08:38:43 -07:00
Alexander Matveev	7c7714d856	[Core][Bugfix][Perf] Introduce `MQLLMEngine` to avoid `asyncio` OH (#8157 ) Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Co-authored-by: Simon Mo <simon.mo@hey.com>	2024-09-18 13:56:58 +00:00
Aaron Pham	9d104b5beb	[CI/Build] Update Ruff version (#8469 ) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2024-09-18 11:00:56 +00:00
Cyrus Leung	6ffa3f314c	[CI/Build] Avoid CUDA initialization (#8534 )	2024-09-18 10:38:11 +00:00
Jiaxin Shan	e351572900	[Misc] Add argument to disable FastAPI docs (#8554 )	2024-09-18 09:51:59 +00:00
Daniele	95965d31b6	[CI/Build] fix Dockerfile.cpu on podman (#8540 )	2024-09-18 10:49:53 +08:00
Tyler Michael Smith	8110e44529	[Kernel] Change interface to Mamba causal_conv1d_update for continuous batching (#8012 )	2024-09-17 23:44:27 +00:00
Alexey Kondratiev(AMD)	09deb4721f	[CI/Build] Excluding kernels/test_gguf.py from ROCm (#8520 )	2024-09-17 16:40:29 -07:00
youkaichao	fa0c114fad	[doc] improve installation doc (#8550 ) Co-authored-by: Andy Dai <76841985+Imss27@users.noreply.github.com>	2024-09-17 16:24:06 -07:00
Joe Runde	98f9713399	[Bugfix] Fix TP > 1 for new granite (#8544 ) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>	2024-09-17 23:17:08 +00:00
Nick Hill	56c3de018c	[Misc] Don't dump contents of kvcache tensors on errors (#8527 )	2024-09-17 12:24:29 -07:00
Patrick von Platen	a54ed80249	[Model] Add mistral function calling format to all models loaded with "mistral" format (#8515 ) Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2024-09-17 17:50:37 +00:00
chenqianfzh	9855b99502	[Feature][kernel] tensor parallelism with bitsandbytes quantization (#8434 )	2024-09-17 08:09:12 -07:00
sroy745	1009e93c5d	[Encoder decoder] Add cuda graph support during decoding for encoder-decoder models (#7631 )	2024-09-17 07:35:01 -07:00
Isotr0py	1b6de8352b	[Benchmark] Support sample from HF datasets and image input for benchmark_serving (#8495 )	2024-09-17 07:34:27 +00:00
Rui Qiao	cbdb252259	[Misc] Limit to ray[adag] 2.35 to avoid backward incompatible change (#8509 ) Signed-off-by: Rui Qiao <ruisearch42@gmail.com>	2024-09-17 00:06:26 -07:00
youkaichao	99aa4eddaf	[torch.compile] register allreduce operations as custom ops (#8526 )	2024-09-16 22:57:57 -07:00
Roger Wang	ee2bceaaa6	[Misc][Bugfix] Disable guided decoding for mistral tokenizer (#8521 )	2024-09-16 22:22:45 -07:00
Alex Brooks	1c1bb388e0	[Frontend] Improve Nullable kv Arg Parsing (#8525 ) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>	2024-09-17 04:17:32 +00:00
Simon Mo	546034b466	[refactor] remove triton based sampler (#8524 )	2024-09-16 20:04:48 -07:00
Joe Runde	cca61642e0	[Bugfix] Fix 3.12 builds on main (#8510 ) Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>	2024-09-17 00:01:45 +00:00
Simon Mo	5ce45eb54d	[misc] small qol fixes for release process (#8517 )	2024-09-16 15:11:27 -07:00
Simon Mo	5478c4b41f	[perf bench] set timeout to debug hanging (#8516 )	2024-09-16 14:30:02 -07:00
Kevin Lin	47f5e03b5b	[Bugfix] Bind api server port before starting engine (#8491 )	2024-09-16 13:56:28 -07:00
youkaichao	2759a43a26	[doc] update doc on testing and debugging (#8514 )	2024-09-16 12:10:23 -07:00
Luka Govedič	5d73ae49d6	[Kernel] AQ AZP 3/4: Asymmetric quantization kernels (#7270 )	2024-09-16 11:52:40 -07:00
sasha0552	781e3b9a42	[Bugfix][Kernel] Fix build for sm_60 in GGUF kernel (#8506 )	2024-09-16 12:15:57 -06:00
Nick Hill	acd5511b6d	[BugFix] Fix clean shutdown issues (#8492 )	2024-09-16 09:33:46 -07:00
lewtun	837c1968f9	[Frontend] Expose revision arg in OpenAI server (#8501 )	2024-09-16 15:55:26 +00:00
ElizaWszola	a091e2da3e	[Kernel] Enable 8-bit weights in Fused Marlin MoE (#8032 ) Co-authored-by: Dipika <dipikasikka1@gmail.com>	2024-09-16 09:47:19 -06:00
Isotr0py	fc990f9795	[Bugfix][Kernel] Add `IQ1_M` quantization implementation to GGUF kernel (#8357 )	2024-09-15 16:51:44 -06:00
Chris	3724d5f6b5	[Bugfix][Model] Fix Python 3.8 compatibility in Pixtral model by updating type annotations (#8490 )	2024-09-15 04:20:05 +00:00
Woosuk Kwon	50e9ec41fc	[TPU] Implement multi-step scheduling (#8489 )	2024-09-14 16:58:31 -07:00
youkaichao	47790f3e32	[torch.compile] add a flag to disable custom op (#8488 )	2024-09-14 13:07:16 -07:00
youkaichao	a36e070dad	[torch.compile] fix functionalization (#8480 )	2024-09-14 09:46:04 -07:00
ywfang	8a0cf1ddc3	[Model] support minicpm3 (#8297 ) Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>	2024-09-14 14:50:26 +00:00
Charlie Fu	1ef0d2efd0	[Kernel][Hardware][Amd]Custom paged attention kernel for rocm (#8310 )	2024-09-13 17:01:11 -07:00
Kunshang Ji	851725202a	[Hardware][intel GPU] bump up ipex version to 2.3 (#8365 ) Co-authored-by: Yan Ma <yan.ma@intel.com>	2024-09-13 16:54:34 -07:00