added

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
updated
2025-04-06 14:07:43 +00:00 · 2025-04-05 13:43:03 +00:00 · 2025-02-06 15:23:50 +08:00 · 2025-02-05 23:16:20 -08:00 · 2025-02-05 22:24:57 -08:00 · 2025-02-06 13:25:54 +08:00
1102 changed files with 13005 additions and 4168 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -1,12 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 import zipfile

-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))


 def print_top_10_largest_files(zip_file):
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import os

--- a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
+model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.6353
+  - name: "exact_match,flexible-extract"
+    value: 0.637
+limit: null
+num_fewshot: null 
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 from pathlib import Path
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse

 from transformers import AutoTokenizer
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 from pathlib import Path
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from lmdeploy.serve.openai.api_client import APIClient

 api_client = APIClient("http://localhost:8000")
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import datetime
 import json
 import os
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@ -23,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container

 # Run the image and test offline inference
-docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic.py
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
 '
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -50,9 +50,9 @@ steps:
  - tests/multimodal
  - tests/test_utils
  - tests/worker
-  - tests/standalone_tests/lazy_torch_compile.py
+  - tests/standalone_tests/lazy_imports.py
  commands:
-  - python3 standalone_tests/lazy_torch_compile.py
+  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@ -128,6 +128,7 @@ steps:
  - tests/spec_decode/e2e/test_integration_dist_tp4
  - tests/compile
  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/ray_placement.py
  commands:
  - pytest -v -s distributed/test_utils.py
  - pytest -v -s compile/test_basic_correctness.py
@ -136,6 +137,7 @@ steps:
  # TODO: create a dedicated test section for multi-GPU example tests
  # when we have multiple distributed example tests
  - python3 ../examples/offline_inference/rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py

 - label: Metrics, Tracing Test # 10min
  num_gpus: 2 
@ -349,6 +351,7 @@ steps:
  - vllm/
  - tests/models
  commands:
+    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
    - pytest -v -s models/test_initialization.py

@ -485,6 +488,7 @@ steps:
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -30,15 +30,6 @@ body:
      </details>
  validations:
    required: true
- type: textarea
-  attributes:
-    label: Model Input Dumps
-    description: |
-      If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
-    placeholder: |
-      Upload the dumped input file.
-  validations:
-    required: false
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -2,7 +2,6 @@ name: PR Reminder Comment Bot
 on:
  pull_request_target:
    types: [opened]
-
 jobs:
  pr_reminder:
    runs-on: ubuntu-latest
@ -15,7 +14,12 @@ jobs:
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
+                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
+                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
+                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
+                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
+                '🚀'
            })
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -97,10 +97,14 @@ repos:
    language: system
    verbose: true
    stages: [commit-msg]
+  - id: check-spdx-header
+    name: Check SPDX headers
+    entry: python tools/check_spdx_header.py
+    language: python
+    types: [python]
  - id: suggestion
    name: Suggestion
    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
    language: system
    verbose: true
    pass_filenames: false
-
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -61,7 +61,7 @@ representative at an online or offline/IRL event.

 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
-channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
+channel in the [vLLM Slack](https://slack.vllm.ai).
 All complaints will be reviewed and investigated promptly and fairly.

 All community leaders are obligated to respect the privacy and security of the
--- a/2
+++ b/2
@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=300
+ARG VLLM_MAX_SIZE_MB=400
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>

 ---
@ -36,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.

-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.

 vLLM is fast with:

@ -139,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us

 * For technical questions and feature requests, please use Github issues or discussions.
-* For discussing with fellow users, please use Discord.
-* For coordinating contributions and development, please use Slack.
+* For discussing with fellow users and coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 import sys
--- a/benchmarks/benchmark_guided.py
+++ b/benchmarks/benchmark_guided.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark guided decoding throughput."""
 import argparse
 import dataclasses
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark the latency of processing a single batch of requests."""
 import argparse
 import dataclasses
--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Offline benchmark to test the long document QA throughput.

--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Benchmark the efficiency of prefix caching.

--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark offline prioritization."""
 import argparse
 import dataclasses
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 r"""Benchmark online serving throughput.

 On the server side, run one of the following commands:
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_guided.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 r"""Benchmark online serving throughput with guided decoding.

 On the server side, run one of the following commands:
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark offline inference throughput."""
 import argparse
 import dataclasses
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import itertools
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Cutlass bench utils
 from typing import Iterable, Tuple

--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import itertools
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
 # Example:
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os

 import aiohttp
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import asyncio
 import itertools

--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json

 import matplotlib.pyplot as plt
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import pickle as pkl
 import time
 from dataclasses import dataclass
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 from typing import Optional
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time

 import torch
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import json
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import copy
 import itertools
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List

 import torch
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import time
 from datetime import datetime
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import random
 import time
 from typing import List, Optional
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import time

 import torch
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 from typing import Optional, Tuple, Union

--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from itertools import accumulate
 from typing import List, Optional

--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 WEIGHT_SHAPES = {
    "ideal": [[4 * 256 * 32, 256 * 32]],
    "mistralai/Mistral-7B-v0.1/TP1": [
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import math
 import pickle
 import re
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Any, Callable, Iterable, Optional

--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Weight Shapes are in the format
 # ([K, N], TP_SPLIT_DIM)
 # Example:
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import cProfile
 import pstats

--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0

 #
 # A command line tool for running pytorch's hipify preprocessor on CUDA
--- a/collect_env.py
+++ b/collect_env.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py

--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -15,6 +15,9 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
                 std::vector<torch::Tensor> const& value_caches,
                 const torch::Tensor& block_mapping);

+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping);
+
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -46,7 +46,10 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
  char* src_ptr = static_cast<char*>(src.data_ptr());
  char* dst_ptr = static_cast<char*>(dst.data_ptr());

-  const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
  const at::cuda::OptionalCUDAGuard device_guard(
      src_device.is_cuda() ? src_device : dst_device);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -93,6 +96,24 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
  }
 }

+// Kernel for MLA, which works on a single joint kv_cache
+// Grid: (num_layers, num_pairs)
+template <typename scalar_t>
+__global__ void copy_blocks_mla_kernel(
+    int64_t* cache_ptrs, const int64_t* __restrict__ block_mapping,
+    const int mem_footprint_per_block) {
+  const int layer_idx = blockIdx.x;
+  const int pair_idx = blockIdx.y;
+  scalar_t* cache = reinterpret_cast<scalar_t*>(cache_ptrs[layer_idx]);
+  int64_t src_block = block_mapping[2 * pair_idx];
+  int64_t dst_block = block_mapping[2 * pair_idx + 1];
+  int64_t src_offset = src_block * mem_footprint_per_block;
+  int64_t dst_offset = dst_block * mem_footprint_per_block;
+  for (int i = threadIdx.x; i < mem_footprint_per_block; i += blockDim.x) {
+    cache[dst_offset + i] = cache[src_offset + i];
+  }
+}
+
 }  // namespace vllm

 // Note: the key_caches and value_caches vectors are constant but
@ -147,6 +168,42 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
      }));
 }

+// copy blocks kernel for MLA (assumes a joint KV-cache)
+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping) {
+  int num_layers = kv_caches.size();
+  if (num_layers == 0) {
+    return;
+  }
+  torch::Device cache_device = kv_caches[0].device();
+  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
+
+  std::vector<int64_t> cache_ptrs(num_layers);
+  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
+    cache_ptrs[layer_idx] =
+        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
+  }
+  torch::Tensor cache_ptrs_tensor =
+      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
+          .to(cache_device);
+
+  int num_pairs = block_mapping.size(0);
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  int mem_footprint_per_block = kv_caches[0].stride(0);
+  dim3 grid(num_layers, num_pairs);
+  dim3 block(std::min(1024, mem_footprint_per_block));
+  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
+      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
+        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            cache_ptrs_tensor.data_ptr<int64_t>(),
+            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
+      }));
+}
+
 namespace vllm {

 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@ -254,6 +311,7 @@ __global__ void concat_and_cache_mla_kernel(
                                     // + pe_dim)]
    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
    const int block_stride,                    //
+    const int entry_stride,                    //
    const int kv_c_stride,                     //
    const int k_pe_stride,                     //
    const int kv_lora_rank,                    //
@ -274,9 +332,8 @@ __global__ void concat_and_cache_mla_kernel(
                  int src_stride, int dst_stride, int size, int offset) {
    for (int i = threadIdx.x; i < size; i += blockDim.x) {
      const int64_t src_idx = token_idx * src_stride + i;
-      const int64_t dst_idx = block_idx * block_stride +
-                              block_offset * (kv_lora_rank + pe_dim) + i +
-                              offset;
+      const int64_t dst_idx =
+          block_idx * block_stride + block_offset * entry_stride + i + offset;
      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
        dst[dst_idx] = src[src_idx];
      } else {
@ -391,14 +448,14 @@ void reshape_and_cache_flash(
 // KV_T is the stored data type of kv-cache.
 // CACHE_T is the data type of key and value tensors.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)             \
-  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>           \
-      <<<grid, block, 0, stream>>>(                                    \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                    \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                    \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),             \
-          slot_mapping.data_ptr<int64_t>(), block_stride, kv_c_stride, \
-          k_pe_stride, kv_lora_rank, pe_dim, block_size,               \
+#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
+  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
          reinterpret_cast<const float*>(scale.data_ptr()));

 void concat_and_cache_mla(
@ -428,6 +485,7 @@ void concat_and_cache_mla(
  int kv_c_stride = kv_c.stride(0);
  int k_pe_stride = k_pe.stride(0);
  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);

  dim3 grid(num_tokens);
  dim3 block(std::min(kv_lora_rank, 512));
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import enum
 from typing import Dict, Union

--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@ -197,6 +197,72 @@ __global__ void moe_align_block_size_global_mem_kernel(
  }
 }

+// taken from
+// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
+template <typename scalar_t>
+__global__ void sgl_moe_align_block_size_kernel(
+    scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
+    int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
+    int32_t block_size, size_t numel, int32_t* cumsum) {
+  __shared__ int32_t shared_counts[32][8];
+  __shared__ int32_t local_offsets[256];
+
+  const int warp_id = threadIdx.x / 32;
+  const int lane_id = threadIdx.x % 32;
+  const int experts_per_warp = 8;
+  const int my_expert_start = warp_id * experts_per_warp;
+
+  for (int i = 0; i < experts_per_warp; ++i) {
+    if (my_expert_start + i < num_experts) {
+      shared_counts[warp_id][i] = 0;
+    }
+  }
+
+  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
+  const size_t start_idx = threadIdx.x * tokens_per_thread;
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int expert_id = topk_ids[i];
+    int warp_idx = expert_id / experts_per_warp;
+    int expert_offset = expert_id % experts_per_warp;
+    atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    cumsum[0] = 0;
+    for (int i = 1; i <= num_experts; ++i) {
+      int expert_count = 0;
+      int warp_idx = (i - 1) / experts_per_warp;
+      int expert_offset = (i - 1) % experts_per_warp;
+      expert_count = shared_counts[warp_idx][expert_offset];
+
+      cumsum[i] =
+          cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
+    }
+    *total_tokens_post_pad = cumsum[num_experts];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x < num_experts) {
+    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+         i += block_size) {
+      expert_ids[i / block_size] = threadIdx.x;
+    }
+    local_offsets[threadIdx.x] = cumsum[threadIdx.x];
+  }
+
+  __syncthreads();
+
+  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+    int32_t expert_id = topk_ids[i];
+    int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1);
+    sorted_token_ids[rank_post_pad] = i;
+  }
+}
+
 template <typename scalar_t, int TOPK>
 __global__ void moe_sum_kernel(
    scalar_t* __restrict__ out,          // [..., d]
@ -305,6 +371,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
  }
 }

+void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                              int64_t block_size,
+                              torch::Tensor sorted_token_ids,
+                              torch::Tensor experts_ids,
+                              torch::Tensor num_tokens_post_pad) {
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
+        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
+        // tensors
+        auto options_int =
+            torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+        // torch::Tensor token_cnts_buffer =
+        //     torch::empty({(num_experts + 1) * num_experts}, options_int);
+        torch::Tensor cumsum_buffer =
+            torch::empty({num_experts + 1}, options_int);
+
+        auto kernel = vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
+        kernel<<<1, 1024, 0, stream>>>(
+            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+            experts_ids.data_ptr<int32_t>(),
+            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+            topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
+      });
+}
+
 void moe_sum(torch::Tensor& input,   // [num_tokens, topk, hidden_size]
             torch::Tensor& output)  // [num_tokens, hidden_size]
 {
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -12,3 +12,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                          int64_t block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);
+
+void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                              int64_t block_size,
+                              torch::Tensor sorted_token_ids,
+                              torch::Tensor experts_ids,
+                              torch::Tensor num_tokens_post_pad);
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -22,6 +22,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "                     Tensor! num_tokens_post_pad) -> ()");
  m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);

+  // temporarily adapted from
+  // https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
+  m.def(
+      "sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
+      "                         int block_size, Tensor! sorted_token_ids,"
+      "                         Tensor! experts_ids,"
+      "                         Tensor! num_tokens_post_pad) -> ()");
+  m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
+
 #ifndef USE_ROCM
  m.def(
      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@ -16,29 +16,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);

-  using GroupShape = std::array<int64_t, 2>;
-
  int M = a.size(0), N = b.size(1), K = a.size(1);

-  GroupShape a_scale_group_shape = [&, &s = a_scales]() -> GroupShape {
-    if (s.numel() == 1) return {M, K};  // tensor-wise
-    if (s.dim() == 2)
-      return {ceil_div(a.size(0), s.size(0)), ceil_div(a.size(1), s.size(1))};
-    TORCH_CHECK(false, "Unsupported scale shape for scale_a");
-  }();
-
-  GroupShape b_scale_group_shape = [&, &s = b_scales]() -> GroupShape {
-    if (s.numel() == 1) return {K, N};  // tensor-wise
-    if (s.dim() == 2)
-      return {ceil_div(b.size(0), s.size(0)), ceil_div(b.size(1), s.size(1))};
-    TORCH_CHECK(false, "Unsupported scale shape for scale_b");
-  }();
-
-  if ((a_scale_group_shape == GroupShape{M, K} ||
-       a_scale_group_shape == GroupShape{1, K}) &&
-      (b_scale_group_shape == GroupShape{K, N} ||
-       b_scale_group_shape == GroupShape{K, 1})) {
-    // "standard per-tensor/per-token/per-channel" scaling
+  if ((a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+      (b_scales.numel() == 1 || b_scales.numel() == b.size(1))) {
+    // Standard per-tensor/per-token/per-channel scaling
    TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
    if (a.dtype() == torch::kFloat8_e4m3fn) {
      vllm::cutlass_scaled_mm_sm90_fp8(c, a, b, a_scales, b_scales, bias);
@ -46,25 +28,32 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
      TORCH_CHECK(a.dtype() == torch::kInt8);
      vllm::cutlass_scaled_mm_sm90_int8(c, a, b, a_scales, b_scales, bias);
    }
-  } else if (a_scale_group_shape == GroupShape{1, 128} &&
-             b_scale_group_shape == GroupShape{128, 128}) {
+  } else {
+    using GroupShape = std::array<int64_t, 2>;
+    auto make_group_shape = [](torch::Tensor const& x,
+                               torch::Tensor const& s) -> GroupShape {
+      TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
+      return {ceil_div(x.size(0), s.size(0)), ceil_div(x.size(1), s.size(1))};
+    };
+
+    GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
+    GroupShape b_scale_group_shape = make_group_shape(b, b_scales);
+
    // 1x128 per-token group scales for activations
    // 128x128 blockwise scales for weights
-    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn &&
-                    b.dtype() == torch::kFloat8_e4m3fn,
-                "Currently only FP8 is supported for A group shape 1x128 and "
-                "B group shape 128x128");
+    TORCH_CHECK((a_scale_group_shape == GroupShape{1, 128} &&
+                 b_scale_group_shape == GroupShape{128, 128} &&
+                 a.dtype() == torch::kFloat8_e4m3fn &&
+                 b.dtype() == torch::kFloat8_e4m3fn),
+                "cutlass_scaled_mm only supports datatype float8_e4m3fn.\n"
+                "a_scale_group_shape must be [1, 128]. Got: [",
+                a_scale_group_shape[0], ", ", a_scale_group_shape[1],
+                "]\n"
+                "b_scale_group_shape must be [128, 128]. Got: [",
+                b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
    TORCH_CHECK(!bias, "Bias not yet supported blockwise scaled_mm");

    vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
-  } else {
-    TORCH_CHECK(false,
-                "Unsupported scale group shapes for CUTLASS 3.x GEMM.\n "
-                "a_scale_group_shape must be [1, 128], got: [",
-                a_scale_group_shape[0], ", ", a_scale_group_shape[1],
-                "]\n"
-                "b_scale_group_shape must be [128, 128], got: [",
-                b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
  }
 }

--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import math
 import os
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -450,6 +450,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);

+  cache_ops.def(
+      "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
+  cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
+
  // Reshape the key and value tensors and cache them.
  cache_ops.def(
      "reshape_and_cache(Tensor key, Tensor value,"
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
@ -35,7 +37,6 @@ author = 'the vLLM Team'
 # ones.
 extensions = [
    "sphinx.ext.napoleon",
-    "sphinx.ext.viewcode",
    "sphinx.ext.linkcode",
    "sphinx.ext.intersphinx",
    "sphinx_copybutton",
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@ -250,7 +250,11 @@ def get_max_image_tokens(self) -> int:
 And thus, we can override the method as:

 ```python
-def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+def get_mm_max_tokens_per_item(
+    self,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+) -> Mapping[str, int]:
    return {"image": self.get_max_image_tokens()}
 ```

--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@ -2,12 +2,6 @@

 # AutoAWQ

-:::{warning}
-Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
-accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
-inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-:::
-
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
 The main benefits are lower latency and memory usage.
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@ -131,7 +131,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct",
    tensor_parallel_size=4,
-    speculative_model="ibm-fms/llama3-70b-accelerator",
+    speculative_model="ibm-ai-platform/llama3-70b-accelerator",
    speculative_draft_tensor_parallel_size=1,
 )
 outputs = llm.generate(prompts, sampling_params)
@ -149,11 +149,11 @@ limitation will be fixed in a future release.

 A variety of speculative models of this type are available on HF hub:

- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator)
- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator)
- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator)
- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator)
- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator)
+- [llama-13b-accelerator](https://huggingface.co/ibm-ai-platform/llama-13b-accelerator)
+- [llama3-8b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-8b-accelerator)
+- [codellama-34b-accelerator](https://huggingface.co/ibm-ai-platform/codellama-34b-accelerator)
+- [llama2-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama2-70b-accelerator)
+- [llama3-70b-accelerator](https://huggingface.co/ibm-ai-platform/llama3-70b-accelerator)
 - [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator)
 - [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator)
 - [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator)
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import itertools
 import re
 from dataclasses import dataclass, field
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@ -36,7 +36,7 @@ VLLM_TARGET_DEVICE=xpu python setup.py install

 :::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
-  type will be supported in the future.
+  type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
 :::

 ## Set up using Docker
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -40,6 +40,82 @@ If vLLM successfully returns text (for generative models) or hidden states (for
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.

+### Transformers fallback
+
+After the merge of <gh-pr:11330>, `vllm` can fallback to models that are available in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
+
+To check if the backend is `transformers`, you can simply do this:
+
+```python 
+from vllm import LLM
+llm = LLM(model=..., task="generate")  # Name or path of your model
+llm.apply_model(lambda model: print(model.__class__))
+```
+
+If it is `TransformersModel` then it means it's based on `transformers`!
+
+#### Supported features
+
+##### LORA and quantization
+
+Both are not supported yet! Make sure to open an issue and we'll work on this together with the `transformers` team!
+
+Usually `transformers` model load weights via the `load_adapters` API, that depends on PEFT. We need to work a bit to either use this api (for now this would result in some weights not being marked as loaded) or replace modules accordingly.
+
+Hints as to how this would look like:
+
+```python
+class TransformersModel(nn.Module, SupportsLoRA):
+  def __init__(*):
+    ...
+    self.model.load_adapter(vllm_config.load_config.model_loader_extra_config["qlora_adapter_name_or_path"])
+```
+
+Blocker is that you need to specify supported lora layers, when we would ideally want to load whatever is inside the checkpoint!
+
+##### Remote code
+
+This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
+
+```python 
+from vllm import LLM
+llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
+llm.apply_model(lambda model: print(model.__class__))
+```
+
+A model just needs the following two things:
+
+```python
+from transformers import PreTrainedModel
+from torch import nn
+
+class MyAttention(nn.Module):
+
+  def forward(self, hidden_states, **kwargs): # <- kwargs are required
+
+    ...
+    attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attn_output, attn_weights = attention_interface(
+      self,
+      query_states,
+      key_states,
+      value_states,
+      **kwargs,
+    )
+    ...
+
+class MyModel(PreTrainedModel):
+  _supports_attention_backend = True
+```
+
+Here is what happens in the background:
+
+1. The config is loaded
+2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
+3. The `TransformersModel` backend is used. See `/model_executors/models/transformers`, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+
+That's it!
+
 ### ModelScope

 To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
@ -650,14 +726,14 @@ See [this page](#generative-models) for more information on how to use generativ
  * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
  *
  * ✅︎
-  *
+  * \*
 - * `Idefics3ForConditionalGeneration`
  * Idefics3
  * T + I
  * `HuggingFaceM4/Idefics3-8B-Llama3` etc.
  * ✅︎
  *
-  *
+  * ✅︎
 - * `InternVLChatModel`
  * InternVL 2.5, Mono-InternVL, InternVL 2.0
  * T + I<sup>E+</sup>
@ -723,7 +799,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
 - * `NVLM_D_Model`
  * NVLM-D 1.0
-  * T + I<sup>E+</sup>
+  * T + I<sup>+</sup>
  * `nvidia/NVLM-D-72B`, etc.
  *
  * ✅︎
@ -770,11 +846,18 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
  * ✅︎
  * ✅︎
+- * `Qwen2_5_VLForConditionalGeneration`
+  * Qwen2.5-VL
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
+  * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `UltravoxModel`
  * Ultravox
  * T + A<sup>E+</sup>
  * `fixie-ai/ultravox-v0_3`
-  *
+  * ✅︎
  * ✅︎
  * ✅︎
 :::
@ -783,7 +866,11 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.

 :::{note}
-To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
+:::
+
+:::{note}
+H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
 :::

 :::{note}
@ -796,8 +883,11 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 :::

 :::{note}
-The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
-A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
+`mistral-community/pixtral-12b` does not support V1 yet.
+:::
+
+:::{note}
+To use Qwen2.5-VL series models, you have to install Huggingface `transformers` library from source via `pip install git+https://github.com/huggingface/transformers`.
 :::

 ### Pooling Models
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@ -60,7 +60,8 @@ bash run_cluster.sh \
                vllm/vllm-openai \
                ip_of_head_node \
                --head \
-                /path/to/the/huggingface/home/in/this/node
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
 ```

 On the rest of the worker nodes, run the following command:
@ -70,10 +71,11 @@ bash run_cluster.sh \
                vllm/vllm-openai \
                ip_of_head_node \
                --worker \
-                /path/to/the/huggingface/home/in/this/node
+                /path/to/the/huggingface/home/in/this/node \
+                -e VLLM_HOST_IP=ip_of_this_node
 ```

-Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct.
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.

 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.

@ -103,3 +105,7 @@ Please make sure you downloaded the model to all the nodes (with the same path),

 When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
 :::
+
+:::{warning}
+If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` to see the IP address used by Ray. See <gh-issue:7815> for more information.
+:::
--- a/examples/offline_inference/aqlm_example.py
+++ b/examples/offline_inference/aqlm_example.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser

--- a/examples/offline_inference/arctic.py
+++ b/examples/offline_inference/arctic.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams

 # Sample prompts.
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use vLLM for running offline inference 
 with the correct prompt format on audio language models.
--- a/examples/offline_inference/basic.py
+++ b/examples/offline_inference/basic.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams

 # Sample prompts.
--- a/examples/offline_inference/basic_with_model_default_sampling.py
+++ b/examples/offline_inference/basic_with_model_default_sampling.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM

 # Sample prompts.
--- a/examples/offline_inference/chat.py
+++ b/examples/offline_inference/chat.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams

 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 import json
 import random
--- a/examples/offline_inference/classification.py
+++ b/examples/offline_inference/classification.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM

 # Sample prompts.
--- a/examples/offline_inference/cli.py
+++ b/examples/offline_inference/cli.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import asdict

 from vllm import LLM, SamplingParams
--- a/examples/offline_inference/cpu_offload.py
+++ b/examples/offline_inference/cpu_offload.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams

 # Sample prompts.
--- a/examples/offline_inference/distributed.py
+++ b/examples/offline_inference/distributed.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use Ray Data for running offline batch inference
 distributively on a multi-nodes cluster.
--- a/examples/offline_inference/embedding.py
+++ b/examples/offline_inference/embedding.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM

 # Sample prompts.
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART
--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 '''
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically Florence-2
--- a/examples/offline_inference/gguf_inference.py
+++ b/examples/offline_inference/gguf_inference.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from huggingface_hub import hf_hub_download

 from vllm import LLM, SamplingParams
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 from typing import List, Tuple

--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use LoRA with different quantization techniques
 for offline inference.
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import gc
 import time
 from typing import List
@ -49,7 +51,7 @@ if __name__ == "__main__":
    # Create an LLM with spec decoding
    llm = LLM(
        model="meta-llama/Llama-2-13b-chat-hf",
-        speculative_model="ibm-fms/llama-13b-accelerator",
+        speculative_model="ibm-ai-platform/llama-13b-accelerator",
    )

    print("With speculation")
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 This example shows how to use the multi-LoRA functionality
 for offline inference.
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams

 # Sample prompts.
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os

 from vllm import LLM, SamplingParams
--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 # ruff: noqa
 import argparse

--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory

--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import inspect
 import json
 import os
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import dataclasses
 import os
--- a/examples/offline_inference/ray_placement.py
+++ b/examples/offline_inference/ray_placement.py
@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+a simple demonstration to show how to control
+the placement of the vLLM workers with Ray.
+The key is to set VLLM_RAY_PER_WORKER_GPUS and
+VLLM_RAY_BUNDLE_INDICES properly.
+"""
+import os
+
+import ray
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+from vllm import LLM
+from vllm.worker.worker import Worker
+
+
+class MyWorker(Worker):
+
+    def report_device_id(self) -> str:
+        from vllm.platforms import current_platform
+        return current_platform.get_device_uuid(self.device.index)
+
+
+class MyLLM(LLM):
+
+    def __init__(self, *args, bundle_indices: list, **kwargs):
+        # a hack to make the script work.
+        # stop ray from manipulating CUDA_VISIBLE_DEVICES
+        # at the top-level
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        # every worker will use 0.4 GPU, so that we can schedule
+        # 2 instances on the same GPUs.
+        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
+        os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(
+            map(str, bundle_indices))
+        print(f"creating LLM with bundle_indices={bundle_indices}")
+        super().__init__(*args, **kwargs)
+
+
+class RayTrainingActor:
+
+    def report_device_id(self) -> str:
+        # the argument for get_device_uuid is the index
+        # of the GPU in the visible devices.
+        # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
+        from vllm.platforms import current_platform
+        return current_platform.get_device_uuid(0)
+
+
+# ray manages 4 GPUs
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+ray.init()
+
+# we want to co-locate vLLM instance and the training actor
+# on the same set of GPUs.
+# the placement plan is as follows:
+# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2)
+# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
+
+pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
+ray.get(pg.ready())
+print(f"placement group has bundles {pg.bundle_specs=}")
+
+training_actors = []
+training_actor_device_ids = []
+inference_engines = []
+inference_engine_device_ids = []
+
+for bundle_index in [0, 1, 2, 3]:
+    training_actor = ray.remote(
+        num_cpus=0,
+        num_gpus=0.4,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+            placement_group_bundle_index=bundle_index,
+        ),
+    )(RayTrainingActor).remote()
+    training_actors.append(training_actor)
+    device_id = ray.get(training_actor.report_device_id.remote())
+    print(f"training actor {bundle_index} is on {device_id}")
+    training_actor_device_ids.append(device_id)
+
+for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]):
+    # IMPORTANT: when creating vLLM instances, we need to
+    # make sure there are no GPU activities on the target GPUs,
+    # otherwise, they will interfere with the vLLM memory profiling,
+    # and cause unexpected behaviors.
+    llm = ray.remote(
+        num_cpus=0,
+        num_gpus=0,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+        ),
+    )(MyLLM).remote(
+        model="facebook/opt-125m",
+        enforce_eager=True,
+        worker_cls=MyWorker,
+        tensor_parallel_size=2,
+        distributed_executor_backend="ray",
+        gpu_memory_utilization=0.4,
+        bundle_indices=bundle_indices,
+    )
+    inference_engines.append(llm)
+    # don't call any method on the inference engine here,
+    # otherwise it will block until the vLLM instance is created.
+
+for i, llm in enumerate(inference_engines):
+    inference_engine_device_ids.append(
+        ray.get(llm.collective_rpc.remote("report_device_id", args=tuple())))
+    print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
+
+# check the placement
+# the first two training actors should be
+# on the same GPUs as the first inference engine
+assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
+# the last two training actors should be
+# on the same GPUs as the second inference engine
+assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 a simple demonstration of RLHF with vLLM, inspired by
 the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Saves each worker's model state dict directly to a checkpoint, which enables a
 fast load path for large tensor-parallel models where each worker only needs to
--- a/examples/offline_inference/scoring.py
+++ b/examples/offline_inference/scoring.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from vllm import LLM

 # Sample prompts.
--- a/examples/offline_inference/simple_profiling.py
+++ b/examples/offline_inference/simple_profiling.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import time

--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from enum import Enum

 from pydantic import BaseModel
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 experimental support for tensor-parallel inference with torchrun,
 see https://github.com/vllm-project/vllm/issues/11400 for
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
rshaw@neuralmagic.com	6de0982dd0	added Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-04-06 14:07:43 +00:00
rshaw@neuralmagic.com	45fa7f9b8e	updated Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>	2025-04-05 13:43:03 +00:00
youkaichao	0408efc6d0	[Misc] Improve error message for incorrect pynvml (#12809 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-02-06 15:23:50 +08:00
Michael Goin	449d1bce02	[Misc] Remove duplicated DeepSeek V2/V3 model definition (#12793 )	2025-02-05 23:16:20 -08:00
Harry Mellor	1a6fcad4c9	Improve `TransformersModel` UX (#12785 )	2025-02-05 22:24:57 -08:00
Lu Fang	56534cd577	[Bugfix] Fix the test_ultravox.py's license (#12806 ) Signed-off-by: Lu Fang <lufang@fb.com>	2025-02-06 13:25:54 +08:00
Sumit Vij	d88506dda4	[Model] LoRA Support for Ultravox model (#11253 )	2025-02-05 19:54:13 -08:00
Lu Fang	9cdea30b4f	[Misc][Easy] Remove the space from the file name	2025-02-05 19:23:35 -08:00
Lucas Wilkinson	76abd0c881	[Bugfix] Better FP8 supported defaults	2025-02-05 19:22:19 -08:00
Gregory Shtrasberg	5b19b93082	[ROCm][Kernel] Using the correct warp_size value	2025-02-05 19:15:08 -08:00
Cyrus Leung	75404d041b	[VLM] Update compatibility with transformers 4.49	2025-02-05 19:09:45 -08:00
Roger Wang	bf3b79efb8	[VLM] Qwen2.5-VL	2025-02-05 13:31:38 -08:00
Russell Bryant	9a5b1554b4	[Docs] Drop duplicate [source] links	2025-02-05 13:30:50 -08:00
Cyrus Leung	a4ce74c14a	[VLM] Use shared field to pass token ids to model	2025-02-05 13:30:46 -08:00
Rahul Tuli	3b2005e1db	Add: Support for Sparse24Bitmask Compressed Models	2025-02-05 13:30:43 -08:00
Sanju C Sudhakaran	af8486de49	[Hardware][Intel-Gaudi] Enable FusedSDPA support for Intel Gaudi (HPU)	2025-02-05 13:29:45 -08:00
Chen Zhang	4c3aac51e1	Merging PR #12536 Merged via CLI script	2025-02-05 13:24:26 -08:00
youkaichao	bc1bdecebf	[core][distributed] exact ray placement control (#12732 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-02-06 02:03:19 +08:00
Akash kaothalkar	022bcc701a	[Bugfix] Fix 'ModuleNotFoundError: No module named 'intel_extension_for_pytorch'' for --tensor-parallel-size more than 1 (#12546 )	2025-02-04 23:11:02 -08:00
Michael Goin	c53dc466b1	[Doc] Remove performance warning for auto_awq.md (#12743 )	2025-02-04 22:43:11 -08:00
Nick Hill	3d09e592a8	[V1][Misc] Shorten `FinishReason` enum and use constant strings (#12760 )	2025-02-04 22:43:02 -08:00
Harry Mellor	fcf2e3d7fc	[Bugfix] Fix OpenVINO model runner (#12750 )	2025-02-04 22:42:46 -08:00
Michael Goin	58b218d7ae	[Doc] Update PR Reminder with link to Developer Slack (#12748 )	2025-02-04 22:42:09 -08:00
Kyle Sayers	7ff7a638b6	[Model][Quant] Fix GLM, Fix fused module mappings for quantization (#12634 ) Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: mgoin <michael@neuralmagic.com>	2025-02-05 05:32:06 +00:00
Dipika Sikka	686006a220	[Misc] Bump the compressed-tensors version (#12736 )	2025-02-04 20:44:48 -08:00
Isotr0py	98fd089fc9	[VLM] Add MLA with pure RoPE support for deepseek-vl2 models (#12729 )	2025-02-04 20:44:26 -08:00
Harry Mellor	249824c3bf	Refactor `Linear` handling in `TransformersModel` (#12727 ) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>	2025-02-05 04:31:12 +00:00
Aleksandr Malyshev	64862d106e	[ROCM][AMD][TRITON] Halving warps number for fw_prefill to reduce spilling (#12713 ) Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>	2025-02-05 03:58:22 +00:00
Aviv Keshet	b3a0d01e45	[Core] add and implement `VLLM_LOGITS_PROCESSOR_THREADS` (#12368 ) Signed-off-by: Aviv Keshet <akeshet@scaledcognition.com>	2025-02-04 18:46:26 -08:00
Lucas Wilkinson	75e94309e8	[Perf] Mem align KV caches for CUDA devices (MLA perf improvement) (#12676 ) Signed-off-by: simon-mo <xmo@berkeley.edu> Signed-off-by: Lucas Wilkinson <lcwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Co-authored-by: simon-mo <xmo@berkeley.edu>	2025-02-04 18:22:24 -08:00
Mark McLoughlin	233df6f5c4	[V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579 ) Signed-off-by: Mark McLoughlin <markmc@redhat.com>	2025-02-04 19:46:54 -05:00
Cyrus Leung	18016a5e62	[Bugfix] Fix CI failures for InternVL and Mantis models (#12728 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>	2025-02-04 23:54:23 +08:00
Sophie du Couédic	649550f27e	[Build] update requirements of no-device for plugin usage (#12630 ) Signed-off-by: Sophie du Couédic <sop@zurich.ibm.com>	2025-02-04 21:19:12 +08:00
Kero Liang	62467a834a	Avoid unnecessary multi-modal input data copy when len(batch) == 1 (#12722 ) Signed-off-by: imkero <kerorek@outlook.com>	2025-02-04 21:03:19 +08:00
Michael Greenbaum	6469038b14	[Bugfix] Fix loading of fine-tuned models based on Phi-3-Small (#12689 ) Signed-off-by: Michael Greenbaum <mgreenbaum@microsoft.com> Co-authored-by: Michael Greenbaum <mgreenbaum@microsoft.com>	2025-02-04 20:58:48 +08:00
Isotr0py	815079de8e	[VLM] merged multimodal processor and V1 support for idefics3 (#12660 ) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>	2025-02-04 20:00:51 +08:00
Woosuk Kwon	18a88fcccc	[V1] Remove scheduling constraint on partial requests (#12674 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-02-04 02:43:58 -08:00
Cyrus Leung	d1ca7df84d	[VLM] Merged multi-modal processor for InternVL-based models (#12553 ) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>	2025-02-04 16:44:52 +08:00
Jee Jee Li	96b23621c1	[Misc] Add BNB quantization for Whisper (#12381 ) Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>	2025-02-04 16:27:36 +08:00
Hongxia Yang	c36ac98d01	[AMD][ROCm] Enable DeepSeek model on ROCm (#12662 ) Signed-off-by: Hongxia Yang <hongxia.yang@amd.com> Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com>	2025-02-04 08:24:11 +00:00
Kyle Sayers	4896d0c2dd	[Quant] Fix use_mla TypeError and support loading pure-sparsity Compressed Tensors configs (#12711 )	2025-02-03 23:27:11 -08:00
Thomas Parnell	bb392af434	[Doc] Replace ibm-fms with ibm-ai-platform (#12709 ) Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>	2025-02-04 07:05:04 +00:00
Michael Goin	5d98d56089	Support Pixtral-Large HF by using llava multimodal_projector_bias config (#12710 ) Signed-off-by: mgoin <michael@neuralmagic.com>	2025-02-04 11:55:46 +08:00
Russell Bryant	73b35cca7f	[Core] Improve hash collision avoidance in prefix caching (#12621 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-02-03 16:28:20 -08:00
Cody Yu	5095e96606	[V1] Revert `uncache_blocks` and support recaching full blocks (#12415 ) Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>	2025-02-03 15:04:53 -08:00
Cody Yu	cf58b9c4ca	[MISC] Remove model input dumping when exception (#12582 ) Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>	2025-02-03 13:34:16 -08:00
kushanam	4797dad3ec	[Model] Add Deepseek V3 fp8_w8a8 configs for B200 (#12707 )	2025-02-03 13:30:39 -08:00
Kyle Sayers	6dd5e52823	Squelch MLA warning for Compressed-Tensors Models (#12704 ) Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>	2025-02-03 13:29:56 -08:00
Tyler Michael Smith	c11de33dad	[Bugfix][Kernel] Fix per-token/per-channel quantization for Hopper scaled mm (#12696 ) Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-02-03 13:04:59 -08:00
Russell Bryant	33e0602e59	[Misc] Fix improper placement of SPDX header in scripts (#12694 ) Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-02-03 11:16:59 -08:00
Arthur	a1a2aaadb9	[Model]: Add `transformers` backend support (#11330 ) # Adds support for `transformers` as a backend Following https://github.com/huggingface/transformers/pull/35235, a bunch of models should already be supported, we are ramping up support for more models. Thanks @Isotr0py for the TP support, and @hmellor for his help as well! This includes: - `trust_remote_code=True` support: any model on the hub, if it implements attention the correct way can be natively supported!! - tensor parallel support --------- Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>	2025-02-03 21:30:38 +08:00
youkaichao	1298a400e8	[ci/build] fix gh200 test (#12681 ) Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-02-03 15:59:49 +08:00
youkaichao	ad4a9dc817	[cuda] manually import the correct pynvml module (#12679 ) fixes problems like https://github.com/vllm-project/vllm/pull/12635 and https://github.com/vllm-project/vllm/pull/12636 and https://github.com/vllm-project/vllm/pull/12565 --------- Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-02-03 15:58:21 +08:00
Srikanth Srinivas	b9986454fe	Fix for attention layers to remain unquantized during moe_wn16 quant (#12570 ) Fix to AWQ quant loading of the new R1 model The new optimized MoE kernels for a large number of experts `moe_wn16` uses AWQ quant which requires the attention layers to be in 16bit The current merge has broken this, and the `get_quant_method` must return None for it to work correctly again --------- Signed-off-by: Srikanth Srinivas <srikanth@astrum.ai> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Beim <beim2015@outlook.com> Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Signed-off-by: mgoin <michael@neuralmagic.com> Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com> Signed-off-by: Aleksandr Malyshev <maleksan@amd.com> Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com> Signed-off-by: simon-mo <xmo@berkeley.edu> Signed-off-by: Cody Yu <hao.yu.cody@gmail.com> Signed-off-by: Chen Zhang <zhangch99@outlook.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: Ryan N <ryan.nguyen@centml.ai> Signed-off-by: Brian Dellabetta <bdellabe@redhat.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: Rahul Tuli <rahul@neuralmagic.com> Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: simon-mo <simon.mo@hey.com> Signed-off-by: Vicente Herrera <vicenteherrera@vicenteherrera.com> Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Shawn Du <shawnd200@outlook.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com> Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Beim <805908499@qq.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: mgoin <michael@neuralmagic.com> Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Nishidha <nishidha.panpaliya@partner.ibm.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Co-authored-by: Aleksandr Malyshev <maleksan@amd.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Tyler Michael Smith <tysmith@redhat.com> Co-authored-by: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com> Co-authored-by: Kevin H. Luu <kevin@anyscale.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Ryan Nguyen <96593302+xpbowler@users.noreply.github.com> Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com> Co-authored-by: fade_away <1028552010@qq.com> Co-authored-by: weilong.yu <weilong.yu@shopee.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Eldar Kurtic <eldarkurtic314@gmail.com> Co-authored-by: Rahul Tuli <rahul@neuralmagic.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Vicente Herrera <vicenteherrera@vicenteherrera.com> Co-authored-by: Jinzhen Lin <linjinzhen@hotmail.com> Co-authored-by: Shawn Du <shawnd200@outlook.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: youkaichao <youkaichao@gmail.com>	2025-02-03 13:46:19 +08:00
Eldar Kurtic	c5932e5dac	Properly check if all fused layers are in the list of targets (#12666 ) Thanks @kylesayrs for catching this!	2025-02-03 13:42:18 +08:00
youkaichao	20579c0fae	make sure mistral_common not imported for non-mistral models (#12669 ) When people use deepseek models, they find that they need to solve cv2 version conflict, see https://zhuanlan.zhihu.com/p/21064432691 . I added the check, and make all imports of `cv2` lazy. --------- Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-02-03 13:40:25 +08:00
Yang Chen	95460fc513	[Kernel] port sgl moe_align_block_size kernels (#12574 ) sgl_moe_align_block_size is based on: `ded9fcd09a` moe_align_block_size is based on: `ba5112ff69` Signed-off-by: Yang Chen <yangche@fb.com>	2025-02-03 13:09:50 +08:00
Zhuohan Li	326fcc8b9f	[Doc] Deprecate Discord (#12668 )	2025-02-02 19:19:56 -08:00
youkaichao	e64330910b	[doc][misc] clarify VLLM_HOST_IP for multi-node inference (#12667 ) As more and more people are trying deepseek models with multi-node inference, https://github.com/vllm-project/vllm/issues/7815 becomes more frequent. Let's give clear message to users. Signed-off-by: youkaichao <youkaichao@gmail.com>	2025-02-03 09:32:18 +08:00
Russell Bryant	e489ad7a21	[Misc] Add SPDX-License-Identifier headers to python source files (#12628 ) - Add SPDX license headers to python source files - Check for SPDX headers using pre-commit commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-02-02 11:58:18 -08:00
Kunshang Ji	f256ebe4df	[Hardware][Intel GPU] add XPU bf16 support (#12392 ) Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>	2025-02-02 10:17:26 +00:00
Shawn Du	f8ece6e17f	[Core][v1] Unify allocating slots in prefill and decode in KV cache manager (#12608 ) As mentioned in RFC https://github.com/vllm-project/vllm/issues/12254, this PR achieves the task: combine allocate_slots and append_slots. There should be no functionality change, except that in decode, also raise exception when num_tokens is zero (like prefill), and change the unit test case accordingly. @comaniac @rickyyx @WoosukKwon @youkaichao @heheda12345 @simon-mo --------- Signed-off-by: Shawn Du <shawnd200@outlook.com>	2025-02-02 16:40:58 +08:00
Woosuk Kwon	abfcdcdf27	[V1][Minor] Avoid frequently creating ConstantList (#12653 ) A small optimization to avoid creating a new `ConstantList` every time `request.kv_block_hashes` is used. Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-02-01 23:43:20 -08:00
Russell Bryant	e497f33491	[Core] Silence unnecessary deprecation warnings (#12620 ) I noticed during testing that I was getting a lot of these deprecation warnings about `local_lora_path`: ``` DeprecationWarning: The 'lora_local_path' attribute is deprecated and will be removed in a future version. Please use 'lora_path' instead. ``` The check used for emitting this warning was always True, even when the parameter was not actually specified. It will always be in `__struct_fields__`. We should be checking for a non-None value, instead. Signed-off-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Russell Bryant <rbryant@redhat.com>	2025-02-02 15:35:50 +08:00
Jinzhen Lin	baaa2b24da	[Bugfix] fix moe_wna16 get_quant_method (#12648 ) Fix https://github.com/vllm-project/vllm/issues/12647 The `get_quant_method` of `moe_wna16` always return moe method, GPTQ-based linear method or AWQ-based linear method, even when the target module is attention layer. `baeded2569/vllm/attention/layer.py (L86-L92)` Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>	2025-02-02 15:29:56 +08:00
Vicente Herrera	b4e5c03306	doc: fixing minor typo in readme.md (#12643 ) Word "evolved" was mistyped Signed-off-by: Vicente Herrera <vicenteherrera@vicenteherrera.com> --------- Signed-off-by: Vicente Herrera <vicenteherrera@vicenteherrera.com>	2025-02-01 17:17:29 +00:00
Michael Goin	3194039c0e	Apply torch.compile to fused_moe/grouped_topk (#12637 )	2025-02-01 16:16:19 +00:00