ci(Mergify): configuration update

Signed-off-by: Lu Fang <null>
2025-06-09 15:44:44 +08:00
30 changed files with 359 additions and 437 deletions
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -1,196 +1,181 @@
 pull_request_rules:
- name: label-documentation
-  description: Automatically apply documentation label
-  conditions:
-    - or:
-      - files~=^[^/]+\.md$
-      - files~=^docs/
-      - files~=^examples/
-  actions:
-    label:
-      add:
-        - documentation
-
- name: label-ci-build
-  description: Automatically apply ci/build label
-  conditions:
-    - or:
-      - files~=^\.github/
-      - files~=\.buildkite/
-      - files~=^cmake/
-      - files=CMakeLists.txt
-      - files~=^docker/Dockerfile
-      - files~=^requirements.*\.txt
-      - files=setup.py
-  actions:
-    label:
-      add:
-        - ci/build
-
- name: label-frontend
-  description: Automatically apply frontend label
-  conditions:
-    - files~=^vllm/entrypoints/
-  actions:
-    label:
-      add:
-        - frontend
-
- name: label-llama
-  description: Automatically apply llama label
-  conditions:
-    - or:
-      - files~=^examples/.*llama.*\.py
-      - files~=^tests/.*llama.*\.py
-      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
-      - files~=^vllm/model_executor/models/.*llama.*\.py
-      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
-  actions:
-    label:
-      add:
-        - llama
-
- name: label-multi-modality
-  description: Automatically apply multi-modality label
-  conditions:
-    - or:
-      - files~=^vllm/multimodal/
-      - files~=^tests/multimodal/
-      - files~=^tests/models/multimodal/
-      - files~=^tests/models/*/audio_language/
-      - files~=^tests/models/*/vision_language/
-      - files=tests/models/test_vision.py
-  actions:
-    label:
-      add:
-        - multi-modality
-
- name: label-structured-output
-  description: Automatically apply structured-output label
-  conditions:
-    - or:
-      - files~=^benchmarks/structured_schemas/
-      - files=benchmarks/benchmark_serving_structured_output.py
-      - files=benchmarks/run_structured_output_benchmark.sh
-      - files=docs/features/structured_outputs.md
-      - files=examples/offline_inference/structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
-      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
-      - files~=^vllm/model_executor/guided_decoding/
-      - files=tests/model_executor/test_guided_processors.py
-      - files=tests/entrypoints/llm/test_guided_generate.py
-      - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_guided_generate.py
-      - files~=^vllm/v1/structured_output/
-  actions:
-    label:
-      add:
-        - structured-output
-
- name: label-speculative-decoding
-  description: Automatically apply speculative-decoding label
-  conditions:
-    - or:
-      - files~=^vllm/spec_decode/
-      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
-      - files~=^tests/spec_decode/
-  actions:
-    label:
-      add:
-        - speculative-decoding
-
- name: label-v1
-  description: Automatically apply v1 label
-  conditions:
-    - or:
-      - files~=^vllm/v1/
-      - files~=^tests/v1/
-  actions:
-    label:
-      add:
-        - v1
-
- name: label-tpu
-  description: Automatically apply tpu label
-  # Keep this list in sync with `label-tpu-remove` conditions
-  conditions:
-    - or:
-      - files~=tpu.py
-      - files~=_tpu
-      - files~=tpu_
-      - files~=/tpu/
-      - files~=pallas
-  actions:
-    label:
-      add:
-        - tpu
-
- name: label-tpu-remove
-  description: Automatically remove tpu label
-  # Keep this list in sync with `label-tpu` conditions
-  conditions:
-    - and:
-      - -files~=tpu.py
-      - -files~=_tpu
-      - -files~=tpu_
-      - -files~=/tpu/
-      - -files~=pallas
-  actions:
-    label:
-      remove:
-        - tpu
-
- name: label-tool-calling
-  description: Automatically add tool-calling label
-  conditions:
-    - or:
-      - files~=^tests/tool_use/
-      - files~=^tests/mistral_tool_use/
-      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
-      - files~=^vllm/entrypoints/openai/tool_parsers/
-      - files=docs/features/tool_calling.md
-      - files~=^examples/tool_chat_*
-      - files=examples/offline_inference/chat_with_tools.py
-      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
-      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
-      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
-  actions:
-    label:
-      add:
-        - tool-calling
-
- name: ping author on conflicts and add 'needs-rebase' label
-  conditions:
+  - name: label-llama
+    description: Automatically apply llama label
+    conditions:
+      - or:
+          - files~=^examples/.*llama.*\.py
+          - files~=^tests/.*llama.*\.py
+          - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
+          - files~=^vllm/model_executor/models/.*llama.*\.py
+          - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+    actions:
+      label:
+        add:
+          - llama
+  - name: label-documentation
+    description: Automatically apply documentation label
+    conditions:
+      - or:
+          - files~=^[^/]+\.md$
+          - files~=^docs/
+          - files~=^examples/
+    actions:
+      label:
+        add:
+          - documentation
+  - name: label-ci-build
+    description: Automatically apply ci/build label
+    conditions:
+      - or:
+          - files~=^\.github/
+          - files~=\.buildkite/
+          - files~=^cmake/
+          - files=CMakeLists.txt
+          - files~=^docker/Dockerfile
+          - files~=^requirements.*\.txt
+          - files=setup.py
+    actions:
+      label:
+        add:
+          - ci/build
+  - name: label-frontend
+    description: Automatically apply frontend label
+    conditions:
+      - files~=^vllm/entrypoints/
+    actions:
+      label:
+        add:
+          - frontend
+  - name: label-multi-modality
+    description: Automatically apply multi-modality label
+    conditions:
+      - or:
+          - files~=^vllm/multimodal/
+          - files~=^tests/multimodal/
+          - files~=^tests/models/multimodal/
+          - files~=^tests/models/*/audio_language/
+          - files~=^tests/models/*/vision_language/
+          - files=tests/models/test_vision.py
+    actions:
+      label:
+        add:
+          - multi-modality
+  - name: label-structured-output
+    description: Automatically apply structured-output label
+    conditions:
+      - or:
+          - files~=^benchmarks/structured_schemas/
+          - files=benchmarks/benchmark_serving_structured_output.py
+          - files=benchmarks/run_structured_output_benchmark.sh
+          - files=docs/features/structured_outputs.md
+          - files=examples/offline_inference/structured_outputs.py
+          - files=examples/online_serving/openai_chat_completion_structured_outputs.py
+          - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+          - files~=^vllm/model_executor/guided_decoding/
+          - files=tests/model_executor/test_guided_processors.py
+          - files=tests/entrypoints/llm/test_guided_generate.py
+          - files~=^tests/v1/structured_output/
+          - files=tests/v1/entrypoints/llm/test_guided_generate.py
+          - files~=^vllm/v1/structured_output/
+    actions:
+      label:
+        add:
+          - structured-output
+  - name: label-speculative-decoding
+    description: Automatically apply speculative-decoding label
+    conditions:
+      - or:
+          - files~=^vllm/spec_decode/
+          - files=vllm/model_executor/layers/spec_decode_base_sampler.py
+          - files~=^tests/spec_decode/
+    actions:
+      label:
+        add:
+          - speculative-decoding
+  - name: label-v1
+    description: Automatically apply v1 label
+    conditions:
+      - or:
+          - files~=^vllm/v1/
+          - files~=^tests/v1/
+    actions:
+      label:
+        add:
+          - v1
+  - name: label-tpu
+    description: Automatically apply tpu label
+    conditions:
+      - or:
+          - files~=tpu.py
+          - files~=_tpu
+          - files~=tpu_
+          - files~=/tpu/
+          - files~=pallas
+    actions:
+      label:
+        add:
+          - tpu
+  - name: label-tpu-remove
+    description: Automatically remove tpu label
+    conditions:
+      - and:
+          - -files~=tpu.py
+          - -files~=_tpu
+          - -files~=tpu_
+          - -files~=/tpu/
+          - -files~=pallas
+    actions:
+      label:
+        remove:
+          - tpu
+  - name: label-tool-calling
+    description: Automatically add tool-calling label
+    conditions:
+      - or:
+          - files~=^tests/tool_use/
+          - files~=^tests/mistral_tool_use/
+          - files~=^tests/entrypoints/openai/tool_parsers/
+          - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+          - files~=^vllm/entrypoints/openai/tool_parsers/
+          - files=docs/features/tool_calling.md
+          - files~=^examples/tool_chat_*
+          - files=examples/offline_inference/chat_with_tools.py
+          - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
+          - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+          - files=examples/online_serving/openai_chat_completion_client_with_tools.py
+    actions:
+      label:
+        add:
+          - tool-calling
+  - name: ping author on conflicts and add 'needs-rebase' label
+    conditions:
      - conflict
      - -closed
-  actions:
-    label:
-      add:
-        - needs-rebase
-    comment:
-      message: |
-       This pull request has merge conflicts that must be resolved before it can be
-       merged. Please rebase the PR, @{{author}}.
+    actions:
+      label:
+        add:
+          - needs-rebase
+      comment:
+        message: |
+          This pull request has merge conflicts that must be resolved before it can be
+          merged. Please rebase the PR, @{{author}}.

-       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
-
- name: assign reviewer for tensorizer changes
-  conditions:
+          https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+  - name: assign reviewer for tensorizer changes
+    conditions:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
      - files~=^tests/tensorizer_loader/
-  actions:
-    assign:
-      users:
-        - "sangstar"
-
- name: remove 'needs-rebase' label when conflict is resolved
-  conditions:
+    actions:
+      assign:
+        users:
+          - sangstar
+  - name: remove 'needs-rebase' label when conflict is resolved
+    conditions:
      - -conflict
      - -closed
-  actions:
-    label:
-      remove:
-        - needs-rebase
+    actions:
+      label:
+        remove:
+          - needs-rebase
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
  # are not supported by Machete yet.
  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_ARCHS)

    #
@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # kernels for the remaining archs that are not already built for 3x.
  # (Build 8.9 for FP8)
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
+    "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
  # subtract out the archs that are already built for 3x
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
  if (SCALED_MM_2X_ARCHS)
@ -684,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)

    #
--- a/docs/design/v1/prefix_caching.md
+++ b/docs/design/v1/prefix_caching.md
@ -144,7 +144,7 @@ As a result, we will have the following components when the KV cache manager is

 **Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:

-1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:  
+1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:  
   1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.  
   2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.  
   3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@ -31,7 +31,6 @@ refer to the [PyTorch Security
 Guide](https://github.com/pytorch/pytorch/security/policy#using-distributed-features).

 Key points from the PyTorch security guide:
-
 - PyTorch Distributed features are intended for internal communication only
 - They are not built for use in untrusted environments or networks
 - No authorization protocol is included for performance reasons
--- a/requirements/common.txt
+++ b/requirements/common.txt
@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.10.1 # required for compressed-tensors
+compressed-tensors == 0.10.0 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@ -8,7 +8,6 @@ import uvicorn
 from fastapi.responses import JSONResponse, Response

 import vllm.entrypoints.api_server
-import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.utils import FlexibleArgumentParser
@ -47,8 +46,9 @@ if __name__ == "__main__":
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
    vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level="debug",
-                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
+    uvicorn.run(
+        app,
+        host=args.host,
+        port=args.port,
+        log_level="debug",
+        timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@ -25,12 +25,6 @@ TOKEN_IDS = [
 ]


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
@pytest.fixture(scope="module")
 def llm():
    # pytest caches the fixture so we use weakref.proxy to
@ -110,19 +104,3 @@ def test_multiple_sampling_params(llm: LLM):
    # sampling_params is None, default params should be applied
    outputs = llm.generate(PROMPTS, sampling_params=None)
    assert len(PROMPTS) == len(outputs)
-
-
-def test_max_model_len():
-    max_model_len = 20
-    llm = LLM(
-        model=MODEL_NAME,
-        max_model_len=max_model_len,
-        gpu_memory_utilization=0.10,
-        enforce_eager=True,  # reduce test time
-    )
-    sampling_params = SamplingParams(max_tokens=max_model_len + 10)
-    outputs = llm.generate(PROMPTS, sampling_params)
-    for output in outputs:
-        num_total_tokens = len(output.prompt_token_ids) + len(
-            output.outputs[0].token_ids)
-        assert num_total_tokens == max_model_len
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@ -183,34 +183,6 @@ def test_env(
                    assert backend.get_name() == expected


-@pytest.mark.parametrize("device", ["cpu", "cuda"])
-@pytest.mark.parametrize("use_v1", [True, False])
-def test_fp32_fallback(
-    device: str,
-    use_v1: bool,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    """Test attention backend selection with fp32."""
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-
-        if device == "cpu":
-            with patch("vllm.attention.selector.current_platform",
-                       CpuPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
-            assert (backend.get_name() == "TORCH_SDPA_VLLM_V1"
-                    if use_v1 else "TORCH_SDPA")
-
-        elif device == "cuda":
-            with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
-                backend = get_attn_backend(16, torch.float32, torch.float32,
-                                           16, False)
-            assert (backend.get_name() == "FLEX_ATTENTION"
-                    if use_v1 else "XFORMERS")
-
-
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
    """Test FlashAttn validation."""
    # TODO: When testing for v1, pipe in `use_v1` as an argument to
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@ -274,7 +274,7 @@ def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor,
    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)

-    b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
+    b_a, b_a_scale, expert_num_tokens = prepare_finalize.prepare(
        a_chunk,
        None,
        None,
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@ -164,6 +164,11 @@ def mixtral_lora_files():
    return snapshot_download(repo_id="SangBinCho/mixtral-lora")


+@pytest.fixture(scope="session")
+def gemma_lora_files():
+    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
+
+
@pytest.fixture(scope="session")
 def chatglm3_lora_files():
    return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@ -4,6 +4,9 @@ import subprocess
 import sys
 from typing import Union

+import pytest
+import ray
+
 import vllm
 from vllm import LLM
 from vllm.lora.request import LoRARequest
@ -118,6 +121,37 @@ def test_llama_lora(sql_lora_files):
    generate_and_test(llm, sql_lora_files)


+# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
+# used by the engine yet.
+@pytest.mark.skip_v1
+@create_new_process_for_each_test()
+def test_llama_lora_warmup(sql_lora_files):
+    """Test that the LLM initialization works with a warmup LORA path and
+    is more conservative"""
+
+    @ray.remote(num_gpus=1)
+    def get_num_gpu_blocks_lora():
+        llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
+        num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
+        return num_gpu_blocks_lora_warmup
+
+    @ray.remote(num_gpus=1)
+    def get_num_gpu_blocks_no_lora():
+        llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
+        num_gpu_blocks_no_lora_warmup = (
+            llm.llm_engine.cache_config.num_gpu_blocks)
+        return num_gpu_blocks_no_lora_warmup
+
+    num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
+    num_gpu_blocks_no_lora_warmup = ray.get(
+        get_num_gpu_blocks_no_lora.remote())
+    assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
+        "The warmup with lora should be more "
+        "conservative than without lora, therefore the number of "
+        "memory blocks for the KV cache should be "
+        "less when using lora than when not using lora")
+
+
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@ -15,6 +15,13 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
 LORA_RANK = 8

+# @pytest.fixture(autouse=True)
+# def v1(run_with_both_engines_lora):
+#     # Simple autouse wrapper to run both engines for each test
+#     # This can be promoted up to conftest.py to run for every
+#     # test in a package
+#     pass
+

 def make_lora_request(lora_id: int):
    return LoRARequest(lora_name=f"{lora_id}",
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@ -11,6 +11,14 @@ MODEL_PATH = "microsoft/phi-2"
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    prompts = [
        PROMPT_TEMPLATE.format(
@ -51,7 +59,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:

 # Skipping for V1 for now as we are hitting,
 # "Head size 80 is not supported by FlashAttention." error.
-@pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
+@pytest.mark.skip_v1
 def test_phi2_lora(phi2_lora_files):
    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
    # Otherwise, the lora-test will fail due to CUDA OOM.
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@ -16,8 +16,6 @@ from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker

-NUM_LORAS = 16
-

@patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
@ -60,12 +58,12 @@ def test_worker_apply_lora(sql_lora_files):
        device_config=DeviceConfig("cuda"),
        cache_config=CacheConfig(
            block_size=16,
+            gpu_memory_utilization=1.0,
            swap_space=0,
            cache_dtype="auto",
        ),
-        lora_config=LoRAConfig(max_lora_rank=8,
-                               max_cpu_loras=NUM_LORAS,
-                               max_loras=NUM_LORAS),
+        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
+                               max_loras=32),
    )
    worker = worker_cls(
        vllm_config=vllm_config,
@ -80,9 +78,9 @@ def test_worker_apply_lora(sql_lora_files):
    set_active_loras(worker, [])
    assert worker.list_loras() == set()

+    n_loras = 32
    lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files)
-        for i in range(NUM_LORAS)
+        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
    ]

    set_active_loras(worker, lora_requests)
@ -91,12 +89,12 @@ def test_worker_apply_lora(sql_lora_files):
        for lora_request in lora_requests
    }

-    for i in range(NUM_LORAS):
+    for i in range(32):
        random.seed(i)
        iter_lora_requests = random.choices(lora_requests,
-                                            k=random.randint(1, NUM_LORAS))
+                                            k=random.randint(1, n_loras))
        random.shuffle(iter_lora_requests)
-        iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
+        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
        set_active_loras(worker, lora_requests)
        assert worker.list_loras().issuperset(
            {lora_request.lora_int_id
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import unittest.mock as mock

 import pytest

@ -16,8 +17,24 @@ from vllm.v1.worker.tpu_model_runner import (
    TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
    _get_padded_token_len, _get_req_paddings, _get_token_paddings)

+# Mock torch_xla module since it may not be available in the test environments
+torch_xla_patcher = mock.patch.dict(
+    "sys.modules", {
+        "torch_xla": mock.MagicMock(),
+        "torch_xla.core.xla_model": mock.MagicMock(),
+        "torch_xla.runtime": mock.MagicMock(),
+    })
+torch_xla_patcher.start()

-def get_vllm_config():
+# Mock the PallasAttentionBackend
+pallas_attention_backend_patcher = mock.patch(
+    "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
+pallas_attention_backend_patcher.start()
+
+
+@pytest.fixture
+def model_runner():
+    # Patchers have already been started at module level.
    scheduler_config = SchedulerConfig(
        max_num_seqs=10,
        max_num_batched_tokens=512,
@ -43,19 +60,18 @@ def get_vllm_config():
        cache_config=cache_config,
        scheduler_config=scheduler_config,
    )
-    return vllm_config
-
-
-def get_model_runner(vllm_config):
    device = "xla:0"  # Mocking TPU device
-    return TPUModelRunner(vllm_config, device)
+    with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
+         mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
+         mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
+        return TPUModelRunner(vllm_config, device)


-@pytest.fixture
-def model_runner():
-    # Patchers have already been started at module level.
-    vllm_config = get_vllm_config()
-    return get_model_runner(vllm_config)
+@pytest.fixture(autouse=True, scope="session")
+def cleanup_patches():
+    yield
+    torch_xla_patcher.stop()
+    pallas_attention_backend_patcher.stop()


 def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
@ -354,14 +370,12 @@ def test_get_req_paddings():
    assert _get_req_paddings(8, 36) == [8, 16, 32, 36]


-def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
-        model_runner):
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} must come before the current layer"
-    vllm_config = model_runner.vllm_config
-    with pytest.raises(ValueError, match=error_msg), \
-        set_current_vllm_config(vllm_config):
+    with pytest.raises(ValueError, match=error_msg):
        fwd_context = {
            # initialization below will fail because target layer is invalid;
            # the target layer needs to come before layer 1
@ -385,14 +399,13 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(
        assert fwd_context is not None


-def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    invalid_layer = "model.layers.0.cross_attn.attn"
    error_msg = f"{invalid_layer} is not a valid Attention layer in the model"
-    vllm_config = model_runner.vllm_config
-    with pytest.raises(ValueError, match=error_msg), \
-        set_current_vllm_config(vllm_config):
+    with pytest.raises(ValueError, match=error_msg):
        fwd_context = {
            layer_0:
            Attention(
@ -415,13 +428,12 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(model_runner):
        assert fwd_context is not None


-def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_with_kv_sharing_target_same_as_current():
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} cannot be the same as the current layer"
-    vllm_config = model_runner.vllm_config
-    with pytest.raises(ValueError, match=error_msg), \
-        set_current_vllm_config(vllm_config):
+    with pytest.raises(ValueError, match=error_msg):
        fwd_context = {
            # initialization below will fail because target layer is invalid;
            # the target layer needs to come before layer 1
@ -445,10 +457,11 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current(model_runner):
        assert fwd_context is not None


-def test_init_kv_cache_without_kv_sharing():
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_without_kv_sharing(model_runner):
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
-    vllm_config = get_vllm_config()
+    vllm_config = model_runner.vllm_config
    with set_current_vllm_config(vllm_config):
        fwd_context = {
            layer_0:
@ -469,38 +482,33 @@ def test_init_kv_cache_without_kv_sharing():
        # suppress var not used error
        assert fwd_context is not None
    # Set high context length to test max context length estimation
-    vllm_config.model_config.max_model_len = 1_000_000
+    vllm_config.model_config.max_model_len = 3_000_000
    vllm_ctx = vllm_config.compilation_config.static_forward_context
-    model_runner = get_model_runner(vllm_config)
    kv_cache_spec = model_runner.get_kv_cache_spec()
    assert len(kv_cache_spec) == 2
    assert len(model_runner.shared_kv_cache_layers) == 0

    available_memory = 20 * GiB_bytes
-    # page size for each layer KV can be calculated as
-    # 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
-    # * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
-    num_expected_blocks = 20480  # 20GB / 512KB / 2 (num layers)
+    # page size for layer 0's kv_cache_spec is 32KB
+    num_expected_blocks = 327680  # 20GB / 32KB / 2 (num layers)
    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                          available_memory)
    assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.kv_cache_tensors) == 2
-    assert kv_cache_config.kv_cache_tensors[0].size == available_memory // 2
-    assert kv_cache_config.kv_cache_tensors[1].size == available_memory // 2
+    assert len(kv_cache_config.tensors) == 2
+    assert kv_cache_config.tensors[layer_0].size == available_memory // 2
+    assert kv_cache_config.tensors[layer_1].size == available_memory // 2

    max_context_len =\
        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
    # max context len with KV sharing should be 2x as large as without
-    # max_context_len = available_memory / (page_size / block_size) / num_caches
-    # max_context_len = 5GB / (512KB / 128) / 2 = 655360
-    assert max_context_len == 655360
+    assert max_context_len == 1310720

    # important: override tensor size to prevent large mem alloc during test
-    # this will only allocate 2 block worth of memory (2 * 512kb)
+    # this will only allocate 2 block worth of memory (2 * 32kb)
    kv_cache_config.num_blocks = 1
-    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
-        kv_cache_tensor.size = (
-            kv_cache_spec[kv_cache_tensor.shared_by[0]].page_size_bytes)
+    for layer in kv_cache_config.tensors:
+        kv_cache_config.tensors[layer].size =\
+            kv_cache_spec[layer].page_size_bytes

    model_runner.initialize_kv_cache(kv_cache_config)

@ -516,10 +524,11 @@ def test_init_kv_cache_without_kv_sharing():
    assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1


-def test_init_kv_cache_with_kv_sharing_valid():
+@pytest.mark.skip(reason="Test is broken on TPU when it's added.")
+def test_init_kv_cache_with_kv_sharing_valid(model_runner):
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
-    vllm_config = get_vllm_config()
+    vllm_config = model_runner.vllm_config
    with set_current_vllm_config(vllm_config):
        fwd_context = {
            layer_0:
@ -543,34 +552,33 @@ def test_init_kv_cache_with_kv_sharing_valid():
    # Set high context length to test max context length estimation
    vllm_config.model_config.max_model_len = 3_000_000
    vllm_ctx = vllm_config.compilation_config.static_forward_context
-    model_runner = get_model_runner(vllm_config)
    kv_cache_spec = model_runner.get_kv_cache_spec()
    assert len(kv_cache_spec) == 1
    assert layer_0 in kv_cache_spec
    assert model_runner.shared_kv_cache_layers[layer_1] == layer_0

    available_memory = 20 * GiB_bytes
-    # page size for layer 0's kv_cache_spec is 512KB
+    # page size for layer 0's kv_cache_spec is 32KB
    # with KV sharing, we can allocate (available_mem//page_size//1) blocks
    # which is twice as many as without KV sharing
-    num_expected_blocks = 2 * 20480  # 20GB / 512KB
+    num_expected_blocks = 655360  # 20GB / 32KB
    kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
                                          available_memory)
    assert kv_cache_config.num_blocks == num_expected_blocks
-    assert len(kv_cache_config.kv_cache_tensors) == 1
+    assert len(kv_cache_config.tensors) == 1
    # Each layer now has twice the available memory for KV cache
    # compared to no KV sharing
-    assert kv_cache_config.kv_cache_tensors[0].size == available_memory
+    assert kv_cache_config.tensors[layer_0].size == available_memory

    max_context_len =\
        estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes)
    # max context len with KV sharing should be 2x as large as without
-    assert max_context_len == (2 * 655360)
+    assert max_context_len == 2 * 1310720

    # important: override tensor size to prevent large mem alloc during test
-    # this will only allocate 1 block worth of memory (512kb)
+    # this will only allocate 1 block worth of memory (32kb)
    kv_cache_config.num_blocks = 1
-    kv_cache_config.kv_cache_tensors[0].size =\
+    kv_cache_config.tensors[layer_0].size =\
        kv_cache_spec[layer_0].page_size_bytes

    model_runner.initialize_kv_cache(kv_cache_config)
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@ -4,7 +4,6 @@
 import random

 import pytest
-import torch

 from vllm.attention import Attention
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
@ -400,7 +399,6 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2):


 def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} must come before the current layer"
@ -429,7 +427,6 @@ def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order():


 def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    invalid_layer = "model.layers.0.cross_attn.attn"
@ -458,7 +455,6 @@ def test_init_kv_cache_with_kv_sharing_target_layer_not_exist():


 def test_init_kv_cache_with_kv_sharing_target_same_as_current():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    error_msg = f"{layer_1} cannot be the same as the current layer"
@ -487,7 +483,6 @@ def test_init_kv_cache_with_kv_sharing_target_same_as_current():


 def test_init_kv_cache_without_kv_sharing():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
@ -555,7 +550,6 @@ def test_init_kv_cache_without_kv_sharing():


 def test_init_kv_cache_with_kv_sharing_valid():
-    torch.set_default_dtype(torch.float16)
    layer_0 = "model.layers.0.self_attn.attn"
    layer_1 = "model.layers.1.self_attn.attn"
    vllm_config = get_vllm_config()
--- a/vllm/config.py
+++ b/vllm/config.py
@ -4497,13 +4497,13 @@ class VllmConfig:
            # warning message here and will log it later.
            if not (current_platform.is_cuda() or current_platform.is_rocm()):
                # Hybrid KV cache manager is not supported on non-GPU platforms.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+                self.disable_hybrid_kv_cache_manager = True
            if self.kv_transfer_config is not None:
                # Hybrid KV cache manager is not compatible with KV transfer.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+                self.disable_hybrid_kv_cache_manager = True
            if self.kv_events_config is not None:
                # Hybrid KV cache manager is not compatible with KV events.
-                self.scheduler_config.disable_hybrid_kv_cache_manager = True
+                self.disable_hybrid_kv_cache_manager = True

    def update_sizes_for_sequence_parallelism(self,
                                              possible_sizes: list) -> list:
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@ -233,11 +233,16 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
        # Defaults for internode and intranode are taken from DeepEP tests.
        num_nvl_bytes = 1024 * 1024 * 1024
        num_qps_per_rank = num_local_experts
-        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
-            num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
-            hidden=token_hidden_size,
-            num_ranks=num_ep_ranks,
-            num_experts=num_global_experts)
+        num_rdma_bytes = None
+
+        if self.internode:
+            num_rdma_bytes = 1024 * 1024 * 1024
+        else:
+            num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+                num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
+                hidden=token_hidden_size,
+                num_ranks=num_ep_ranks,
+                num_experts=num_global_experts)

        assert num_rdma_bytes is not None
        return dict(group=self.cpu_group,
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -1337,6 +1337,13 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False

+        # Only Fp16 and Bf16 dtypes since we only support FA.
+        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
+        if model_config.dtype not in V1_SUPPORTED_DTYPES:
+            _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
+                               recommend_to_remove=False)
+            return False
+
        # No Embedding Models so far.
        if model_config.task not in ["generate"]:
            _raise_or_fallback(feature_name=f"--task {model_config.task}",
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@ -82,7 +82,7 @@ class StopChecker:
            return

        # Check if the sequence has reached max_model_len.
-        if seq.get_len() >= self._get_max_model_len(lora_req):
+        if seq.get_len() > self._get_max_model_len(lora_req):
            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
            return

--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@ -17,7 +17,6 @@ from typing import Any, Optional
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse

-import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.launcher import serve_http
@ -30,6 +29,7 @@ from vllm.version import __version__ as VLLM_VERSION

 logger = init_logger("vllm.entrypoints.api_server")

+TIMEOUT_KEEP_ALIVE = 5  # seconds.
 app = FastAPI()
 engine = None

@ -134,7 +134,7 @@ async def run_server(args: Namespace,
        host=args.host,
        port=args.port,
        log_level=args.log_level,
-        timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
+        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
        ssl_keyfile=args.ssl_keyfile,
        ssl_certfile=args.ssl_certfile,
        ssl_ca_certs=args.ssl_ca_certs,
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -103,6 +103,8 @@ from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION

+TIMEOUT_KEEP_ALIVE = 5  # seconds
+
 prometheus_multiproc_dir: tempfile.TemporaryDirectory

 # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
@ -1358,7 +1360,7 @@ async def run_server_worker(listen_address,
            # NOTE: When the 'disable_uvicorn_access_log' value is True,
            # no access log will be output.
            access_log=not args.disable_uvicorn_access_log,
-            timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE,
+            timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
            ssl_keyfile=args.ssl_keyfile,
            ssl_certfile=args.ssl_certfile,
            ssl_ca_certs=args.ssl_ca_certs,
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -71,7 +71,6 @@ if TYPE_CHECKING:
    VERBOSE: bool = False
    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
    VLLM_RPC_TIMEOUT: int = 10000  # ms
-    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
    VLLM_PLUGINS: Optional[list[str]] = None
    VLLM_LORA_RESOLVER_CACHE_DIR: Optional[str] = None
    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
@ -111,7 +110,6 @@ if TYPE_CHECKING:
    VLLM_DP_SIZE: int = 1
    VLLM_DP_MASTER_IP: str = ""
    VLLM_DP_MASTER_PORT: int = 0
-    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
    VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
    VLLM_V0_USE_OUTLINES_CACHE: bool = False
    VLLM_TPU_BUCKET_PADDING_GAP: int = 0
@ -558,10 +556,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_RPC_TIMEOUT":
    lambda: int(os.getenv("VLLM_RPC_TIMEOUT", "10000")),

-    # Timeout in seconds for keeping HTTP connections alive in API server
-    "VLLM_HTTP_TIMEOUT_KEEP_ALIVE":
-    lambda: int(os.environ.get("VLLM_HTTP_TIMEOUT_KEEP_ALIVE", "5")),
-
    # a list of plugin names to load, separated by commas.
    # if this is not set, it means all plugins will be loaded
    # if this is set to an empty string, no plugins will be loaded
@ -767,10 +761,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_DP_MASTER_PORT":
    lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),

-    # Randomize inputs during dummy runs when using Data Parallel
-    "VLLM_RANDOMIZE_DP_DUMMY_INPUTS":
-    lambda: os.environ.get("VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0") == "1",
-
    # Whether to use S3 path for model loading in CI via RunAI Streamer
    "VLLM_CI_USE_S3":
    lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@ -80,13 +80,11 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
        topk: int,
        num_experts: int,
    ) -> tuple[int, int, torch.dtype]:
-
        block_m = self.block_shape[0]
        M_sum = (M * topk) + num_experts * (block_m - 1)
        M_sum = round_up(M_sum, block_m)
        workspace1 = M_sum * max(N * 2, K)
-        workspace2 = M_sum * max(N, K)
-
+        workspace2 = M_sum * N
        return (workspace1, workspace2, a.dtype)

    def apply(
@ -137,31 +135,26 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):

        # Note: M_sum is different than the pre-permuted shape of a1q.
        M_sum = a1q.size(0)
-
-        mm1_out = _resize_cache(workspace13, (M_sum, N))
-        act_out = _resize_cache(workspace2, (M_sum, N // 2))
-        quant_out = _resize_cache(workspace13.view(dtype=torch.float8_e4m3fn),
-                                  (M_sum, N // 2))
-        mm2_out = _resize_cache(workspace2, (M_sum, K))
-        out = _resize_cache(workspace13, (inv_perm.size(0), K))
+        workspace1 = _resize_cache(workspace13, (M_sum, N))
+        workspace2 = _resize_cache(workspace2, (M_sum, N // 2))
+        workspace3 = _resize_cache(workspace13, (M_sum, K))

        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (a1q, a1q_scale), (w1, w1_scale), mm1_out, expert_ids)
+            (a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids)

-        self.activation(activation, act_out, mm1_out.view(-1, N))
+        self.activation(activation, workspace2, workspace1.view(-1, N))

        a2q_scale: Optional[torch.Tensor] = None
-        a2q, a2q_scale = per_token_group_quant_fp8(act_out,
+        a2q, a2q_scale = per_token_group_quant_fp8(workspace2,
                                                   self.block_shape[1],
-                                                   column_major_scales=True,
-                                                   out_q=quant_out)
+                                                   column_major_scales=True)

        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (a2q, a2q_scale), (w2, w2_scale), mm2_out, expert_ids)
+            (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids)

-        torch.index_select(mm2_out, 0, inv_perm, out=out)
+        workspace3 = workspace3[inv_perm, ...]

-        return out
+        return workspace3


 def deep_gemm_moe_fp8(
--- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@ -5,7 +5,6 @@ import deep_ep
 import torch

 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.utils import (
    moe_kernel_quantize_input)

@ -194,23 +193,20 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                                  apply_router_weight_on_input: bool,
                                  output_dtype: torch.dtype):

-        hidden_dim = fused_expert_output.size(-1)
        if fused_expert_output.ndim == 2:
+            hidden_dim = fused_expert_output.size(-1)
            fused_expert_output = fused_expert_output.view(
                num_tokens, -1, hidden_dim)

        if not apply_router_weight_on_input:
            # The DeepEP combine kernels don't do the topk weight
            # multiplication. We multiply the weights locally.
-            m_x_topk = fused_expert_output.size(0)
-            fused_expert_output.mul_(topk_weights.view(m_x_topk, -1, 1))
+            fused_expert_output = fused_expert_output.to(torch.float32)
+            fused_expert_output = fused_expert_output * topk_weights.view(
+                fused_expert_output.size(0), -1, 1)
+            fused_expert_output = fused_expert_output.to(output_dtype)

-        out = torch.empty((num_tokens, hidden_dim),
-                          device=fused_expert_output.device,
-                          dtype=output_dtype)
-        ops.moe_sum(fused_expert_output, out)
-
-        return out
+        return fused_expert_output.sum(dim=1).to(output_dtype)

    def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor,
                 topk_weights: torch.Tensor, topk_ids: torch.Tensor,
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@ -18,7 +18,7 @@ def _moe_permute(
    expert_map: Optional[torch.Tensor],
    block_m: int,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
-           torch.Tensor]:
+           Optional[torch.Tensor]]:
    """
    Determine the sorted_token_ids, expert_ids for the given problem size.
    Permute the hidden states and scales according to `sorted_token_ids`.
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@ -234,13 +234,8 @@ def _per_token_group_quant_fp8(
    row = g_id // groups_per_row
    row_g_id = g_id % groups_per_row

-    # Ensure offset calculations use int64 to prevent overflow
-    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
-                                                        group_size)
-    y_ptr += y_ptr_offset
-
-    y_q_ptr_offset = g_id.to(tl.int64) * group_size
-    y_q_ptr += y_q_ptr_offset
+    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
+    y_q_ptr += g_id * group_size
    y_s_ptr += g_id

    cols = tl.arange(0, BLOCK)  # N <= BLOCK
@ -287,23 +282,15 @@ def _per_token_group_quant_fp8_colmajor(
    row = g_id // groups_per_row
    row_g_id = g_id % groups_per_row

-    # Ensure offset calculations use int64 to prevent overflow
-    y_ptr_offset = (row.to(tl.int64) * y_row_stride) + (row_g_id.to(tl.int64) *
-                                                        group_size)
-    y_ptr += y_ptr_offset
-
-    y_q_ptr_offset = g_id.to(tl.int64) * group_size
-    y_q_ptr += y_q_ptr_offset
+    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
+    y_q_ptr += g_id * group_size

    # Convert g_id the flattened block coordinate to 2D so we can index
    # into the output y_scales matrix
    blocks_per_row = y_num_columns // group_size
    scale_col = g_id % blocks_per_row
    scale_row = g_id // blocks_per_row
-    # Ensure offset calculation uses int64 for y_s_ptr
-    y_s_ptr_offset = (scale_col.to(tl.int64) * y_s_col_stride) + scale_row.to(
-        tl.int64)
-    y_s_ptr += y_s_ptr_offset
+    y_s_ptr += scale_col * y_s_col_stride + scale_row

    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
    mask = cols < group_size
@ -324,7 +311,6 @@ def per_token_group_quant_fp8(
    eps: float = 1e-10,
    dtype: Optional[torch.dtype] = None,
    column_major_scales: bool = False,
-    out_q: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
    """Function to perform per-token-group quantization on an input tensor `x`.
    It converts the tensor values into signed float8 values and returns the
@ -335,8 +321,6 @@ def per_token_group_quant_fp8(
        eps: The minimum to avoid dividing zero.
        dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
        is supported for now.
-        column_major_scales: Outputs scales in column major.
-        out_q: Optional output tensor. If not provided, function will create.
    Returns:
        tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
        scaling factor for quantization.
@ -351,11 +335,7 @@ def per_token_group_quant_fp8(
    fp8_min = finfo.min
    fp8_max = finfo.max

-    assert out_q is None or out_q.shape == x.shape
-    x_q = out_q
-    if x_q is None:
-        x_q = torch.empty_like(x, device=x.device, dtype=dtype)
-
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
    M = x.numel() // group_size
    N = group_size
    if column_major_scales:
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -233,10 +233,6 @@ class CudaPlatformBase(Platform):
                logger.info_once("Using Triton backend on V1 engine.")
                return ("vllm.v1.attention.backends."
                        "triton_attn.TritonAttentionBackend")
-            if dtype not in (torch.float16, torch.bfloat16):
-                logger.info_once(
-                    f"Using FlexAttenion backend for {dtype} on V1 engine.")
-                return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"  # noqa: E501
            if cls.is_device_capability(100):
                # Prefer FlashInfer for V1 on Blackwell GPUs if installed
                try:
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@ -40,13 +40,12 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
            blocksparse_params: Optional[dict[str, Any]],
            logits_soft_cap: Optional[float],
            attn_type: str,
-            kv_sharing_target_layer_name: Optional[str],
            # MLA Specific Arguments
            **mla_args) -> None:
        super().__init__(num_heads, head_size, scale, num_kv_heads,
                         alibi_slopes, sliding_window, kv_cache_dtype,
                         blocksparse_params, logits_soft_cap, attn_type,
-                         kv_sharing_target_layer_name, **mla_args)
+                         **mla_args)

        unsupported_features = [
            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -5,7 +5,6 @@ import copy
 import gc
 import time
 import weakref
-from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Optional, Union

 import numpy as np
@ -13,7 +12,6 @@ import torch
 import torch.distributed
 import torch.nn as nn

-import vllm.envs as envs
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.backends.abstract import (AttentionBackend,
                                              AttentionMetadataBuilder)
@ -657,10 +655,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):

        # Fill unused with -1. Needed for reshape_and_cache
        self.seq_lens[num_reqs:].fill_(0)
-        # Note: pad query_start_loc to be non-decreasing, as kernels
-        # like FlashAttention requires that
-        self.query_start_loc[num_reqs + 1:].fill_(
-            self.query_start_loc_cpu[num_reqs].item())
+        self.query_start_loc[num_reqs + 1:].fill_(-1)

        query_start_loc = self.query_start_loc[:num_reqs + 1]
        seq_lens = self.seq_lens[:num_reqs]
@ -1729,35 +1724,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):

        return prompt_logprobs_dict

-    @contextmanager
-    def maybe_randomize_inputs(self, input_ids: torch.Tensor):
-        """
-        Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set.
-        This is to help balance expert-selection
-         - during profile_run
-         - during DP rank dummy run 
-        """
-        dp_size = self.vllm_config.parallel_config.data_parallel_size
-        randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1
-        if not randomize_inputs:
-            yield
-        else:
-            import functools
-
-            @functools.cache
-            def rand_input_ids() -> torch.Tensor:
-                return torch.randint_like(
-                    self.input_ids,
-                    low=0,
-                    high=self.model_config.get_vocab_size(),
-                    dtype=input_ids.dtype)
-
-            logger.debug("Randomizing dummy data for DP Rank")
-            input_ids.copy_(rand_input_ids()[:input_ids.size(0)],
-                            non_blocking=True)
-            yield
-            input_ids.fill_(0)
-
    @torch.inference_mode()
    def _dummy_run(
        self,
@ -1838,7 +1804,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                    num_tokens, None, False)

-            with self.maybe_randomize_inputs(input_ids), set_forward_context(
+            with set_forward_context(
                    attn_metadata,
                    self.vllm_config,
                    num_tokens=num_tokens,