[BugFix] Fix DeepGEMM warmup, no m.weight_scale_inv (#25206 )

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
[KV offload][1b/N] rename offloading to kv_offload (#25191 )
2025-09-18 14:26:28 -07:00 · 2025-09-18 20:53:52 +00:00 · 2025-09-18 20:35:21 +00:00 · 2025-09-18 12:38:37 -07:00 · 2025-09-18 19:32:24 +00:00 · 2025-09-18 19:18:56 +00:00
27 changed files with 832 additions and 2197 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -280,6 +280,7 @@ steps:
    # split the test to avoid interference
    - pytest -v -s v1/core
    - pytest -v -s v1/executor
+    - pytest -v -s v1/kv_offload
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -28,11 +28,9 @@ def monkeypatch_module():
    mpatch.undo()


-@pytest.fixture(scope="module", params=[False, True])
-def server(request, monkeypatch_module, zephyr_lora_files):  #noqa: F811
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+@pytest.fixture(scope="module")
+def server(monkeypatch_module, zephyr_lora_files):  #noqa: F811
+    monkeypatch_module.setenv('VLLM_USE_V1', '1')

    args = [
        # use half precision for speed and memory savings in CI environment
@ -57,13 +55,6 @@ def server(request, monkeypatch_module, zephyr_lora_files):  #noqa: F811
        yield remote_server


-@pytest.fixture
-def is_v1_server(server):
-    import os
-    assert os.environ['VLLM_USE_V1'] in ['0', '1']
-    return os.environ['VLLM_USE_V1'] == '1'
-
-
@pytest_asyncio.fixture
 async def client(server):
    async with server.get_async_client() as async_client:
@ -481,10 +472,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,

@pytest.mark.asyncio
 async def test_structured_outputs_choice_chat(
-        client: openai.AsyncOpenAI, sample_structured_outputs_choices,
-        is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
+    client: openai.AsyncOpenAI,
+    sample_structured_outputs_choices,
+):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -522,12 +512,10 @@ async def test_structured_outputs_choice_chat(


@pytest.mark.asyncio
-async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
-                                            sample_json_schema,
-                                            is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
-
+async def test_structured_outputs_json_chat(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -569,10 +557,10 @@ async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,


@pytest.mark.asyncio
-async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
-                                             sample_regex, is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Structured outputs is only supported in v1 engine")
+async def test_structured_outputs_regex_chat(
+    client: openai.AsyncOpenAI,
+    sample_regex,
+):

    messages = [{
        "role": "system",
@ -660,10 +648,10 @@ async def test_structured_outputs_choice_chat_logprobs(


@pytest.mark.asyncio
-async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
-                              is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("Tool use is only supported in v1 engine")
+async def test_named_tool_use(
+    client: openai.AsyncOpenAI,
+    sample_json_schema,
+):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@ -821,11 +809,7 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI):


@pytest.mark.asyncio
-async def test_response_format_json_schema(client: openai.AsyncOpenAI,
-                                           is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip(
-            "JSON schema response format is only supported in v1 engine")
+async def test_response_format_json_schema(client: openai.AsyncOpenAI):
    prompt = 'what is 1+1? The format is "result": 2'
    # Check that this prompt cannot lead to a valid JSON without json_schema
    for _ in range(2):
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@ -1,830 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# imports for structured outputs tests
-import json
-import os
-from typing import Optional
-
-import jsonschema
-import openai  # use the official client for correctness check
-import pytest
-import pytest_asyncio
-import regex as re
-import requests
-# downloading lora to test lora requests
-from openai import BadRequestError
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-from ...utils import RemoteOpenAIServer
-
-# any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically these adapters use a different base model,
-# but we're not testing generation quality here
-
-
-@pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files):
-    return [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "bfloat16",
-        "--max-model-len",
-        "8192",
-        "--max-num-seqs",
-        "128",
-        "--enforce-eager",
-        # lora config
-        "--enable-lora",
-        "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "2",
-    ]
-
-
-@pytest.fixture(scope="module",
-                params=["", "--disable-frontend-multiprocessing"])
-def server(default_server_args, request):
-    if request.param:
-        default_server_args.append(request.param)
-
-    original_value = os.environ.get('VLLM_USE_V1')
-    os.environ['VLLM_USE_V1'] = '0'
-    try:
-        with RemoteOpenAIServer(MODEL_NAME,
-                                default_server_args) as remote_server:
-            yield remote_server
-    finally:
-        # Restore original env value
-        if original_value is None:
-            os.environ.pop('VLLM_USE_V1', None)
-        else:
-            os.environ['VLLM_USE_V1'] = original_value
-
-
-@pytest.fixture
-def is_v1_server(server):
-    import os
-
-    # For completion tests, we assume v0 since there's no explicit v1 setup
-    return os.environ.get('VLLM_USE_V1', '0') == '1'
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
-    completion = await client.completions.create(model=model_name,
-                                                 prompt="Hello, my name is",
-                                                 max_tokens=5,
-                                                 temperature=0.0)
-
-    assert completion.id is not None
-    assert completion.choices is not None and len(completion.choices) == 1
-
-    choice = completion.choices[0]
-    assert len(choice.text) >= 5
-    assert choice.finish_reason == "length"
-    assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 1
-    assert completion.choices[0].prompt_logprobs is None
-
-
-@pytest.mark.asyncio
-async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
-    # test using token IDs
-    with pytest.raises(openai.BadRequestError, match="out of vocabulary"):
-        # Added tokens should be rejected by the base model
-        await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 32000, 32001, 32002],
-            echo=True,
-            max_tokens=5,
-            temperature=0.0,
-        )
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=None,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # just test 1 lora
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=0,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) == 1
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
-    # test using token IDs
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-        logprobs=5,
-    )
-    choice = completion.choices[0]
-    assert choice.logprobs is not None
-    assert choice.logprobs.token_logprobs is not None
-    assert choice.logprobs.top_logprobs is not None
-    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
-                                            model_name: str):
-
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=21,
-        )
-        ...
-    with pytest.raises(
-        (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        stream = await client.completions.create(
-            model=model_name,
-            prompt=[0, 0, 0, 0, 0],
-            max_tokens=5,
-            temperature=0.0,
-            # vLLM has higher default max_logprobs (20 instead of 5) to support
-            # both Completion API and Chat Completion API
-            logprobs=30,
-            stream=True,
-        )
-        async for chunk in stream:
-            ...
-
-    # the server should still work afterwards
-    completion = await client.completions.create(
-        model=model_name,
-        prompt=[0, 0, 0, 0, 0],
-        max_tokens=5,
-        temperature=0.0,
-    )
-    assert len(completion.choices[0].text) >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
-                                                         (MODEL_NAME, 0),
-                                                         (MODEL_NAME, 1),
-                                                         (MODEL_NAME, None)])
-async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
-                                          model_name: str,
-                                          prompt_logprobs: Optional[int]):
-    params: dict = {
-        "prompt": ["A robot may not injure another robot", "My name is"],
-        "model": model_name,
-    }
-    if prompt_logprobs is not None:
-        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
-
-    if prompt_logprobs is not None and prompt_logprobs < 0:
-        with pytest.raises(BadRequestError):
-            await client.completions.create(**params)
-    else:
-        completion = await client.completions.create(**params)
-        if prompt_logprobs is not None:
-            assert completion.choices[0].prompt_logprobs is not None
-            assert len(completion.choices[0].prompt_logprobs) > 0
-
-            assert completion.choices[1].prompt_logprobs is not None
-            assert len(completion.choices[1].prompt_logprobs) > 0
-
-        else:
-            assert completion.choices[0].prompt_logprobs is None
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_streaming(client: openai.AsyncOpenAI,
-                                    model_name: str):
-    prompt = "What is an LLM?"
-
-    single_completion = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-    )
-    single_output = single_completion.choices[0].text
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True)
-    chunks: list[str] = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == "length"
-    assert chunk.choices[0].text
-    assert "".join(chunks) == single_output
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
-    """Streaming for parallel sampling.
-    The tokens from multiple samples, are flattened into a single stream,
-    with an index to indicate which sample the token belongs to.
-    """
-
-    prompt = "What is an LLM?"
-    n = 3
-    max_tokens = 5
-
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=max_tokens,
-                                             n=n,
-                                             stream=True)
-    chunks: list[list[str]] = [[] for i in range(n)]
-    finish_reason_count = 0
-    async for chunk in stream:
-        index = chunk.choices[0].index
-        text = chunk.choices[0].text
-        chunks[index].append(text)
-        if chunk.choices[0].finish_reason is not None:
-            finish_reason_count += 1
-    assert finish_reason_count == n
-    for chunk in chunks:
-        assert len(chunk) == max_tokens
-        print("".join(chunk))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_completion_stream_options(client: openai.AsyncOpenAI,
-                                         model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": False, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": False,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is None
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": False}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 False,
-                                             })
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-        else:
-            assert chunk.usage is None
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=True, stream_options=
-    #     {"include_usage": True, "continuous_usage_stats": True}
-    stream = await client.completions.create(model=model_name,
-                                             prompt=prompt,
-                                             max_tokens=5,
-                                             temperature=0.0,
-                                             stream=True,
-                                             stream_options={
-                                                 "include_usage": True,
-                                                 "continuous_usage_stats":
-                                                 True,
-                                             })
-    async for chunk in stream:
-        assert chunk.usage is not None
-        assert chunk.usage.prompt_tokens > 0
-        assert chunk.usage.completion_tokens > 0
-        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
-                                            chunk.usage.completion_tokens)
-        if chunk.choices[0].finish_reason is not None:
-            final_chunk = await stream.__anext__()
-            assert final_chunk.usage is not None
-            assert final_chunk.usage.prompt_tokens > 0
-            assert final_chunk.usage.completion_tokens > 0
-            assert final_chunk.usage.total_tokens == (
-                final_chunk.usage.prompt_tokens +
-                final_chunk.usage.completion_tokens)
-            assert final_chunk.choices == []
-
-    # Test stream=False, stream_options=
-    #     {"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": None})
-
-    # Test stream=False, stream_options=
-    #    {"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(model=model_name,
-                                        prompt=prompt,
-                                        max_tokens=5,
-                                        temperature=0.0,
-                                        stream=False,
-                                        stream_options={"include_usage": True})
-
-    # Test stream=False, stream_options=
-    #     {"continuous_usage_stats": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": None})
-
-    # Test stream=False, stream_options=
-    #    {"continuous_usage_stats": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"continuous_usage_stats": True})
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
-    # test both text and token IDs
-    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
-        # test simple list
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(batch.choices) == 2
-        assert batch.choices[0].text == batch.choices[1].text
-
-        # test n = 2
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            n=2,
-            max_tokens=5,
-            temperature=0.0,
-            extra_body=dict(
-                # NOTE: this has to be true for n > 1 in vLLM, but
-                # not necessary for official client.
-                use_beam_search=True),
-        )
-        assert len(batch.choices) == 4
-        assert batch.choices[0].text != batch.choices[
-            1].text, "beam search should be different"
-        assert batch.choices[0].text == batch.choices[
-            2].text, "two copies of the same prompt should be the same"
-        assert batch.choices[1].text == batch.choices[
-            3].text, "two copies of the same prompt should be the same"
-
-        # test streaming
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompts,
-            max_tokens=5,
-            temperature=0.0,
-            stream=True,
-        )
-        texts = [""] * 2
-        async for chunk in batch:
-            assert len(chunk.choices) == 1
-            choice = chunk.choices[0]
-            texts[choice.index] += choice.text
-        assert texts[0] == texts[1]
-
-
-@pytest.mark.asyncio
-async def test_logits_bias(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 5
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    token_id = 1000
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token_id): 100},
-        seed=42,
-    )
-    assert len(completion.choices[0].text) >= 5
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
-                                add_special_tokens=False)["input_ids"]
-    assert all([
-        response == expected
-        for response, expected in zip(response_tokens, expected_tokens)
-    ])
-
-    # Test ban
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-    )
-    response_tokens = tokenizer(completion.choices[0].text,
-                                add_special_tokens=False)["input_ids"]
-    first_response = completion.choices[0].text
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        logit_bias={str(token): -100
-                    for token in response_tokens},
-    )
-    assert first_response != completion.choices[0].text
-
-
-@pytest.mark.asyncio
-async def test_allowed_token_ids(client: openai.AsyncOpenAI):
-    prompt = "Hello, my name is"
-    max_tokens = 1
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-
-    # Test exclusive selection
-    allowed_ids = [21555, 21557, 21558]
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=prompt,
-        max_tokens=max_tokens,
-        temperature=0.0,
-        seed=42,
-        extra_body=dict(allowed_token_ids=allowed_ids),
-        logprobs=1,
-    )
-    response_tokens = completion.choices[0].logprobs.tokens
-    assert len(response_tokens) == 1
-    assert tokenizer.convert_tokens_to_ids(response_tokens)[0] in allowed_ids
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_json_completion(
-    client: openai.AsyncOpenAI,
-    sample_json_schema,
-    is_v1_server: bool,
-):
-    if not is_v1_server:
-        pytest.skip("structured outputs is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}",
-        n=3,
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(structured_outputs=dict(json=sample_json_schema)))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        output_json = json.loads(completion.choices[i].text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_regex_completion(
-    client: openai.AsyncOpenAI,
-    sample_regex,
-    is_v1_server: bool,
-):
-    if not is_v1_server:
-        pytest.skip("structured outputs is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
-        n=3,
-        temperature=1.0,
-        max_tokens=20,
-        extra_body=dict(structured_outputs=dict(regex=sample_regex)))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 3
-    for i in range(3):
-        assert re.fullmatch(sample_regex,
-                            completion.choices[i].text) is not None
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_choice_completion(
-    client: openai.AsyncOpenAI,
-    sample_structured_outputs_choices,
-    is_v1_server: bool,
-):
-    if not is_v1_server:
-        pytest.skip("structured outputs is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt="The best language for type-safe systems programming is ",
-        n=2,
-        temperature=1.0,
-        max_tokens=10,
-        extra_body=dict(structured_outputs=dict(
-            choice=sample_structured_outputs_choices)))
-
-    assert completion.id is not None
-    assert len(completion.choices) == 2
-    for i in range(2):
-        assert completion.choices[i].text in sample_structured_outputs_choices
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_grammar(client: openai.AsyncOpenAI,
-                                          sample_sql_statements,
-                                          is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("grammar is only supported in v1 engine")
-
-    completion = await client.completions.create(
-        model=MODEL_NAME,
-        prompt=("Generate a sql state that select col_1 from "
-                "table_1 where it is equals to 1"),
-        temperature=1.0,
-        max_tokens=500,
-        extra_body=dict(
-            structured_outputs=dict(grammar=sample_sql_statements), ))
-
-    content = completion.choices[0].text
-
-    # use Lark to parse the output, and make sure it's a valid parse tree
-    from lark import Lark
-    parser = Lark(sample_sql_statements)
-    parser.parse(content)
-
-    # remove spaces for comparison b/c we removed them in the grammar
-    ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
-
-    assert content.strip() == ground_truth
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    # first test base model, then test loras
-    "model_name",
-    [MODEL_NAME, "zephyr-lora"],
-)
-@pytest.mark.parametrize("logprobs_arg", [1, 0])
-async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
-                                       model_name: str, logprobs_arg: int):
-    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
-    # test using text and token IDs
-    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
-        completion = await client.completions.create(model=model_name,
-                                                     prompt=prompt,
-                                                     max_tokens=5,
-                                                     temperature=0.0,
-                                                     echo=True,
-                                                     logprobs=logprobs_arg)
-
-        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
-                                                             list) else prompt
-        assert re.search(r"^" + prompt_text, completion.choices[0].text)
-        logprobs = completion.choices[0].logprobs
-        assert logprobs is not None
-        assert len(logprobs.text_offset) > 5
-        assert (len(logprobs.token_logprobs) > 5
-                and logprobs.token_logprobs[0] is None)
-        assert (len(logprobs.top_logprobs) > 5
-                and logprobs.top_logprobs[0] is None)
-        for top_logprobs in logprobs.top_logprobs[1:]:
-            assert max(logprobs_arg,
-                       1) <= len(top_logprobs) <= logprobs_arg + 1
-        assert len(logprobs.tokens) > 5
-
-
-@pytest.mark.asyncio
-async def test_structured_outputs_type_error(client: openai.AsyncOpenAI,
-                                             sample_json_schema, sample_regex,
-                                             is_v1_server: bool):
-    if not is_v1_server:
-        pytest.skip("structured outputs is only supported in v1 engine")
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example JSON that fits this schema: 42",
-            extra_body=dict(structured_outputs=dict(json=42)))
-
-    with pytest.raises(openai.BadRequestError):
-        _ = await client.completions.create(
-            model=MODEL_NAME,
-            prompt="Give an example string that fits this regex",
-            extra_body=dict(structured_outputs=dict(
-                regex=sample_regex,
-                json=sample_json_schema,
-            )))
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name,stream,echo",
-    [
-        (MODEL_NAME, False, False),
-        (MODEL_NAME, False, True),
-        (MODEL_NAME, True, False),
-        (MODEL_NAME, True, True)  # should not raise BadRequestError error
-    ],
-)
-async def test_echo_stream_completion(client: openai.AsyncOpenAI,
-                                      model_name: str, stream: bool,
-                                      echo: bool):
-    saying: str = "Hello, my name is"
-    result = await client.completions.create(model=model_name,
-                                             prompt=saying,
-                                             max_tokens=10,
-                                             temperature=0.0,
-                                             echo=echo,
-                                             stream=stream)
-
-    stop_reason = "length"
-
-    if not stream:
-        completion = result
-        assert completion.id is not None
-        assert completion.choices is not None and len(completion.choices) == 1
-
-        choice = completion.choices[0]
-        assert len(choice.text) >= 5
-        assert choice.finish_reason == stop_reason
-
-        if echo:
-            assert choice.text is not None and saying in choice.text
-        else:
-            assert choice.text is not None and saying not in choice.text
-
-    else:
-        chunks: list[str] = []
-        final_finish_reason = None
-        async for chunk in result:
-            if chunk.choices and chunk.choices[0].text:
-                chunks.append(chunk.choices[0].text)
-            if chunk.choices and chunk.choices[0].finish_reason:
-                final_finish_reason = chunk.choices[0].finish_reason
-
-        assert final_finish_reason == stop_reason
-        content = "".join(chunks)
-        if echo:
-            assert content is not None and saying in content
-        else:
-            assert content is not None and saying not in content
-
-
-@pytest.mark.asyncio
-async def test_invocations(server: RemoteOpenAIServer,
-                           client: openai.AsyncOpenAI):
-    request_args = {
-        "model": MODEL_NAME,
-        "prompt": "Hello, my name is",
-        "max_tokens": 5,
-        "temperature": 0.0,
-        "logprobs": None,
-    }
-
-    completion = await client.completions.create(**request_args)
-
-    invocation_response = requests.post(server.url_for("invocations"),
-                                        json=request_args)
-    invocation_response.raise_for_status()
-
-    completion_output = completion.model_dump()
-    invocation_output = invocation_response.json()
-
-    assert completion_output.keys() == invocation_output.keys()
-    assert completion_output["choices"] == invocation_output["choices"]
--- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
+++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py
@ -14,6 +14,9 @@ from transformers import AutoConfig

 from ...utils import RemoteOpenAIServer

+pytest.skip("Skipping prompt_embeds test until V1 supports it.",
+            allow_module_level=True)
+
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@ -53,12 +53,13 @@ def monkeypatch_module():
    mpatch.undo()


-@pytest.fixture(scope="module", params=[False, True])
+@pytest.fixture(scope="module", params=[True])
 def server_with_lora_modules_json(request, monkeypatch_module,
                                  zephyr_lora_files):

    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+    assert use_v1
+    monkeypatch_module.setenv('VLLM_USE_V1', '1')

    # Define the json format LoRA module configurations
    lora_module_1 = {
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@ -22,7 +22,7 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 PREV_MINOR_VERSION = version._prev_minor_version()


-@pytest.fixture(scope="module", params=[True, False])
+@pytest.fixture(scope="module", params=[True])
 def use_v1(request):
    # Module-scoped variant of run_with_both_engines
    #
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@ -10,8 +10,30 @@ import pytest
 from vllm.transformers_utils.tokenizer import get_tokenizer

 from ...utils import RemoteOpenAIServer
-from .test_completion import default_server_args  # noqa: F401
-from .test_completion import MODEL_NAME
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+
+@pytest.fixture(scope="module")
+def default_server_args(zephyr_lora_files):
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        # lora config
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+    ]


@pytest.fixture(scope="module")
--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@ -15,14 +15,6 @@ MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
 DTYPE = "float16"


-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
@pytest.fixture(scope="module")
 def server():
    args = [
--- a/tests/v1/kv_offload/test_worker.py
+++ b/tests/v1/kv_offload/test_worker.py
@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
+                                              OffloadingWorker, TransferResult,
+                                              TransferSpec)
+
+
+class LoadStoreSpec1(LoadStoreSpec):
+
+    def __init__(self,
+                 submit_success: bool = True,
+                 async_success: bool = True,
+                 exception: bool = False):
+        self.finished = False
+        self.submit_success = submit_success
+        self.async_success = async_success
+        self.exception = exception
+
+    @staticmethod
+    def medium() -> str:
+        return "1"
+
+    def __repr__(self):
+        return f"{self.medium()}: {id(self)}"
+
+
+class LoadStoreSpec2(LoadStoreSpec):
+
+    @staticmethod
+    def medium() -> str:
+        return "2"
+
+    def __repr__(self):
+        return f"{self.medium()}: {id(self)}"
+
+
+class OffloadingHandler1To2(OffloadingHandler):
+
+    def __init__(self):
+        self.transfers: dict[int, LoadStoreSpec1] = {}
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        src, dst = spec
+        assert isinstance(src, LoadStoreSpec1)
+        assert isinstance(dst, LoadStoreSpec2)
+
+        if src.exception:
+            raise Exception("An expected exception. Don't worry!")
+        if not src.submit_success:
+            return False
+
+        self.transfers[job_id] = src
+        return True
+
+    def get_finished(self) -> list[TransferResult]:
+        finished = []
+        for job_id, spec in list(self.transfers.items()):
+            if spec.finished:
+                finished.append((job_id, spec.async_success))
+                del self.transfers[job_id]
+        return finished
+
+
+class OffloadingHandler2To1(OffloadingHandler):
+
+    def __init__(self):
+        self.transfers: dict[int, LoadStoreSpec1] = {}
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        src, dst = spec
+        assert isinstance(src, LoadStoreSpec2)
+        assert isinstance(dst, LoadStoreSpec1)
+
+        self.transfers[job_id] = dst
+        return True
+
+    def get_finished(self) -> list[TransferResult]:
+        finished = []
+        for job_id, spec in list(self.transfers.items()):
+            if spec.finished:
+                finished.append((job_id, spec.async_success))
+                del self.transfers[job_id]
+        return finished
+
+
+def test_offloading_worker():
+    """
+    Tests OffloadingWorker with 2 handlers.
+    One handler performs 1->2 transfers, and the other handles 2->1.
+    """
+    worker = OffloadingWorker()
+    handler1to2 = OffloadingHandler1To2()
+    handler2to1 = OffloadingHandler2To1()
+    worker.register_handler(LoadStoreSpec1, LoadStoreSpec2, handler1to2)
+    worker.register_handler(LoadStoreSpec2, LoadStoreSpec1, handler2to1)
+
+    # 1st transfer 1->2 (exception)
+    src1 = LoadStoreSpec1(exception=True)
+    dst1 = LoadStoreSpec2()
+    assert not worker.transfer_async(1, (src1, dst1))
+
+    # 2ed transfer 1->2 (failure to submit)
+    src2 = LoadStoreSpec1(submit_success=False)
+    dst2 = LoadStoreSpec2()
+    assert not worker.transfer_async(2, (src2, dst2))
+
+    # 3rd transfer 1->2 (failure)
+    src3 = LoadStoreSpec1(async_success=False)
+    dst3 = LoadStoreSpec2()
+    assert worker.transfer_async(3, (src3, dst3))
+
+    # 4th transfer 1->2 (success)
+    src4 = LoadStoreSpec1()
+    dst4 = LoadStoreSpec2()
+    worker.transfer_async(4, (src4, dst4))
+    assert set(handler1to2.transfers.keys()) == {3, 4}
+
+    # 5th transfer 2->1
+    src5 = LoadStoreSpec2()
+    dst5 = LoadStoreSpec1()
+    worker.transfer_async(5, (src5, dst5))
+    assert set(handler2to1.transfers.keys()) == {5}
+
+    # no transfer completed yet
+    assert worker.get_finished() == []
+
+    # complete 3rd, 4th
+    src3.finished = True
+    src4.finished = True
+
+    # 6th transfer 1->2
+    src6 = LoadStoreSpec1()
+    dst6 = LoadStoreSpec2()
+    worker.transfer_async(6, (src6, dst6))
+
+    # 7th transfer 2->1
+    src7 = LoadStoreSpec2()
+    dst7 = LoadStoreSpec1()
+    worker.transfer_async(7, (src7, dst7))
+
+    # 6th and 7th transfers started
+    assert 6 in handler1to2.transfers
+    assert 7 in handler2to1.transfers
+
+    # verify result of 3rd and 4th transfers
+    assert (sorted(worker.get_finished()) == [(3, False), (4, True)])
+
+    # complete 6th and 7th transfers
+    src6.finished = True
+    dst7.finished = True
+    assert (sorted(worker.get_finished()) == [(6, True), (7, True)])
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@ -7,7 +7,6 @@ import pytest
 import vllm.envs as envs
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine

 MODEL = "meta-llama/Llama-3.2-1B-Instruct"

@ -96,20 +95,3 @@ def test_v1_attn_backend(monkeypatch):
        _ = AsyncEngineArgs(model=MODEL).create_engine_config()
        assert envs.VLLM_USE_V1
        m.delenv("VLLM_USE_V1")
-
-
-def test_reject_using_constructor_directly(monkeypatch):
-    with monkeypatch.context() as m:
-        if os.getenv("VLLM_USE_V1", None):
-            m.delenv("VLLM_USE_V1")
-
-        # Sets VLLM_USE_V1=1.
-        vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
-
-        # This uses the V0 constructor directly.
-        with pytest.raises(ValueError):
-            AsyncLLMEngine(vllm_config,
-                           AsyncLLMEngine._get_executor_cls(vllm_config),
-                           log_stats=True)
-
-        m.delenv("VLLM_USE_V1")
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -563,18 +563,6 @@ class CompilationConfig:
                self.cudagraph_mode = CUDAGraphMode.FULL
            self.splitting_ops = []

-        if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
-            # exclude MoE dispatch/combine from capture by ensuring
-            # piecewise splitting includes them, so communication remains
-            # outside CUDA graphs while compute can still be graphed.
-            moe_ops = [
-                "vllm.moe_forward",
-                "vllm.moe_forward_shared",
-            ]
-            for op in moe_ops:
-                if op not in self.splitting_ops:
-                    self.splitting_ops.append(op)
-
    def splitting_ops_contain_attention(self) -> bool:
        return self.splitting_ops is not None and all(
            op in self.splitting_ops for op in self._attention_ops)
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
--- a/vllm/engine/async_timeout.py
+++ b/vllm/engine/async_timeout.py
@ -1,173 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Workaround for https://github.com/python/cpython/issues/86296
-#
-# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
-# Licensed under the Apache License (Apache-2.0)
-
-import asyncio
-import enum
-import sys
-from types import TracebackType
-from typing import Any, Optional, Type
-
-if sys.version_info[:2] >= (3, 11):
-    from asyncio import timeout as asyncio_timeout
-else:
-
-    class _State(enum.Enum):
-        INIT = "INIT"
-        ENTER = "ENTER"
-        TIMEOUT = "TIMEOUT"
-        EXIT = "EXIT"
-
-    class Timeout:
-        # Internal class, please don't instantiate it directly
-        # Use timeout() and timeout_at() public factories instead.
-        #
-        # Implementation note: `async with timeout()` is preferred
-        # over `with timeout()`.
-        # While technically the Timeout class implementation
-        # doesn't need to be async at all,
-        # the `async with` statement explicitly points that
-        # the context manager should be used from async function context.
-        #
-        # This design allows to avoid many silly misusages.
-        #
-        # TimeoutError is raised immediately when scheduled
-        # if the deadline is passed.
-        # The purpose is to time out as soon as possible
-        # without waiting for the next await expression.
-
-        __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler")
-
-        def __init__(self, deadline: Optional[float],
-                     loop: asyncio.AbstractEventLoop) -> None:
-            self._loop = loop
-            self._state = _State.INIT
-
-            self._timeout_handler = None  # type: Optional[asyncio.Handle]
-            if deadline is None:
-                self._deadline = None  # type: Optional[float]
-            else:
-                self.update(deadline)
-
-        async def __aenter__(self) -> "Timeout":
-            self._do_enter()
-            return self
-
-        async def __aexit__(
-            self,
-            exc_type: Optional[Type[BaseException]],
-            exc_val: Optional[BaseException],
-            exc_tb: Optional[TracebackType],
-        ) -> Optional[bool]:
-            self._do_exit(exc_type)
-            return None
-
-        @property
-        def expired(self) -> bool:
-            """Is timeout expired during execution?"""
-            return self._state == _State.TIMEOUT
-
-        @property
-        def deadline(self) -> Optional[float]:
-            return self._deadline
-
-        def reject(self) -> None:
-            """Reject scheduled timeout if any."""
-            # cancel is maybe better name but
-            # task.cancel() raises CancelledError in asyncio world.
-            if self._state not in (_State.INIT, _State.ENTER):
-                raise RuntimeError(f"invalid state {self._state.value}")
-            self._reject()
-
-        def _reject(self) -> None:
-            if self._timeout_handler is not None:
-                self._timeout_handler.cancel()
-                self._timeout_handler = None
-
-        def shift(self, delay: float) -> None:
-            """Advance timeout on delay seconds.
-            The delay can be negative.
-            Raise RuntimeError if shift is called when deadline is not scheduled
-            """
-            deadline = self._deadline
-            if deadline is None:
-                raise RuntimeError(
-                    "cannot shift timeout if deadline is not scheduled")
-            self.update(deadline + delay)
-
-        def update(self, deadline: float) -> None:
-            """Set deadline to absolute value.
-            deadline argument points on the time in the same clock system
-            as loop.time().
-            If new deadline is in the past the timeout is raised immediately.
-            Please note: it is not POSIX time but a time with
-            undefined starting base, e.g. the time of the system power on.
-            """
-            if self._state == _State.EXIT:
-                raise RuntimeError(
-                    "cannot reschedule after exit from context manager")
-            if self._state == _State.TIMEOUT:
-                raise RuntimeError("cannot reschedule expired timeout")
-            if self._timeout_handler is not None:
-                self._timeout_handler.cancel()
-            self._deadline = deadline
-            if self._state != _State.INIT:
-                self._reschedule()
-
-        def _reschedule(self) -> None:
-            assert self._state == _State.ENTER
-            deadline = self._deadline
-            if deadline is None:
-                return
-
-            now = self._loop.time()
-            if self._timeout_handler is not None:
-                self._timeout_handler.cancel()
-
-            task = asyncio.current_task()
-            if deadline <= now:
-                self._timeout_handler = self._loop.call_soon(
-                    self._on_timeout, task)
-            else:
-                self._timeout_handler = self._loop.call_at(
-                    deadline, self._on_timeout, task)
-
-        def _do_enter(self) -> None:
-            if self._state != _State.INIT:
-                raise RuntimeError(f"invalid state {self._state.value}")
-            self._state = _State.ENTER
-            self._reschedule()
-
-        def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None:
-            if exc_type is asyncio.CancelledError and \
-                    self._state == _State.TIMEOUT:
-                self._timeout_handler = None
-                raise asyncio.TimeoutError
-            # timeout has not expired
-            self._state = _State.EXIT
-            self._reject()
-            return None
-
-        def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None:
-            if task:
-                task.cancel()
-            self._state = _State.TIMEOUT
-            # drop the reference early
-            self._timeout_handler = None
-
-    def asyncio_timeout(delay: Optional[float]) -> Timeout:
-        """timeout context manager.
-        Useful in cases when you want to apply timeout logic around block
-        of code or in cases when asyncio.wait_for is not suitable. For example:
-        >>> async with timeout(0.001):
-        ...     async with aiohttp.get('https://github.com') as r:
-        ...         await r.text()
-        delay - value in seconds or None to disable timeout logic
-        """
-        loop = asyncio.get_running_loop()
-        deadline = loop.time() + delay if delay is not None else None
-        return Timeout(deadline, loop)
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@ -11,7 +11,6 @@ import uvicorn
 from fastapi import FastAPI, Request, Response

 from vllm import envs
-from vllm.engine.async_llm_engine import AsyncEngineDeadError
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
                                        H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
@ -154,7 +153,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
    """

    @app.exception_handler(RuntimeError)
-    @app.exception_handler(AsyncEngineDeadError)
    @app.exception_handler(EngineDeadError)
    @app.exception_handler(EngineGenerateError)
    async def runtime_exception_handler(request: Request, __):
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -38,7 +38,6 @@ from typing_extensions import assert_never
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (load_chat_template,
                                         resolve_hf_chat_template,
@ -201,50 +200,34 @@ async def build_async_engine_client_from_engine_args(
    vllm_config = engine_args.create_engine_config(usage_context=usage_context)

    # V1 AsyncLLM.
-    if envs.VLLM_USE_V1:
-        if disable_frontend_multiprocessing:
-            logger.warning(
-                "V1 is enabled, but got --disable-frontend-multiprocessing. "
-                "To disable frontend multiprocessing, set VLLM_USE_V1=0.")
+    assert envs.VLLM_USE_V1

-        from vllm.v1.engine.async_llm import AsyncLLM
-        async_llm: Optional[AsyncLLM] = None
-        client_count = client_config.pop(
-            "client_count") if client_config else 1
-        client_index = client_config.pop(
-            "client_index") if client_config else 0
-        try:
-            async_llm = AsyncLLM.from_vllm_config(
-                vllm_config=vllm_config,
-                usage_context=usage_context,
-                enable_log_requests=engine_args.enable_log_requests,
-                disable_log_stats=engine_args.disable_log_stats,
-                client_addresses=client_config,
-                client_count=client_count,
-                client_index=client_index)
+    if disable_frontend_multiprocessing:
+        logger.warning(
+            "V1 is enabled, but got --disable-frontend-multiprocessing. "
+            "To disable frontend multiprocessing, set VLLM_USE_V1=0.")

-            # Don't keep the dummy data in memory
-            await async_llm.reset_mm_cache()
+    from vllm.v1.engine.async_llm import AsyncLLM
+    async_llm: Optional[AsyncLLM] = None
+    client_count = client_config.pop("client_count") if client_config else 1
+    client_index = client_config.pop("client_index") if client_config else 0
+    try:
+        async_llm = AsyncLLM.from_vllm_config(
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            enable_log_requests=engine_args.enable_log_requests,
+            disable_log_stats=engine_args.disable_log_stats,
+            client_addresses=client_config,
+            client_count=client_count,
+            client_index=client_index)

-            yield async_llm
-        finally:
-            if async_llm:
-                async_llm.shutdown()
+        # Don't keep the dummy data in memory
+        await async_llm.reset_mm_cache()

-    # V0 AsyncLLM.
-    else:
-
-        engine_client: Optional[EngineClient] = None
-        try:
-            engine_client = AsyncLLMEngine.from_vllm_config(
-                vllm_config=vllm_config,
-                usage_context=usage_context,
-                enable_log_requests=engine_args.enable_log_requests,
-                disable_log_stats=engine_args.disable_log_stats)
-            yield engine_client
-        finally:
-            if engine_client and hasattr(engine_client, "shutdown"):
-                engine_client.shutdown()
+        yield async_llm
+    finally:
+        if async_llm:
+            async_llm.shutdown()


 async def validate_json_request(raw_request: Request):
--- a/vllm/model_executor/models/init.py
+++ b/vllm/model_executor/models/init.py
@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
-                         SupportsPP, SupportsTranscription, SupportsV0Only,
-                         has_inner_state, supports_lora, supports_multimodal,
-                         supports_pp, supports_transcription, supports_v0_only)
+from .interfaces import (HasInnerState, SupportsLoRA, SupportsMRoPE,
+                         SupportsMultiModal, SupportsPP, SupportsTranscription,
+                         SupportsV0Only, has_inner_state, supports_lora,
+                         supports_mrope, supports_multimodal, supports_pp,
+                         supports_transcription, supports_v0_only)
 from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
                              is_pooling_model, is_text_generation_model)
 from .registry import ModelRegistry
@ -21,6 +22,8 @@ __all__ = [
    "supports_lora",
    "SupportsMultiModal",
    "supports_multimodal",
+    "SupportsMRoPE",
+    "supports_mrope",
    "SupportsPP",
    "supports_pp",
    "SupportsTranscription",
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@ -76,7 +76,6 @@ class OAIAttention(nn.Module):

        self.sinks = torch.nn.Parameter(
            torch.empty(config.num_attention_heads // tp_size,
-                        dtype=torch.bfloat16,
                        requires_grad=False))

        self.q_size = self.num_attention_heads * self.head_dim // tp_size
@ -145,8 +144,7 @@ class MLPBlock(torch.nn.Module):
        self.experts_per_token = config.num_experts_per_tok
        self.world_size = dist.get_world_size() if dist.is_initialized() else 1
        self.router = torch.nn.Linear(config.hidden_size,
-                                      config.num_local_experts,
-                                      dtype=torch.bfloat16)
+                                      config.num_local_experts)
        assert config.intermediate_size % self.world_size == 0
        self.experts = FusedMoE(num_experts=config.num_local_experts,
                                top_k=config.num_experts_per_tok,
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@ -8,6 +8,7 @@ from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
 import numpy as np
 import torch
 from torch import Tensor
+from transformers import PretrainedConfig
 from transformers.models.whisper.tokenization_whisper import LANGUAGES
 from typing_extensions import Self, TypeIs

@ -852,3 +853,70 @@ def supports_eagle3(
    model: Union[type[object], object],
 ) -> Union[TypeIs[type[SupportsEagle3]], TypeIs[SupportsEagle3]]:
    return isinstance(model, SupportsEagle3)
+
+
+@runtime_checkable
+class SupportsMRoPE(Protocol):
+    """The interface required for all models that support M-RoPE."""
+
+    supports_mrope: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports M-RoPE.
+    
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        second_per_grid_ts: Optional[list[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """
+        Get M-RoPE input positions and delta value for this specific model.
+        
+        This method should be implemented by each model that supports M-RoPE
+        to provide model-specific logic for computing input positions.
+        
+        Args:
+            input_tokens: List of input token IDs
+            hf_config: HuggingFace model configuration
+            image_grid_thw: Image grid dimensions (t, h, w)
+            video_grid_thw: Video grid dimensions (t, h, w)
+            second_per_grid_ts: Seconds per grid timestep for videos
+            context_len: Context length
+            seq_len: Sequence length
+            audio_feature_lengths: Audio feature lengths for multimodal models
+            use_audio_in_video: Whether to use audio in video for interleaving
+            
+        Returns:
+            Tuple of (llm_positions, mrope_position_delta)
+            - llm_positions: Tensor of shape [3, num_tokens]
+                with T/H/W positions
+            - mrope_position_delta: Delta for position calculations
+        """
+        ...
+
+
+@overload
+def supports_mrope(model: type[object]) -> TypeIs[type[SupportsMRoPE]]:
+    ...
+
+
+@overload
+def supports_mrope(model: object) -> TypeIs[SupportsMRoPE]:
+    ...
+
+
+def supports_mrope(
+    model: Union[type[object], object],
+) -> Union[TypeIs[type[SupportsMRoPE]], TypeIs[SupportsMRoPE]]:
+    return isinstance(model, SupportsMRoPE)
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@ -32,7 +32,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from transformers import AutoConfig, BatchFeature
+from transformers import AutoConfig, BatchFeature, PretrainedConfig
 from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
                                          Qwen2VLProcessor)
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
@ -73,7 +73,7 @@ from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

-from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA, SupportsMRoPE,
                         SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, WeightsMapper,
                    init_vllm_registered_model, maybe_prefix,
@ -1096,7 +1096,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                        info=Qwen2VLProcessingInfo,
                                        dummy_inputs=Qwen2VLDummyInputsBuilder)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsLoRA, SupportsPP):
+                                      SupportsLoRA, SupportsPP, SupportsMRoPE):

    # To ensure correct weight loading and mapping.
    hf_to_vllm_mapper = WeightsMapper(
@ -1109,6 +1109,118 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
            "model.": "language_model.model.",
        })

+    def get_mrope_input_positions(
+        self,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        video_grid_thw: Optional[Union[list[list[int]], torch.Tensor]],
+        second_per_grid_ts: Optional[list[float]] = None,
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+        audio_feature_lengths: Optional[torch.Tensor] = None,
+        use_audio_in_video: bool = False,
+    ) -> tuple[torch.Tensor, int]:
+        """Get M-RoPE input positions for Qwen2-VL model."""
+        if image_grid_thw is None:
+            image_grid_thw = []
+        if video_grid_thw is None:
+            video_grid_thw = []
+        if second_per_grid_ts is None:
+            second_per_grid_ts = []
+
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config,
+                                    "tokens_per_second", 1.0)
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        vision_start_indices = torch.argwhere(
+            input_tokens_tensor == vision_start_token_id).squeeze(1)
+        vision_tokens = input_tokens_tensor[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        llm_pos_ids_list: list = []
+
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+
+        image_index, video_index = 0, 0
+        for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
+            else:
+                ed_image = len(input_tokens) + 1
+            if remain_videos > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+
+            llm_grid_t, llm_grid_h, llm_grid_w = \
+                t, h // spatial_merge_size, w // spatial_merge_size
+            text_len = ed - st
+
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
+                       tokens_per_second).long().flatten()
+
+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                llm_grid_t, -1, llm_grid_w).flatten()
+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                llm_grid_t, llm_grid_h, -1).flatten()
+            llm_pos_ids_list.append(
+                torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
+
+        return llm_positions, mrope_position_delta
+
    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
        if modality.startswith("image"):
--- a/vllm/model_executor/warmup/deep_gemm_warmup.py
+++ b/vllm/model_executor/warmup/deep_gemm_warmup.py
@ -36,7 +36,7 @@ def _extract_data_from_linear_base_module(
    assert m.quant_method.quant_config is not None

    w = m.weight
-    ws = m.weight_scale_inv
+    ws = m.weight_scale
    quant_block_size = m.quant_method.quant_config.weight_block_size

    assert isinstance(w, torch.Tensor)
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@ -191,14 +191,17 @@ class CudaPlatformBase(Platform):
        compilation_config = vllm_config.compilation_config
        if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
                and parallel_config.data_parallel_size > 1
-                and compilation_config.cudagraph_mode
-                not in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE]):
+                and compilation_config.cudagraph_mode != CUDAGraphMode.NONE):
+            # TODO: Piecewise Cuda graph might be enabled
+            # if torch compile cache key issue fixed
+            # See https://github.com/vllm-project/vllm/pull/25093
            logger.info(
-                "Data Parallel with DeepEP high-throughput: using PIECEWISE "
-                "CUDA graphs and excluding MoE ops from capture. Set "
-                "VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE "
-                "graphs captured as well.")
-            compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+                "Data Parallel: disabling cudagraphs since DP "
+                "with DeepEP high-throughput kernels are not CUDA Graph "
+                "compatible. The DeepEP low-latency kernels are CUDA Graph "
+                "compatible. Set the all_to_all backend to deepep_low_latency "
+                "to use those kernels instead.")
+            compilation_config.cudagraph_mode = CUDAGraphMode.NONE

    @classmethod
    def get_current_memory_usage(cls,
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@ -206,12 +206,11 @@ class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]):
        )

        if H < MAX_HEADS:
-            # Extract the subsets of the outputs
-            returned_lse = lse[:, :H].contiguous(
-            ) if self.need_to_return_lse_for_decode else lse
            out = out[:, :H]
+            if self.need_to_return_lse_for_decode:
+                lse = lse[:, :H].contiguous()

-        return out, returned_lse
+        return out, lse

    def _forward_decode(
        self,
--- a/vllm/v1/kv_offload/abstract.py
+++ b/vllm/v1/kv_offload/abstract.py
@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+OffloadingManager class for managing KV data offloading in vLLM v1
+
+This class runs in the scheduler, tracks which blocks are offloaded
+and their address.
+
+The class provides the following primitives:
+    lookup() - find the length of the maximal series of blocks,
+        starting from the first one, that are all offloaded.
+    prepare_load() - prepare given blocks to be read.
+        The given blocks will be protected from eviction.
+        This function returns a LoadSpec which encapsulates
+        information required for performing the load.
+    touch() - marks the give blocks as recently used. Can be used
+        to track block's LRU. This function is separated from the
+        prepare_load function to allow setting block recency even
+        for blocks which do not need reading from the cache, such as
+        blocks that are cached by the GPU prefix cache.
+    complete_load() - mark blocks which were previously prepared to be
+        loaded as done loading. This is to re-allow their eviction.
+    prepare_store() - prepare the given blocks to be written.
+        Returns a StoreSpec encapsulating offloading information,
+        as well as a list of blocks that were evicted as a result.
+    complete_store() - marks a previous store as completed.
+        Following this call, the given blocks will become loadable.
+"""
+
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Optional
+
+from vllm.v1.core.kv_cache_utils import BlockHash
+
+
+class LoadStoreSpec(ABC):
+    """
+    Abstract metadata that encapsulates information allowing a worker
+    to load, and optionally also to store, blocks of KV data.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def medium() -> str:
+        """
+        Returns a string representation of the medium type
+        this store/load targets.
+        """
+        pass
+
+
+@dataclass
+class PrepareStoreOutput:
+    block_hashes_to_store: list[BlockHash]
+    store_spec: LoadStoreSpec
+    block_hashes_evicted: list[BlockHash]
+
+
+@dataclass
+class OffloadingEvent:
+    block_hashes: list[BlockHash]
+    block_size: int
+    medium: str
+    # True if blocks are removed, False if stored
+    removed: bool
+
+
+class OffloadingManager(ABC):
+
+    @abstractmethod
+    def lookup(self, block_hashes: Iterable[BlockHash]) -> int:
+        """
+        Finds the length of the maximal series of blocks, starting from the
+        first one, that are all offloaded.
+
+        Args:
+            block_hashes: the hashes identifying the blocks to lookup.
+
+        Returns:
+            An integer representing the maximal number of blocks that
+            are currently offloaded.
+        """
+        pass
+
+    @abstractmethod
+    def prepare_load(self, block_hashes: Iterable[BlockHash]) -> LoadStoreSpec:
+        """
+        Prepare the given blocks to be read.
+        The given blocks will be protected from eviction until
+        complete_load is called.
+        It assumes all given blocks are offloaded.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+
+        Returns:
+            A LoadStoreSpec that can be used by a worker to locate and load
+            the actual offloaded KV data.
+        """
+        pass
+
+    def touch(self, block_hashes: Iterable[BlockHash]):
+        """
+        Mark the given blocks as recently used.
+        This could in practice mean moving them to the end of an LRU list.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+        """
+        return
+
+    def complete_load(self, block_hashes: Iterable[BlockHash]):
+        """
+        Marks previous blocks that were prepared to load as done loading.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+        """
+        return
+
+    @abstractmethod
+    def prepare_store(
+            self,
+            block_hashes: Iterable[BlockHash]) -> Optional[PrepareStoreOutput]:
+        """
+        Prepare the given blocks to be offloaded.
+        The given blocks will be protected from eviction until
+        complete_store is called.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+
+        Returns:
+            A PrepareStoreOutput indicating which blocks need storing,
+            where to store them (LoadStoreSpec), and list of blocks that
+            were evicted as a result.
+            None is returned if the blocks cannot be stored.
+        """
+        pass
+
+    def complete_store(self,
+                       block_hashes: Iterable[BlockHash],
+                       success: bool = True):
+        """
+        Marks blocks which were previously prepared to be stored, as stored.
+        Following this call, the blocks become loadable.
+        If if_success is False, blocks that were not marked as stored will be
+        removed.
+
+        Args:
+            block_hashes: the hashes identifying the blocks.
+            success: whether the blocks were stored successfully.
+        """
+        return
+
+    def take_events(self) -> Iterable[OffloadingEvent]:
+        """
+        Take the offloading events from the manager.
+
+        Yields:
+            New OffloadingEvents collected since the last call.
+        """
+        return ()
--- a/vllm/v1/kv_offload/mediums.py
+++ b/vllm/v1/kv_offload/mediums.py
@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC
+
+import numpy as np
+
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+
+
+class BlockIDsLoadStoreSpec(LoadStoreSpec, ABC):
+    """
+    Spec for loading/storing KV blocks from given block numbers.
+    """
+
+    def __init__(self, block_ids: list[int]):
+        self.block_ids = np.array(block_ids, dtype=np.int64)
+
+    def __repr__(self) -> str:
+        return repr(self.block_ids)
+
+
+class GPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to GPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "GPU"
+
+
+class CPULoadStoreSpec(BlockIDsLoadStoreSpec):
+    """
+    Spec for loading/storing a KV block to CPU memory.
+    """
+
+    @staticmethod
+    def medium() -> str:
+        return "CPU"
--- a/vllm/v1/kv_offload/worker/worker.py
+++ b/vllm/v1/kv_offload/worker/worker.py
@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+
+from vllm.logger import init_logger
+from vllm.v1.kv_offload.abstract import LoadStoreSpec
+
+# a single transfer spec (src_blocks_spec, dst_blocks_spec)
+TransferSpec = tuple[LoadStoreSpec, LoadStoreSpec]
+# transfers are forwarded to workers by (src_medium, dst_medium)
+TransferType = tuple[str, str]
+# transfer result (job_id, success)
+TransferResult = tuple[int, bool]
+
+logger = init_logger(__name__)
+
+
+class OffloadingHandler(ABC):
+    """
+    OffloadingHandler class for managing asynchronous KV data transfers
+
+    This class runs in the worker.
+    It kicks off async KV data transfer requests, and allows
+    collecting back completion statuses.
+
+    The class provides the following primitives:
+        transfer_async() - kicks off a new transfer job
+        get_finished() - returns a list of newly finished job IDs.
+    """
+
+    @abstractmethod
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        """
+        Initiates an asynchronous transfer of KV data.
+
+        Args:
+            job_id: a unique ID that will be used when notifying back on
+                transfer completion.
+            spec: the (src, dst) spec of the KV data transfer.
+
+        Returns:
+            True if transfer was submitted successfully.
+        """
+        pass
+
+    @abstractmethod
+    def get_finished(self) -> list[TransferResult]:
+        """
+        Get transfers finished since last call.
+
+        Returns:
+            A list of (job_id, success) of transfers.
+        """
+        pass
+
+
+class OffloadingWorker:
+    """
+    OffloadingWorker class for managing asynchronous KV data transfers
+    using multiple OffloadingHandlers
+
+    This class runs in the worker.
+    It kicks off async KV data transfer requests, by delegating
+    to one of its registered OffloadingHandlers, based on the transfer type.
+
+    The class provides the following primitives:
+        register_handler() - registers a new handler to handle
+            a specific transfer type
+        transfer_async() - kicks off a new transfer job
+            using one of the registered handlers.
+        get_finished() - returns a list of newly finished job IDs
+            from all handlers.
+    """
+
+    def __init__(self):
+        self.handlers: set[OffloadingHandler] = set()
+        self.transfer_type_to_handler: dict[TransferType,
+                                            OffloadingHandler] = {}
+
+    def register_handler(self, src_cls: type[LoadStoreSpec],
+                         dst_cls: type[LoadStoreSpec],
+                         handler: OffloadingHandler) -> None:
+        """
+        Registers a new handler.
+
+        Args:
+            src_cls: the source type of transfers handled by this handler.
+            dst_cls: the destination type of transfers handled by this handler.
+            handler: the handler that will handle transfers.
+        """
+        transfer_type = (src_cls.medium(), dst_cls.medium())
+        assert transfer_type not in self.transfer_type_to_handler
+        self.handlers.add(handler)
+        self.transfer_type_to_handler[transfer_type] = handler
+
+    def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
+        """
+        Initiates an asynchronous transfer of KV data.
+
+        Args:
+            job_id: a unique ID that will be used when notifying back on
+                transfer completion.
+            spec: the (src, dst) spec of the KV data transfer.
+
+        Returns:
+            True if transfer was submitted successfully.
+        """
+        src, dst = spec
+        transfer_type = (src.medium(), dst.medium())
+        handler = self.transfer_type_to_handler.get(transfer_type)
+        assert handler is not None
+
+        try:
+            success = handler.transfer_async(job_id, spec)
+        except Exception as e:
+            logger.warning("Exception in %r transfer %d: %r",
+                           transfer_type,
+                           job_id,
+                           e,
+                           exc_info=True)
+            return False
+
+        if not success:
+            logger.warning("Failed to submit %r transfer %d", transfer_type,
+                           job_id)
+        else:
+            logger.debug("Submitted %r transfer %d: %r", transfer_type, job_id,
+                         spec)
+
+        return success
+
+    def get_finished(self) -> list[TransferResult]:
+        """
+        Get transfers finished since last call.
+
+        Returns:
+            A list of (job_id, success) of transfers.
+        """
+        finished = []
+        for handler in self.handlers:
+            finished.extend(handler.get_finished())
+        return finished
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -42,6 +42,7 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (is_mixture_of_experts,
                                                   supports_eagle3,
+                                                   supports_mrope,
                                                   supports_transcription)
 from vllm.model_executor.models.interfaces_base import (
    VllmModelForPooling, is_pooling_model, is_text_generation_model)
@ -730,16 +731,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            if mm_input.get("use_audio_in_video") is True:
                use_audio_in_video = True

-        req_state.mrope_positions, req_state.mrope_position_delta = \
-            MRotaryEmbedding.get_input_positions_tensor(
-                req_state.prompt_token_ids,
-                hf_config=self.model_config.hf_config,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                audio_feature_lengths=audio_feature_lengths,
-                use_audio_in_video=use_audio_in_video,
-            )
+        if supports_mrope(self.model):
+            req_state.mrope_positions, req_state.mrope_position_delta = \
+                self.model.get_mrope_input_positions(
+                    req_state.prompt_token_ids,
+                    hf_config=self.model_config.hf_config,
+                    image_grid_thw=image_grid_thw,
+                    video_grid_thw=video_grid_thw,
+                    second_per_grid_ts=second_per_grid_ts,
+                    audio_feature_lengths=audio_feature_lengths,
+                    use_audio_in_video=use_audio_in_video,
+                )
+        else:
+            req_state.mrope_positions, req_state.mrope_position_delta = \
+                MRotaryEmbedding.get_input_positions_tensor(
+                    req_state.prompt_token_ids,
+                    hf_config=self.model_config.hf_config,
+                    image_grid_thw=image_grid_thw,
+                    video_grid_thw=video_grid_thw,
+                    second_per_grid_ts=second_per_grid_ts,
+                    audio_feature_lengths=audio_feature_lengths,
+                    use_audio_in_video=use_audio_in_video,
+                )

    def _extract_mm_kwargs(
        self,
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@ -41,7 +41,8 @@ from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
                                                get_sampler)
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.model_executor.models import supports_lora, supports_multimodal
+from vllm.model_executor.models import (supports_lora, supports_mrope,
+                                        supports_multimodal)
 from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                             MultiModalKwargs, MultiModalPlaceholderMap,
@ -670,18 +671,33 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
                    inter_data.seq_ids[seq_idx]]
                token_ids = seq_data.get_token_ids()

-                mrope_input_positions, mrope_position_delta = \
-                    MRotaryEmbedding.get_input_positions(
-                        token_ids,
-                        hf_config=hf_config,
-                        image_grid_thw=image_grid_thw,
-                        video_grid_thw=video_grid_thw,
-                        second_per_grid_ts=second_per_grid_ts,
-                        context_len=inter_data.context_lens[seq_idx],
-                        seq_len=inter_data.seq_lens[seq_idx],
-                        audio_feature_lengths=audio_feature_lengths,
-                        use_audio_in_video=use_audio_in_video,
-                    )
+                if supports_mrope(self.runner.model):
+                    mrope_input_positions, mrope_position_delta = \
+                        self.runner.model.get_mrope_input_positions(
+                            token_ids,
+                            hf_config=hf_config,
+                            image_grid_thw=image_grid_thw,
+                            video_grid_thw=video_grid_thw,
+                            second_per_grid_ts=second_per_grid_ts,
+                            context_len=inter_data.context_lens[seq_idx],
+                            seq_len=inter_data.seq_lens[seq_idx],
+                            audio_feature_lengths=audio_feature_lengths,
+                            use_audio_in_video=use_audio_in_video,
+                        )
+                    mrope_input_positions = mrope_input_positions.tolist()
+                else:
+                    mrope_input_positions, mrope_position_delta = \
+                        MRotaryEmbedding.get_input_positions(
+                            token_ids,
+                            hf_config=hf_config,
+                            image_grid_thw=image_grid_thw,
+                            video_grid_thw=video_grid_thw,
+                            second_per_grid_ts=second_per_grid_ts,
+                            context_len=inter_data.context_lens[seq_idx],
+                            seq_len=inter_data.seq_lens[seq_idx],
+                            audio_feature_lengths=audio_feature_lengths,
+                            use_audio_in_video=use_audio_in_video,
+                        )

                seq_data.mrope_position_delta = mrope_position_delta
                inter_data.mrope_input_positions[
Author	SHA1	Message	Date
Lucas Wilkinson	9fac6aa30b	[BugFix] Fix DeepGEMM warmup, no m.weight_scale_inv (#25206 ) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>	2025-09-18 14:26:28 -07:00
Or Ozeri	a53ad626d6	[KV offload][1b/N] rename offloading to kv_offload (#25191 ) Signed-off-by: Or Ozeri <oro@il.ibm.com>	2025-09-18 20:53:52 +00:00
Woosuk Kwon	1c3dad22ff	[V0 Deprecation] Remove unused async_timeout.py (#25190 ) Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-18 20:35:21 +00:00
Wentao Ye	d2a30a2d93	[Bug] Fix torch Compilation Cache Hit Error (#25093 ) Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-09-18 12:38:37 -07:00
Wentao Ye	75fb112d80	[Bug] Fix `returned_lse` not Defined issue (#25106 ) Signed-off-by: yewentao256 <zhyanwentao@126.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-09-18 19:32:24 +00:00
Aziz	38db529f66	[feat]: Create interface for model-specific M-RoPE (#24194 ) Signed-off-by: AzizCode92 <azizbenothman76@gmail.com> Signed-off-by: Aziz <azizbenothman76@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>	2025-09-18 19:18:56 +00:00
Nikhil Gupta	064cac7bb7	[fix]: remove data type hardcoding from gptoss model implementation (#23807 ) Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com>	2025-09-18 18:15:23 +00:00
Woosuk Kwon	e19bce40a1	[V0 Deprecation] Remove AsyncLLMEngine (#25025 ) Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai> Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>	2025-09-18 11:07:42 -07:00
Or Ozeri	505805b645	[KV offload][1/N] Introduce an offloading component (#19848 ) Signed-off-by: Or Ozeri <oro@il.ibm.com>	2025-09-18 10:57:07 -07:00