[CI/Build][CPU] Fix CPU CI and remove all CPU V0 files (#20560)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-08 13:13:44 +08:00
parent 8369b7c2a9
commit 7721ef1786
9 changed files with 785 additions and 839 deletions
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@ -39,7 +39,7 @@ AITER_MODEL_LIST = [
    [
        pytest.param(
            "bigscience/bloom-560m",  # bloom - testing alibi slopes
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
        ),
        pytest.param(
            "openai-community/gpt2",  # gpt2
@ -87,7 +87,11 @@ AITER_MODEL_LIST = [
        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
        pytest.param(
            "TitanML/tiny-mixtral",  # mixtral
-            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+            marks=[pytest.mark.cpu_model],
        )
    ])
@pytest.mark.parametrize("max_tokens", [32])
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
 from typing import Optional

 import pytest
@ -29,8 +28,10 @@ def v1(run_with_both_engines):
        # [Decoder-only]
        pytest.param("BAAI/bge-multilingual-gemma2",
                     marks=[pytest.mark.core_model]),
-        pytest.param("intfloat/e5-mistral-7b-instruct",
-                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param(
+            "intfloat/e5-mistral-7b-instruct",
+            # CPU v1 doesn't support sliding window
+            marks=[pytest.mark.core_model]),
        # the qwen models interfere with each other (see PR
        # https://github.com/vllm-project/vllm/pull/18720).
        # To avoid this problem, for now we skip v0 since it will be
@ -38,11 +39,13 @@ def v1(run_with_both_engines):
        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
        # [Encoder-only]
-        pytest.param("BAAI/bge-base-en-v1.5",
-                     marks=[
-                         pytest.mark.core_model, pytest.mark.cpu_model,
-                         pytest.mark.skip_v1
-                     ]),
+        pytest.param(
+            "BAAI/bge-base-en-v1.5",
+            marks=[
+                # CPU only supports V1
+                pytest.mark.core_model,
+                pytest.mark.skip_v1
+            ]),
        pytest.param("sentence-transformers/all-MiniLM-L12-v2",
                     marks=[pytest.mark.skip_v1]),
        pytest.param("intfloat/multilingual-e5-small",
@ -61,10 +64,6 @@ def test_models(
    model,
    monkeypatch,
 ) -> None:
-    if model == "intfloat/e5-mistral-7b-instruct" and current_platform.is_cpu(
-    ) and os.environ.get("VLLM_USE_V1", "0") == "1":
-        pytest.skip("CPU V1 doesn't support sliding window")
-
    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
        # ROCm Triton FA does not currently support sliding window attention
        # switch to use ROCm CK FA backend
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
 import pytest
 import torch
 import torch.nn.functional as F
@ -84,6 +86,9 @@ def test_prm_models(
    dtype: str,
    monkeypatch,
 ) -> None:
+    if current_platform.is_cpu() and os.environ.get("VLLM_USE_V1", "0") == "0":
+        pytest.skip("CPU only supports V1")
+
    if current_platform.is_rocm():
        # ROCm Triton FA does not currently support sliding window attention
        # switch to use ROCm CK FA backend