[CI] Add E2E Blackwell Quantized MoE Test (#25723)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-09-26 15:23:00 -04:00
parent 0002b7f0d1
commit f708bd4904
4 changed files with 155 additions and 2 deletions
--- a/tests/quantization/test_blackwell_moe.py
+++ b/tests/quantization/test_blackwell_moe.py
@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import os
+
+import pytest
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if not current_platform.is_device_capability(100):
+    pytest.skip("This test only runs on Blackwell GPUs (SM100).",
+                allow_module_level=True)
+
+os.environ["FLASHINFER_NVCC_THREADS"] = "16"
+
+# dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4,
+# "text_config": {"num_layers": 4, "num_hidden_layers": 4}}
+dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}
+
+
+def can_initialize(model: str, extra_args: list[str]):
+
+    # Server arguments
+    server_args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-batched-tokens",
+        "256",
+        "--load-format",
+        "dummy",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": 0}),
+        *extra_args,
+    ]
+
+    # Launch server and make a simple request
+    with RemoteOpenAIServer(
+            model,
+            server_args,
+            max_wait_seconds=1000,  # Due to FlashInfer compile
+            override_hf_configs=dummy_hf_overrides) as server:
+        client = server.get_client()
+        # Make a simple request to verify the server works
+        completion = client.completions.create(
+            model=model,
+            prompt=["Hello, World!"],
+            temperature=0,
+            max_tokens=2,
+        )
+        print(completion)
+        assert completion.choices[0].text is not None
+
+
+## Llama4 ##
+
+
+@pytest.mark.skip(reason=(
+    "RuntimeError: run_moe() Expected a value of type "
+    "'Optional[List[Tensor]]' for argument '_9' but instead found type "
+    "'list'."))
+def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
+        monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
+    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
+
+
+@pytest.mark.skip(reason="Works, but takes too long to run")
+def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
+        monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
+    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", [])
+
+
+@pytest.mark.skip(reason="Works, but takes too long to run")
+def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
+    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
+
+
+@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
+def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
+    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", [])
+
+
+## DeepSeekV3 ##
+
+
+def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
+    can_initialize("deepseek-ai/DeepSeek-V3.1", [])
+
+
+def test_deepseek_nvfp4_moe_flashinfer_cutlass(
+        monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
+    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
+
+
+@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
+def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
+    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
+    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", [])
+
+
+## GPT-OSS ##
+
+
+def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
+    can_initialize("openai/gpt-oss-20b", [])
+
+
+def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
+        monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
+    can_initialize("openai/gpt-oss-20b", [])
+
+
+def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
+        monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
+    can_initialize("openai/gpt-oss-20b", [])
--- a/tests/utils.py
+++ b/tests/utils.py
@ -91,8 +91,10 @@ class RemoteOpenAIServer:
        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
        if env_dict is not None:
            env.update(env_dict)
+        serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
+        print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
        self.proc: subprocess.Popen = subprocess.Popen(
-            ["vllm", "serve", model, *vllm_serve_args],
+            serve_cmd,
            env=env,
            stdout=sys.stdout,
            stderr=sys.stderr,