precommit

Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Merge branch 'main' into copilot/disable-batched-triton-kernel
2025-10-29 20:26:12 +00:00 · 2025-10-29 20:18:03 +00:00 · 2025-10-29 20:17:46 +00:00 · 2025-10-16 03:46:02 +00:00 · 2025-10-16 03:42:58 +00:00 · 2025-10-16 03:37:04 +00:00
2 changed files with 35 additions and 5 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -155,6 +155,7 @@ if TYPE_CHECKING:
    VLLM_USE_FLASHINFER_MOE_FP8: bool = False
    VLLM_USE_FLASHINFER_MOE_FP4: bool = False
    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "throughput"
+    VLLM_ALLOW_BATCHED_TRITON_FALLBACK: bool = False
    VLLM_XGRAMMAR_CACHE_MB: int = 0
    VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
    VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@ -1145,6 +1146,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool(
        int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))
    ),
+    # If set to 1, allow fallback to batched triton kernel when deepgemm
+    # is unavailable. By default (0), the system will crash if deepgemm
+    # is expected but not available.
+    "VLLM_ALLOW_BATCHED_TRITON_FALLBACK": lambda: bool(
+        int(os.getenv("VLLM_ALLOW_BATCHED_TRITON_FALLBACK", "0"))
+    ),
    # Control the cache sized used by the xgrammar compiler. The default
    # of 512 MB should be enough for roughly 1000 JSON schemas.
    # It can be changed with this variable if needed for some reason.
--- a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@ -3,6 +3,7 @@

 import torch

+import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    BatchedDeepGemmExperts,
@ -22,11 +23,8 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
    ):
        super().__init__(quant_config)

-        self.batched_triton_experts = BatchedTritonExperts(
-            max_num_tokens=max_num_tokens,
-            num_dispatchers=num_dispatchers,
-            quant_config=self.quant_config,
-        )
+        # Store the original request for deep gemm
+        deep_gemm_requested = allow_deep_gemm

        self.allow_deep_gemm = (
            allow_deep_gemm
@ -44,6 +42,31 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
            else None
        )

+        # If deep gemm was requested but is not available (either due to
+        # unsupported configuration or missing dependencies), check if
+        # we should allow fallback to batched triton kernel
+        if (
+            deep_gemm_requested
+            and self.batched_deep_gemm_experts is None
+            and not envs.VLLM_ALLOW_BATCHED_TRITON_FALLBACK
+        ):
+            raise RuntimeError(
+                "DeepGemm was requested but is not available. "
+                "The batched triton kernel fallback is disabled by default. "
+                "Set VLLM_ALLOW_BATCHED_TRITON_FALLBACK=1 to enable the fallback "
+                "for debugging purposes."
+            )
+
+        self.batched_triton_experts = (
+            BatchedTritonExperts(
+                max_num_tokens=max_num_tokens,
+                num_dispatchers=num_dispatchers,
+                quant_config=self.quant_config,
+            )
+            if self.batched_deep_gemm_experts is None
+            else None
+        )
+
        assert (
            self.batched_deep_gemm_experts is not None
            or self.batched_triton_experts is not None
Author	SHA1	Message	Date
Tyler Michael Smith	7557a67655	precommit Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>	2025-10-29 20:26:12 +00:00
Tyler Michael Smith	1af476b0e9	Merge branch 'main' into copilot/disable-batched-triton-kernel	2025-10-29 20:18:03 +00:00
Tyler Michael Smith	8c3b1c7c62	ditch the unit test honestly Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>	2025-10-29 20:17:46 +00:00
copilot-swe-agent[bot]	c72d44ba4a	Add test for batched triton fallback behavior Co-authored-by: tlrmchlsmth <1236979+tlrmchlsmth@users.noreply.github.com>	2025-10-16 03:46:02 +00:00
copilot-swe-agent[bot]	c292032b44	Add env var to control batched triton kernel fallback - Add VLLM_ALLOW_BATCHED_TRITON_FALLBACK environment variable - Modify BatchedTritonOrDeepGemmExperts to crash when deepgemm is unavailable unless debug env is set Co-authored-by: tlrmchlsmth <1236979+tlrmchlsmth@users.noreply.github.com>	2025-10-16 03:42:58 +00:00
copilot-swe-agent[bot]	b286fba2bb	Initial plan	2025-10-16 03:37:04 +00:00