Compare commits

...

6 Commits

Author SHA1 Message Date
7557a67655 precommit
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-10-29 20:26:12 +00:00
1af476b0e9 Merge branch 'main' into copilot/disable-batched-triton-kernel 2025-10-29 20:18:03 +00:00
8c3b1c7c62 ditch the unit test honestly
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-10-29 20:17:46 +00:00
c72d44ba4a Add test for batched triton fallback behavior
Co-authored-by: tlrmchlsmth <1236979+tlrmchlsmth@users.noreply.github.com>
2025-10-16 03:46:02 +00:00
c292032b44 Add env var to control batched triton kernel fallback
- Add VLLM_ALLOW_BATCHED_TRITON_FALLBACK environment variable
- Modify BatchedTritonOrDeepGemmExperts to crash when deepgemm is unavailable unless debug env is set

Co-authored-by: tlrmchlsmth <1236979+tlrmchlsmth@users.noreply.github.com>
2025-10-16 03:42:58 +00:00
b286fba2bb Initial plan 2025-10-16 03:37:04 +00:00
2 changed files with 35 additions and 5 deletions

View File

@ -155,6 +155,7 @@ if TYPE_CHECKING:
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
VLLM_USE_FLASHINFER_MOE_FP4: bool = False
VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "throughput"
VLLM_ALLOW_BATCHED_TRITON_FALLBACK: bool = False
VLLM_XGRAMMAR_CACHE_MB: int = 0
VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@ -1145,6 +1146,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16": lambda: bool(
int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))
),
# If set to 1, allow fallback to batched triton kernel when deepgemm
# is unavailable. By default (0), the system will crash if deepgemm
# is expected but not available.
"VLLM_ALLOW_BATCHED_TRITON_FALLBACK": lambda: bool(
int(os.getenv("VLLM_ALLOW_BATCHED_TRITON_FALLBACK", "0"))
),
# Control the cache sized used by the xgrammar compiler. The default
# of 512 MB should be enough for roughly 1000 JSON schemas.
# It can be changed with this variable if needed for some reason.

View File

@ -3,6 +3,7 @@
import torch
import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts,
@ -22,11 +23,8 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
):
super().__init__(quant_config)
self.batched_triton_experts = BatchedTritonExperts(
max_num_tokens=max_num_tokens,
num_dispatchers=num_dispatchers,
quant_config=self.quant_config,
)
# Store the original request for deep gemm
deep_gemm_requested = allow_deep_gemm
self.allow_deep_gemm = (
allow_deep_gemm
@ -44,6 +42,31 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
else None
)
# If deep gemm was requested but is not available (either due to
# unsupported configuration or missing dependencies), check if
# we should allow fallback to batched triton kernel
if (
deep_gemm_requested
and self.batched_deep_gemm_experts is None
and not envs.VLLM_ALLOW_BATCHED_TRITON_FALLBACK
):
raise RuntimeError(
"DeepGemm was requested but is not available. "
"The batched triton kernel fallback is disabled by default. "
"Set VLLM_ALLOW_BATCHED_TRITON_FALLBACK=1 to enable the fallback "
"for debugging purposes."
)
self.batched_triton_experts = (
BatchedTritonExperts(
max_num_tokens=max_num_tokens,
num_dispatchers=num_dispatchers,
quant_config=self.quant_config,
)
if self.batched_deep_gemm_experts is None
else None
)
assert (
self.batched_deep_gemm_experts is not None
or self.batched_triton_experts is not None