Enable prefix caching with full cuda graphs (#19617)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-06-15 01:05:05 -07:00
committed by GitHub
parent 3d330c4c09
commit 055915e6ce

View File

@ -4495,7 +4495,6 @@ class VllmConfig:
"full_cuda_graph is not supported with "
"cascade attention. Disabling cascade attention.")
self.model_config.disable_cascade_attn = True
self.cache_config.enable_prefix_caching = False
if (self.kv_events_config is not None
and self.kv_events_config.enable_kv_cache_events