Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-09-18 13:20:37 -07:00
parent 3f50030cc8
commit a4962833f9

View File

@ -51,9 +51,8 @@ class GPUModelRunner:
self.device = device
self.pin_memory = is_pin_memory_available()
self.dtype = self.model_config.dtype
if self.cache_config.cache_dtype == "auto":
self.kv_cache_dtype = self.dtype
else:
self.kv_cache_dtype = self.dtype
if self.cache_config.cache_dtype != "auto":
# Quantized KV cache.
self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
self.cache_config.cache_dtype]
@ -99,9 +98,6 @@ class GPUModelRunner:
def profile_run(self):
pass
def maybe_remove_all_loras(self, lora_config):
pass
def get_kv_cache_spec(self):
return get_kv_cache_spec(self.vllm_config, self.kv_cache_dtype)
@ -269,6 +265,7 @@ class GPUModelRunner:
slot_mappings = self.block_tables.compute_slot_mappings(
query_start_loc_gpu, positions.gpu[:num_tokens])
logits_indices = query_start_loc_gpu[1:] - 1
num_logits_indices = logits_indices.size(0)
# Layer name -> attention metadata.
attn_metadata: dict[str, Any] = {}
@ -290,7 +287,7 @@ class GPUModelRunner:
block_table_tensor=block_table,
slot_mapping=slot_mapping,
logits_indices_padded=None,
num_logits_indices=logits_indices.size(0),
num_logits_indices=num_logits_indices,
causal=True,
encoder_seq_lens=None,
)