[V1][Spec Decode] Share input embedding of target model with EAGLE draft model to free ~1GB for llama 3 model (#17326)

Co-authored-by: root <root@ekagra-8xh100.us-east5-a.c.serving-efficiency-poc.internal>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Ekagra Ranjan
2025-05-14 15:31:46 -04:00
committed by GitHub
parent 964472b966
commit 418d2f8bfb
4 changed files with 59 additions and 19 deletions

View File

@ -105,6 +105,13 @@ def main():
outputs = llm.generate(prompt_token_ids=prompt_ids,
sampling_params=sampling_params)
# print the generated text
for output in outputs:
print("-" * 50)
print(f"prompt: {output.prompt}")
print(f"generated text: {output.outputs[0].text}")
print("-" * 50)
if not hasattr(outputs, "metrics") or outputs.metrics is None:
return