[V1][Spec Decode] Share input embedding of target model with EAGLE draft model to free ~1GB for llama 3 model (#17326)

Co-authored-by: root <root@ekagra-8xh100.us-east5-a.c.serving-efficiency-poc.internal> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-05-14 15:31:46 -04:00
parent 964472b966
commit 418d2f8bfb
4 changed files with 59 additions and 19 deletions
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@ -105,6 +105,13 @@ def main():
    outputs = llm.generate(prompt_token_ids=prompt_ids,
                           sampling_params=sampling_params)

+    # print the generated text
+    for output in outputs:
+        print("-" * 50)
+        print(f"prompt: {output.prompt}")
+        print(f"generated text: {output.outputs[0].text}")
+        print("-" * 50)
+
    if not hasattr(outputs, "metrics") or outputs.metrics is None:
        return