[Model] Add support for the multi-modal Llama 3.2 model (#8811)
Co-authored-by: simon-mo <xmo@berkeley.edu> Co-authored-by: Chang Su <chang.s.su@oracle.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@ -242,6 +242,29 @@ def run_qwen2_vl(question, modality):
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# LLama
|
||||
def run_mllama(question, modality):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
# Note: The default setting of max_num_seqs (256) and
|
||||
# max_model_len (131072) for this model may cause OOM.
|
||||
# You may lower either to run this example on lower-end GPUs.
|
||||
|
||||
# The configuration below has been confirmed to launch on a
|
||||
# single H100 GPU.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
prompt = f"<|image|><|begin_of_text|>{question}"
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"llava": run_llava,
|
||||
"llava-next": run_llava_next,
|
||||
@ -256,6 +279,7 @@ model_example_map = {
|
||||
"internvl_chat": run_internvl,
|
||||
"qwen_vl": run_qwen_vl,
|
||||
"qwen2_vl": run_qwen2_vl,
|
||||
"mllama": run_mllama,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ chat_completion_from_url = client.chat.completions.create(
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What’s in this image?"
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
@ -75,7 +75,7 @@ chat_completion_from_base64 = client.chat.completions.create(
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What’s in this image?"
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
|
||||
Reference in New Issue
Block a user