[Doc] Move examples into categories (#11840)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
45
examples/offline_inference/aqlm_example.py
Normal file
45
examples/offline_inference/aqlm_example.py
Normal file
@ -0,0 +1,45 @@
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = FlexibleArgumentParser(description='AQLM examples')
|
||||
|
||||
parser.add_argument('--model',
|
||||
'-m',
|
||||
type=str,
|
||||
default=None,
|
||||
help='model path, as for HF')
|
||||
parser.add_argument('--choice',
|
||||
'-c',
|
||||
type=int,
|
||||
default=0,
|
||||
help='known good models by index, [0-4]')
|
||||
parser.add_argument('--tensor-parallel-size',
|
||||
'-t',
|
||||
type=int,
|
||||
default=1,
|
||||
help='tensor parallel size')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
models = [
|
||||
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
|
||||
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
|
||||
"ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
|
||||
"ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
|
||||
"BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
|
||||
]
|
||||
|
||||
model = LLM(args.model if args.model is not None else models[args.choice],
|
||||
tensor_parallel_size=args.tensor_parallel_size)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=100, temperature=0)
|
||||
outputs = model.generate("Hello my name is",
|
||||
sampling_params=sampling_params)
|
||||
print(outputs[0].outputs[0].text)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
22
examples/offline_inference/cpu_offload.py
Normal file
22
examples/offline_inference/cpu_offload.py
Normal file
@ -0,0 +1,22 @@
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
45
examples/offline_inference/florence2_inference.py
Normal file
45
examples/offline_inference/florence2_inference.py
Normal file
@ -0,0 +1,45 @@
|
||||
'''
|
||||
Demonstrate prompting of text-to-text
|
||||
encoder/decoder models, specifically Florence-2
|
||||
'''
|
||||
# TODO(Isotr0py):
|
||||
# Move to offline_inference/offline_inference_vision_language.py
|
||||
# after porting vision backbone
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
dtype = "float"
|
||||
|
||||
# Create a Florence-2 encoder/decoder model instance
|
||||
llm = LLM(
|
||||
model="microsoft/Florence-2-base",
|
||||
tokenizer="facebook/bart-base",
|
||||
dtype=dtype,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
prompts = [
|
||||
"<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>",
|
||||
"<CAPTION_TO_PHRASE_GROUNDING>", "<OD>", "<DENSE_REGION_CAPTION>",
|
||||
"<REGION_PROPOSAL>", "<OCR>", "<OCR_WITH_REGION>"
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
min_tokens=0,
|
||||
max_tokens=20,
|
||||
)
|
||||
|
||||
# Generate output tokens from the prompts. The output is a list of
|
||||
# RequestOutput objects that contain the prompt, generated
|
||||
# text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
encoder_prompt = output.encoder_prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Encoder prompt: {encoder_prompt!r}, "
|
||||
f"Decoder prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
38
examples/offline_inference/gguf_inference.py
Normal file
38
examples/offline_inference/gguf_inference.py
Normal file
@ -0,0 +1,38 @@
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def run_gguf_inference(model_path):
|
||||
PROMPT_TEMPLATE = "<|system|>\n{system_message}</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n" # noqa: E501
|
||||
system_message = "You are a friendly chatbot who always responds in the style of a pirate." # noqa: E501
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"How many helicopters can a human eat in one sitting?",
|
||||
"What's the future of AI?",
|
||||
]
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(system_message=system_message, prompt=prompt)
|
||||
for prompt in prompts
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=128)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model=model_path,
|
||||
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
gpu_memory_utilization=0.95)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
|
||||
filename = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
|
||||
model = hf_hub_download(repo_id, filename=filename)
|
||||
run_gguf_inference(model)
|
||||
60
examples/offline_inference/llm_engine_example.py
Normal file
60
examples/offline_inference/llm_engine_example.py
Normal file
@ -0,0 +1,60 @@
|
||||
import argparse
|
||||
from typing import List, Tuple
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
|
||||
"""Create a list of test prompts with their sampling parameters."""
|
||||
return [
|
||||
("A robot may not injure a human being",
|
||||
SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
|
||||
("To be or not to be,",
|
||||
SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
|
||||
("What is the meaning of life?",
|
||||
SamplingParams(n=2,
|
||||
best_of=5,
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
frequency_penalty=0.1)),
|
||||
]
|
||||
|
||||
|
||||
def process_requests(engine: LLMEngine,
|
||||
test_prompts: List[Tuple[str, SamplingParams]]):
|
||||
"""Continuously process a list of prompts and handle the outputs."""
|
||||
request_id = 0
|
||||
|
||||
while test_prompts or engine.has_unfinished_requests():
|
||||
if test_prompts:
|
||||
prompt, sampling_params = test_prompts.pop(0)
|
||||
engine.add_request(str(request_id), prompt, sampling_params)
|
||||
request_id += 1
|
||||
|
||||
request_outputs: List[RequestOutput] = engine.step()
|
||||
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
print(request_output)
|
||||
|
||||
|
||||
def initialize_engine(args: argparse.Namespace) -> LLMEngine:
|
||||
"""Initialize the LLMEngine from the command line arguments."""
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
return LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
"""Main function that sets up and runs the prompt processing."""
|
||||
engine = initialize_engine(args)
|
||||
test_prompts = create_test_prompts()
|
||||
process_requests(engine, test_prompts)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using the LLMEngine class directly')
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
134
examples/offline_inference/lora_with_quantization_inference.py
Normal file
134
examples/offline_inference/lora_with_quantization_inference.py
Normal file
@ -0,0 +1,134 @@
|
||||
"""
|
||||
This example shows how to use LoRA with different quantization techniques
|
||||
for offline inference.
|
||||
|
||||
Requires HuggingFace credentials for access.
|
||||
"""
|
||||
|
||||
import gc
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
def create_test_prompts(
|
||||
lora_path: str
|
||||
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
|
||||
return [
|
||||
# this is an example of using quantization without LoRA
|
||||
("My name is",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=128), None),
|
||||
# the next three examples use quantization with LoRA
|
||||
("my name is",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=128),
|
||||
LoRARequest("lora-test-1", 1, lora_path)),
|
||||
("The capital of USA is",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=128),
|
||||
LoRARequest("lora-test-2", 1, lora_path)),
|
||||
("The capital of France is",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=128),
|
||||
LoRARequest("lora-test-3", 1, lora_path)),
|
||||
]
|
||||
|
||||
|
||||
def process_requests(engine: LLMEngine,
|
||||
test_prompts: List[Tuple[str, SamplingParams,
|
||||
Optional[LoRARequest]]]):
|
||||
"""Continuously process a list of prompts and handle the outputs."""
|
||||
request_id = 0
|
||||
|
||||
while test_prompts or engine.has_unfinished_requests():
|
||||
if test_prompts:
|
||||
prompt, sampling_params, lora_request = test_prompts.pop(0)
|
||||
engine.add_request(str(request_id),
|
||||
prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request)
|
||||
request_id += 1
|
||||
|
||||
request_outputs: List[RequestOutput] = engine.step()
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
print("----------------------------------------------------")
|
||||
print(f"Prompt: {request_output.prompt}")
|
||||
print(f"Output: {request_output.outputs[0].text}")
|
||||
|
||||
|
||||
def initialize_engine(model: str, quantization: str,
|
||||
lora_repo: Optional[str]) -> LLMEngine:
|
||||
"""Initialize the LLMEngine."""
|
||||
|
||||
if quantization == "bitsandbytes":
|
||||
# QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique.
|
||||
# It quantizes the model when loading, with some config info from the
|
||||
# LoRA adapter repo. So need to set the parameter of load_format and
|
||||
# qlora_adapter_name_or_path as below.
|
||||
engine_args = EngineArgs(model=model,
|
||||
quantization=quantization,
|
||||
qlora_adapter_name_or_path=lora_repo,
|
||||
load_format="bitsandbytes",
|
||||
enable_lora=True,
|
||||
max_lora_rank=64)
|
||||
else:
|
||||
engine_args = EngineArgs(model=model,
|
||||
quantization=quantization,
|
||||
enable_lora=True,
|
||||
max_loras=4)
|
||||
return LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function that sets up and runs the prompt processing."""
|
||||
|
||||
test_configs = [{
|
||||
"name": "qlora_inference_example",
|
||||
'model': "huggyllama/llama-7b",
|
||||
'quantization': "bitsandbytes",
|
||||
'lora_repo': 'timdettmers/qlora-flan-7b'
|
||||
}, {
|
||||
"name": "AWQ_inference_with_lora_example",
|
||||
'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
|
||||
'quantization': "awq",
|
||||
'lora_repo': 'jashing/tinyllama-colorist-lora'
|
||||
}, {
|
||||
"name": "GPTQ_inference_with_lora_example",
|
||||
'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
|
||||
'quantization': "gptq",
|
||||
'lora_repo': 'jashing/tinyllama-colorist-lora'
|
||||
}]
|
||||
|
||||
for test_config in test_configs:
|
||||
print(
|
||||
f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~"
|
||||
)
|
||||
engine = initialize_engine(test_config['model'],
|
||||
test_config['quantization'],
|
||||
test_config['lora_repo'])
|
||||
lora_path = snapshot_download(repo_id=test_config['lora_repo'])
|
||||
test_prompts = create_test_prompts(lora_path)
|
||||
process_requests(engine, test_prompts)
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
del engine
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
106
examples/offline_inference/multilora_inference.py
Normal file
106
examples/offline_inference/multilora_inference.py
Normal file
@ -0,0 +1,106 @@
|
||||
"""
|
||||
This example shows how to use the multi-LoRA functionality
|
||||
for offline inference.
|
||||
|
||||
Requires HuggingFace credentials for access to Llama2.
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
def create_test_prompts(
|
||||
lora_path: str
|
||||
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
|
||||
"""Create a list of test prompts with their sampling parameters.
|
||||
|
||||
2 requests for base model, 4 requests for the LoRA. We define 2
|
||||
different LoRA adapters (using the same model for demo purposes).
|
||||
Since we also set `max_loras=1`, the expectation is that the requests
|
||||
with the second LoRA adapter will be ran after all requests with the
|
||||
first adapter have finished.
|
||||
"""
|
||||
return [
|
||||
("A robot may not injure a human being",
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=128), None),
|
||||
("To be or not to be,",
|
||||
SamplingParams(temperature=0.8,
|
||||
top_k=5,
|
||||
presence_penalty=0.2,
|
||||
max_tokens=128), None),
|
||||
(
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=128,
|
||||
stop_token_ids=[32003]),
|
||||
LoRARequest("sql-lora", 1, lora_path)),
|
||||
(
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
SamplingParams(temperature=0.0,
|
||||
logprobs=1,
|
||||
prompt_logprobs=1,
|
||||
max_tokens=128,
|
||||
stop_token_ids=[32003]),
|
||||
LoRARequest("sql-lora2", 2, lora_path)),
|
||||
]
|
||||
|
||||
|
||||
def process_requests(engine: LLMEngine,
|
||||
test_prompts: List[Tuple[str, SamplingParams,
|
||||
Optional[LoRARequest]]]):
|
||||
"""Continuously process a list of prompts and handle the outputs."""
|
||||
request_id = 0
|
||||
|
||||
while test_prompts or engine.has_unfinished_requests():
|
||||
if test_prompts:
|
||||
prompt, sampling_params, lora_request = test_prompts.pop(0)
|
||||
engine.add_request(str(request_id),
|
||||
prompt,
|
||||
sampling_params,
|
||||
lora_request=lora_request)
|
||||
request_id += 1
|
||||
|
||||
request_outputs: List[RequestOutput] = engine.step()
|
||||
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
print(request_output)
|
||||
|
||||
|
||||
def initialize_engine() -> LLMEngine:
|
||||
"""Initialize the LLMEngine."""
|
||||
# max_loras: controls the number of LoRAs that can be used in the same
|
||||
# batch. Larger numbers will cause higher memory usage, as each LoRA
|
||||
# slot requires its own preallocated tensor.
|
||||
# max_lora_rank: controls the maximum supported rank of all LoRAs. Larger
|
||||
# numbers will cause higher memory usage. If you know that all LoRAs will
|
||||
# use the same rank, it is recommended to set this as low as possible.
|
||||
# max_cpu_loras: controls the size of the CPU LoRA cache.
|
||||
engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf",
|
||||
enable_lora=True,
|
||||
max_loras=1,
|
||||
max_lora_rank=8,
|
||||
max_cpu_loras=2,
|
||||
max_num_seqs=256)
|
||||
return LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function that sets up and runs the prompt processing."""
|
||||
engine = initialize_engine()
|
||||
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
test_prompts = create_test_prompts(lora_path)
|
||||
process_requests(engine, test_prompts)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
138
examples/offline_inference/offline_chat_with_tools.py
Normal file
138
examples/offline_inference/offline_chat_with_tools.py
Normal file
@ -0,0 +1,138 @@
|
||||
# ruff: noqa
|
||||
import json
|
||||
import random
|
||||
import string
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
# This script is an offline demo for function calling
|
||||
#
|
||||
# If you want to run a server/client setup, please follow this code:
|
||||
#
|
||||
# - Server:
|
||||
#
|
||||
# ```bash
|
||||
# vllm serve mistralai/Mistral-7B-Instruct-v0.3 --tokenizer-mode mistral --load-format mistral --config-format mistral
|
||||
# ```
|
||||
#
|
||||
# - Client:
|
||||
#
|
||||
# ```bash
|
||||
# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
|
||||
# --header 'Content-Type: application/json' \
|
||||
# --header 'Authorization: Bearer token' \
|
||||
# --data '{
|
||||
# "model": "mistralai/Mistral-7B-Instruct-v0.3"
|
||||
# "messages": [
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": [
|
||||
# {"type" : "text", "text": "Describe this image in detail please."},
|
||||
# {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
|
||||
# {"type" : "text", "text": "and this one as well. Answer in French."},
|
||||
# {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
|
||||
# ]
|
||||
# }
|
||||
# ]
|
||||
# }'
|
||||
# ```
|
||||
#
|
||||
# Usage:
|
||||
# python demo.py simple
|
||||
# python demo.py advanced
|
||||
|
||||
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
|
||||
# or switch to "mistralai/Mistral-Nemo-Instruct-2407"
|
||||
# or "mistralai/Mistral-Large-Instruct-2407"
|
||||
# or any other mistral model with function calling ability
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
|
||||
llm = LLM(model=model_name,
|
||||
tokenizer_mode="mistral",
|
||||
config_format="mistral",
|
||||
load_format="mistral")
|
||||
|
||||
|
||||
def generate_random_id(length=9):
|
||||
characters = string.ascii_letters + string.digits
|
||||
random_id = ''.join(random.choice(characters) for _ in range(length))
|
||||
return random_id
|
||||
|
||||
|
||||
# simulate an API that can be called
|
||||
def get_current_weather(city: str, state: str, unit: 'str'):
|
||||
return (f"The weather in {city}, {state} is 85 degrees {unit}. It is "
|
||||
"partly cloudly, with highs in the 90's.")
|
||||
|
||||
|
||||
tool_funtions = {"get_current_weather": get_current_weather}
|
||||
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"The city to find the weather for, e.g. 'San Francisco'"
|
||||
},
|
||||
"state": {
|
||||
"type":
|
||||
"string",
|
||||
"description":
|
||||
"the two-letter abbreviation for the state that the city is"
|
||||
" in, e.g. 'CA' which would mean 'California'"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"description": "The unit to fetch the temperature in",
|
||||
"enum": ["celsius", "fahrenheit"]
|
||||
}
|
||||
},
|
||||
"required": ["city", "state", "unit"]
|
||||
}
|
||||
}
|
||||
}]
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
|
||||
}]
|
||||
|
||||
outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
|
||||
output = outputs[0].outputs[0].text.strip()
|
||||
|
||||
# append the assistant message
|
||||
messages.append({
|
||||
"role": "assistant",
|
||||
"content": output,
|
||||
})
|
||||
|
||||
# let's now actually parse and execute the model's output simulating an API call by using the
|
||||
# above defined function
|
||||
tool_calls = json.loads(output)
|
||||
tool_answers = [
|
||||
tool_funtions[call['name']](**call['arguments']) for call in tool_calls
|
||||
]
|
||||
|
||||
# append the answer as a tool message and let the LLM give you an answer
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"content": "\n\n".join(tool_answers),
|
||||
"tool_call_id": generate_random_id(),
|
||||
})
|
||||
|
||||
outputs = llm.chat(messages, sampling_params, tools=tools)
|
||||
|
||||
print(outputs[0].outputs[0].text.strip())
|
||||
# yields
|
||||
# 'The weather in Dallas, TX is 85 degrees fahrenheit. '
|
||||
# 'It is partly cloudly, with highs in the 90's.'
|
||||
22
examples/offline_inference/offline_inference.py
Normal file
22
examples/offline_inference/offline_inference.py
Normal file
@ -0,0 +1,22 @@
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="facebook/opt-125m")
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
26
examples/offline_inference/offline_inference_arctic.py
Normal file
26
examples/offline_inference/offline_inference_arctic.py
Normal file
@ -0,0 +1,26 @@
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="snowflake/snowflake-arctic-instruct",
|
||||
quantization="deepspeedfp",
|
||||
tensor_parallel_size=8,
|
||||
trust_remote_code=True)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
131
examples/offline_inference/offline_inference_audio_language.py
Normal file
131
examples/offline_inference/offline_inference_audio_language.py
Normal file
@ -0,0 +1,131 @@
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference
|
||||
with the correct prompt format on audio language models.
|
||||
|
||||
For most models, the prompt format should follow corresponding examples
|
||||
on HuggingFace model repository.
|
||||
"""
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
|
||||
question_per_audio_count = {
|
||||
0: "What is 1+1?",
|
||||
1: "What is recited in the audio?",
|
||||
2: "What sport and what nursery rhyme are referenced?"
|
||||
}
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# Ultravox 0.3
|
||||
def run_ultravox(question: str, audio_count: int):
|
||||
model_name = "fixie-ai/ultravox-v0_3"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
messages = [{
|
||||
'role': 'user',
|
||||
'content': "<|audio|>\n" * audio_count + question
|
||||
}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Qwen2-Audio
|
||||
def run_qwen2_audio(question: str, audio_count: int):
|
||||
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"audio": audio_count})
|
||||
|
||||
audio_in_prompt = "".join([
|
||||
f"Audio {idx+1}: "
|
||||
f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
|
||||
])
|
||||
|
||||
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
"<|im_start|>user\n"
|
||||
f"{audio_in_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio}
|
||||
|
||||
|
||||
def main(args):
|
||||
model = args.model_type
|
||||
if model not in model_example_map:
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
audio_count = args.num_audios
|
||||
llm, prompt, stop_token_ids = model_example_map[model](
|
||||
question_per_audio_count[audio_count], audio_count)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=64,
|
||||
stop_token_ids=stop_token_ids)
|
||||
|
||||
mm_data = {}
|
||||
if audio_count > 0:
|
||||
mm_data = {
|
||||
"audio": [
|
||||
asset.audio_and_sample_rate
|
||||
for asset in audio_assets[:audio_count]
|
||||
]
|
||||
}
|
||||
|
||||
assert args.num_prompts > 0
|
||||
inputs = {"prompt": prompt, "multi_modal_data": mm_data}
|
||||
if args.num_prompts > 1:
|
||||
# Batch inference
|
||||
inputs = [inputs] * args.num_prompts
|
||||
|
||||
outputs = llm.generate(inputs, sampling_params=sampling_params)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using vLLM for offline inference with '
|
||||
'audio language models')
|
||||
parser.add_argument('--model-type',
|
||||
'-m',
|
||||
type=str,
|
||||
default="ultravox",
|
||||
choices=model_example_map.keys(),
|
||||
help='Huggingface "model_type".')
|
||||
parser.add_argument('--num-prompts',
|
||||
type=int,
|
||||
default=1,
|
||||
help='Number of prompts to run.')
|
||||
parser.add_argument("--num-audios",
|
||||
type=int,
|
||||
default=1,
|
||||
choices=[0, 1, 2],
|
||||
help="Number of audio items per prompt.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
80
examples/offline_inference/offline_inference_chat.py
Normal file
80
examples/offline_inference/offline_inference_chat.py
Normal file
@ -0,0 +1,80 @@
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
sampling_params = SamplingParams(temperature=0.5)
|
||||
|
||||
|
||||
def print_outputs(outputs):
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
# In this script, we demonstrate how to pass input to the chat method:
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Write an essay about the importance of higher education.",
|
||||
},
|
||||
]
|
||||
outputs = llm.chat(conversation,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
print_outputs(outputs)
|
||||
|
||||
# You can run batch inference with llm.chat API
|
||||
conversation = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Hello"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Hello! How can I assist you today?"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Write an essay about the importance of higher education.",
|
||||
},
|
||||
]
|
||||
conversations = [conversation for _ in range(10)]
|
||||
|
||||
# We turn on tqdm progress bar to verify it's indeed running batch inference
|
||||
outputs = llm.chat(messages=conversations,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
print_outputs(outputs)
|
||||
|
||||
# A chat template can be optionally supplied.
|
||||
# If not, the model will use its default chat template.
|
||||
|
||||
# with open('template_falcon_180b.jinja', "r") as f:
|
||||
# chat_template = f.read()
|
||||
|
||||
# outputs = llm.chat(
|
||||
# conversations,
|
||||
# sampling_params=sampling_params,
|
||||
# use_tqdm=False,
|
||||
# chat_template=chat_template,
|
||||
# )
|
||||
@ -0,0 +1,28 @@
|
||||
from vllm import LLM
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="classify" for classification models
|
||||
model = LLM(
|
||||
model="jason9693/Qwen2.5-1.5B-apeach",
|
||||
task="classify",
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
# Generate logits. The output is a list of ClassificationRequestOutputs.
|
||||
outputs = model.classify(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
probs = output.outputs.probs
|
||||
probs_trimmed = ((str(probs[:16])[:-1] +
|
||||
", ...]") if len(probs) > 16 else probs)
|
||||
print(f"Prompt: {prompt!r} | "
|
||||
f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
|
||||
80
examples/offline_inference/offline_inference_cli.py
Normal file
80
examples/offline_inference/offline_inference_cli.py
Normal file
@ -0,0 +1,80 @@
|
||||
from dataclasses import asdict
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def get_prompts(num_prompts: int):
|
||||
# The default sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
if num_prompts != len(prompts):
|
||||
prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
|
||||
|
||||
return prompts
|
||||
|
||||
|
||||
def main(args):
|
||||
# Create prompts
|
||||
prompts = get_prompts(args.num_prompts)
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(n=args.n,
|
||||
temperature=args.temperature,
|
||||
top_p=args.top_p,
|
||||
top_k=args.top_k,
|
||||
max_tokens=args.max_tokens)
|
||||
|
||||
# Create an LLM.
|
||||
# The default model is 'facebook/opt-125m'
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
llm = LLM(**asdict(engine_args))
|
||||
|
||||
# Generate texts from the prompts.
|
||||
# The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
group = parser.add_argument_group("SamplingParams options")
|
||||
group.add_argument("--num-prompts",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of prompts used for inference")
|
||||
group.add_argument("--max-tokens",
|
||||
type=int,
|
||||
default=16,
|
||||
help="Generated output length for sampling")
|
||||
group.add_argument('--n',
|
||||
type=int,
|
||||
default=1,
|
||||
help='Number of generated sequences per prompt')
|
||||
group.add_argument('--temperature',
|
||||
type=float,
|
||||
default=0.8,
|
||||
help='Temperature for text generation')
|
||||
group.add_argument('--top-p',
|
||||
type=float,
|
||||
default=0.95,
|
||||
help='top_p for text generation')
|
||||
group.add_argument('--top-k',
|
||||
type=int,
|
||||
default=-1,
|
||||
help='top_k for text generation')
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
108
examples/offline_inference/offline_inference_distributed.py
Normal file
108
examples/offline_inference/offline_inference_distributed.py
Normal file
@ -0,0 +1,108 @@
|
||||
"""
|
||||
This example shows how to use Ray Data for running offline batch inference
|
||||
distributively on a multi-nodes cluster.
|
||||
|
||||
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import numpy as np
|
||||
import ray
|
||||
from packaging.version import Version
|
||||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
assert Version(ray.__version__) >= Version(
|
||||
"2.22.0"), "Ray version must be at least 2.22.0"
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Set tensor parallelism per instance.
|
||||
tensor_parallel_size = 1
|
||||
|
||||
# Set number of instances. Each instance will use tensor_parallel_size GPUs.
|
||||
num_instances = 1
|
||||
|
||||
|
||||
# Create a class to do batch inference.
|
||||
class LLMPredictor:
|
||||
|
||||
def __init__(self):
|
||||
# Create an LLM.
|
||||
self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
|
||||
tensor_parallel_size=tensor_parallel_size)
|
||||
|
||||
def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
|
||||
# Generate texts from the prompts.
|
||||
# The output is a list of RequestOutput objects that contain the prompt,
|
||||
# generated text, and other information.
|
||||
outputs = self.llm.generate(batch["text"], sampling_params)
|
||||
prompt: List[str] = []
|
||||
generated_text: List[str] = []
|
||||
for output in outputs:
|
||||
prompt.append(output.prompt)
|
||||
generated_text.append(' '.join([o.text for o in output.outputs]))
|
||||
return {
|
||||
"prompt": prompt,
|
||||
"generated_text": generated_text,
|
||||
}
|
||||
|
||||
|
||||
# Read one text file from S3. Ray Data supports reading multiple files
|
||||
# from cloud storage (such as JSONL, Parquet, CSV, binary format).
|
||||
ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
|
||||
|
||||
|
||||
# For tensor_parallel_size > 1, we need to create placement groups for vLLM
|
||||
# to use. Every actor has to have its own placement group.
|
||||
def scheduling_strategy_fn():
|
||||
# One bundle per tensor parallel worker
|
||||
pg = ray.util.placement_group(
|
||||
[{
|
||||
"GPU": 1,
|
||||
"CPU": 1
|
||||
}] * tensor_parallel_size,
|
||||
strategy="STRICT_PACK",
|
||||
)
|
||||
return dict(scheduling_strategy=PlacementGroupSchedulingStrategy(
|
||||
pg, placement_group_capture_child_tasks=True))
|
||||
|
||||
|
||||
resources_kwarg: Dict[str, Any] = {}
|
||||
if tensor_parallel_size == 1:
|
||||
# For tensor_parallel_size == 1, we simply set num_gpus=1.
|
||||
resources_kwarg["num_gpus"] = 1
|
||||
else:
|
||||
# Otherwise, we have to set num_gpus=0 and provide
|
||||
# a function that will create a placement group for
|
||||
# each instance.
|
||||
resources_kwarg["num_gpus"] = 0
|
||||
resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
|
||||
|
||||
# Apply batch inference for all input data.
|
||||
ds = ds.map_batches(
|
||||
LLMPredictor,
|
||||
# Set the concurrency to the number of LLM instances.
|
||||
concurrency=num_instances,
|
||||
# Specify the batch size for inference.
|
||||
batch_size=32,
|
||||
**resources_kwarg,
|
||||
)
|
||||
|
||||
# Peek first 10 results.
|
||||
# NOTE: This is for local testing and debugging. For production use case,
|
||||
# one should write full result out as shown below.
|
||||
outputs = ds.take(limit=10)
|
||||
for output in outputs:
|
||||
prompt = output["prompt"]
|
||||
generated_text = output["generated_text"]
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
# Write inference output data out as Parquet files to S3.
|
||||
# Multiple files would be written to the output destination,
|
||||
# and each task would write one or more files separately.
|
||||
#
|
||||
# ds.write_parquet("s3://<your-output-bucket>")
|
||||
28
examples/offline_inference/offline_inference_embedding.py
Normal file
28
examples/offline_inference/offline_inference_embedding.py
Normal file
@ -0,0 +1,28 @@
|
||||
from vllm import LLM
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
model = LLM(
|
||||
model="intfloat/e5-mistral-7b-instruct",
|
||||
task="embed",
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
outputs = model.embed(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
embeds = output.outputs.embedding
|
||||
embeds_trimmed = ((str(embeds[:16])[:-1] +
|
||||
", ...]") if len(embeds) > 16 else embeds)
|
||||
print(f"Prompt: {prompt!r} | "
|
||||
f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||
@ -0,0 +1,99 @@
|
||||
'''
|
||||
Demonstrate prompting of text-to-text
|
||||
encoder/decoder models, specifically BART
|
||||
'''
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
|
||||
TokensPrompt, zip_enc_dec_prompts)
|
||||
|
||||
dtype = "float"
|
||||
|
||||
# Create a BART encoder/decoder model instance
|
||||
llm = LLM(
|
||||
model="facebook/bart-large-cnn",
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
# Get BART tokenizer
|
||||
tokenizer = llm.llm_engine.get_tokenizer_group()
|
||||
|
||||
# Test prompts
|
||||
#
|
||||
# This section shows all of the valid ways to prompt an
|
||||
# encoder/decoder model.
|
||||
#
|
||||
# - Helpers for building prompts
|
||||
text_prompt_raw = "Hello, my name is"
|
||||
text_prompt = TextPrompt(prompt="The president of the United States is")
|
||||
tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
|
||||
prompt="The capital of France is"))
|
||||
# - Pass a single prompt to encoder/decoder model
|
||||
# (implicitly encoder input prompt);
|
||||
# decoder input prompt is assumed to be None
|
||||
|
||||
single_text_prompt_raw = text_prompt_raw # Pass a string directly
|
||||
single_text_prompt = text_prompt # Pass a TextPrompt
|
||||
single_tokens_prompt = tokens_prompt # Pass a TokensPrompt
|
||||
|
||||
# - Pass explicit encoder and decoder input prompts within one data structure.
|
||||
# Encoder and decoder prompts can both independently be text or tokens, with
|
||||
# no requirement that they be the same prompt type. Some example prompt-type
|
||||
# combinations are shown below, note that these are not exhaustive.
|
||||
|
||||
enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
|
||||
# Pass encoder prompt string directly, &
|
||||
# pass decoder prompt tokens
|
||||
encoder_prompt=single_text_prompt_raw,
|
||||
decoder_prompt=single_tokens_prompt,
|
||||
)
|
||||
enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
|
||||
# Pass TextPrompt to encoder, and
|
||||
# pass decoder prompt string directly
|
||||
encoder_prompt=single_text_prompt,
|
||||
decoder_prompt=single_text_prompt_raw,
|
||||
)
|
||||
enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
|
||||
# Pass encoder prompt tokens directly, and
|
||||
# pass TextPrompt to decoder
|
||||
encoder_prompt=single_tokens_prompt,
|
||||
decoder_prompt=single_text_prompt,
|
||||
)
|
||||
|
||||
# - Finally, here's a useful helper function for zipping encoder and
|
||||
# decoder prompts together into a list of ExplicitEncoderDecoderPrompt
|
||||
# instances
|
||||
zipped_prompt_list = zip_enc_dec_prompts(
|
||||
['An encoder prompt', 'Another encoder prompt'],
|
||||
['A decoder prompt', 'Another decoder prompt'])
|
||||
|
||||
# - Let's put all of the above example prompts together into one list
|
||||
# which we will pass to the encoder/decoder LLM.
|
||||
prompts = [
|
||||
single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
|
||||
enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
|
||||
] + zipped_prompt_list
|
||||
|
||||
print(prompts)
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
min_tokens=0,
|
||||
max_tokens=20,
|
||||
)
|
||||
|
||||
# Generate output tokens from the prompts. The output is a list of
|
||||
# RequestOutput objects that contain the prompt, generated
|
||||
# text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
encoder_prompt = output.encoder_prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Encoder prompt: {encoder_prompt!r}, "
|
||||
f"Decoder prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
@ -0,0 +1,56 @@
|
||||
import gc
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def time_generation(llm: LLM, prompts: List[str],
|
||||
sampling_params: SamplingParams):
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput
|
||||
# objects that contain the prompt, generated text, and other information.
|
||||
# Warmup first
|
||||
llm.generate(prompts, sampling_params)
|
||||
llm.generate(prompts, sampling_params)
|
||||
start = time.time()
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
end = time.time()
|
||||
print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
template = (
|
||||
"Below is an instruction that describes a task. Write a response "
|
||||
"that appropriately completes the request.\n\n### Instruction:\n{}"
|
||||
"\n\n### Response:\n")
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Write about the president of the United States.",
|
||||
]
|
||||
prompts = [template.format(prompt) for prompt in prompts]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
|
||||
|
||||
# Create an LLM without spec decoding
|
||||
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
|
||||
|
||||
print("Without speculation")
|
||||
time_generation(llm, prompts, sampling_params)
|
||||
|
||||
del llm
|
||||
gc.collect()
|
||||
|
||||
# Create an LLM with spec decoding
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-2-13b-chat-hf",
|
||||
speculative_model="ibm-fms/llama-13b-accelerator",
|
||||
)
|
||||
|
||||
print("With speculation")
|
||||
time_generation(llm, prompts, sampling_params)
|
||||
36
examples/offline_inference/offline_inference_neuron.py
Normal file
36
examples/offline_inference/offline_inference_neuron.py
Normal file
@ -0,0 +1,36 @@
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
max_num_seqs=8,
|
||||
# The max_model_len and block_size arguments are required to be same as
|
||||
# max sequence length when targeting neuron device.
|
||||
# Currently, this is a known limitation in continuous batching support
|
||||
# in transformers-neuronx.
|
||||
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
||||
max_model_len=1024,
|
||||
block_size=1024,
|
||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||
# The device argument can be either unspecified for automated detection,
|
||||
# or explicitly assigned.
|
||||
device="neuron",
|
||||
tensor_parallel_size=2)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
@ -0,0 +1,50 @@
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# creates XLA hlo graphs for all the context length buckets.
|
||||
os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
|
||||
# creates XLA hlo graphs for all the token gen buckets.
|
||||
os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
|
||||
# Quantizes neuron model weight to int8 ,
|
||||
# The default config for quantization is int8 dtype.
|
||||
os.environ['NEURON_QUANT_DTYPE'] = "s8"
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
max_num_seqs=8,
|
||||
# The max_model_len and block_size arguments are required to be same as
|
||||
# max sequence length when targeting neuron device.
|
||||
# Currently, this is a known limitation in continuous batching support
|
||||
# in transformers-neuronx.
|
||||
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
||||
max_model_len=2048,
|
||||
block_size=2048,
|
||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||
# The device argument can be either unspecified for automated detection,
|
||||
# or explicitly assigned.
|
||||
device="neuron",
|
||||
quantization="neuron_quant",
|
||||
override_neuron_config={
|
||||
"cast_logits_dtype": "bfloat16",
|
||||
},
|
||||
tensor_parallel_size=2)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
@ -0,0 +1,205 @@
|
||||
# Offline Inference with the OpenAI Batch file format
|
||||
|
||||
```{important}
|
||||
This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
|
||||
```
|
||||
|
||||
## File Format
|
||||
|
||||
The OpenAI batch file format consists of a series of json objects on new lines.
|
||||
|
||||
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl)
|
||||
|
||||
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
|
||||
|
||||
```{note}
|
||||
We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
|
||||
```
|
||||
|
||||
## Pre-requisites
|
||||
|
||||
* The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
|
||||
- Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
|
||||
- Install the token on your machine (Run `huggingface-cli login`).
|
||||
- Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
|
||||
|
||||
|
||||
## Example 1: Running with a local file
|
||||
|
||||
### Step 1: Create your batch file
|
||||
|
||||
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
|
||||
|
||||
```
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
```
|
||||
|
||||
Once you've created your batch file it should look like this
|
||||
|
||||
```
|
||||
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
```
|
||||
|
||||
### Step 2: Run the batch
|
||||
|
||||
The batch running tool is designed to be used from the command line.
|
||||
|
||||
You can run the batch with the following command, which will write its results to a file called `results.jsonl`
|
||||
|
||||
```
|
||||
python -m vllm.entrypoints.openai.run_batch -i offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
### Step 3: Check your results
|
||||
|
||||
You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`
|
||||
|
||||
```
|
||||
$ cat results.jsonl
|
||||
{"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
|
||||
{"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
|
||||
```
|
||||
|
||||
## Example 2: Using remote files
|
||||
|
||||
The batch runner supports remote input and output urls that are accessible via http/https.
|
||||
|
||||
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl`, you can run
|
||||
|
||||
```
|
||||
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
## Example 3: Integrating with AWS S3
|
||||
|
||||
To integrate with cloud blob storage, we recommend using presigned urls.
|
||||
|
||||
[Learn more about S3 presigned urls here]
|
||||
|
||||
### Additional prerequisites
|
||||
|
||||
* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html).
|
||||
* The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
|
||||
- [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
|
||||
* The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
|
||||
|
||||
### Step 1: Upload your input script
|
||||
|
||||
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
|
||||
|
||||
```
|
||||
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
```
|
||||
|
||||
Once you've created your batch file it should look like this
|
||||
|
||||
```
|
||||
$ cat offline_inference/offline_inference_openai/openai_example_batch.jsonl
|
||||
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
```
|
||||
|
||||
Now upload your batch file to your S3 bucket.
|
||||
|
||||
```
|
||||
aws s3 cp offline_inference/offline_inference_openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
|
||||
```
|
||||
|
||||
### Step 2: Generate your presigned urls
|
||||
|
||||
Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
|
||||
|
||||
(The script is adapted from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py)
|
||||
|
||||
```
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
def generate_presigned_url(s3_client, client_method, method_parameters, expires_in):
|
||||
"""
|
||||
Generate a presigned Amazon S3 URL that can be used to perform an action.
|
||||
|
||||
:param s3_client: A Boto3 Amazon S3 client.
|
||||
:param client_method: The name of the client method that the URL performs.
|
||||
:param method_parameters: The parameters of the specified client method.
|
||||
:param expires_in: The number of seconds the presigned URL is valid for.
|
||||
:return: The presigned URL.
|
||||
"""
|
||||
try:
|
||||
url = s3_client.generate_presigned_url(
|
||||
ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
|
||||
)
|
||||
except ClientError:
|
||||
raise
|
||||
return url
|
||||
|
||||
|
||||
s3_client = boto3.client("s3")
|
||||
input_url = generate_presigned_url(
|
||||
s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
|
||||
)
|
||||
output_url = generate_presigned_url(
|
||||
s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
|
||||
)
|
||||
print(f"{input_url=}")
|
||||
print(f"{output_url=}")
|
||||
```
|
||||
|
||||
This script should output
|
||||
|
||||
```
|
||||
input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
|
||||
output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
|
||||
```
|
||||
|
||||
### Step 3: Run the batch runner using your presigned urls
|
||||
|
||||
You can now run the batch runner, using the urls generated in the previous section.
|
||||
|
||||
```
|
||||
python -m vllm.entrypoints.openai.run_batch \
|
||||
-i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
|
||||
-o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
|
||||
--model --model meta-llama/Meta-Llama-3-8B-Instruct
|
||||
```
|
||||
|
||||
### Step 4: View your results
|
||||
|
||||
Your results are now on S3. You can view them in your terminal by running
|
||||
|
||||
```
|
||||
aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
|
||||
```
|
||||
|
||||
## Example 4: Using embeddings endpoint
|
||||
|
||||
### Additional prerequisites
|
||||
|
||||
* Ensure you are using `vllm >= 0.5.5`.
|
||||
|
||||
### Step 1: Create your batch file
|
||||
|
||||
Add embedding requests to your batch file. The following is an example:
|
||||
|
||||
```
|
||||
{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
|
||||
```
|
||||
|
||||
You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
|
||||
|
||||
### Step 2: Run the batch
|
||||
|
||||
You can run the batch using the same command as in earlier examples.
|
||||
|
||||
### Step 3: Check your results
|
||||
|
||||
You can check your results by running `cat results.jsonl`
|
||||
|
||||
```
|
||||
$ cat results.jsonl
|
||||
{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
|
||||
...
|
||||
```
|
||||
@ -0,0 +1,2 @@
|
||||
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
|
||||
165
examples/offline_inference/offline_inference_pixtral.py
Normal file
165
examples/offline_inference/offline_inference_pixtral.py
Normal file
@ -0,0 +1,165 @@
|
||||
# ruff: noqa
|
||||
import argparse
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
# This script is an offline demo for running Pixtral.
|
||||
#
|
||||
# If you want to run a server/client setup, please follow this code:
|
||||
#
|
||||
# - Server:
|
||||
#
|
||||
# ```bash
|
||||
# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
|
||||
# ```
|
||||
#
|
||||
# - Client:
|
||||
#
|
||||
# ```bash
|
||||
# curl --location 'http://<your-node-url>:8000/v1/chat/completions' \
|
||||
# --header 'Content-Type: application/json' \
|
||||
# --header 'Authorization: Bearer token' \
|
||||
# --data '{
|
||||
# "model": "mistralai/Pixtral-12B-2409",
|
||||
# "messages": [
|
||||
# {
|
||||
# "role": "user",
|
||||
# "content": [
|
||||
# {"type" : "text", "text": "Describe this image in detail please."},
|
||||
# {"type": "image_url", "image_url": {"url": "https://s3.amazonaws.com/cms.ipressroom.com/338/files/201808/5b894ee1a138352221103195_A680%7Ejogging-edit/A680%7Ejogging-edit_hero.jpg"}},
|
||||
# {"type" : "text", "text": "and this one as well. Answer in French."},
|
||||
# {"type": "image_url", "image_url": {"url": "https://www.wolframcloud.com/obj/resourcesystem/images/a0e/a0ee3983-46c6-4c92-b85d-059044639928/6af8cfb971db031b.png"}}
|
||||
# ]
|
||||
# }
|
||||
# ]
|
||||
# }'
|
||||
# ```
|
||||
#
|
||||
# Usage:
|
||||
# python demo.py simple
|
||||
# python demo.py advanced
|
||||
|
||||
|
||||
def run_simple_demo():
|
||||
model_name = "mistralai/Pixtral-12B-2409"
|
||||
sampling_params = SamplingParams(max_tokens=8192)
|
||||
|
||||
# Lower max_num_seqs or max_model_len on low-VRAM GPUs.
|
||||
llm = LLM(model=model_name, tokenizer_mode="mistral")
|
||||
|
||||
prompt = "Describe this image in one sentence."
|
||||
image_url = "https://picsum.photos/id/237/200/300"
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
outputs = llm.chat(messages, sampling_params=sampling_params)
|
||||
|
||||
print(outputs[0].outputs[0].text)
|
||||
|
||||
|
||||
def run_advanced_demo():
|
||||
model_name = "mistralai/Pixtral-12B-2409"
|
||||
max_img_per_msg = 5
|
||||
max_tokens_per_img = 4096
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=8192, temperature=0.7)
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
tokenizer_mode="mistral",
|
||||
limit_mm_per_prompt={"image": max_img_per_msg},
|
||||
max_model_len=max_img_per_msg * max_tokens_per_img,
|
||||
)
|
||||
|
||||
prompt = "Describe the following image."
|
||||
|
||||
url_1 = "https://huggingface.co/datasets/patrickvonplaten/random_img/resolve/main/yosemite.png"
|
||||
url_2 = "https://picsum.photos/seed/picsum/200/300"
|
||||
url_3 = "https://picsum.photos/id/32/512/512"
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": url_1
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": url_2
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "The images show nature.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "More details please and answer only in French!.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": url_3
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
|
||||
print(outputs[0].outputs[0].text)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run a demo in simple or advanced mode.")
|
||||
|
||||
parser.add_argument(
|
||||
"mode",
|
||||
choices=["simple", "advanced"],
|
||||
help="Specify the demo mode: 'simple' or 'advanced'",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.mode == "simple":
|
||||
print("Running simple demo...")
|
||||
run_simple_demo()
|
||||
elif args.mode == "advanced":
|
||||
print("Running advanced demo...")
|
||||
run_advanced_demo()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
23
examples/offline_inference/offline_inference_scoring.py
Normal file
23
examples/offline_inference/offline_inference_scoring.py
Normal file
@ -0,0 +1,23 @@
|
||||
from vllm import LLM
|
||||
|
||||
# Sample prompts.
|
||||
text_1 = "What is the capital of France?"
|
||||
texts_2 = [
|
||||
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="score" for cross-encoder models
|
||||
model = LLM(
|
||||
model="BAAI/bge-reranker-v2-m3",
|
||||
task="score",
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
# Generate scores. The output is a list of ScoringRequestOutputs.
|
||||
outputs = model.score(text_1, texts_2)
|
||||
|
||||
# Print the outputs.
|
||||
for text_2, output in zip(texts_2, outputs):
|
||||
score = output.outputs.score
|
||||
print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
|
||||
@ -0,0 +1,78 @@
|
||||
from enum import Enum
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
|
||||
|
||||
# Guided decoding by Choice (list of possible options)
|
||||
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||
outputs = llm.generate(
|
||||
prompts="Classify this sentiment: vLLM is wonderful!",
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
print(outputs[0].outputs[0].text)
|
||||
|
||||
# Guided decoding by Regex
|
||||
guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
|
||||
stop=["\n"])
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
|
||||
print(outputs[0].outputs[0].text)
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
class CarType(str, Enum):
|
||||
sedan = "sedan"
|
||||
suv = "SUV"
|
||||
truck = "Truck"
|
||||
coupe = "Coupe"
|
||||
|
||||
|
||||
class CarDescription(BaseModel):
|
||||
brand: str
|
||||
model: str
|
||||
car_type: CarType
|
||||
|
||||
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
guided_decoding_params = GuidedDecodingParams(json=json_schema)
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
outputs = llm.generate(
|
||||
prompts=prompt,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
print(outputs[0].outputs[0].text)
|
||||
|
||||
# Guided decoding by Grammar
|
||||
simplified_sql_grammar = """
|
||||
?start: select_statement
|
||||
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
|
||||
?column_list: column_name ("," column_name)*
|
||||
|
||||
?table_name: identifier
|
||||
|
||||
?column_name: identifier
|
||||
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
"""
|
||||
guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
outputs = llm.generate(
|
||||
prompts=prompt,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
print(outputs[0].outputs[0].text)
|
||||
28
examples/offline_inference/offline_inference_tpu.py
Normal file
28
examples/offline_inference/offline_inference_tpu.py
Normal file
@ -0,0 +1,28 @@
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
prompts = [
|
||||
"A robot may not injure a human being",
|
||||
"It is only with the heart that one can see rightly;",
|
||||
"The greatest glory in living lies not in never falling,",
|
||||
]
|
||||
answers = [
|
||||
" or, through inaction, allow a human being to come to harm.",
|
||||
" what is essential is invisible to the eye.",
|
||||
" but in rising every time we fall.",
|
||||
]
|
||||
N = 1
|
||||
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
|
||||
sampling_params = SamplingParams(temperature=0.7,
|
||||
top_p=1.0,
|
||||
n=N,
|
||||
max_tokens=16)
|
||||
|
||||
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
|
||||
# In real workloads, `enforace_eager` should be `False`.
|
||||
llm = LLM(model="google/gemma-2b", enforce_eager=True)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output, answer in zip(outputs, answers):
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
687
examples/offline_inference/offline_inference_vision_language.py
Normal file
687
examples/offline_inference/offline_inference_vision_language.py
Normal file
@ -0,0 +1,687 @@
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference with
|
||||
the correct prompt format on vision language models for text generation.
|
||||
|
||||
For most models, the prompt format should follow corresponding examples
|
||||
on HuggingFace model repository.
|
||||
"""
|
||||
import random
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
# Aria
|
||||
def run_aria(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
model_name = "rhymes-ai/Aria"
|
||||
|
||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||
llm = LLM(model=model_name,
|
||||
tokenizer_mode="slow",
|
||||
dtype="bfloat16",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
|
||||
prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
|
||||
"<|im_end|>\n<|im_start|>assistant\n")
|
||||
|
||||
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# BLIP-2
|
||||
def run_blip2(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
|
||||
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
|
||||
prompt = f"Question: {question} Answer:"
|
||||
llm = LLM(model="Salesforce/blip2-opt-2.7b",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Chameleon
|
||||
def run_chameleon(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"{question}<image>"
|
||||
llm = LLM(model="facebook/chameleon-7b",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Fuyu
|
||||
def run_fuyu(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"{question}\n"
|
||||
llm = LLM(model="adept/fuyu-8b",
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# GLM-4v
|
||||
def run_glm4v(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
model_name = "THUDM/glm-4v-9b"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
prompt = question
|
||||
stop_token_ids = [151329, 151336, 151338]
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# H2OVL-Mississippi
|
||||
def run_h2ovl(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "h2oai/h2ovl-mississippi-2b"
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
# Stop tokens for H2OVL-Mississippi
|
||||
# https://huggingface.co/h2oai/h2ovl-mississippi-2b
|
||||
stop_token_ids = [tokenizer.eos_token_id]
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Idefics3-8B-Llama3
|
||||
def run_idefics3(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
enforce_eager=True,
|
||||
# if you are running out of memory, you can reduce the "longest_edge".
|
||||
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
|
||||
mm_processor_kwargs={
|
||||
"size": {
|
||||
"longest_edge": 3 * 364
|
||||
},
|
||||
},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
prompt = (
|
||||
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
|
||||
)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# InternVL
|
||||
def run_internvl(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
# Stop tokens for InternVL
|
||||
# models variants may have different stop tokens
|
||||
# please refer to the model card for the correct "stop words":
|
||||
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# LLaVA-1.5
|
||||
def run_llava(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"USER: <image>\n{question}\nASSISTANT:"
|
||||
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf",
|
||||
max_model_len=4096,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# LLaVA-1.6/LLaVA-NeXT
|
||||
def run_llava_next(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"[INST] <image>\n{question} [/INST]"
|
||||
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# LlaVA-NeXT-Video
|
||||
# Currently only support for video input
|
||||
def run_llava_next_video(question: str, modality: str):
|
||||
assert modality == "video"
|
||||
|
||||
prompt = f"USER: <video>\n{question} ASSISTANT:"
|
||||
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
|
||||
max_model_len=8192,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# LLaVA-OneVision
|
||||
def run_llava_onevision(question: str, modality: str):
|
||||
|
||||
if modality == "video":
|
||||
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
|
||||
<|im_start|>assistant\n"
|
||||
|
||||
elif modality == "image":
|
||||
prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
|
||||
<|im_start|>assistant\n"
|
||||
|
||||
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
|
||||
max_model_len=16384,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Mantis
|
||||
def run_mantis(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501
|
||||
prompt = llama3_template.format(f"{question}\n<image>")
|
||||
|
||||
llm = LLM(
|
||||
model="TIGER-Lab/Mantis-8B-siglip-llama3",
|
||||
max_model_len=4096,
|
||||
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
stop_token_ids = [128009]
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# MiniCPM-V
|
||||
def run_minicpmv(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
# 2.0
|
||||
# The official repo doesn't work yet, so we need to use a fork for now
|
||||
# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
|
||||
# model_name = "HwwwH/MiniCPM-V-2"
|
||||
|
||||
# 2.5
|
||||
# model_name = "openbmb/MiniCPM-Llama3-V-2_5"
|
||||
|
||||
# 2.6
|
||||
model_name = "openbmb/MiniCPM-V-2_6"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
trust_remote_code=True,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
|
||||
# 2.0
|
||||
# stop_token_ids = [tokenizer.eos_id]
|
||||
|
||||
# 2.5
|
||||
# stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
|
||||
|
||||
# 2.6
|
||||
stop_tokens = ['<|im_end|>', '<|endoftext|>']
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
|
||||
messages = [{
|
||||
'role': 'user',
|
||||
'content': f'(<image>./</image>)\n{question}'
|
||||
}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# LLama 3.2
|
||||
def run_mllama(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
# Note: The default setting of max_num_seqs (256) and
|
||||
# max_model_len (131072) for this model may cause OOM.
|
||||
# You may lower either to run this example on lower-end GPUs.
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image"
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": f"{question}"
|
||||
}]
|
||||
}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
add_generation_prompt=True,
|
||||
tokenize=False)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Molmo
|
||||
def run_molmo(question, modality):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "allenai/Molmo-7B-D-0924"
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompt = question
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# NVLM-D
|
||||
def run_nvlm_d(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
tensor_parallel_size=4,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# PaliGemma
|
||||
def run_paligemma(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
# PaliGemma has special prompt format for VQA
|
||||
prompt = "caption en"
|
||||
llm = LLM(model="google/paligemma-3b-mix-224",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# PaliGemma 2
|
||||
def run_paligemma2(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
# PaliGemma 2 has special prompt format for VQA
|
||||
prompt = "caption en"
|
||||
llm = LLM(model="google/paligemma2-3b-ft-docci-448",
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Phi-3-Vision
|
||||
def run_phi3v(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
|
||||
|
||||
# num_crops is an override kwarg to the multimodal image processor;
|
||||
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
|
||||
# to use 16 for single frame scenarios, and 4 for multi-frame.
|
||||
#
|
||||
# Generally speaking, a larger value for num_crops results in more
|
||||
# tokens per image instance, because it may scale the image more in
|
||||
# the image preprocessing. Some references in the model docs and the
|
||||
# formula for image tokens after the preprocessing
|
||||
# transform can be found below.
|
||||
#
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
# Note - mm_processor_kwargs can also be passed to generate/chat calls
|
||||
mm_processor_kwargs={"num_crops": 16},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Pixtral HF-format
|
||||
def run_pixtral_hf(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "mistral-community/pixtral-12b"
|
||||
|
||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompt = f"<s>[INST]{question}\n[IMG][/INST]"
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Qwen
|
||||
def run_qwen_vl(question: str, modality: str):
|
||||
assert modality == "image"
|
||||
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen-VL",
|
||||
trust_remote_code=True,
|
||||
max_model_len=1024,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompt = f"{question}Picture 1: <img></img>\n"
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
# Qwen2-VL
|
||||
def run_qwen2_vl(question: str, modality: str):
|
||||
|
||||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=5,
|
||||
# Note - mm_processor_kwargs can also be passed to generate/chat calls
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 28 * 28,
|
||||
"max_pixels": 1280 * 28 * 28,
|
||||
},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
if modality == "image":
|
||||
placeholder = "<|image_pad|>"
|
||||
elif modality == "video":
|
||||
placeholder = "<|video_pad|>"
|
||||
|
||||
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
|
||||
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
|
||||
f"{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
stop_token_ids = None
|
||||
return llm, prompt, stop_token_ids
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"aria": run_aria,
|
||||
"blip-2": run_blip2,
|
||||
"chameleon": run_chameleon,
|
||||
"fuyu": run_fuyu,
|
||||
"glm4v": run_glm4v,
|
||||
"h2ovl_chat": run_h2ovl,
|
||||
"idefics3": run_idefics3,
|
||||
"internvl_chat": run_internvl,
|
||||
"llava": run_llava,
|
||||
"llava-next": run_llava_next,
|
||||
"llava-next-video": run_llava_next_video,
|
||||
"llava-onevision": run_llava_onevision,
|
||||
"mantis": run_mantis,
|
||||
"minicpmv": run_minicpmv,
|
||||
"mllama": run_mllama,
|
||||
"molmo": run_molmo,
|
||||
"NVLM_D": run_nvlm_d,
|
||||
"paligemma": run_paligemma,
|
||||
"paligemma2": run_paligemma2,
|
||||
"phi3_v": run_phi3v,
|
||||
"pixtral_hf": run_pixtral_hf,
|
||||
"qwen_vl": run_qwen_vl,
|
||||
"qwen2_vl": run_qwen2_vl,
|
||||
}
|
||||
|
||||
|
||||
def get_multi_modal_input(args):
|
||||
"""
|
||||
return {
|
||||
"data": image or video,
|
||||
"question": question,
|
||||
}
|
||||
"""
|
||||
if args.modality == "image":
|
||||
# Input image and question
|
||||
image = ImageAsset("cherry_blossom") \
|
||||
.pil_image.convert("RGB")
|
||||
img_question = "What is the content of this image?"
|
||||
|
||||
return {
|
||||
"data": image,
|
||||
"question": img_question,
|
||||
}
|
||||
|
||||
if args.modality == "video":
|
||||
# Input video and question
|
||||
video = VideoAsset(name="sample_demo_1.mp4",
|
||||
num_frames=args.num_frames).np_ndarrays
|
||||
vid_question = "Why is this video funny?"
|
||||
|
||||
return {
|
||||
"data": video,
|
||||
"question": vid_question,
|
||||
}
|
||||
|
||||
msg = f"Modality {args.modality} is not supported."
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
|
||||
"""Repeats images with provided probability of "image_repeat_prob".
|
||||
Used to simulate hit/miss for the MM preprocessor cache.
|
||||
"""
|
||||
assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
|
||||
no_yes = [0, 1]
|
||||
probs = [1.0 - image_repeat_prob, image_repeat_prob]
|
||||
|
||||
inputs = []
|
||||
cur_image = data
|
||||
for i in range(num_prompts):
|
||||
if image_repeat_prob is not None:
|
||||
res = random.choices(no_yes, probs)[0]
|
||||
if res == 0:
|
||||
# No repeat => Modify one pixel
|
||||
cur_image = cur_image.copy()
|
||||
new_val = (i // 256 // 256, i // 256, i % 256)
|
||||
cur_image.putpixel((0, 0), new_val)
|
||||
|
||||
inputs.append({
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
modality: cur_image
|
||||
}
|
||||
})
|
||||
|
||||
return inputs
|
||||
|
||||
|
||||
def main(args):
|
||||
model = args.model_type
|
||||
if model not in model_example_map:
|
||||
raise ValueError(f"Model type {model} is not supported.")
|
||||
|
||||
modality = args.modality
|
||||
mm_input = get_multi_modal_input(args)
|
||||
data = mm_input["data"]
|
||||
question = mm_input["question"]
|
||||
|
||||
llm, prompt, stop_token_ids = model_example_map[model](question, modality)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=64,
|
||||
stop_token_ids=stop_token_ids)
|
||||
|
||||
assert args.num_prompts > 0
|
||||
if args.num_prompts == 1:
|
||||
# Single inference
|
||||
inputs = {
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
modality: data
|
||||
},
|
||||
}
|
||||
|
||||
else:
|
||||
# Batch inference
|
||||
if args.image_repeat_prob is not None:
|
||||
# Repeat images with specified probability of "image_repeat_prob"
|
||||
inputs = apply_image_repeat(args.image_repeat_prob,
|
||||
args.num_prompts, data, prompt,
|
||||
modality)
|
||||
else:
|
||||
# Use the same image for all prompts
|
||||
inputs = [{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
modality: data
|
||||
},
|
||||
} for _ in range(args.num_prompts)]
|
||||
|
||||
if args.time_generate:
|
||||
import time
|
||||
start_time = time.time()
|
||||
outputs = llm.generate(inputs, sampling_params=sampling_params)
|
||||
elapsed_time = time.time() - start_time
|
||||
print("-- generate time = {}".format(elapsed_time))
|
||||
|
||||
else:
|
||||
outputs = llm.generate(inputs, sampling_params=sampling_params)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using vLLM for offline inference with '
|
||||
'vision language models for text generation')
|
||||
parser.add_argument('--model-type',
|
||||
'-m',
|
||||
type=str,
|
||||
default="llava",
|
||||
choices=model_example_map.keys(),
|
||||
help='Huggingface "model_type".')
|
||||
parser.add_argument('--num-prompts',
|
||||
type=int,
|
||||
default=4,
|
||||
help='Number of prompts to run.')
|
||||
parser.add_argument('--modality',
|
||||
type=str,
|
||||
default="image",
|
||||
choices=['image', 'video'],
|
||||
help='Modality of the input.')
|
||||
parser.add_argument('--num-frames',
|
||||
type=int,
|
||||
default=16,
|
||||
help='Number of frames to extract from the video.')
|
||||
|
||||
parser.add_argument(
|
||||
'--image-repeat-prob',
|
||||
type=float,
|
||||
default=None,
|
||||
help='Simulates the hit-ratio for multi-modal preprocessor cache'
|
||||
' (if enabled)')
|
||||
|
||||
parser.add_argument(
|
||||
'--disable-mm-preprocessor-cache',
|
||||
action='store_true',
|
||||
help='If True, disables caching of multi-modal preprocessor/mapper.')
|
||||
|
||||
parser.add_argument(
|
||||
'--time-generate',
|
||||
action='store_true',
|
||||
help='If True, then print the total generate() call time')
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@ -0,0 +1,170 @@
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference with
|
||||
the correct prompt format on vision language models for multimodal embedding.
|
||||
|
||||
For most models, the prompt format should follow corresponding examples
|
||||
on HuggingFace model repository.
|
||||
"""
|
||||
from argparse import Namespace
|
||||
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
|
||||
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
class TextQuery(TypedDict):
|
||||
modality: Literal["text"]
|
||||
text: str
|
||||
|
||||
|
||||
class ImageQuery(TypedDict):
|
||||
modality: Literal["image"]
|
||||
image: Image
|
||||
|
||||
|
||||
class TextImageQuery(TypedDict):
|
||||
modality: Literal["text+image"]
|
||||
text: str
|
||||
image: Image
|
||||
|
||||
|
||||
QueryModality = Literal["text", "image", "text+image"]
|
||||
Query = Union[TextQuery, ImageQuery, TextImageQuery]
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
llm: LLM
|
||||
prompt: str
|
||||
image: Optional[Image]
|
||||
|
||||
|
||||
def run_e5_v(query: Query):
|
||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
|
||||
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = llama3_template.format(
|
||||
f"{text}\nSummary above sentence in one word: ")
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = llama3_template.format(
|
||||
"<image>\nSummary above image in one word: ")
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query['modality']
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
llm = LLM(
|
||||
model="royokong/e5-v",
|
||||
task="embed",
|
||||
max_model_len=4096,
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_vlm2vec(query: Query):
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
image = query["image"]
|
||||
elif query["modality"] == "text+image":
|
||||
text = query["text"]
|
||||
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query['modality']
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
llm = LLM(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
task="embed",
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={"num_crops": 4},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def get_query(modality: QueryModality):
|
||||
if modality == "text":
|
||||
return TextQuery(modality="text", text="A dog sitting in the grass")
|
||||
|
||||
if modality == "image":
|
||||
return ImageQuery(
|
||||
modality="image",
|
||||
image=fetch_image(
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501
|
||||
),
|
||||
)
|
||||
|
||||
if modality == "text+image":
|
||||
return TextImageQuery(
|
||||
modality="text+image",
|
||||
text="A cat standing in the snow.",
|
||||
image=fetch_image(
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501
|
||||
),
|
||||
)
|
||||
|
||||
msg = f"Modality {modality} is not supported."
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def run_encode(model: str, modality: QueryModality):
|
||||
query = get_query(modality)
|
||||
req_data = model_example_map[model](query)
|
||||
|
||||
mm_data = {}
|
||||
if req_data.image is not None:
|
||||
mm_data["image"] = req_data.image
|
||||
|
||||
outputs = req_data.llm.embed({
|
||||
"prompt": req_data.prompt,
|
||||
"multi_modal_data": mm_data,
|
||||
})
|
||||
|
||||
for output in outputs:
|
||||
print(output.outputs.embedding)
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
run_encode(args.model_name, args.modality)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"e5_v": run_e5_v,
|
||||
"vlm2vec": run_vlm2vec,
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using vLLM for offline inference with '
|
||||
'vision language models for multimodal embedding')
|
||||
parser.add_argument('--model-name',
|
||||
'-m',
|
||||
type=str,
|
||||
default="vlm2vec",
|
||||
choices=model_example_map.keys(),
|
||||
help='The name of the embedding model.')
|
||||
parser.add_argument('--modality',
|
||||
type=str,
|
||||
default="image",
|
||||
choices=get_args(QueryModality),
|
||||
help='Modality of the input.')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@ -0,0 +1,470 @@
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference with
|
||||
multi-image input on vision language models for text generation,
|
||||
using the chat template defined by the model.
|
||||
"""
|
||||
from argparse import Namespace
|
||||
from typing import List, NamedTuple, Optional
|
||||
|
||||
from PIL.Image import Image
|
||||
from transformers import AutoProcessor, AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
QUESTION = "What is the content of each image?"
|
||||
IMAGE_URLS = [
|
||||
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
|
||||
]
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
llm: LLM
|
||||
prompt: str
|
||||
stop_token_ids: Optional[List[int]]
|
||||
image_data: List[Image]
|
||||
chat_template: Optional[str]
|
||||
|
||||
|
||||
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
|
||||
# lower-end GPUs.
|
||||
# Unless specified, these settings have been tested to work on a single L4.
|
||||
|
||||
|
||||
def load_aria(question, image_urls: List[str]) -> ModelRequestData:
|
||||
model_name = "rhymes-ai/Aria"
|
||||
llm = LLM(model=model_name,
|
||||
tokenizer_mode="slow",
|
||||
trust_remote_code=True,
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={"image": len(image_urls)})
|
||||
placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
|
||||
prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
|
||||
model_name = "h2oai/h2ovl-mississippi-2b"
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
mm_processor_kwargs={"max_dynamic_patch": 4},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(f"Image-{i}: <image>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
# Stop tokens for H2OVL-Mississippi
|
||||
# https://huggingface.co/h2oai/h2ovl-mississippi-2b
|
||||
stop_token_ids = [tokenizer.eos_token_id]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
|
||||
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
# if you are running out of memory, you can reduce the "longest_edge".
|
||||
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
|
||||
mm_processor_kwargs={
|
||||
"size": {
|
||||
"longest_edge": 2 * 364
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(f"Image-{i}: <image>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
mm_processor_kwargs={"max_dynamic_patch": 4},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(f"Image-{i}: <image>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
# Stop tokens for InternVL
|
||||
# models variants may have different stop tokens
|
||||
# please refer to the model card for the correct "stop words":
|
||||
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
|
||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = "<|image|>" * len(image_urls)
|
||||
prompt = f"{placeholders}<|begin_of_text|>{question}"
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_nvlm_d(question: str, image_urls: List[str]):
|
||||
model_name = "nvidia/NVLM-D-72B"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=4,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
mm_processor_kwargs={"max_dynamic_patch": 4},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(f"Image-{i}: <image>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
|
||||
model_name = "mistral-community/pixtral-12b"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = "[IMG]" * len(image_urls)
|
||||
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
|
||||
# num_crops is an override kwarg to the multimodal image processor;
|
||||
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
|
||||
# to use 16 for single frame scenarios, and 4 for multi-frame.
|
||||
#
|
||||
# Generally speaking, a larger value for num_crops results in more
|
||||
# tokens per image instance, because it may scale the image more in
|
||||
# the image preprocessing. Some references in the model docs and the
|
||||
# formula for image tokens after the preprocessing
|
||||
# transform can be found below.
|
||||
#
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
|
||||
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
|
||||
llm = LLM(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
trust_remote_code=True,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
mm_processor_kwargs={"num_crops": 4},
|
||||
)
|
||||
placeholders = "\n".join(f"<|image_{i}|>"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
|
||||
stop_token_ids = None
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_qwen_vl_chat(question: str,
|
||||
image_urls: List[str]) -> ModelRequestData:
|
||||
model_name = "Qwen/Qwen-VL-Chat"
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
trust_remote_code=True,
|
||||
max_model_len=1024,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
placeholders = "".join(f"Picture {i}: <img></img>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
|
||||
# This model does not have a chat_template attribute on its tokenizer,
|
||||
# so we need to explicitly pass it. We use ChatML since it's used in the
|
||||
# generation utils of the model:
|
||||
# https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name,
|
||||
trust_remote_code=True)
|
||||
|
||||
# Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
|
||||
chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501
|
||||
|
||||
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
|
||||
prompt = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
chat_template=chat_template)
|
||||
|
||||
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
|
||||
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=chat_template,
|
||||
)
|
||||
|
||||
|
||||
def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
|
||||
try:
|
||||
from qwen_vl_utils import process_vision_info
|
||||
except ModuleNotFoundError:
|
||||
print('WARNING: `qwen-vl-utils` not installed, input images will not '
|
||||
'be automatically resized. You can enable this functionality by '
|
||||
'`pip install qwen-vl-utils`.')
|
||||
process_vision_info = None
|
||||
|
||||
model_name = "Qwen/Qwen2-VL-7B-Instruct"
|
||||
|
||||
# Tested on L40
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=32768 if process_vision_info is None else 4096,
|
||||
max_num_seqs=5,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{
|
||||
"type": "text",
|
||||
"text": question
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
stop_token_ids = None
|
||||
|
||||
if process_vision_info is None:
|
||||
image_data = [fetch_image(url) for url in image_urls]
|
||||
else:
|
||||
image_data, _ = process_vision_info(messages)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=stop_token_ids,
|
||||
image_data=image_data,
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"aria": load_aria,
|
||||
"h2ovl_chat": load_h2onvl,
|
||||
"idefics3": load_idefics3,
|
||||
"internvl_chat": load_internvl,
|
||||
"mllama": load_mllama,
|
||||
"NVLM_D": load_nvlm_d,
|
||||
"phi3_v": load_phi3v,
|
||||
"pixtral_hf": load_pixtral_hf,
|
||||
"qwen_vl_chat": load_qwen_vl_chat,
|
||||
"qwen2_vl": load_qwen2_vl,
|
||||
}
|
||||
|
||||
|
||||
def run_generate(model, question: str, image_urls: List[str]):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=128,
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
|
||||
outputs = req_data.llm.generate(
|
||||
{
|
||||
"prompt": req_data.prompt,
|
||||
"multi_modal_data": {
|
||||
"image": req_data.image_data
|
||||
},
|
||||
},
|
||||
sampling_params=sampling_params)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
|
||||
def run_chat(model: str, question: str, image_urls: List[str]):
|
||||
req_data = model_example_map[model](question, image_urls)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=128,
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
outputs = req_data.llm.chat(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": question,
|
||||
},
|
||||
*({
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
} for image_url in image_urls),
|
||||
],
|
||||
}],
|
||||
sampling_params=sampling_params,
|
||||
chat_template=req_data.chat_template,
|
||||
)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
model = args.model_type
|
||||
method = args.method
|
||||
|
||||
if method == "generate":
|
||||
run_generate(model, QUESTION, IMAGE_URLS)
|
||||
elif method == "chat":
|
||||
run_chat(model, QUESTION, IMAGE_URLS)
|
||||
else:
|
||||
raise ValueError(f"Invalid method: {method}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(
|
||||
description='Demo on using vLLM for offline inference with '
|
||||
'vision language models that support multi-image input for text '
|
||||
'generation')
|
||||
parser.add_argument('--model-type',
|
||||
'-m',
|
||||
type=str,
|
||||
default="phi3_v",
|
||||
choices=model_example_map.keys(),
|
||||
help='Huggingface "model_type".')
|
||||
parser.add_argument("--method",
|
||||
type=str,
|
||||
default="generate",
|
||||
choices=["generate", "chat"],
|
||||
help="The method to run in `vllm.LLM`.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
59
examples/offline_inference/offline_inference_whisper.py
Normal file
59
examples/offline_inference/offline_inference_whisper.py
Normal file
@ -0,0 +1,59 @@
|
||||
import time
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
# Create a Whisper encoder/decoder model instance
|
||||
llm = LLM(
|
||||
model="openai/whisper-large-v3",
|
||||
max_model_len=448,
|
||||
max_num_seqs=400,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
kv_cache_dtype="fp8",
|
||||
)
|
||||
|
||||
prompts = [
|
||||
{
|
||||
"prompt": "<|startoftranscript|>",
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
},
|
||||
},
|
||||
{ # Test explicit encoder/decoder prompt
|
||||
"encoder_prompt": {
|
||||
"prompt": "",
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("winning_call").audio_and_sample_rate,
|
||||
},
|
||||
},
|
||||
"decoder_prompt": "<|startoftranscript|>",
|
||||
}
|
||||
] * 1024
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0,
|
||||
top_p=1.0,
|
||||
max_tokens=200,
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
|
||||
# Generate output tokens from the prompts. The output is a list of
|
||||
# RequestOutput objects that contain the prompt, generated
|
||||
# text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
encoder_prompt = output.encoder_prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Encoder prompt: {encoder_prompt!r}, "
|
||||
f"Decoder prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
|
||||
duration = time.time() - start
|
||||
|
||||
print("Duration:", duration)
|
||||
print("RPS:", len(prompts) / duration)
|
||||
@ -0,0 +1,30 @@
|
||||
from vllm import LLM
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
# Create an LLM with built-in default generation config.
|
||||
# The generation config is set to None by default to keep
|
||||
# the behavior consistent with the previous version.
|
||||
# If you want to use the default generation config from the model,
|
||||
# you should set the generation_config to "auto".
|
||||
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
|
||||
|
||||
# Load the default sampling parameters from the model.
|
||||
sampling_params = llm.get_default_sampling_params()
|
||||
# Modify the sampling parameters if needed.
|
||||
sampling_params.temperature = 0.5
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
83
examples/offline_inference/offline_inference_with_prefix.py
Normal file
83
examples/offline_inference/offline_inference_with_prefix.py
Normal file
@ -0,0 +1,83 @@
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
|
||||
# NOTE: This is just a running example. For benchmarking purpose,
|
||||
# please see benchmarks/benchmark_prefix_caching.py
|
||||
|
||||
# Common prefix.
|
||||
prefix = (
|
||||
"You are an expert school principal, skilled in effectively managing "
|
||||
"faculty and staff. Draft 10-15 questions for a potential first grade "
|
||||
"Head Teacher for my K-12, all-girls', independent school that emphasizes "
|
||||
"community, joyful discovery, and life-long learning. The candidate is "
|
||||
"coming in for a first-round panel interview for a 8th grade Math "
|
||||
"teaching role. They have 5 years of previous teaching experience "
|
||||
"as an assistant teacher at a co-ed, public school with experience "
|
||||
"in middle school math teaching. Based on these information, fulfill "
|
||||
"the following paragraph: ")
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
|
||||
generating_prompts = [prefix + prompt for prompt in prompts]
|
||||
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.0)
|
||||
|
||||
# Create an LLM without prefix caching as a baseline.
|
||||
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
|
||||
|
||||
print("Results without `enable_prefix_caching`")
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = regular_llm.generate(generating_prompts, sampling_params)
|
||||
|
||||
regular_generated_texts = []
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
regular_generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
print("-" * 80)
|
||||
|
||||
# Destroy the LLM object and free up the GPU memory.
|
||||
del regular_llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
# Create an LLM with prefix caching enabled.
|
||||
prefix_cached_llm = LLM(model="facebook/opt-125m",
|
||||
enable_prefix_caching=True,
|
||||
gpu_memory_utilization=0.4)
|
||||
|
||||
# Warmup so that the shared prompt's KV cache is computed.
|
||||
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
|
||||
|
||||
# Generate with prefix caching.
|
||||
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
|
||||
|
||||
print("Results with `enable_prefix_caching`")
|
||||
|
||||
cached_generated_texts = []
|
||||
# Print the outputs. You should see the same outputs as before.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
cached_generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
print("-" * 80)
|
||||
|
||||
# Compare the results and display the speedup
|
||||
generated_same = all([
|
||||
regular_generated_texts[i] == cached_generated_texts[i]
|
||||
for i in range(len(prompts))
|
||||
])
|
||||
print(f"Generated answers are the same: {generated_same}")
|
||||
@ -0,0 +1,40 @@
|
||||
import os
|
||||
import time
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# enable torch profiler, can also be set on cmd line
|
||||
os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile"
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
|
||||
|
||||
llm.start_profile()
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput
|
||||
# objects that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
llm.stop_profile()
|
||||
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
# Add a buffer to wait for profiler in the background process
|
||||
# (in case MP is on) to finish writing profiling output.
|
||||
time.sleep(10)
|
||||
458
examples/offline_inference/offline_profile.py
Normal file
458
examples/offline_inference/offline_profile.py
Normal file
@ -0,0 +1,458 @@
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from argparse import RawTextHelpFormatter
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Dict, Generator, List, Optional, TypeAlias
|
||||
|
||||
import torch
|
||||
import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.profiler import layerwise_profile
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
BATCH_SIZE_DEFAULT = 1
|
||||
PROMPT_LEN_DEFAULT = 256
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProfileContext:
|
||||
engine_args: EngineArgs
|
||||
prompt_len: int
|
||||
batch_size: int
|
||||
|
||||
# The profiler can run in 2 modes,
|
||||
# 1. Run profiler for user specified num_steps
|
||||
num_steps: Optional[int] = None
|
||||
# 2. Run profiler until all requests complete
|
||||
complete_num_requests_per_step: Optional[int] = None
|
||||
|
||||
save_chrome_traces_folder: Optional[str] = None
|
||||
|
||||
|
||||
def get_dtype(dtype: str):
|
||||
if dtype == "torch.float":
|
||||
return torch.float
|
||||
else:
|
||||
return dtype
|
||||
|
||||
|
||||
OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
|
||||
def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
|
||||
-> OutputLen_NumReqs_Map:
|
||||
"""
|
||||
Given the number of requests, batch_size, and the number of requests
|
||||
that each engine-step should process, step_requests, determine the
|
||||
output lengths of the requests such that step_request is honoured.
|
||||
|
||||
Example:
|
||||
if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
|
||||
then return,
|
||||
{2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
|
||||
32 requests should have output length 2,
|
||||
32 requests should have output length 3,
|
||||
32 requests should have output length 4,
|
||||
31 requests should have output length 5,
|
||||
1 request should have output length 6.
|
||||
|
||||
Args:
|
||||
batch_size (int): Number of requests submitted for profile. This is
|
||||
args.batch_size.
|
||||
step_requests (List[int]): step_requests[i] is the number of requests
|
||||
that the ith engine step should process.
|
||||
|
||||
Returns:
|
||||
OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
|
||||
number of requests required to have that output-length as values.
|
||||
"""
|
||||
ol_nr: OutputLen_NumReqs_Map = {}
|
||||
|
||||
# Number of request that are assigned an output-length
|
||||
num_reqs_assigned: int = 0
|
||||
num_steps: int = len(step_requests)
|
||||
|
||||
# sanity check. The first step (prefill-step), must process all requests.
|
||||
assert step_requests[0] == batch_size
|
||||
|
||||
# Begin assignments from the last step.
|
||||
output_length: int = num_steps
|
||||
for num_requests_at_step in reversed(step_requests):
|
||||
if num_reqs_assigned == batch_size:
|
||||
break
|
||||
|
||||
assert num_reqs_assigned < batch_size
|
||||
|
||||
# Remove the number of requests that have been determined
|
||||
# to participate in this step and beyond.
|
||||
num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
|
||||
assert num_reqs_unassigned_at_step >= 0
|
||||
|
||||
if num_reqs_unassigned_at_step > 0:
|
||||
ol_nr[output_length] = num_reqs_unassigned_at_step
|
||||
num_reqs_assigned += num_reqs_unassigned_at_step
|
||||
|
||||
output_length -= 1
|
||||
|
||||
# sanity checks.
|
||||
assert sum(ol_nr.values()) == batch_size, \
|
||||
("Number of requests in output-length assignment does not match "
|
||||
f"batch-size.\n batch size {batch_size} - "
|
||||
f"step requests {step_requests} - assignments {ol_nr}")
|
||||
|
||||
# Check that the output-length is in [1, num-steps]. Output length must be
|
||||
# at least 1 as all requests must participate in the prefill-step.
|
||||
assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \
|
||||
("Output lengths of requests should be in range "
|
||||
f"[1, num-engine-steps].\n batch size {batch_size} - "
|
||||
f"step requests {step_requests} - assignments {ol_nr}")
|
||||
|
||||
return ol_nr
|
||||
|
||||
|
||||
def determine_requests_per_step(context: ProfileContext) -> List[int]:
|
||||
"""
|
||||
Determine number of requests each engine step should process.
|
||||
If context.num_steps is set, then all engine steps process the
|
||||
same number of requests and the output list is of length
|
||||
context.num_steps.
|
||||
|
||||
If context.complete_num_requests_per_step is set, then each decode step
|
||||
processes fewer and fewer requests until there are no requests to process.
|
||||
In this case, the output list is as big as the number of steps
|
||||
required to process all requests.
|
||||
|
||||
Args:
|
||||
context: ProfileContext object.
|
||||
|
||||
Returns:
|
||||
List[int]: Number of requests to process for all engine-steps.
|
||||
output[i], contains the number of requests that the ith step
|
||||
should process.
|
||||
"""
|
||||
if context.num_steps:
|
||||
# All requests must run until num_engine_steps. This implies
|
||||
# that their output lengths must be equal to num_engine_steps.
|
||||
return [context.batch_size] * context.num_steps
|
||||
|
||||
assert context.complete_num_requests_per_step and \
|
||||
context.complete_num_requests_per_step > 0, \
|
||||
(f"Expected a positive complete_num_requests_per_step argument."
|
||||
f"Instead got {context.complete_num_requests_per_step}")
|
||||
|
||||
# We start dropping after the first decode step.
|
||||
step_requests = [
|
||||
context.batch_size, # prefill
|
||||
context.batch_size, # decode
|
||||
]
|
||||
|
||||
num_running_requests = context.batch_size
|
||||
num_running_requests -= context.complete_num_requests_per_step
|
||||
while num_running_requests > 0:
|
||||
step_requests.append(num_running_requests)
|
||||
num_running_requests -= context.complete_num_requests_per_step
|
||||
|
||||
if step_requests[-1] != 1:
|
||||
# have 1 request running at the last step. This is often
|
||||
# useful
|
||||
step_requests.append(1)
|
||||
|
||||
return step_requests
|
||||
|
||||
|
||||
def run_profile(context: ProfileContext, csv_output: Optional[str],
|
||||
json_output: Optional[str]):
|
||||
print("Run profile with:")
|
||||
for key, value in asdict(context).items():
|
||||
print(f" {key} = {value}")
|
||||
|
||||
requests_per_step: List[int] = determine_requests_per_step(context)
|
||||
|
||||
ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
|
||||
context.batch_size, requests_per_step)
|
||||
|
||||
num_steps_to_profile: int = len(requests_per_step)
|
||||
max_output_len: int = max(ol_nr.keys())
|
||||
assert max_output_len >= 1
|
||||
|
||||
# Create sampling params
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
# max_tokens is set on a per-request basis.
|
||||
max_tokens=None,
|
||||
ignore_eos=True)
|
||||
|
||||
# Create LLM
|
||||
llm = LLM(**asdict(context.engine_args))
|
||||
batch_size = context.batch_size
|
||||
prompt_len = context.prompt_len
|
||||
|
||||
scheduler_config = llm.llm_engine.scheduler_config
|
||||
max_model_len = llm.llm_engine.model_config.max_model_len
|
||||
max_num_batched_tokens = scheduler_config.max_num_batched_tokens
|
||||
max_num_seqs = scheduler_config.max_num_seqs
|
||||
|
||||
if batch_size * prompt_len > max_num_batched_tokens:
|
||||
print(f"ERROR: chosen batch_size * prompt_len "
|
||||
f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
|
||||
f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
|
||||
f"and therefore cannot be run in a single profile step, please "
|
||||
f"choose a smaller batch size or prompt length, or increase "
|
||||
f"--max-num-batched-tokens")
|
||||
sys.exit(-1)
|
||||
if batch_size > max_num_seqs:
|
||||
print(
|
||||
f"ERROR: chosen batch_size ({batch_size}) is larger than "
|
||||
f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
|
||||
f"single profile step, please choose a smaller batch size")
|
||||
sys.exit(-1)
|
||||
print("llm.llm_engine.model_config.max_model_len: ",
|
||||
llm.llm_engine.model_config.max_model_len)
|
||||
if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
|
||||
print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
|
||||
f"{max_output_len} = {prompt_len + max_output_len}) is larger "
|
||||
f"than the model's max_model_len ({max_model_len}), please "
|
||||
f"choose a smaller prompt_len or max_output_len, or increase "
|
||||
f"--max-model-len")
|
||||
sys.exit(-1)
|
||||
|
||||
def add_requests():
|
||||
|
||||
def get_output_len_generator() -> Generator[int, Any, Any]:
|
||||
for output_len, num_reqs in ol_nr.items():
|
||||
for _ in range(num_reqs):
|
||||
yield output_len
|
||||
|
||||
output_len_generator = get_output_len_generator()
|
||||
for i in range(batch_size):
|
||||
sampling_params.max_tokens = next(output_len_generator)
|
||||
assert isinstance(sampling_params.max_tokens, int)
|
||||
|
||||
prompt_token_ids = torch.randint(
|
||||
llm.llm_engine.model_config.get_vocab_size(),
|
||||
size=(prompt_len, )).tolist()
|
||||
|
||||
llm.llm_engine.add_request(
|
||||
request_id=f"seq{i}",
|
||||
prompt={'prompt_token_ids': prompt_token_ids},
|
||||
params=sampling_params)
|
||||
|
||||
def abort_requests():
|
||||
for i in range(batch_size):
|
||||
llm.llm_engine.abort_request(f"seq{i}")
|
||||
|
||||
# Warm up run
|
||||
print("Warm up run ...")
|
||||
add_requests()
|
||||
llm.llm_engine.step() # Prefill
|
||||
llm.llm_engine.step() # Decode
|
||||
abort_requests()
|
||||
|
||||
print("Profile run ...")
|
||||
add_requests()
|
||||
|
||||
with layerwise_profile() as prefill_prof:
|
||||
llm.llm_engine.step() # First step is prefill
|
||||
|
||||
decode_profs = []
|
||||
for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
|
||||
num_running_seqs = llm.llm_engine.scheduler[
|
||||
0].get_num_unfinished_seq_groups()
|
||||
with layerwise_profile(
|
||||
num_running_seqs=num_running_seqs) as decode_prof:
|
||||
llm.llm_engine.step()
|
||||
decode_profs.append(decode_prof)
|
||||
|
||||
decode_results_list = [prof.results for prof in decode_profs]
|
||||
prefill_results = prefill_prof.results
|
||||
has_decode = len(decode_results_list) > 0
|
||||
|
||||
LINE_WIDTH = 80
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= Prefill Model Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
prefill_results.print_model_table()
|
||||
|
||||
if has_decode:
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= First Decode Step Model Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
decode_results_list[0].print_model_table()
|
||||
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= Prefill Summary Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
prefill_results.print_summary_table()
|
||||
|
||||
if has_decode:
|
||||
print()
|
||||
print("=" * LINE_WIDTH)
|
||||
print(f"= First Decode Step Summary Table "
|
||||
f"(prompt_len={prompt_len}, batch_size={batch_size})")
|
||||
print("=" * LINE_WIDTH)
|
||||
print()
|
||||
decode_results_list[0].print_summary_table()
|
||||
|
||||
if csv_output:
|
||||
csv_filename_base = csv_output[:-4] \
|
||||
if csv_output.endswith('.csv') else csv_output
|
||||
prefill_results.export_model_stats_table_csv(
|
||||
csv_filename_base + "_prefill_model_table.csv")
|
||||
prefill_results.export_summary_stats_table_csv(
|
||||
csv_filename_base + "_prefill_summary_table.csv")
|
||||
|
||||
if has_decode:
|
||||
decode_results_list[0].export_model_stats_table_csv(\
|
||||
csv_filename_base + "_decode_model_table.csv")
|
||||
decode_results_list[0].export_summary_stats_table_csv(
|
||||
csv_filename_base + "_decode_summary_table.csv")
|
||||
|
||||
if json_output:
|
||||
cuda_devices = [
|
||||
torch.cuda.get_device_properties(dev_idx)
|
||||
for dev_idx in range(torch.cuda.device_count())
|
||||
]
|
||||
|
||||
json_dict = {
|
||||
"context": {
|
||||
"python_version": f"{sys.version}",
|
||||
"torch_version": f"{torch.__version__}",
|
||||
"torch_cuda_version": f"{torch.version.cuda}",
|
||||
"cuda_devices": f"{cuda_devices}",
|
||||
**asdict(context)
|
||||
},
|
||||
"prefill": prefill_results.convert_stats_to_dict(),
|
||||
}
|
||||
|
||||
if has_decode:
|
||||
for idx, dr in enumerate(decode_results_list):
|
||||
json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
|
||||
|
||||
# Add .json to json_output filename if it doesn't exist already.
|
||||
json_output_file = json_output if json_output.endswith(
|
||||
'.json') else json_output + '.json'
|
||||
with open(json_output_file, "w+") as f:
|
||||
json.dump(json_dict, f, indent=2)
|
||||
pass
|
||||
|
||||
if context.save_chrome_traces_folder is not None:
|
||||
os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
|
||||
prefill_prof.profiler.export_chrome_trace(
|
||||
context.save_chrome_traces_folder + "/prefill.json")
|
||||
for idx, decode_prof in enumerate(decode_profs):
|
||||
decode_prof.profiler.export_chrome_trace(
|
||||
context.save_chrome_traces_folder + f"/decode_{idx + 1}.json")
|
||||
print("Traces saved as prefill.json and decode_1.json, etc."
|
||||
f" in folder {context.save_chrome_traces_folder}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(description="""
|
||||
Profile a model
|
||||
|
||||
example:
|
||||
```
|
||||
python examples/offline_inference/offline_profile.py \\
|
||||
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
|
||||
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
|
||||
--enforce-eager run_num_steps -n 2
|
||||
```
|
||||
|
||||
then you can use various tools to analyze the json output
|
||||
terminal ascii tables:
|
||||
```
|
||||
python tools/profiler/print_layerwise_table.py \\
|
||||
--json-trace Llama31-8b-FP8.json --phase prefill --table summary
|
||||
```
|
||||
or create matplotlib stacked bar charts:
|
||||
```
|
||||
python tools/profiler/visualize_layerwise_profile.py \\
|
||||
--json-trace Llama31-8b-FP8.json \\
|
||||
--output-directory profile_breakdown --plot-metric pct_cuda_time
|
||||
```
|
||||
""",
|
||||
formatter_class=RawTextHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--csv",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Export the results as multiple csv file. This should be the root "
|
||||
"filename, will create <filename>_prefill_model_table.csv, "
|
||||
"<filename>_prefill_summary_table.csv, "
|
||||
"<filename>_decode_model_table.csv, and "
|
||||
"<filename>_decode_summary_table.csv")
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Export the results as a json file. This should be the filename")
|
||||
parser.add_argument("--save-chrome-traces-folder",
|
||||
type=str,
|
||||
help="Save chrome traces for the prefill and decode "
|
||||
"will save traces as prefill.json and decode_1.json, "
|
||||
"etc. inside this folder")
|
||||
parser.add_argument(
|
||||
"--prompt-len",
|
||||
type=int,
|
||||
default=PROMPT_LEN_DEFAULT,
|
||||
help=f"Length of the random prompt to use when profiling, all batched "
|
||||
f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
|
||||
parser.add_argument("--batch-size",
|
||||
type=int,
|
||||
default=BATCH_SIZE_DEFAULT,
|
||||
help=f"Number of requests to run as a single batch, "
|
||||
f"default={BATCH_SIZE_DEFAULT}")
|
||||
|
||||
subparsers = parser.add_subparsers(dest="cmd")
|
||||
|
||||
run_num_steps_parser = subparsers.add_parser(
|
||||
"run_num_steps",
|
||||
help="This variation profiles n engine.step() invocations.")
|
||||
run_num_steps_parser.add_argument(
|
||||
'-n',
|
||||
'--num-steps',
|
||||
type=int,
|
||||
help="Number of engine steps to profile.\n"
|
||||
"Setting it to 1, profiles only the prefill step.\n"
|
||||
"Setting it to 2, profiles the prefill and first decode step\n"
|
||||
"Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
|
||||
"and so on ...")
|
||||
|
||||
run_to_completion_parser = subparsers.add_parser(
|
||||
"run_to_completion",
|
||||
help="This variation profiles all the engine.step() invocations"
|
||||
"until the engine exhausts all submitted requests.")
|
||||
run_to_completion_parser.add_argument(
|
||||
'-n',
|
||||
'--complete-num-requests-per-step',
|
||||
type=int,
|
||||
help=
|
||||
"Complete complete_num_requests_per_step requests every decode step."
|
||||
"For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
|
||||
"the profiler is run for 6 engine steps, with the steps processing, "
|
||||
"128, 128, 96, 64, 32, 1 requests respectively.\n"
|
||||
"Note that we tack-on a one-request step at the end as it is often "
|
||||
"useful.")
|
||||
|
||||
EngineArgs.add_cli_args(parser)
|
||||
|
||||
args = parser.parse_args()
|
||||
context = ProfileContext(
|
||||
engine_args=EngineArgs.from_cli_args(args),
|
||||
**{
|
||||
k: v
|
||||
for k, v in vars(args).items()
|
||||
if k in inspect.signature(ProfileContext).parameters
|
||||
})
|
||||
run_profile(context, csv_output=args.csv, json_output=args.json)
|
||||
75
examples/offline_inference/save_sharded_state.py
Normal file
75
examples/offline_inference/save_sharded_state.py
Normal file
@ -0,0 +1,75 @@
|
||||
"""
|
||||
Saves each worker's model state dict directly to a checkpoint, which enables a
|
||||
fast load path for large tensor-parallel models where each worker only needs to
|
||||
read its own shard rather than the entire checkpoint.
|
||||
|
||||
Example usage:
|
||||
|
||||
python save_sharded_state.py \
|
||||
--model /path/to/load \
|
||||
--quantization deepspeedfp \
|
||||
--tensor-parallel-size 8 \
|
||||
--output /path/to/save
|
||||
|
||||
Then, the model can be loaded with
|
||||
|
||||
llm = LLM(
|
||||
model="/path/to/save",
|
||||
load_format="sharded_state",
|
||||
quantization="deepspeedfp",
|
||||
tensor_parallel_size=8,
|
||||
)
|
||||
"""
|
||||
import dataclasses
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
parser = FlexibleArgumentParser()
|
||||
EngineArgs.add_cli_args(parser)
|
||||
parser.add_argument("--output",
|
||||
"-o",
|
||||
required=True,
|
||||
type=str,
|
||||
help="path to output checkpoint")
|
||||
parser.add_argument("--file-pattern",
|
||||
type=str,
|
||||
help="string pattern of saved filenames")
|
||||
parser.add_argument("--max-file-size",
|
||||
type=str,
|
||||
default=5 * 1024**3,
|
||||
help="max size (in bytes) of each safetensors file")
|
||||
|
||||
|
||||
def main(args):
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
if engine_args.enable_lora:
|
||||
raise ValueError("Saving with enable_lora=True is not supported!")
|
||||
model_path = engine_args.model
|
||||
if not Path(model_path).is_dir():
|
||||
raise ValueError("model path must be a local directory")
|
||||
# Create LLM instance from arguments
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
# Prepare output directory
|
||||
Path(args.output).mkdir(exist_ok=True)
|
||||
# Dump worker states to output directory
|
||||
model_executor = llm.llm_engine.model_executor
|
||||
model_executor.save_sharded_state(path=args.output,
|
||||
pattern=args.file_pattern,
|
||||
max_size=args.max_file_size)
|
||||
# Copy metadata files to output directory
|
||||
for file in os.listdir(model_path):
|
||||
if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
|
||||
if os.path.isdir(os.path.join(model_path, file)):
|
||||
shutil.copytree(os.path.join(model_path, file),
|
||||
os.path.join(args.output, file))
|
||||
else:
|
||||
shutil.copy(os.path.join(model_path, file), args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
Reference in New Issue
Block a user