Compare commits
111 Commits
fix_use_ep
...
v1_fix_pro
| Author | SHA1 | Date | |
|---|---|---|---|
| ccd21e1993 | |||
| 4d022cbc75 | |||
| 70de35a881 | |||
| 34b2cf3b33 | |||
| 9e90c9f73f | |||
| e9528f6dc6 | |||
| 51baa9c333 | |||
| 35e076b3a8 | |||
| a26f59ccbc | |||
| aa3b3d76e0 | |||
| f7030df3be | |||
| 905e91e9ac | |||
| f8f9c0ba62 | |||
| dda811021a | |||
| 93195146ea | |||
| ed37599544 | |||
| 99ef59cf7f | |||
| d544d141ec | |||
| 3e397a9484 | |||
| 268c325078 | |||
| 3cc9af88ff | |||
| 7cd0bd7212 | |||
| 56d4aefa33 | |||
| dd143ef541 | |||
| daefed052c | |||
| 5fbab20e02 | |||
| e8224f3dca | |||
| 9665313c39 | |||
| 0c54fc7273 | |||
| c1b57855ec | |||
| 83b824c8b4 | |||
| 7678fcd5b6 | |||
| 8661c0241d | |||
| ce8d6b75fc | |||
| 61de3ef74b | |||
| ec1f9c8c91 | |||
| 65e09094c4 | |||
| c70cf0fe06 | |||
| a5d11a54dc | |||
| 3d4c87758e | |||
| a9bd832fc5 | |||
| 417bcefbae | |||
| baada0e737 | |||
| 82eb61dd4c | |||
| 0d4d06fe2f | |||
| 4aed0ca6a2 | |||
| 1621b25288 | |||
| a564797151 | |||
| 1da6a09274 | |||
| 1e44ffc3ff | |||
| a454748544 | |||
| 1bff42c4b7 | |||
| cb391d85dc | |||
| fee5b8d37f | |||
| b2ce859bd2 | |||
| 566f10a929 | |||
| c3b5189137 | |||
| a25866ac8d | |||
| 098900d7c2 | |||
| 98d01d3ce2 | |||
| d55244df31 | |||
| 04149cce27 | |||
| 24834f4894 | |||
| ec7da6fcf3 | |||
| 819d548e8a | |||
| 477d2a8aa2 | |||
| e484e02857 | |||
| 24f6b9a713 | |||
| 9cdde47289 | |||
| b1eb4ca152 | |||
| 87b4ac56c2 | |||
| cb84e45ac7 | |||
| 4716377fbc | |||
| 4e9cf8c1dd | |||
| 2976dc27e9 | |||
| 102bf967f0 | |||
| 1f4b09b525 | |||
| 86c3369eb8 | |||
| 2755c34a8f | |||
| db10422184 | |||
| e1a2c699dd | |||
| 0115ccd5c0 | |||
| 40b4284fe3 | |||
| 4ebc0b9640 | |||
| dc96fd54c6 | |||
| 1f5d13ab9f | |||
| 90cb44eb02 | |||
| e11880deea | |||
| 9351f91be9 | |||
| 5a1e1c8353 | |||
| 69ecaa7c79 | |||
| 7f00899ff7 | |||
| 995e3d1f41 | |||
| b4ac449a83 | |||
| 8e5314a468 | |||
| 87918e40c4 | |||
| f6b32efb7f | |||
| b99733d092 | |||
| 05a015d6a5 | |||
| ad971af8c7 | |||
| f2ebb6f541 | |||
| 1d01211264 | |||
| f94ab12f79 | |||
| a865bc1ca6 | |||
| 21802c4b6d | |||
| 652907b354 | |||
| 24f1c01e0f | |||
| fad6e2538e | |||
| 7f6d47c1a2 | |||
| 3147586ebd | |||
| ed636d99ca |
@ -0,0 +1,11 @@
|
||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.31
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.47
|
||||
limit: 1319
|
||||
num_fewshot: 5
|
||||
@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
||||
Minitron-4B-Base-FP8.yaml
|
||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
||||
Qwen2-1.5B-Instruct-FP8W8.yaml
|
||||
Meta-Llama-3-8B-QQQ.yaml
|
||||
|
||||
@ -163,11 +163,6 @@ steps:
|
||||
- tests/tracing
|
||||
commands:
|
||||
- pytest -v -s metrics
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
|
||||
- pytest -v -s tracing
|
||||
|
||||
##### fast check tests #####
|
||||
@ -292,6 +287,14 @@ steps:
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/compile
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -301,7 +304,6 @@ steps:
|
||||
# these tests need to be separated, cannot combine
|
||||
- pytest -v -s compile/piecewise/test_simple.py
|
||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
|
||||
- label: PyTorch Fullgraph Test # 18min
|
||||
source_file_dependencies:
|
||||
@ -427,7 +429,7 @@ steps:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||
@ -446,10 +448,7 @@ steps:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
||||
# HACK - run phi3v tests separately to sidestep this transformers bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
||||
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
2
.github/ISSUE_TEMPLATE/600-new-model.yml
vendored
@ -9,7 +9,7 @@ body:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: The model to consider.
|
||||
|
||||
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
2
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
|
||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||
|
||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
|
||||
|
||||
@ -122,6 +122,12 @@ repos:
|
||||
language: system
|
||||
always_run: true
|
||||
pass_filenames: false
|
||||
- id: update-dockerfile-graph
|
||||
name: Update Dockerfile dependency graph
|
||||
entry: tools/update-dockerfile-graph.sh
|
||||
language: script
|
||||
files: ^docker/Dockerfile$
|
||||
pass_filenames: false
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@ -230,6 +230,7 @@ set(VLLM_EXT_SRC
|
||||
"csrc/cache_kernels.cu"
|
||||
"csrc/attention/paged_attention_v1.cu"
|
||||
"csrc/attention/paged_attention_v2.cu"
|
||||
"csrc/attention/merge_attn_states.cu"
|
||||
"csrc/pos_encoding_kernels.cu"
|
||||
"csrc/activation_kernels.cu"
|
||||
"csrc/layernorm_kernels.cu"
|
||||
|
||||
@ -15,11 +15,8 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
|
||||
---
|
||||
|
||||
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
|
||||
|
||||
---
|
||||
|
||||
*Latest News* 🔥
|
||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||
|
||||
@ -288,7 +288,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
||||
class RandomDataset(BenchmarkDataset):
|
||||
# Default values copied from benchmark_serving.py for the random dataset.
|
||||
DEFAULT_PREFIX_LEN = 0
|
||||
DEFAULT_RANGE_RATIO = 1.0
|
||||
DEFAULT_RANGE_RATIO = 0.0
|
||||
DEFAULT_INPUT_LEN = 1024
|
||||
DEFAULT_OUTPUT_LEN = 128
|
||||
|
||||
@ -308,19 +308,32 @@ class RandomDataset(BenchmarkDataset):
|
||||
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||
**kwargs,
|
||||
) -> list[SampleRequest]:
|
||||
# Enforce range_ratio < 1
|
||||
assert range_ratio < 1.0, (
|
||||
"random_range_ratio must be < 1.0 to ensure a valid sampling range"
|
||||
)
|
||||
|
||||
vocab_size = tokenizer.vocab_size
|
||||
|
||||
prefix_token_ids = (np.random.randint(
|
||||
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
|
||||
|
||||
input_low = int(input_len * range_ratio)
|
||||
output_low = int(output_len * range_ratio)
|
||||
# New sampling logic: [X * (1 - b), X * (1 + b)]
|
||||
input_low = int(input_len * (1 - range_ratio))
|
||||
input_high = int(input_len * (1 + range_ratio))
|
||||
output_low = int(output_len * (1 - range_ratio))
|
||||
output_high = int(output_len * (1 + range_ratio))
|
||||
|
||||
# Add logging for debugging
|
||||
logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
|
||||
logger.info("Sampling output_len from [%s, %s]", output_low,
|
||||
output_high)
|
||||
|
||||
input_lens = np.random.randint(input_low,
|
||||
input_len + 1,
|
||||
input_high + 1,
|
||||
size=num_requests)
|
||||
output_lens = np.random.randint(output_low,
|
||||
output_len + 1,
|
||||
output_high + 1,
|
||||
size=num_requests)
|
||||
offsets = np.random.randint(0, vocab_size, size=num_requests)
|
||||
|
||||
@ -472,11 +485,11 @@ class SonnetDataset(BenchmarkDataset):
|
||||
|
||||
# Determine how many poem lines to use.
|
||||
num_input_lines = round((input_len - base_offset) / avg_len)
|
||||
num_prefix_lines = round((prefix_len - base_offset) / avg_len)
|
||||
num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
|
||||
prefix_lines = self.data[:num_prefix_lines]
|
||||
|
||||
samples = []
|
||||
for _ in range(num_requests):
|
||||
while len(samples) < num_requests:
|
||||
extra_lines = random.choices(self.data,
|
||||
k=num_input_lines - num_prefix_lines)
|
||||
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
|
||||
@ -484,13 +497,14 @@ class SonnetDataset(BenchmarkDataset):
|
||||
prompt_formatted = tokenizer.apply_chat_template(
|
||||
msg, add_generation_prompt=True, tokenize=False)
|
||||
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
||||
samples.append(
|
||||
SampleRequest(
|
||||
prompt=prompt_formatted
|
||||
if return_prompt_formatted else prompt,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=output_len,
|
||||
))
|
||||
if prompt_len <= input_len:
|
||||
samples.append(
|
||||
SampleRequest(
|
||||
prompt=prompt_formatted
|
||||
if return_prompt_formatted else prompt,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=output_len,
|
||||
))
|
||||
return samples
|
||||
|
||||
|
||||
|
||||
@ -156,7 +156,7 @@ def calculate_metrics(
|
||||
if outputs[i].success:
|
||||
output_len = outputs[i].output_tokens
|
||||
|
||||
if output_len is None:
|
||||
if not output_len:
|
||||
# We use the tokenizer to count the number of output tokens
|
||||
# for some serving backends instead of looking at
|
||||
# len(outputs[i].itl) since multiple output tokens may be
|
||||
@ -921,7 +921,7 @@ if __name__ == "__main__":
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-seperated list of selected metrics to report percentils. "
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||
"Default value is \"ttft,tpot,itl\".")
|
||||
@ -929,7 +929,7 @@ if __name__ == "__main__":
|
||||
"--metric-percentiles",
|
||||
type=str,
|
||||
default="99",
|
||||
help="Comma-seperated list of percentiles for selected metrics. "
|
||||
help="Comma-separated list of percentiles for selected metrics. "
|
||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||
"Default value is \"99\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
@ -996,18 +996,23 @@ if __name__ == "__main__":
|
||||
random_group.add_argument(
|
||||
"--random-range-ratio",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Range of sampled ratio of input/output length, "
|
||||
"used only for random sampling.",
|
||||
default=0.0,
|
||||
help="Range ratio for sampling input/output length, "
|
||||
"used only for random sampling. Must be in the range [0, 1) to define "
|
||||
"a symmetric sampling range"
|
||||
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
||||
)
|
||||
random_group.add_argument(
|
||||
"--random-prefix-len",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of fixed prefix tokens before random "
|
||||
" context. The length range of context in a random "
|
||||
" request is [random-prefix-len, "
|
||||
" random-prefix-len + random-prefix-len * random-range-ratio).")
|
||||
help=("Number of fixed prefix tokens before the random context "
|
||||
"in a request. "
|
||||
"The total input length is the sum of `random-prefix-len` and "
|
||||
"a random "
|
||||
"context length sampled from [input_len * (1 - range_ratio), "
|
||||
"input_len * (1 + range_ratio)]."),
|
||||
)
|
||||
|
||||
hf_group = parser.add_argument_group("hf dataset options")
|
||||
hf_group.add_argument("--hf-subset",
|
||||
|
||||
@ -11,7 +11,7 @@ On the client side, run:
|
||||
--model <your_model> \
|
||||
--dataset json \
|
||||
--structured-output-ratio 1.0 \
|
||||
--structured-output-backend xgrammar \
|
||||
--structured-output-backend auto \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
|
||||
@ -130,10 +130,11 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
||||
"description":
|
||||
"An unique optional field to avoid cached schemas"
|
||||
}
|
||||
else:
|
||||
json_schemas = [schema] * args.num_prompts
|
||||
|
||||
def gen_prompt(index: int):
|
||||
schema = json_schemas[index % len(json_schemas)]
|
||||
return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
|
||||
return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501
|
||||
|
||||
def get_schema(index: int):
|
||||
return json_schemas[index % len(json_schemas)]
|
||||
@ -963,7 +964,7 @@ if __name__ == "__main__":
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-seperated list of selected metrics to report percentils. "
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||
"Default value is \"ttft,tpot,itl\".")
|
||||
@ -971,7 +972,7 @@ if __name__ == "__main__":
|
||||
"--metric-percentiles",
|
||||
type=str,
|
||||
default="99",
|
||||
help="Comma-seperated list of percentiles for selected metrics. "
|
||||
help="Comma-separated list of percentiles for selected metrics. "
|
||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||
"Default value is \"99\". "
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
@ -996,12 +997,14 @@ if __name__ == "__main__":
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Ratio of Structured Outputs requests")
|
||||
parser.add_argument(
|
||||
"--structured-output-backend",
|
||||
type=str,
|
||||
choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
|
||||
default="xgrammar",
|
||||
help="Backend to use for structured outputs")
|
||||
parser.add_argument("--structured-output-backend",
|
||||
type=str,
|
||||
choices=[
|
||||
"outlines", "lm-format-enforcer", "xgrammar",
|
||||
"guidance", "auto"
|
||||
],
|
||||
default="auto",
|
||||
help="Backend to use for structured outputs")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -213,14 +213,17 @@ def run_hf(
|
||||
max_prompt_len = 0
|
||||
max_output_len = 0
|
||||
for i in range(len(requests)):
|
||||
prompt, prompt_len, output_len = requests[i]
|
||||
prompt = requests[i].prompt
|
||||
prompt_len = requests[i].prompt_len
|
||||
output_len = requests[i].expected_output_len
|
||||
# Add the prompt to the batch.
|
||||
batch.append(prompt)
|
||||
max_prompt_len = max(max_prompt_len, prompt_len)
|
||||
max_output_len = max(max_output_len, output_len)
|
||||
if len(batch) < max_batch_size and i != len(requests) - 1:
|
||||
# Check if we can add more requests to the batch.
|
||||
_, next_prompt_len, next_output_len = requests[i + 1]
|
||||
next_prompt_len = requests[i + 1].prompt_len
|
||||
next_output_len = requests[i + 1].expected_output_len
|
||||
if (max(max_prompt_len, next_prompt_len) +
|
||||
max(max_output_len, next_output_len)) <= 2048:
|
||||
# We can add more requests to the batch.
|
||||
@ -591,18 +594,22 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
help="Path to the lora adapters to use. This can be an absolute path, "
|
||||
"a relative path, or a Hugging Face model identifier.")
|
||||
parser.add_argument("--prefix-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of prefix tokens per request."
|
||||
"This is for the RandomDataset and SonnetDataset")
|
||||
parser.add_argument(
|
||||
"--prefix-len",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Number of fixed prefix tokens before the random "
|
||||
"context in a request (default: 0).",
|
||||
)
|
||||
# random dataset
|
||||
parser.add_argument(
|
||||
"--random-range-ratio",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Range of sampled ratio of input/output length, "
|
||||
"used only for RandomDataSet.",
|
||||
default=0.0,
|
||||
help="Range ratio for sampling input/output length, "
|
||||
"used only for RandomDataset. Must be in the range [0, 1) to define "
|
||||
"a symmetric sampling range "
|
||||
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
||||
)
|
||||
|
||||
# hf dtaset
|
||||
|
||||
@ -105,8 +105,14 @@ def run(command):
|
||||
else:
|
||||
enc = locale.getpreferredencoding()
|
||||
output = raw_output.decode(enc)
|
||||
if command == 'nvidia-smi topo -m':
|
||||
# don't remove the leading whitespace of `nvidia-smi topo -m`
|
||||
# because they are meaningful
|
||||
output = output.rstrip()
|
||||
else:
|
||||
output = output.strip()
|
||||
err = raw_err.decode(enc)
|
||||
return rc, output.strip(), err.strip()
|
||||
return rc, output, err.strip()
|
||||
|
||||
|
||||
def run_and_read_all(run_lambda, command):
|
||||
|
||||
173
csrc/attention/merge_attn_states.cu
Normal file
173
csrc/attention/merge_attn_states.cu
Normal file
@ -0,0 +1,173 @@
|
||||
#include <optional>
|
||||
#include <torch/all.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <algorithm>
|
||||
|
||||
#include "attention_dtypes.h"
|
||||
#include "attention_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
|
||||
// can be used to combine partial attention results (in the split-KV case)
|
||||
template <typename scalar_t, const uint NUM_THREADS>
|
||||
__global__ void merge_attn_states_kernel(
|
||||
scalar_t* output, float* output_lse, const scalar_t* prefix_output,
|
||||
const float* prefix_lse, const scalar_t* suffix_output,
|
||||
const float* suffix_lse, const uint num_tokens, const uint num_heads,
|
||||
const uint head_size) {
|
||||
using pack_128b_t = uint4;
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
const uint threads_per_head = head_size / pack_size;
|
||||
|
||||
const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
|
||||
const uint token_head_threads = num_tokens * num_heads * threads_per_head;
|
||||
|
||||
if (global_idx >= token_head_threads) return;
|
||||
|
||||
// global_idx -> token_idx + head_idx + pack_idx
|
||||
const uint token_head_idx = global_idx / threads_per_head;
|
||||
const uint pack_idx = global_idx % threads_per_head;
|
||||
|
||||
const uint token_idx = token_head_idx / num_heads;
|
||||
const uint head_idx = token_head_idx % num_heads;
|
||||
|
||||
const uint pack_offset = pack_idx * pack_size; // (0~15)*8, etc.
|
||||
const uint head_offset =
|
||||
token_idx * num_heads * head_size + head_idx * head_size;
|
||||
const scalar_t* prefix_head_ptr = prefix_output + head_offset;
|
||||
const scalar_t* suffix_head_ptr = suffix_output + head_offset;
|
||||
scalar_t* output_head_ptr = output + head_offset;
|
||||
|
||||
float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
|
||||
float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
|
||||
p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
|
||||
s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
|
||||
|
||||
const float max_lse = fmaxf(p_lse, s_lse);
|
||||
p_lse = p_lse - max_lse;
|
||||
s_lse = s_lse - max_lse;
|
||||
const float p_se = expf(p_lse);
|
||||
const float s_se = expf(s_lse);
|
||||
const float out_se = p_se + s_se;
|
||||
const float p_scale = p_se / out_se;
|
||||
const float s_scale = s_se / out_se;
|
||||
|
||||
if (pack_offset < head_size) {
|
||||
// Pack 128b load
|
||||
pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
|
||||
prefix_head_ptr)[pack_offset / pack_size];
|
||||
pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
|
||||
suffix_head_ptr)[pack_offset / pack_size];
|
||||
pack_128b_t o_out_pack;
|
||||
|
||||
#pragma unroll
|
||||
for (uint i = 0; i < pack_size; ++i) {
|
||||
// Always use float for FMA to keep high precision.
|
||||
// half(uint16_t), bfloat16, float -> float.
|
||||
const float p_out_f =
|
||||
vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
|
||||
const float s_out_f =
|
||||
vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
|
||||
// fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
|
||||
const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
|
||||
// float -> half(uint16_t), bfloat16, float.
|
||||
vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
|
||||
}
|
||||
|
||||
// Pack 128b storage
|
||||
reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
|
||||
o_out_pack;
|
||||
}
|
||||
// We only need to write to output_lse once per head.
|
||||
if (output_lse != nullptr && pack_idx == 0) {
|
||||
float out_lse = logf(out_se) + max_lse;
|
||||
output_lse[head_idx * num_tokens + token_idx] = out_lse;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// The following macro is used to dispatch the conversion function based on
|
||||
// the output data type. The FN is a macro that calls a function with
|
||||
// template<typename scalar_t>.
|
||||
#define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn) \
|
||||
{ \
|
||||
if (scalar_dtype == at::ScalarType::Float) { \
|
||||
fn(float); \
|
||||
} else if (scalar_dtype == at::ScalarType::Half) { \
|
||||
fn(uint16_t); \
|
||||
} else if (scalar_dtype == at::ScalarType::BFloat16) { \
|
||||
fn(__nv_bfloat16); \
|
||||
} else { \
|
||||
TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS) \
|
||||
{ \
|
||||
vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS><<<grid, block>>>( \
|
||||
reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
|
||||
reinterpret_cast<scalar_t*>(prefix_output.data_ptr()), \
|
||||
reinterpret_cast<float*>(prefix_lse.data_ptr()), \
|
||||
reinterpret_cast<scalar_t*>(suffix_output.data_ptr()), \
|
||||
reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens, \
|
||||
num_heads, head_size); \
|
||||
}
|
||||
|
||||
/*@brief Merges the attention states from prefix and suffix
|
||||
* into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
|
||||
*
|
||||
* @param output [n,h,d] The output tensor to store the merged attention states.
|
||||
* @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
|
||||
* @param prefix_output [n,h,d] The prefix attention states.
|
||||
* @param prefix_lse [h,d] The log-sum-exp values for the prefix attention
|
||||
* states.
|
||||
* @param suffix_output [n,h,d] The suffix attention states.
|
||||
* @param suffix_lse [h,d] The log-sum-exp values for the suffix attention
|
||||
* states.
|
||||
*/
|
||||
template <typename scalar_t>
|
||||
void merge_attn_states_launcher(torch::Tensor& output,
|
||||
std::optional<torch::Tensor> output_lse,
|
||||
const torch::Tensor& prefix_output,
|
||||
const torch::Tensor& prefix_lse,
|
||||
const torch::Tensor& suffix_output,
|
||||
const torch::Tensor& suffix_lse) {
|
||||
constexpr uint NUM_THREADS = 128;
|
||||
const uint num_tokens = output.size(0);
|
||||
const uint num_heads = output.size(1);
|
||||
const uint head_size = output.size(2);
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
TORCH_CHECK(head_size % pack_size == 0,
|
||||
"headsize must be multiple of pack_size:", pack_size);
|
||||
float* output_lse_ptr = nullptr;
|
||||
if (output_lse.has_value()) {
|
||||
output_lse_ptr = output_lse.value().data_ptr<float>();
|
||||
}
|
||||
// process one pack elements per thread. float -> 4, half/bf16 -> 8
|
||||
const uint threads_per_head = head_size / pack_size;
|
||||
const uint total_threads = num_tokens * num_heads * threads_per_head;
|
||||
|
||||
dim3 block(NUM_THREADS);
|
||||
dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
|
||||
|
||||
LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
|
||||
}
|
||||
|
||||
#define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t) \
|
||||
{ \
|
||||
merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
|
||||
prefix_lse, suffix_output, \
|
||||
suffix_lse); \
|
||||
}
|
||||
|
||||
void merge_attn_states(torch::Tensor& output,
|
||||
std::optional<torch::Tensor> output_lse,
|
||||
const torch::Tensor& prefix_output,
|
||||
const torch::Tensor& prefix_lse,
|
||||
const torch::Tensor& suffix_output,
|
||||
const torch::Tensor& suffix_lse) {
|
||||
DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
|
||||
}
|
||||
@ -4,6 +4,11 @@
|
||||
#include <string>
|
||||
#include <sched.h>
|
||||
#endif
|
||||
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#define gettid() syscall(SYS_gettid)
|
||||
#endif
|
||||
|
||||
#include "cpu_types.hpp"
|
||||
|
||||
|
||||
@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
|
||||
int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
|
||||
// in case the final state is separated between the last "smem_exchange" and
|
||||
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
|
||||
// (which occurs when `final_state_position` is a non-positivie index)
|
||||
// (which occurs when `final_state_position` is a non-positive index)
|
||||
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
|
||||
if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
|
||||
input_t vals_load[kNElts] = {0};
|
||||
|
||||
@ -52,6 +52,15 @@ void paged_attention_v2(
|
||||
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||
const int64_t blocksparse_head_sliding_step);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
void merge_attn_states(torch::Tensor& output,
|
||||
std::optional<torch::Tensor> output_lse,
|
||||
const torch::Tensor& prefix_output,
|
||||
const torch::Tensor& prefix_lse,
|
||||
const torch::Tensor& suffix_output,
|
||||
const torch::Tensor& suffix_lse);
|
||||
#endif
|
||||
|
||||
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||
double epsilon);
|
||||
|
||||
|
||||
@ -129,7 +129,7 @@ static __device__ __forceinline__ void moe_q(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q4_0 64
|
||||
#define MOE_X_Q4_0 8
|
||||
#define MOE_Y_Q4_0 128
|
||||
#define NWARPS_Q4_0 8
|
||||
#else
|
||||
@ -190,7 +190,7 @@ static void ggml_moe_q4_0_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q4_1 64
|
||||
#define MOE_X_Q4_1 8
|
||||
#define MOE_Y_Q4_1 128
|
||||
#define NWARPS_Q4_1 8
|
||||
#else
|
||||
@ -251,7 +251,7 @@ static void ggml_moe_q4_1_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q5_0 64
|
||||
#define MOE_X_Q5_0 8
|
||||
#define MOE_Y_Q5_0 128
|
||||
#define NWARPS_Q5_0 8
|
||||
#else
|
||||
@ -312,7 +312,7 @@ static void ggml_moe_q5_0_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q5_1 64
|
||||
#define MOE_X_Q5_1 8
|
||||
#define MOE_Y_Q5_1 128
|
||||
#define NWARPS_Q5_1 8
|
||||
#else
|
||||
@ -373,7 +373,7 @@ static void ggml_moe_q5_1_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q8_0 64
|
||||
#define MOE_X_Q8_0 8
|
||||
#define MOE_Y_Q8_0 128
|
||||
#define NWARPS_Q8_0 8
|
||||
#else
|
||||
@ -434,7 +434,7 @@ static void ggml_moe_q8_0_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q2_K 64
|
||||
#define MOE_X_Q2_K 8
|
||||
#define MOE_Y_Q2_K 128
|
||||
#define NWARPS_Q2_K 8
|
||||
#else
|
||||
@ -495,7 +495,7 @@ static void ggml_moe_q2_K_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q3_K 64
|
||||
#define MOE_X_Q3_K 8
|
||||
#define MOE_Y_Q3_K 128
|
||||
#define NWARPS_Q3_K 8
|
||||
#else
|
||||
@ -556,7 +556,7 @@ static void ggml_moe_q3_K_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q4_K 64
|
||||
#define MOE_X_Q4_K 8
|
||||
#define MOE_Y_Q4_K 128
|
||||
#define NWARPS_Q4_K 8
|
||||
#else
|
||||
@ -617,7 +617,7 @@ static void ggml_moe_q4_K_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q5_K 64
|
||||
#define MOE_X_Q5_K 8
|
||||
#define MOE_Y_Q5_K 128
|
||||
#define NWARPS_Q5_K 8
|
||||
#else
|
||||
@ -678,7 +678,7 @@ static void ggml_moe_q5_K_q8_1_cuda(
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
#define MOE_X_Q6_K 64
|
||||
#define MOE_X_Q6_K 8
|
||||
#define MOE_Y_Q6_K 128
|
||||
#define NWARPS_Q6_K 8
|
||||
#else
|
||||
|
||||
@ -64,6 +64,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
" int blocksparse_head_sliding_step) -> ()");
|
||||
ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
// Merge attn states
|
||||
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
|
||||
// can be used to combine partial attention results (in the split-KV case)
|
||||
ops.def(
|
||||
"merge_attn_states("
|
||||
" Tensor! output,"
|
||||
" Tensor!? output_lse,"
|
||||
" Tensor prefix_output,"
|
||||
" Tensor prefix_lse,"
|
||||
" Tensor suffix_output,"
|
||||
" Tensor suffix_lse) -> ()");
|
||||
ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
|
||||
#endif
|
||||
|
||||
// Activation ops
|
||||
// Activation function used in SwiGLU.
|
||||
ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
|
||||
|
||||
@ -18,6 +18,8 @@ WORKDIR /workspace/
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
|
||||
ENV LD_PRELOAD=""
|
||||
|
||||
# Install minimal dependencies and uv
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
@ -32,6 +34,7 @@ ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||
|
||||
ENV PATH="/root/.local/bin:$PATH"
|
||||
ENV VIRTUAL_ENV="/opt/venv"
|
||||
ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
|
||||
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
|
||||
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
|
||||
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||
|
||||
COPY ./ /workspace/vllm
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# default base image
|
||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
|
||||
WORKDIR ${APP_MOUNT}/vllm
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
|
||||
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
|
||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
|
||||
RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install pytest
|
||||
|
||||
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
|
||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||
|
||||
- [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||
- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||
- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||
- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
|
||||
|
||||
@ -79,6 +79,17 @@ Further update the model as follows:
|
||||
return inputs_embeds
|
||||
```
|
||||
|
||||
- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
|
||||
|
||||
```python
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
def get_language_model(self) -> torch.nn.Module:
|
||||
# Change `language_model` according to your implementation.
|
||||
return self.language_model
|
||||
```
|
||||
|
||||
- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
|
||||
|
||||
```diff
|
||||
@ -110,17 +121,21 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None, "video": 1}
|
||||
```
|
||||
|
||||
### Maximum number of placeholder feature tokens
|
||||
## 3. Specify dummy inputs
|
||||
|
||||
Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
|
||||
to return the maximum number of placeholder feature tokens per input item for each modality.
|
||||
Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
|
||||
HF processing as well as memory profiling.
|
||||
|
||||
When calling the model, the output embeddings from the visual encoder are assigned to the input positions
|
||||
containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
|
||||
to the size of the output embeddings.
|
||||
### For memory profiling
|
||||
|
||||
:::::{tab-set}
|
||||
::::{tab-item} Basic example: LLaVA
|
||||
Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
|
||||
to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
|
||||
the model so that vLLM can reserve the correct amount of memory for it.
|
||||
|
||||
Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
|
||||
|
||||
::::{tab-set}
|
||||
:::{tab-item} Basic example: LLaVA
|
||||
:sync: llava
|
||||
|
||||
Looking at the code of HF's `LlavaForConditionalGeneration`:
|
||||
@ -229,7 +244,7 @@ def get_num_image_tokens(
|
||||
```
|
||||
|
||||
Notice that the number of image tokens doesn't depend on the image width and height.
|
||||
So, we can calculate the maximum number of image tokens using any image size:
|
||||
We can simply use a dummy `image_size`:
|
||||
|
||||
```python
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
@ -237,33 +252,35 @@ def get_image_size_with_most_features(self) -> ImageSize:
|
||||
width = height = hf_config.image_size
|
||||
return ImageSize(width=width, height=height)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
)
|
||||
```
|
||||
|
||||
And thus, we can override the method as:
|
||||
|
||||
```python
|
||||
def get_mm_max_tokens_per_item(
|
||||
def get_dummy_processor_inputs(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
) -> ProcessorInputs:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
processor = self.info.get_hf_processor()
|
||||
image_token = processor.image_token
|
||||
|
||||
hf_config = self.get_hf_config()
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
mm_data = {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
|
||||
return ProcessorInputs(
|
||||
prompt_text=image_token * num_images,
|
||||
mm_data=mm_data,
|
||||
)
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
|
||||
:::
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} Non-consecutive feature tokens: Fuyu
|
||||
:::{tab-item} No input placeholders: Fuyu
|
||||
:sync: fuyu
|
||||
|
||||
Looking at the code of HF's `FuyuForCausalLM`:
|
||||
@ -383,188 +400,16 @@ num_patches_per_dim_w = image_width // patch_width
|
||||
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
|
||||
```
|
||||
|
||||
We can calculate this in vLLM using this code:
|
||||
|
||||
```python
|
||||
def get_num_image_patches(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
image_processor = self.get_image_processor()
|
||||
target_width = image_processor.size["width"]
|
||||
target_height = image_processor.size["height"]
|
||||
patch_width = image_processor.patch_size["width"]
|
||||
patch_height = image_processor.patch_size["height"]
|
||||
|
||||
if not (image_width <= target_width and image_height <= target_height):
|
||||
height_scale_factor = target_height / image_height
|
||||
width_scale_factor = target_width / image_width
|
||||
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
|
||||
|
||||
image_height = int(image_height * optimal_scale_factor)
|
||||
image_width = int(image_width * optimal_scale_factor)
|
||||
|
||||
ncols = math.ceil(image_width / patch_width)
|
||||
nrows = math.ceil(image_height / patch_height)
|
||||
return ncols * nrows
|
||||
```
|
||||
|
||||
These image patches correspond to placeholder tokens (`|SPEAKER|`). However, the processor also
|
||||
inserts newline tokens (`|NEWLINE|`) as shown here:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L654-L670
|
||||
tensor_of_image_ids = torch.full(
|
||||
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
|
||||
)
|
||||
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
|
||||
assert num_patches == patches.shape[0]
|
||||
|
||||
if variable_sized:
|
||||
# Now terminate each line with |NEWLINE|.
|
||||
tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
|
||||
newline_ids = torch.full(
|
||||
[tensor_of_image_ids.shape[0], 1],
|
||||
image_newline_id,
|
||||
dtype=torch.int32,
|
||||
device=image_input.device,
|
||||
)
|
||||
tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
|
||||
tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
|
||||
```
|
||||
|
||||
So, the layout of tokens for an image is:
|
||||
|
||||
```
|
||||
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|
||||
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|
||||
...
|
||||
|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
|
||||
```
|
||||
|
||||
This makes the placeholder tokens non-consecutive in the prompt.
|
||||
Since vLLM requires the feature tokens to be consecutive, **we also treat the newline tokens as feature tokens**.
|
||||
|
||||
So overall, the total number of feature tokens is
|
||||
|
||||
```python
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
image_processor = self.get_image_processor()
|
||||
target_width = image_processor.size["width"]
|
||||
target_height = image_processor.size["height"]
|
||||
patch_width = image_processor.patch_size["width"]
|
||||
patch_height = image_processor.patch_size["height"]
|
||||
|
||||
if not (image_width <= target_width and image_height <= target_height):
|
||||
height_scale_factor = target_height / image_height
|
||||
width_scale_factor = target_width / image_width
|
||||
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
|
||||
|
||||
image_height = int(image_height * optimal_scale_factor)
|
||||
image_width = int(image_width * optimal_scale_factor)
|
||||
|
||||
ncols = math.ceil(image_width / patch_width)
|
||||
nrows = math.ceil(image_height / patch_height)
|
||||
return (ncols + 1) * nrows
|
||||
```
|
||||
|
||||
To calculate the maximum number of image tokens, recall that input images are first resized
|
||||
to fit within `image_processor.size`. The maximum possible dimensions of the image before
|
||||
being converted into patches is therefore equal to `image_processor.size`.
|
||||
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
|
||||
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
|
||||
|
||||
```python
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
image_processor = self.get_image_processor()
|
||||
return ImageSize(width=image_processor.size["width"],
|
||||
height=image_processor.size["height"])
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
target_width, target_height = self.get_image_size_with_most_features()
|
||||
|
||||
return self.get_num_image_tokens(
|
||||
image_width=target_width,
|
||||
image_height=target_height,
|
||||
)
|
||||
```
|
||||
|
||||
And thus, we can override the method as:
|
||||
|
||||
```python
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
return {"image": self.get_max_image_tokens()}
|
||||
```
|
||||
|
||||
:::{note}
|
||||
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) returns `ncols` and `nrows` directly instead of the total token count.
|
||||
This is because `ncols` and `nrows` are used to specify the layout of the feature tokens (as shown in Step 4 of this guide).
|
||||
:::
|
||||
|
||||
::::
|
||||
:::::
|
||||
|
||||
## 3. Specify dummy inputs
|
||||
|
||||
Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
|
||||
HF processing as well as memory profiling.
|
||||
|
||||
### For memory profiling
|
||||
|
||||
Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
|
||||
to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
|
||||
the model so that vLLM can reserve the correct amount of memory for it.
|
||||
|
||||
Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
|
||||
on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
|
||||
|
||||
::::{tab-set}
|
||||
:::{tab-item} Basic example: LLaVA
|
||||
:sync: llava
|
||||
|
||||
Making use of the `get_image_size_with_most_features` method implemented in Step 2:
|
||||
|
||||
```python
|
||||
def get_dummy_processor_inputs(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> ProcessorInputs:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
processor = self.info.get_hf_processor()
|
||||
image_token = processor.image_token
|
||||
|
||||
hf_config = self.get_hf_config()
|
||||
target_width, target_height = self.info.get_image_size_with_most_features()
|
||||
|
||||
mm_data = {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
|
||||
return ProcessorInputs(
|
||||
prompt_text=image_token * num_images,
|
||||
mm_data=mm_data,
|
||||
)
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
:::{tab-item} No input placeholders: Fuyu
|
||||
:sync: fuyu
|
||||
|
||||
Fuyu does not expect image placeholders in the inputs to HF processor, so
|
||||
the dummy prompt text is empty regardless of the number of images.
|
||||
Otherwise, the logic of this method is very similar to LLaVA:
|
||||
@ -860,8 +705,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
|
||||
)
|
||||
```
|
||||
|
||||
To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
|
||||
with different `full` and `feature` attributes:
|
||||
To assign the vision embeddings to only the image tokens, instead of a string
|
||||
you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
|
||||
|
||||
```python
|
||||
hf_config = self.info.get_hf_config()
|
||||
@ -879,9 +724,9 @@ def get_replacement_fuyu(item_idx: int):
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails(
|
||||
full=image_tokens + [bos_token_id],
|
||||
features=image_tokens,
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=_IMAGE_TOKEN_ID,
|
||||
)
|
||||
```
|
||||
|
||||
@ -914,9 +759,9 @@ def _get_prompt_updates(
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails(
|
||||
full=image_tokens + [bos_token_id],
|
||||
features=image_tokens,
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=_IMAGE_TOKEN_ID,
|
||||
)
|
||||
|
||||
return [
|
||||
|
||||
@ -18,4 +18,5 @@ int8
|
||||
fp8
|
||||
quark
|
||||
quantized_kvcache
|
||||
torchao
|
||||
:::
|
||||
|
||||
@ -62,7 +62,7 @@ The table below shows the compatibility of various quantization implementations
|
||||
* ❌
|
||||
* ✅︎
|
||||
* ❌
|
||||
* ❌
|
||||
* ✅︎
|
||||
- * FP8 (W8A8)
|
||||
* ❌
|
||||
* ❌
|
||||
|
||||
34
docs/source/features/quantization/torchao.md
Normal file
34
docs/source/features/quantization/torchao.md
Normal file
@ -0,0 +1,34 @@
|
||||
# TorchAO
|
||||
|
||||
TorchAO is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like torch.compile, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks).
|
||||
|
||||
We recommend installing the latest torchao nightly with
|
||||
|
||||
```console
|
||||
# Install the latest TorchAO nightly build
|
||||
# Choose the CUDA version that matches your system (cu126, cu128, etc.)
|
||||
pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126
|
||||
```
|
||||
|
||||
## Quantizing HuggingFace Models
|
||||
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
|
||||
from torchao.quantization import Int8WeightOnlyConfig
|
||||
|
||||
model_name = "meta-llama/Meta-Llama-3-8B"
|
||||
quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
input_text = "What are we having for dinner?"
|
||||
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
|
||||
|
||||
hub_repo = # YOUR HUB REPO ID
|
||||
tokenizer.push_to_hub(hub_repo)
|
||||
quantized_model.push_to_hub(hub_repo, safe_serialization=False)
|
||||
```
|
||||
|
||||
Alternatively, you can use the TorchAO Quantization space for quantizing models with a simple UI.
|
||||
See: https://huggingface.co/spaces/medmekk/TorchAO_Quantization
|
||||
@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
|
||||
|
||||
For more information on CoreWeave's Tensorizer, please refer to
|
||||
[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
|
||||
the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
|
||||
the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html).
|
||||
|
||||
:::{note}
|
||||
Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
|
||||
|
||||
@ -160,6 +160,35 @@ If vLLM successfully returns text (for generative models) or hidden states (for
|
||||
Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
|
||||
Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
|
||||
|
||||
#### Using a proxy
|
||||
|
||||
Here are some tips for loading/downloading models from Hugging Face using a proxy:
|
||||
|
||||
- Set the proxy globally for your session (or set it in the profile file):
|
||||
|
||||
```shell
|
||||
export http_proxy=http://your.proxy.server:port
|
||||
export https_proxy=http://your.proxy.server:port
|
||||
```
|
||||
|
||||
- Set the proxy for just the current command:
|
||||
|
||||
```shell
|
||||
https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
|
||||
|
||||
# or use vllm cmd directly
|
||||
https_proxy=http://your.proxy.server:port vllm serve <model_name> --disable-log-requests
|
||||
```
|
||||
|
||||
- Set the proxy in Python interpreter:
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
os.environ['http_proxy'] = 'http://your.proxy.server:port'
|
||||
os.environ['https_proxy'] = 'http://your.proxy.server:port'
|
||||
```
|
||||
|
||||
### ModelScope
|
||||
|
||||
To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
|
||||
@ -303,6 +332,11 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* `THUDM/glm-4-9b-chat-hf`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Glm4ForCausalLM`
|
||||
* GLM-4-0414
|
||||
* `THUDM/GLM-4-32B-Chat-0414`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `GPT2LMHeadModel`
|
||||
* GPT-2
|
||||
* `gpt2`, `gpt2-xl`, etc.
|
||||
@ -990,6 +1024,13 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `SmolVLMForConditionalGeneration`
|
||||
* SmolVLM2
|
||||
* T + I
|
||||
* `SmolVLM2-2.2B-Instruct`
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `UltravoxModel`
|
||||
* Ultravox
|
||||
* T + A<sup>E+</sup>
|
||||
@ -1006,9 +1047,6 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
|
||||
|
||||
:::{important}
|
||||
To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
|
||||
`pip install git+https://github.com/huggingface/transformers`.
|
||||
|
||||
Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
|
||||
You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
|
||||
:::
|
||||
|
||||
@ -199,13 +199,6 @@ def main(args):
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
# We set temperature to 0.2 so that outputs can be different
|
||||
# even when all prompts are identical when running batch inference.
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
@ -226,8 +219,15 @@ def main(args):
|
||||
if args.num_prompts > 1:
|
||||
# Batch inference
|
||||
inputs = [inputs] * args.num_prompts
|
||||
# Add LoRA request if applicable
|
||||
lora_request = (req_data.lora_requests *
|
||||
args.num_prompts if req_data.lora_requests else None)
|
||||
|
||||
outputs = llm.generate(inputs, sampling_params=sampling_params)
|
||||
outputs = llm.generate(
|
||||
inputs,
|
||||
sampling_params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
|
||||
@ -76,6 +76,7 @@ def main():
|
||||
max_num_seqs=args.max_num_seqs,
|
||||
gpu_memory_utilization=0.8,
|
||||
speculative_config={
|
||||
"method": "eagle",
|
||||
"model": eagle_dir,
|
||||
"num_speculative_tokens": args.num_spec_tokens,
|
||||
"draft_tensor_parallel_size": args.draft_tp,
|
||||
|
||||
50
examples/offline_inference/embed_jina_embeddings_v3.py
Normal file
50
examples/offline_inference/embed_jina_embeddings_v3.py
Normal file
@ -0,0 +1,50 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from argparse import Namespace
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Follow the white rabbit.", # English
|
||||
"Sigue al conejo blanco.", # Spanish
|
||||
"Suis le lapin blanc.", # French
|
||||
"跟着白兔走。", # Chinese
|
||||
"اتبع الأرنب الأبيض.", # Arabic
|
||||
"Folge dem weißen Kaninchen.", # German
|
||||
]
|
||||
|
||||
# Create an LLM.
|
||||
# You should pass task="embed" for embedding models
|
||||
model = LLM(**vars(args))
|
||||
|
||||
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
|
||||
# Only text matching task is supported for now. See #16120
|
||||
outputs = model.embed(prompts)
|
||||
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:")
|
||||
print("Only text matching task is supported for now. See #16120")
|
||||
print("-" * 60)
|
||||
for prompt, output in zip(prompts, outputs):
|
||||
embeds = output.outputs.embedding
|
||||
embeds_trimmed = ((str(embeds[:16])[:-1] +
|
||||
", ...]") if len(embeds) > 16 else embeds)
|
||||
print(f"Prompt: {prompt!r} \n"
|
||||
f"Embeddings for text matching: {embeds_trimmed} "
|
||||
f"(size={len(embeds)})")
|
||||
print("-" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser()
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
# Set example specific arguments
|
||||
parser.set_defaults(model="jinaai/jina-embeddings-v3",
|
||||
task="embed",
|
||||
trust_remote_code=True)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@ -56,7 +56,7 @@ def run_florence2():
|
||||
def run_mllama():
|
||||
engine_args = EngineArgs(
|
||||
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
max_model_len=4096,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
dtype="half",
|
||||
|
||||
@ -90,8 +90,9 @@ def run_simple_demo(args: argparse.Namespace):
|
||||
},
|
||||
]
|
||||
outputs = llm.chat(messages, sampling_params=sampling_params)
|
||||
|
||||
print("-" * 50)
|
||||
print(outputs[0].outputs[0].text)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def run_advanced_demo(args: argparse.Namespace):
|
||||
@ -162,7 +163,9 @@ def run_advanced_demo(args: argparse.Namespace):
|
||||
]
|
||||
|
||||
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
|
||||
print("-" * 50)
|
||||
print(outputs[0].outputs[0].text)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@ -61,6 +61,7 @@ def process_requests(engine: LLMEngine,
|
||||
"""Continuously process a list of prompts and handle the outputs."""
|
||||
request_id = 0
|
||||
|
||||
print("-" * 50)
|
||||
while test_prompts or engine.has_unfinished_requests():
|
||||
if test_prompts:
|
||||
prompt, sampling_params, lora_request = test_prompts.pop(0)
|
||||
@ -75,6 +76,7 @@ def process_requests(engine: LLMEngine,
|
||||
for request_output in request_outputs:
|
||||
if request_output.finished:
|
||||
print(request_output)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def initialize_engine() -> LLMEngine:
|
||||
|
||||
@ -12,27 +12,36 @@ prompts = [
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
max_num_seqs=8,
|
||||
# The max_model_len and block_size arguments are required to be same as
|
||||
# max sequence length when targeting neuron device.
|
||||
# Currently, this is a known limitation in continuous batching support
|
||||
# in transformers-neuronx.
|
||||
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
||||
max_model_len=1024,
|
||||
block_size=1024,
|
||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||
# The device argument can be either unspecified for automated detection,
|
||||
# or explicitly assigned.
|
||||
device="neuron",
|
||||
tensor_parallel_size=2)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
def main():
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
max_num_seqs=8,
|
||||
# The max_model_len and block_size arguments are required to be same as
|
||||
# max sequence length when targeting neuron device.
|
||||
# Currently, this is a known limitation in continuous batching support
|
||||
# in transformers-neuronx.
|
||||
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
||||
max_model_len=1024,
|
||||
block_size=1024,
|
||||
# ruff: noqa: E501
|
||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||
# The device argument can be either unspecified for automated detection,
|
||||
# or explicitly assigned.
|
||||
device="neuron",
|
||||
tensor_parallel_size=2)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -22,31 +22,40 @@ prompts = [
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
max_num_seqs=8,
|
||||
# The max_model_len and block_size arguments are required to be same as
|
||||
# max sequence length when targeting neuron device.
|
||||
# Currently, this is a known limitation in continuous batching support
|
||||
# in transformers-neuronx.
|
||||
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
||||
max_model_len=2048,
|
||||
block_size=2048,
|
||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||
# The device argument can be either unspecified for automated detection,
|
||||
# or explicitly assigned.
|
||||
device="neuron",
|
||||
quantization="neuron_quant",
|
||||
override_neuron_config={
|
||||
"cast_logits_dtype": "bfloat16",
|
||||
},
|
||||
tensor_parallel_size=2)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
def main():
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
max_num_seqs=8,
|
||||
# The max_model_len and block_size arguments are required to be same as
|
||||
# max sequence length when targeting neuron device.
|
||||
# Currently, this is a known limitation in continuous batching support
|
||||
# in transformers-neuronx.
|
||||
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
||||
max_model_len=2048,
|
||||
block_size=2048,
|
||||
# ruff: noqa: E501
|
||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||
# The device argument can be either unspecified for automated detection,
|
||||
# or explicitly assigned.
|
||||
device="neuron",
|
||||
quantization="neuron_quant",
|
||||
override_neuron_config={
|
||||
"cast_logits_dtype": "bfloat16",
|
||||
},
|
||||
tensor_parallel_size=2)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -31,55 +31,62 @@ generating_prompts = [prefix + prompt for prompt in prompts]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.0)
|
||||
|
||||
# Create an LLM without prefix caching as a baseline.
|
||||
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
|
||||
|
||||
print("Results without `enable_prefix_caching`")
|
||||
def main():
|
||||
# Create an LLM without prefix caching as a baseline.
|
||||
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = regular_llm.generate(generating_prompts, sampling_params)
|
||||
print("Results without `enable_prefix_caching`")
|
||||
|
||||
regular_generated_texts = []
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
regular_generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
# ruff: noqa: E501
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = regular_llm.generate(generating_prompts, sampling_params)
|
||||
|
||||
print("-" * 80)
|
||||
regular_generated_texts = []
|
||||
# Print the outputs.
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
regular_generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
# Destroy the LLM object and free up the GPU memory.
|
||||
del regular_llm
|
||||
cleanup_dist_env_and_memory()
|
||||
# Destroy the LLM object and free up the GPU memory.
|
||||
del regular_llm
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
# Create an LLM with prefix caching enabled.
|
||||
prefix_cached_llm = LLM(model="facebook/opt-125m",
|
||||
enable_prefix_caching=True,
|
||||
gpu_memory_utilization=0.4)
|
||||
# Create an LLM with prefix caching enabled.
|
||||
prefix_cached_llm = LLM(model="facebook/opt-125m",
|
||||
enable_prefix_caching=True,
|
||||
gpu_memory_utilization=0.4)
|
||||
|
||||
# Warmup so that the shared prompt's KV cache is computed.
|
||||
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
|
||||
# Warmup so that the shared prompt's KV cache is computed.
|
||||
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
|
||||
|
||||
# Generate with prefix caching.
|
||||
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
|
||||
# Generate with prefix caching.
|
||||
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
|
||||
|
||||
print("Results with `enable_prefix_caching`")
|
||||
print("Results with `enable_prefix_caching`")
|
||||
|
||||
cached_generated_texts = []
|
||||
# Print the outputs. You should see the same outputs as before.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
cached_generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
cached_generated_texts = []
|
||||
# Print the outputs. You should see the same outputs as before.
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
cached_generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
print("-" * 80)
|
||||
# Compare the results and display the speedup
|
||||
generated_same = all([
|
||||
regular_generated_texts[i] == cached_generated_texts[i]
|
||||
for i in range(len(prompts))
|
||||
])
|
||||
print(f"Generated answers are the same: {generated_same}")
|
||||
|
||||
# Compare the results and display the speedup
|
||||
generated_same = all([
|
||||
regular_generated_texts[i] == cached_generated_texts[i]
|
||||
for i in range(len(prompts))
|
||||
])
|
||||
print(f"Generated answers are the same: {generated_same}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -12,7 +12,7 @@ from typing import Any, Optional, TypeAlias
|
||||
import torch
|
||||
import tqdm
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, SamplingParams, envs
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.profiler import layerwise_profile
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
@ -234,9 +234,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
|
||||
sampling_params.max_tokens = next(output_len_generator)
|
||||
assert isinstance(sampling_params.max_tokens, int)
|
||||
|
||||
prompt_token_ids = torch.randint(
|
||||
llm.llm_engine.model_config.get_vocab_size(),
|
||||
size=(prompt_len, )).tolist()
|
||||
prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size,
|
||||
size=(prompt_len, )).tolist()
|
||||
|
||||
llm.llm_engine.add_request(
|
||||
request_id=f"seq{i}",
|
||||
@ -262,8 +261,13 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
|
||||
|
||||
decode_profs = []
|
||||
for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
|
||||
num_running_seqs = llm.llm_engine.scheduler[
|
||||
0].get_num_unfinished_seq_groups()
|
||||
if envs.VLLM_USE_V1:
|
||||
num_running_seqs = llm.llm_engine.scheduler[
|
||||
0].get_num_unfinished_requests()
|
||||
else:
|
||||
num_running_seqs = llm.llm_engine.scheduler[
|
||||
0].get_num_unfinished_seq_groups()
|
||||
|
||||
with layerwise_profile(
|
||||
num_running_seqs=num_running_seqs) as decode_prof:
|
||||
llm.llm_engine.step()
|
||||
|
||||
@ -19,8 +19,6 @@ SEED = 42
|
||||
# because it is almost impossible to make the scheduling deterministic in the
|
||||
# online serving setting.
|
||||
|
||||
llm = LLM(model="facebook/opt-125m", seed=SEED)
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
@ -29,8 +27,17 @@ prompts = [
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
def main():
|
||||
llm = LLM(model="facebook/opt-125m", seed=SEED)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -85,11 +85,13 @@ sampling_params = SamplingParams(temperature=0)
|
||||
|
||||
outputs = ray.get(llm.generate.remote(prompts, sampling_params))
|
||||
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, "
|
||||
print(f"Prompt: {prompt!r}\n"
|
||||
f"Generated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
# set up the communication between the training process
|
||||
# and the inference engine.
|
||||
@ -120,8 +122,10 @@ assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
|
||||
# use the updated model to generate texts, they will be nonsense
|
||||
# because the weights are all zeros.
|
||||
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
|
||||
print("-" * 50)
|
||||
for output in outputs_updated:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, "
|
||||
print(f"Prompt: {prompt!r}\n"
|
||||
f"Generated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
@ -32,10 +32,12 @@ if __name__ == "__main__":
|
||||
llm.stop_profile()
|
||||
|
||||
# Print the outputs.
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
# Add a buffer to wait for profiler in the background process
|
||||
# (in case MP is on) to finish writing profiling output.
|
||||
|
||||
@ -1,4 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""
|
||||
This file demonstrates the example usage of guided decoding
|
||||
to generate structured outputs using vLLM. It shows how to apply
|
||||
different guided decoding techniques such as Choice, Regex, JSON schema,
|
||||
and Grammar to produce structured and formatted results
|
||||
based on specific prompts.
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
@ -7,26 +14,21 @@ from pydantic import BaseModel
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.sampling_params import GuidedDecodingParams
|
||||
|
||||
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
|
||||
|
||||
# Guided decoding by Choice (list of possible options)
|
||||
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||
outputs = llm.generate(
|
||||
prompts="Classify this sentiment: vLLM is wonderful!",
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
print(outputs[0].outputs[0].text)
|
||||
guided_decoding_params_choice = GuidedDecodingParams(
|
||||
choice=["Positive", "Negative"])
|
||||
sampling_params_choice = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_choice)
|
||||
prompt_choice = "Classify this sentiment: vLLM is wonderful!"
|
||||
|
||||
# Guided decoding by Regex
|
||||
guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
|
||||
stop=["\n"])
|
||||
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
|
||||
print(outputs[0].outputs[0].text)
|
||||
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
|
||||
sampling_params_regex = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_regex, stop=["\n"])
|
||||
prompt_regex = (
|
||||
"Generate an email address for Alan Turing, who works in Enigma."
|
||||
"End in .com and new line. Example result:"
|
||||
"alan.turing@enigma.com\n")
|
||||
|
||||
|
||||
# Guided decoding by JSON using Pydantic schema
|
||||
@ -44,37 +46,54 @@ class CarDescription(BaseModel):
|
||||
|
||||
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
|
||||
guided_decoding_params = GuidedDecodingParams(json=json_schema)
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||
prompt = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
outputs = llm.generate(
|
||||
prompts=prompt,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
print(outputs[0].outputs[0].text)
|
||||
guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
|
||||
sampling_params_json = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_json)
|
||||
prompt_json = ("Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's")
|
||||
|
||||
# Guided decoding by Grammar
|
||||
simplified_sql_grammar = """
|
||||
?start: select_statement
|
||||
|
||||
?select_statement: "SELECT " column_list " FROM " table_name
|
||||
|
||||
?column_list: column_name ("," column_name)*
|
||||
|
||||
?table_name: identifier
|
||||
|
||||
?column_name: identifier
|
||||
|
||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
||||
root ::= select_statement
|
||||
select_statement ::= "SELECT " column " from " table " where " condition
|
||||
column ::= "col_1 " | "col_2 "
|
||||
table ::= "table_1 " | "table_2 "
|
||||
condition ::= column "= " number
|
||||
number ::= "1 " | "2 "
|
||||
"""
|
||||
guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
|
||||
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
|
||||
prompt = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
outputs = llm.generate(
|
||||
prompts=prompt,
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
print(outputs[0].outputs[0].text)
|
||||
guided_decoding_params_grammar = GuidedDecodingParams(
|
||||
grammar=simplified_sql_grammar)
|
||||
sampling_params_grammar = SamplingParams(
|
||||
guided_decoding=guided_decoding_params_grammar)
|
||||
prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'"
|
||||
"from the 'users' table.")
|
||||
|
||||
|
||||
def format_output(title: str, output: str):
|
||||
print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")
|
||||
|
||||
|
||||
def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
|
||||
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
|
||||
return outputs[0].outputs[0].text
|
||||
|
||||
|
||||
def main():
|
||||
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
|
||||
|
||||
choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
|
||||
format_output("Guided decoding by Choice", choice_output)
|
||||
|
||||
regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
|
||||
format_output("Guided decoding by Regex", regex_output)
|
||||
|
||||
json_output = generate_output(prompt_json, sampling_params_json, llm)
|
||||
format_output("Guided decoding by JSON", json_output)
|
||||
|
||||
grammar_output = generate_output(prompt_grammar, sampling_params_grammar,
|
||||
llm)
|
||||
format_output("Guided decoding by Grammar", grammar_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -36,11 +36,13 @@ llm = LLM(
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
# all ranks will have the same outputs
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, "
|
||||
print(f"Prompt: {prompt!r}\n"
|
||||
f"Generated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
"""
|
||||
Further tips:
|
||||
|
||||
|
||||
@ -16,14 +16,22 @@ N = 1
|
||||
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
|
||||
sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
|
||||
|
||||
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
|
||||
# In real workloads, `enforace_eager` should be `False`.
|
||||
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=4)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output, answer in zip(outputs, answers):
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
|
||||
def main():
|
||||
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
|
||||
# In real workloads, `enforace_eager` should be `False`.
|
||||
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
|
||||
max_num_batched_tokens=64,
|
||||
max_num_seqs=4)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
print("-" * 50)
|
||||
for output, answer in zip(outputs, answers):
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
assert generated_text.startswith(answer)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -8,6 +8,7 @@ on HuggingFace model repository.
|
||||
"""
|
||||
import os
|
||||
import random
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import asdict
|
||||
from typing import NamedTuple, Optional
|
||||
|
||||
@ -298,6 +299,34 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# SmolVLM2-2.2B-Instruct
|
||||
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
enforce_eager=True,
|
||||
mm_processor_kwargs={
|
||||
"max_image_size": {
|
||||
"longest_edge": 384
|
||||
},
|
||||
},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
prompts = [
|
||||
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
|
||||
for question in questions
|
||||
]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# InternVL
|
||||
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -556,7 +585,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
@ -955,6 +984,7 @@ model_example_map = {
|
||||
"qwen2_vl": run_qwen2_vl,
|
||||
"qwen2_5_vl": run_qwen2_5_vl,
|
||||
"skywork_chat": run_skyworkr1v,
|
||||
"smolvlm": run_smolvlm,
|
||||
}
|
||||
|
||||
|
||||
@ -1026,6 +1056,20 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data,
|
||||
return inputs
|
||||
|
||||
|
||||
@contextmanager
|
||||
def time_counter(enable: bool):
|
||||
if enable:
|
||||
import time
|
||||
start_time = time.time()
|
||||
yield
|
||||
elapsed_time = time.time() - start_time
|
||||
print("-" * 50)
|
||||
print("-- generate time = {}".format(elapsed_time))
|
||||
print("-" * 50)
|
||||
else:
|
||||
yield
|
||||
|
||||
|
||||
def main(args):
|
||||
model = args.model_type
|
||||
if model not in model_example_map:
|
||||
@ -1084,19 +1128,22 @@ def main(args):
|
||||
},
|
||||
} for i in range(args.num_prompts)]
|
||||
|
||||
if args.time_generate:
|
||||
import time
|
||||
start_time = time.time()
|
||||
outputs = llm.generate(inputs, sampling_params=sampling_params)
|
||||
elapsed_time = time.time() - start_time
|
||||
print("-- generate time = {}".format(elapsed_time))
|
||||
# Add LoRA request if applicable
|
||||
lora_request = (req_data.lora_requests *
|
||||
args.num_prompts if req_data.lora_requests else None)
|
||||
|
||||
else:
|
||||
outputs = llm.generate(inputs, sampling_params=sampling_params)
|
||||
with time_counter(args.time_generate):
|
||||
outputs = llm.generate(
|
||||
inputs,
|
||||
sampling_params=sampling_params,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -143,8 +143,10 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
|
||||
"multi_modal_data": mm_data,
|
||||
})
|
||||
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
print(output.outputs.embedding)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
|
||||
@ -22,6 +22,16 @@ QUESTION = "What is the content of each image?"
|
||||
IMAGE_URLS = [
|
||||
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
|
||||
"https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
|
||||
]
|
||||
|
||||
|
||||
@ -217,6 +227,33 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
|
||||
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=16,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
mm_processor_kwargs={
|
||||
"max_image_size": {
|
||||
"longest_edge": 384
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
placeholders = "\n".join(f"Image-{i}: <image>\n"
|
||||
for i, _ in enumerate(image_urls, start=1))
|
||||
prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "OpenGVLab/InternVL2-2B"
|
||||
|
||||
@ -258,8 +295,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=4,
|
||||
max_model_len=131072,
|
||||
tensor_parallel_size=8,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
@ -318,8 +354,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
# The configuration below has been confirmed to launch on a single L40 GPU.
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=16,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
@ -614,6 +650,7 @@ model_example_map = {
|
||||
"qwen_vl_chat": load_qwen_vl_chat,
|
||||
"qwen2_vl": load_qwen2_vl,
|
||||
"qwen2_5_vl": load_qwen2_5_vl,
|
||||
"smolvlm": load_smolvlm,
|
||||
}
|
||||
|
||||
|
||||
@ -624,15 +661,8 @@ def run_generate(model, question: str, image_urls: list[str],
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
# To maintain code compatibility in this script, we add LoRA here.
|
||||
# You can also add LoRA using:
|
||||
# llm.generate(prompts, lora_request=lora_request,...)
|
||||
if req_data.lora_requests:
|
||||
for lora_request in req_data.lora_requests:
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=128,
|
||||
max_tokens=256,
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
|
||||
outputs = llm.generate(
|
||||
@ -642,11 +672,15 @@ def run_generate(model, question: str, image_urls: list[str],
|
||||
"image": req_data.image_data
|
||||
},
|
||||
},
|
||||
sampling_params=sampling_params)
|
||||
sampling_params=sampling_params,
|
||||
lora_request=req_data.lora_requests,
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def run_chat(model: str, question: str, image_urls: list[str],
|
||||
@ -664,7 +698,7 @@ def run_chat(model: str, question: str, image_urls: list[str],
|
||||
llm.llm_engine.add_lora(lora_request=lora_request)
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.0,
|
||||
max_tokens=128,
|
||||
max_tokens=256,
|
||||
stop_token_ids=req_data.stop_token_ids)
|
||||
outputs = llm.chat(
|
||||
[{
|
||||
@ -685,11 +719,14 @@ def run_chat(model: str, question: str, image_urls: list[str],
|
||||
}],
|
||||
sampling_params=sampling_params,
|
||||
chat_template=req_data.chat_template,
|
||||
lora_request=req_data.lora_requests,
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
for o in outputs:
|
||||
generated_text = o.outputs[0].text
|
||||
print(generated_text)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
@ -697,10 +734,12 @@ def main(args: Namespace):
|
||||
method = args.method
|
||||
seed = args.seed
|
||||
|
||||
image_urls = IMAGE_URLS[:args.num_images]
|
||||
|
||||
if method == "generate":
|
||||
run_generate(model, QUESTION, IMAGE_URLS, seed)
|
||||
run_generate(model, QUESTION, image_urls, seed)
|
||||
elif method == "chat":
|
||||
run_chat(model, QUESTION, IMAGE_URLS, seed)
|
||||
run_chat(model, QUESTION, image_urls, seed)
|
||||
else:
|
||||
raise ValueError(f"Invalid method: {method}")
|
||||
|
||||
@ -725,6 +764,12 @@ if __name__ == "__main__":
|
||||
type=int,
|
||||
default=None,
|
||||
help="Set the seed when initializing `vllm.LLM`.")
|
||||
parser.add_argument(
|
||||
"--num-images",
|
||||
"-n",
|
||||
choices=list(range(1, 13)), # 12 is the max number of images
|
||||
default=2,
|
||||
help="Number of images to use for the demo.")
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Example Python client for `vllm.entrypoints.api_server`
|
||||
Start the demo server:
|
||||
python -m vllm.entrypoints.api_server --model <model_name>
|
||||
|
||||
NOTE: The API server is used only for demonstration and simple performance
|
||||
benchmarks. It is not intended for production use.
|
||||
For production use, we recommend `vllm serve` and the OpenAI client API.
|
||||
@ -7,6 +10,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from argparse import Namespace
|
||||
from collections.abc import Iterable
|
||||
|
||||
import requests
|
||||
@ -27,7 +31,6 @@ def post_http_request(prompt: str,
|
||||
pload = {
|
||||
"prompt": prompt,
|
||||
"n": n,
|
||||
"use_beam_search": True,
|
||||
"temperature": 0.0,
|
||||
"max_tokens": 16,
|
||||
"stream": stream,
|
||||
@ -55,14 +58,7 @@ def get_response(response: requests.Response) -> list[str]:
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--n", type=int, default=4)
|
||||
parser.add_argument("--prompt", type=str, default="San Francisco is a")
|
||||
parser.add_argument("--stream", action="store_true")
|
||||
args = parser.parse_args()
|
||||
def main(args: Namespace):
|
||||
prompt = args.prompt
|
||||
api_url = f"http://{args.host}:{args.port}/generate"
|
||||
n = args.n
|
||||
@ -83,3 +79,14 @@ if __name__ == "__main__":
|
||||
output = get_response(response)
|
||||
for i, line in enumerate(output):
|
||||
print(f"Beam candidate {i}: {line!r}", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--host", type=str, default="localhost")
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--n", type=int, default=1)
|
||||
parser.add_argument("--prompt", type=str, default="San Francisco is a")
|
||||
parser.add_argument("--stream", action="store_true")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -6,7 +6,7 @@ requests >= 2.26.0
|
||||
tqdm
|
||||
blake3
|
||||
py-cpuinfo
|
||||
transformers >= 4.51.0
|
||||
transformers >= 4.51.1
|
||||
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
|
||||
tokenizers >= 0.19.1 # Required for Llama 3.
|
||||
protobuf # Required by LlamaTokenizer.
|
||||
@ -22,13 +22,13 @@ lm-format-enforcer >= 0.10.11, < 0.11
|
||||
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||
outlines == 0.1.11
|
||||
lark == 1.2.2
|
||||
xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
|
||||
typing_extensions >= 4.10
|
||||
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
|
||||
partial-json-parser # used for parsing partial JSON outputs
|
||||
pyzmq
|
||||
msgspec
|
||||
gguf == 0.10.0
|
||||
gguf >= 0.13.0
|
||||
importlib_metadata
|
||||
mistral_common[opencv] >= 1.5.4
|
||||
opencv-python-headless >= 4.11.0 # required for video IO
|
||||
@ -36,10 +36,14 @@ pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
einops # Required for Qwen2-VL.
|
||||
compressed-tensors == 0.9.2 # required for compressed-tensors
|
||||
compressed-tensors == 0.9.3 # required for compressed-tensors
|
||||
depyf==0.18.0 # required for profiling and debugging with compilation config
|
||||
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
|
||||
watchfiles # required for http server to monitor the updates of TLS files
|
||||
python-json-logger # Used by logging as per examples/other/logging_configuration.md
|
||||
scipy # Required for phi-4-multimodal-instruct
|
||||
ninja # Required for xgrammar, rocm, tpu, xpu
|
||||
opentelemetry-sdk>=1.26.0,<1.27.0 # vllm.tracing
|
||||
opentelemetry-api>=1.26.0,<1.27.0 # vllm.tracing
|
||||
opentelemetry-exporter-otlp>=1.26.0,<1.27.0 # vllm.tracing
|
||||
opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0 # vllm.tracing
|
||||
|
||||
@ -15,3 +15,6 @@ torchaudio==2.6.0; platform_machine == "ppc64le"
|
||||
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
|
||||
torchvision==0.21.0; platform_machine == "ppc64le"
|
||||
datasets # for benchmark scripts
|
||||
|
||||
# cpu cannot use triton 3.3.0
|
||||
triton==3.2.0; platform_machine != "ppc64le"
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
-r common.txt
|
||||
|
||||
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
|
||||
numba == 0.61; python_version > '3.9'
|
||||
numba == 0.61.2; python_version > '3.9'
|
||||
|
||||
# Dependencies for NVIDIA GPUs
|
||||
ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
ray
|
||||
triton==3.1.0
|
||||
pandas
|
||||
numpy==1.26.4
|
||||
tabulate
|
||||
setuptools>=61
|
||||
setuptools-scm>=8
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
-r common.txt
|
||||
|
||||
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
|
||||
numba == 0.61; python_version > '3.9'
|
||||
numba == 0.61.2; python_version > '3.9'
|
||||
|
||||
# Dependencies for AMD GPUs
|
||||
awscli
|
||||
|
||||
@ -5,6 +5,7 @@ pytest-forked
|
||||
pytest-asyncio
|
||||
pytest-rerunfailures
|
||||
pytest-shard
|
||||
pytest-timeout
|
||||
|
||||
# testing utils
|
||||
awscli
|
||||
@ -27,10 +28,11 @@ torchvision==0.21.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.5.4 # required for pixtral test
|
||||
num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]==0.4.8 # required for model evaluation test
|
||||
transformers==4.51.0
|
||||
transformers==4.51.1
|
||||
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
|
||||
# quantization
|
||||
bitsandbytes>=0.45.3
|
||||
@ -40,7 +42,7 @@ genai_perf==0.0.8
|
||||
tritonclient==2.51.0
|
||||
|
||||
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
|
||||
numba == 0.61; python_version > '3.9'
|
||||
numba == 0.61.2; python_version > '3.9'
|
||||
numpy
|
||||
runai-model-streamer==0.11.0
|
||||
runai-model-streamer-s3==0.11.0
|
||||
|
||||
@ -101,6 +101,8 @@ dill==0.3.8
|
||||
# multiprocess
|
||||
dnspython==2.7.0
|
||||
# via email-validator
|
||||
docopt==0.6.2
|
||||
# via num2words
|
||||
docutils==0.16
|
||||
# via awscli
|
||||
einops==0.8.0
|
||||
@ -263,7 +265,9 @@ networkx==3.2.1
|
||||
# via torch
|
||||
nltk==3.9.1
|
||||
# via rouge-score
|
||||
numba==0.61.0
|
||||
num2words==0.5.14
|
||||
# via -r requirements/test.in
|
||||
numba==0.61.2
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# librosa
|
||||
@ -444,6 +448,7 @@ pytest==8.3.3
|
||||
# pytest-mock
|
||||
# pytest-rerunfailures
|
||||
# pytest-shard
|
||||
# pytest-timeout
|
||||
pytest-asyncio==0.24.0
|
||||
# via -r requirements/test.in
|
||||
pytest-forked==1.6.0
|
||||
@ -454,6 +459,8 @@ pytest-rerunfailures==14.0
|
||||
# via -r requirements/test.in
|
||||
pytest-shard==0.1.2
|
||||
# via -r requirements/test.in
|
||||
pytest-timeout==2.3.1
|
||||
# via -r requirements/test.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# botocore
|
||||
@ -645,7 +652,7 @@ tqdm==4.66.6
|
||||
# transformers
|
||||
tqdm-multiprocess==0.0.11
|
||||
# via lm-eval
|
||||
transformers==4.51.0
|
||||
transformers==4.51.1
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# genai-perf
|
||||
|
||||
@ -17,10 +17,10 @@ ray[data]
|
||||
--find-links https://storage.googleapis.com/libtpu-releases/index.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
|
||||
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250406-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250406-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
|
||||
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
|
||||
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -15,7 +15,7 @@ from vllm.platforms import current_platform
|
||||
from ..utils import create_new_process_for_each_test
|
||||
|
||||
|
||||
def models_list(all: bool):
|
||||
def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
|
||||
TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
|
||||
("facebook/opt-125m", {}),
|
||||
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
||||
@ -32,47 +32,50 @@ def models_list(all: bool):
|
||||
("meta-llama/Llama-3.2-1B-Instruct", {}),
|
||||
]
|
||||
|
||||
if not all:
|
||||
return TEST_MODELS
|
||||
|
||||
if is_quant_method_supported("aqlm"):
|
||||
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
|
||||
"quantization": "aqlm"
|
||||
}))
|
||||
|
||||
# TODO: figure out why this fails.
|
||||
if False and is_quant_method_supported("gguf"): # noqa: SIM223
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
|
||||
"quantization": "gguf"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("gptq"):
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
|
||||
"quantization": "gptq"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("gptq_marlin"):
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
|
||||
"quantization": "gptq_marlin"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("gptq_marlin_24"):
|
||||
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
|
||||
"quantization": "gptq_marlin_24"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("marlin"):
|
||||
TEST_MODELS.append(
|
||||
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
|
||||
"quantization": "marlin"
|
||||
if all:
|
||||
if is_quant_method_supported("aqlm"):
|
||||
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
|
||||
"quantization": "aqlm"
|
||||
}))
|
||||
|
||||
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
|
||||
"quantization": "AWQ"
|
||||
}))
|
||||
# TODO: figure out why this fails.
|
||||
if False and is_quant_method_supported("gguf"): # noqa: SIM223
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
|
||||
"quantization": "gguf"
|
||||
}))
|
||||
|
||||
return TEST_MODELS
|
||||
if is_quant_method_supported("gptq"):
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
|
||||
"quantization": "gptq"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("gptq_marlin"):
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
|
||||
"quantization": "gptq_marlin"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("gptq_marlin_24"):
|
||||
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
|
||||
"quantization": "gptq_marlin_24"
|
||||
}))
|
||||
|
||||
if is_quant_method_supported("marlin"):
|
||||
TEST_MODELS.append(
|
||||
("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
|
||||
"quantization": "marlin"
|
||||
}))
|
||||
|
||||
if not current_platform.is_rocm() and is_quant_method_supported("awq"):
|
||||
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
|
||||
"quantization": "AWQ"
|
||||
}))
|
||||
|
||||
if keywords is None:
|
||||
return TEST_MODELS
|
||||
|
||||
# filter by keywords
|
||||
pred = lambda model: any(keyword in model[0] for keyword in keywords)
|
||||
return list(filter(pred, TEST_MODELS))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -96,20 +99,30 @@ def test_full_graph(
|
||||
run_model(optimization_level, model, model_kwargs)
|
||||
|
||||
|
||||
PassConfig = CompilationConfig.PassConfig
|
||||
|
||||
|
||||
# TODO(luka) add other supported compilation config scenarios here
|
||||
@pytest.mark.parametrize(
|
||||
"compilation_config",
|
||||
# additional compile sizes
|
||||
"compilation_config, model_info",
|
||||
[
|
||||
CompilationConfig(level=CompilationLevel.PIECEWISE,
|
||||
compile_sizes=[1, 2])
|
||||
# additional compile sizes, only some of the models
|
||||
(CompilationConfig(level=CompilationLevel.PIECEWISE,
|
||||
compile_sizes=[1, 2]), model)
|
||||
for model in models_list(all=False)
|
||||
] + [
|
||||
# RMSNorm + quant fusion, only 8-bit quant models
|
||||
(CompilationConfig(level=CompilationLevel.PIECEWISE,
|
||||
custom_ops=["+rms_norm"],
|
||||
pass_config=PassConfig(enable_fusion=True,
|
||||
enable_noop=True)), model)
|
||||
for model in models_list(keywords=["FP8-dynamic", "quantized.w8a8"])
|
||||
])
|
||||
# only test some of the models
|
||||
@pytest.mark.parametrize("model_info", models_list(all=False))
|
||||
@create_new_process_for_each_test()
|
||||
def test_custom_compile_config(
|
||||
model_info: tuple[str, dict[str, Any]],
|
||||
compilation_config: CompilationConfig,
|
||||
model_info: tuple[str, dict[str, Any]],
|
||||
):
|
||||
model, model_kwargs = model_info
|
||||
print(f"MODEL={model}")
|
||||
|
||||
@ -44,12 +44,17 @@ class TestModel(torch.nn.Module):
|
||||
resid = torch.sqrt(x)
|
||||
y = self.norm[0](x)
|
||||
|
||||
x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
|
||||
x2 = self.fp8_linear.apply(y,
|
||||
self.w[0],
|
||||
self.wscale[0],
|
||||
input_scale=self.scale[0])
|
||||
# make sure resid is used for replacement to work
|
||||
y2, resid = self.norm[1](x2, resid)
|
||||
|
||||
x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
|
||||
self.scale[1])
|
||||
x3 = self.fp8_linear.apply(y2,
|
||||
self.w[1],
|
||||
self.wscale[1],
|
||||
input_scale=self.scale[1])
|
||||
y3, resid = self.norm[2](x3, resid) # use resid here
|
||||
return y3
|
||||
|
||||
|
||||
@ -671,8 +671,9 @@ class HfRunner:
|
||||
return [(output_ids, output_str, output_logprobs)
|
||||
for output_ids, output_str, output_logprobs in outputs]
|
||||
|
||||
def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
|
||||
return self.model.encode(prompts)
|
||||
def encode(self, prompts: list[str], *args,
|
||||
**kwargs) -> list[list[torch.Tensor]]:
|
||||
return self.model.encode(prompts, *args, **kwargs)
|
||||
|
||||
def predict(self, prompts: list[list[str]]) -> torch.Tensor:
|
||||
return self.model.predict(prompts, convert_to_tensor=True)
|
||||
|
||||
@ -18,7 +18,8 @@ models = ["llava-hf/llava-1.5-7b-hf"]
|
||||
def test_context_length_too_short(vllm_runner, image_assets, model):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
with pytest.raises(ValueError, match="too long to fit into the model"):
|
||||
with pytest.raises(ValueError,
|
||||
match="longer than the maximum model length"):
|
||||
vllm_model = vllm_runner(
|
||||
model,
|
||||
max_model_len=128, # LLaVA has a feature size of 576
|
||||
|
||||
@ -3,9 +3,11 @@
|
||||
import json
|
||||
import re
|
||||
import weakref
|
||||
from enum import Enum
|
||||
|
||||
import jsonschema
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.entrypoints.llm import LLM
|
||||
@ -330,3 +332,44 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
|
||||
# Parse to verify it is valid JSON
|
||||
parsed_json = json.loads(generated_text)
|
||||
assert isinstance(parsed_json, dict)
|
||||
|
||||
|
||||
class CarType(str, Enum):
|
||||
sedan = "sedan"
|
||||
suv = "SUV"
|
||||
truck = "Truck"
|
||||
coupe = "Coupe"
|
||||
|
||||
|
||||
class CarDescription(BaseModel):
|
||||
brand: str
|
||||
model: str
|
||||
car_type: CarType
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
def test_guided_json_completion_with_enum(llm, guided_decoding_backend: str):
|
||||
json_schema = CarDescription.model_json_schema()
|
||||
sampling_params = SamplingParams(temperature=1.0,
|
||||
max_tokens=1000,
|
||||
guided_decoding=GuidedDecodingParams(
|
||||
json=json_schema,
|
||||
backend=guided_decoding_backend))
|
||||
outputs = llm.generate(
|
||||
prompts="Generate a JSON with the brand, model and car_type of"
|
||||
"the most iconic car from the 90's",
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=True)
|
||||
|
||||
assert outputs is not None
|
||||
for output in outputs:
|
||||
assert output is not None
|
||||
assert isinstance(output, RequestOutput)
|
||||
prompt = output.prompt
|
||||
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text is not None
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
output_json = json.loads(generated_text)
|
||||
jsonschema.validate(instance=output_json, schema=json_schema)
|
||||
@ -15,7 +15,7 @@ def v1(run_with_both_engines):
|
||||
|
||||
def test_empty_prompt():
|
||||
llm = LLM(model="openai-community/gpt2", enforce_eager=True)
|
||||
with pytest.raises(ValueError, match='Prompt cannot be empty'):
|
||||
with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
|
||||
llm.generate([""])
|
||||
|
||||
|
||||
|
||||
@ -20,8 +20,6 @@ from .test_completion import zephyr_lora_files # noqa: F401
|
||||
# any model with a chat template should work here
|
||||
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
|
||||
|
||||
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def monkeypatch_module():
|
||||
@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
|
||||
assert last_completion_tokens == 10
|
||||
|
||||
|
||||
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
|
||||
# (i.e. using the same ordering as in the Completions API tests), the test
|
||||
# will fail on the second `guided_decoding_backend` even when I swap their order
|
||||
# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
|
||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
||||
pytest.skip("Only xgrammar backend is supported with V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.7,
|
||||
extra_body=dict(guided_choice=sample_guided_choice,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(guided_choice=sample_guided_choice))
|
||||
choice1 = chat_completion.choices[0].message.content
|
||||
assert choice1 in sample_guided_choice
|
||||
|
||||
@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.7,
|
||||
extra_body=dict(guided_choice=sample_guided_choice,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(guided_choice=sample_guided_choice))
|
||||
choice2 = chat_completion.choices[0].message.content
|
||||
assert choice2 in sample_guided_choice
|
||||
assert choice1 != choice2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
async def test_guided_json_chat(client: openai.AsyncOpenAI,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported in V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(guided_json=sample_json_schema,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(guided_json=sample_json_schema))
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json1 = json.loads(message.content)
|
||||
@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=1000,
|
||||
extra_body=dict(guided_json=sample_json_schema,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(guided_json=sample_json_schema))
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None
|
||||
json2 = json.loads(message.content)
|
||||
@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
guided_decoding_backend: str, sample_regex):
|
||||
|
||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
||||
pytest.skip("Only xgrammar backend is supported with V1")
|
||||
async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex):
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(guided_regex=sample_regex))
|
||||
ip1 = chat_completion.choices[0].message.content
|
||||
assert ip1 is not None
|
||||
assert re.fullmatch(sample_regex, ip1) is not None
|
||||
@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_completion_tokens=20,
|
||||
extra_body=dict(guided_regex=sample_regex,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(guided_regex=sample_regex))
|
||||
ip2 = chat_completion.choices[0].message.content
|
||||
assert ip2 is not None
|
||||
assert re.fullmatch(sample_regex, ip2) is not None
|
||||
@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
|
||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
||||
pytest.skip("Only xgrammar backend is supported with V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
extra_body=dict(guided_choice=sample_guided_choice,
|
||||
guided_decoding_backend=guided_decoding_backend))
|
||||
extra_body=dict(guided_choice=sample_guided_choice))
|
||||
|
||||
assert chat_completion.choices[0].logprobs is not None
|
||||
assert chat_completion.choices[0].logprobs.content is not None
|
||||
@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
||||
|
||||
async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema):
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
},
|
||||
extra_body=dict(guided_decoding_backend=guided_decoding_backend))
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert len(message.content) == 0
|
||||
json_string = message.tool_calls[0].function.arguments
|
||||
@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
"name": "dummy_function_name"
|
||||
}
|
||||
},
|
||||
extra_body=dict(guided_decoding_backend=guided_decoding_backend),
|
||||
stream=True)
|
||||
|
||||
output = []
|
||||
@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
|
||||
model=model_name,
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
extra_body=dict(guided_decoding_backend="outlines"),
|
||||
)
|
||||
|
||||
assert chat_completion.choices[0].message.tool_calls is not None
|
||||
@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
|
||||
model=model_name,
|
||||
tools=tools,
|
||||
tool_choice="required",
|
||||
extra_body=dict(guided_decoding_backend="outlines"),
|
||||
stream=True,
|
||||
)
|
||||
|
||||
@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
|
||||
@ -11,6 +11,7 @@ import requests
|
||||
from vllm.entrypoints.openai.protocol import EmbeddingResponse
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...models.embedding.utils import check_embeddings_close
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "intfloat/multilingual-e5-small"
|
||||
@ -190,30 +191,35 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
|
||||
responses_float = await client.embeddings.create(input=input_texts,
|
||||
model=model_name,
|
||||
encoding_format="float")
|
||||
float_data = [d.embedding for d in responses_float.data]
|
||||
|
||||
responses_base64 = await client.embeddings.create(input=input_texts,
|
||||
model=model_name,
|
||||
encoding_format="base64")
|
||||
|
||||
decoded_responses_base64_data = []
|
||||
base64_data = []
|
||||
for data in responses_base64.data:
|
||||
decoded_responses_base64_data.append(
|
||||
base64_data.append(
|
||||
np.frombuffer(base64.b64decode(data.embedding),
|
||||
dtype="float32").tolist())
|
||||
|
||||
assert responses_float.data[0].embedding == decoded_responses_base64_data[
|
||||
0]
|
||||
assert responses_float.data[1].embedding == decoded_responses_base64_data[
|
||||
1]
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=float_data,
|
||||
embeddings_1_lst=base64_data,
|
||||
name_0="float",
|
||||
name_1="base64",
|
||||
)
|
||||
|
||||
# Default response is float32 decoded from base64 by OpenAI Client
|
||||
responses_default = await client.embeddings.create(input=input_texts,
|
||||
model=model_name)
|
||||
default_data = [d.embedding for d in responses_default.data]
|
||||
|
||||
assert responses_float.data[0].embedding == responses_default.data[
|
||||
0].embedding
|
||||
assert responses_float.data[1].embedding == responses_default.data[
|
||||
1].embedding
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=float_data,
|
||||
embeddings_1_lst=default_data,
|
||||
name_0="float",
|
||||
name_1="default",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -17,7 +17,7 @@ async def test_empty_prompt():
|
||||
client = remote_server.get_async_client()
|
||||
|
||||
with pytest.raises(openai.BadRequestError,
|
||||
match=re.compile('.+Prompt cannot be empty.+')):
|
||||
match="decoder prompt cannot be empty"):
|
||||
await client.completions.create(model=model_name,
|
||||
prompt="",
|
||||
max_tokens=5,
|
||||
|
||||
@ -30,6 +30,7 @@ QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
|
||||
MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
|
||||
HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
@ -80,6 +81,30 @@ def mllama_tokenizer():
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def mistral_model_config():
|
||||
return ModelConfig(MISTRAL_MODEL_ID,
|
||||
task="generate",
|
||||
tokenizer=MISTRAL_MODEL_ID,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="auto",
|
||||
seed=0,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
})
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def mistral_tokenizer():
|
||||
return TokenizerGroup(
|
||||
tokenizer_id=MISTRAL_MODEL_ID,
|
||||
enable_lora=False,
|
||||
max_num_seqs=5,
|
||||
max_input_length=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def image_url():
|
||||
image = ImageAsset('cherry_blossom')
|
||||
@ -131,6 +156,66 @@ def test_parse_chat_messages_single_image(
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
|
||||
|
||||
def test_parse_chat_messages_empty_system(
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
):
|
||||
# Test string format
|
||||
conversation, _ = parse_chat_messages(
|
||||
[{
|
||||
"role": "system",
|
||||
"content": ""
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "Who are you?"
|
||||
}]
|
||||
}],
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
assert conversation == [{
|
||||
"role": "system",
|
||||
"content": ""
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": "Who are you?"
|
||||
}]
|
||||
|
||||
# Test openai format
|
||||
conversation, _ = parse_chat_messages(
|
||||
[{
|
||||
"role": "system",
|
||||
"content": ""
|
||||
}, {
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "Who are you?"
|
||||
}]
|
||||
}],
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
content_format="openai",
|
||||
)
|
||||
assert conversation == [{
|
||||
"role": "system",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": ""
|
||||
}]
|
||||
}, {
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": "Who are you?"
|
||||
}]
|
||||
}]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_single_image_async(
|
||||
phi3v_model_config,
|
||||
@ -671,7 +756,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
# Build a config for the model
|
||||
model_config = ModelConfig(model,
|
||||
task="generate",
|
||||
tokenizer=MLLAMA_MODEL_ID,
|
||||
tokenizer=model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
dtype="auto",
|
||||
@ -682,7 +767,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
|
||||
# Build the tokenizer group and grab the underlying tokenizer
|
||||
tokenizer_group = TokenizerGroup(
|
||||
MLLAMA_MODEL_ID,
|
||||
model,
|
||||
enable_lora=False,
|
||||
max_num_seqs=5,
|
||||
max_input_length=None,
|
||||
|
||||
@ -124,7 +124,7 @@ def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
|
||||
cal_diff(out_flash, out_torch, "out")
|
||||
cal_diff(lse_flash, lse_torch, "lse")
|
||||
|
||||
t = triton.testing.do_bench(flash_mla, fast_flush=False)
|
||||
t = triton.testing.do_bench(flash_mla)
|
||||
FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
|
||||
bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d +
|
||||
b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
|
||||
|
||||
265
tests/kernels/test_merge_attn_states.py
Normal file
265
tests/kernels/test_merge_attn_states.py
Normal file
@ -0,0 +1,265 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm._custom_ops import merge_attn_states as merge_attn_states_cuda
|
||||
from vllm.attention.ops.triton_merge_attn_states import (
|
||||
merge_attn_states as merge_attn_states_triton)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
# Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
|
||||
# can be used to combine partial attention results (in the split-KV case)
|
||||
def merge_attn_states_torch(
|
||||
output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
|
||||
prefix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
|
||||
prefix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
|
||||
suffix_output: torch.Tensor, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
|
||||
suffix_lse: torch.Tensor, # [NUM_HEADS, NUM_TOKENS]
|
||||
output_lse: Optional[torch.Tensor] = None, # [NUM_HEADS, NUM_TOKENS]
|
||||
):
|
||||
p_lse = prefix_lse
|
||||
s_lse = suffix_lse
|
||||
# inf -> -inf
|
||||
p_lse[p_lse == torch.inf] = -torch.inf
|
||||
s_lse[s_lse == torch.inf] = -torch.inf
|
||||
# max_lse [NUM_HEADS, NUM_TOKENS]
|
||||
max_lse = torch.maximum(p_lse, s_lse)
|
||||
p_lse = p_lse - max_lse
|
||||
s_lse = s_lse - max_lse
|
||||
p_lse_exp = torch.exp(p_lse)
|
||||
s_lse_exp = torch.exp(s_lse)
|
||||
out_se = (p_lse_exp + s_lse_exp)
|
||||
if output_lse is not None:
|
||||
output_lse = torch.log(out_se) + max_lse
|
||||
p_scale = p_lse_exp / out_se # [NUM_HEADS, NUM_TOKENS]
|
||||
s_scale = s_lse_exp / out_se # [NUM_HEADS, NUM_TOKENS]
|
||||
p_scale = torch.transpose(p_scale, 0,
|
||||
1).unsqueeze(2) # [NUM_TOKENS, NUM_HEADS, 1]
|
||||
s_scale = torch.transpose(s_scale, 0,
|
||||
1).unsqueeze(2) # [NUM_TOKENS, NUM_HEADS, 1]
|
||||
output = prefix_output * p_scale + suffix_output * s_scale
|
||||
return output, output_lse
|
||||
|
||||
|
||||
NUM_BATCH_TOKENS = [256, 512, 613, 1024, 1536, 4096]
|
||||
NUM_QUERY_HEADS = [4, 8, 16, 32, 48, 64]
|
||||
HEAD_SIZES = [32, 48, 64, 96, 128, 256]
|
||||
DTYPES = [torch.float32, torch.half, torch.bfloat16]
|
||||
|
||||
all_case_info: list[tuple] = []
|
||||
|
||||
|
||||
def generate_markdown_table():
|
||||
global all_case_info
|
||||
table_header = ("| tokens | heads | headsize | dtype "
|
||||
"| device | torch | triton | cuda | speedup |")
|
||||
table_separator = "| --- | --- | --- | --- | --- | --- | --- | --- | --- |"
|
||||
|
||||
def shortly_dtype(dtype: torch.dtype) -> str:
|
||||
return str(dtype).removeprefix("torch.")
|
||||
|
||||
def shortly_device(device: str) -> str:
|
||||
return device.removeprefix("NVIDIA").strip()
|
||||
|
||||
print(table_header)
|
||||
print(table_separator)
|
||||
for info in all_case_info:
|
||||
(num_tokens, num_heads, head_size, dtype, device,
|
||||
avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
|
||||
performance_improved) = info
|
||||
dtype = shortly_dtype(dtype)
|
||||
device = shortly_device(device)
|
||||
print(f"| {num_tokens} | {num_heads} | {head_size} "
|
||||
f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
|
||||
f"| {avg_time_triton_kernel:.5f}ms "
|
||||
f"| {avg_time_cuda_kernel:.5f}ms "
|
||||
f"| {performance_improved:.4f}x |")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS)
|
||||
@pytest.mark.parametrize("num_query_heads", NUM_QUERY_HEADS)
|
||||
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||
@pytest.mark.parametrize("output_dtype", DTYPES)
|
||||
@torch.inference_mode()
|
||||
def test_merge_attn_states(num_tokens: int, num_query_heads: int,
|
||||
head_size: int, output_dtype: torch.dtype):
|
||||
if not current_platform.is_cuda():
|
||||
pytest.skip('Currently only support compare triton merge_attn_states '
|
||||
'with custom cuda merge_attn_states kernel')
|
||||
|
||||
NUM_TOKENS = num_tokens
|
||||
NUM_HEADS = num_query_heads
|
||||
HEAD_SIZE = head_size
|
||||
|
||||
print(f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
|
||||
f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
|
||||
f"Device: {current_platform.get_device_name()}")
|
||||
|
||||
# prefix_lse and suffix_lse contain inf and normal values
|
||||
prefix_lse = torch.randn(NUM_HEADS,
|
||||
NUM_TOKENS,
|
||||
dtype=torch.float32,
|
||||
device="cuda")
|
||||
suffix_lse = torch.randn(NUM_HEADS,
|
||||
NUM_TOKENS,
|
||||
dtype=torch.float32,
|
||||
device="cuda")
|
||||
|
||||
# Generate boolean masks
|
||||
mask_prefix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
|
||||
mask_suffix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
|
||||
# Ensure that the same position is not True at the same time
|
||||
combined_mask = torch.logical_and(mask_prefix, mask_suffix)
|
||||
mask_prefix = torch.logical_and(mask_prefix, ~combined_mask)
|
||||
mask_suffix = torch.logical_and(mask_suffix, ~combined_mask)
|
||||
|
||||
prefix_lse[mask_prefix] = float('inf')
|
||||
suffix_lse[mask_suffix] = float('inf')
|
||||
|
||||
# Other input tensors (need to be initialized but
|
||||
# no actual calculation needed)
|
||||
output = torch.zeros((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
|
||||
dtype=output_dtype,
|
||||
device="cuda")
|
||||
output_lse = torch.zeros((NUM_HEADS, NUM_TOKENS),
|
||||
dtype=torch.float32,
|
||||
device="cuda")
|
||||
prefix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
|
||||
dtype=output_dtype,
|
||||
device="cuda")
|
||||
suffix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
|
||||
dtype=output_dtype,
|
||||
device="cuda")
|
||||
|
||||
warmup_times = 2
|
||||
repeat_times = 20
|
||||
|
||||
output_torch = output.clone()
|
||||
output_lse_torch = output_lse.clone()
|
||||
total_time_torch_kernel = 0
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
# 0. Run the Torch kernel
|
||||
prefix_lse_torch = prefix_lse.clone()
|
||||
suffix_lse_torch = suffix_lse.clone()
|
||||
for _ in range(warmup_times):
|
||||
output_torch, output_lse_torch = merge_attn_states_torch(
|
||||
output_torch, prefix_output, prefix_lse_torch, suffix_output,
|
||||
suffix_lse_torch, output_lse_torch)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
for _ in range(repeat_times):
|
||||
start.record()
|
||||
output_torch, output_lse_torch = merge_attn_states_torch(
|
||||
output_torch, prefix_output, prefix_lse_torch, suffix_output,
|
||||
suffix_lse_torch, output_lse_torch)
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
total_time_torch_kernel += start.elapsed_time(end)
|
||||
|
||||
avg_time_torch_kernel = total_time_torch_kernel / repeat_times
|
||||
|
||||
# 1. Run the Triton kernel
|
||||
output_ref_triton = output.clone()
|
||||
output_lse_ref_triton = output_lse.clone()
|
||||
|
||||
total_time_triton_kernel = 0
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
for _ in range(warmup_times):
|
||||
merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
|
||||
suffix_output, suffix_lse,
|
||||
output_lse_ref_triton)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
for _ in range(repeat_times):
|
||||
start.record()
|
||||
merge_attn_states_triton(output_ref_triton, prefix_output, prefix_lse,
|
||||
suffix_output, suffix_lse,
|
||||
output_lse_ref_triton)
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
total_time_triton_kernel += start.elapsed_time(end)
|
||||
|
||||
avg_time_triton_kernel = total_time_triton_kernel / repeat_times
|
||||
|
||||
# 2. Run the CUDA kernel
|
||||
total_time_cuda_kernel = 0
|
||||
output_cuda = output.clone()
|
||||
output_lse_cuda = output_lse.clone()
|
||||
|
||||
for _ in range(warmup_times):
|
||||
merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
|
||||
suffix_output, suffix_lse, output_lse_cuda)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
for _ in range(repeat_times):
|
||||
start.record()
|
||||
merge_attn_states_cuda(output_cuda, prefix_output, prefix_lse,
|
||||
suffix_output, suffix_lse, output_lse_cuda)
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
total_time_cuda_kernel += start.elapsed_time(end)
|
||||
|
||||
avg_time_cuda_kernel = total_time_cuda_kernel / repeat_times
|
||||
|
||||
# 3. Performance compare
|
||||
performance_improved = avg_time_triton_kernel / avg_time_cuda_kernel
|
||||
print(f" Torch time: {avg_time_torch_kernel:.6f}ms")
|
||||
print(f"Triton time: {avg_time_triton_kernel:.6f}ms")
|
||||
print(f" CUDA time: {avg_time_cuda_kernel:.6f}ms, "
|
||||
f"Performance: {performance_improved:.5f}x")
|
||||
print("-" * 100)
|
||||
|
||||
# 4. Correctness compare
|
||||
# Liger Kernel: Efficient Triton Kernels for LLM Training
|
||||
# https://arxiv.org/pdf/2410.10989, 3.3 Correctness
|
||||
# use rtol = 1e-2 for bfloat16.
|
||||
rtol = 1e-2 if output_dtype == torch.bfloat16 else 1e-3
|
||||
|
||||
def diff(a: torch.Tensor, b: torch.Tensor):
|
||||
max_diff = torch.max(torch.abs(a.float() - b.float()))
|
||||
return max_diff
|
||||
|
||||
# Use Triton output as reference because we want to replace
|
||||
# the Triton kernel with custom CUDA kernel for merge attn
|
||||
# states operation.
|
||||
output_ref = output_ref_triton
|
||||
output_lse_ref = output_lse_ref_triton
|
||||
torch.testing.assert_close(output_cuda.float(),
|
||||
output_ref.float(),
|
||||
atol=1e-3,
|
||||
rtol=rtol)
|
||||
print("Output all match, max abs diff:")
|
||||
print(f"(Triton vs Torch) : {diff(output_torch, output_ref)}")
|
||||
print(f" (CUDA vs Torch) : {diff(output_torch, output_cuda)}")
|
||||
print(f" (CUDA vs Triton): {diff(output_ref, output_cuda)}")
|
||||
print("-" * 100)
|
||||
|
||||
torch.testing.assert_close(output_lse_cuda.float(),
|
||||
output_lse_ref.float(),
|
||||
atol=1e-3,
|
||||
rtol=rtol)
|
||||
print("Output LSE all match, max abs diff:")
|
||||
print(f"(Triton vs Torch) : {diff(output_lse_torch, output_lse_ref)}")
|
||||
print(f" (CUDA vs Torch) : {diff(output_lse_torch, output_lse_cuda)}")
|
||||
print(f" (CUDA vs Triton): {diff(output_lse_ref, output_lse_cuda)}")
|
||||
print("-" * 100)
|
||||
|
||||
print("All output values test passed! All inf values "
|
||||
"are correctly replaced with -inf.")
|
||||
print("-" * 100)
|
||||
|
||||
device = current_platform.get_device_name()
|
||||
all_case_info.append(
|
||||
(NUM_TOKENS, NUM_HEADS, HEAD_SIZE, output_dtype, device,
|
||||
avg_time_torch_kernel, avg_time_triton_kernel, avg_time_cuda_kernel,
|
||||
performance_improved))
|
||||
if len(all_case_info) == (len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) *
|
||||
len(NUM_QUERY_HEADS) * len(DTYPES)):
|
||||
generate_markdown_table()
|
||||
@ -256,3 +256,15 @@ def run_with_both_engines_lora(request, monkeypatch):
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def reset_default_device():
|
||||
"""
|
||||
Some tests, such as `test_punica_ops.py`, explicitly set the
|
||||
default device, which can affect subsequent tests. Adding this fixture
|
||||
helps avoid this problem.
|
||||
"""
|
||||
original_device = torch.get_default_device()
|
||||
yield
|
||||
torch.set_default_device(original_device)
|
||||
|
||||
@ -73,7 +73,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=1,
|
||||
trust_remote_code=True,
|
||||
fully_sharded_loras=fully_sharded)
|
||||
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
|
||||
|
||||
@ -61,7 +61,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=64,
|
||||
tensor_parallel_size=1,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True)
|
||||
|
||||
|
||||
@ -65,7 +65,7 @@ VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clean_cache():
|
||||
def clean_cache_reset_device(reset_default_device):
|
||||
# Release any memory we might be holding on to. CI runs OOMs otherwise.
|
||||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
||||
_LORA_B_PTR_DICT)
|
||||
|
||||
@ -88,7 +88,6 @@ def test_llama_lora(sql_lora_files):
|
||||
# also test odd max_num_seqs
|
||||
max_num_seqs=13,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
enable_chunked_prefill=True)
|
||||
generate_and_test(llm, sql_lora_files)
|
||||
|
||||
|
||||
@ -13,6 +13,11 @@ from vllm.platforms import current_platform
|
||||
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_device(reset_default_device):
|
||||
pass
|
||||
|
||||
|
||||
# Utility shrink and expand operations used as reference implementations.
|
||||
def sgmv_shrink_for_nslices(
|
||||
nslices: int, inputs_tensor: torch.Tensor,
|
||||
|
||||
@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
|
||||
tp_size):
|
||||
if num_gpus_available < tp_size and \
|
||||
tp_size > 1 and current_platform.is_cuda_alike():
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
def test_quant_model_lora(tinyllama_lora_files, model):
|
||||
|
||||
llm = vllm.LLM(
|
||||
model=model.model_path,
|
||||
@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
max_model_len=400,
|
||||
tensor_parallel_size=tp_size,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
|
||||
enable_lora=True,
|
||||
max_num_seqs=16,
|
||||
max_loras=4,
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.2, #avoid OOM
|
||||
quantization=model.quantization,
|
||||
trust_remote_code=True,
|
||||
|
||||
@ -53,7 +53,6 @@ def test_ilama_lora(ilama_lora_files):
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
max_lora_rank=16,
|
||||
tensor_parallel_size=1,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True)
|
||||
|
||||
|
||||
@ -9,11 +9,13 @@ from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
from pytest import MarkDecorator
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ....conftest import VllmRunner
|
||||
from ....utils import multi_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
@ -25,6 +27,7 @@ class GGUFTestConfig(NamedTuple):
|
||||
original_model: str
|
||||
gguf_repo: str
|
||||
gguf_filename: str
|
||||
marks: list[MarkDecorator] = []
|
||||
|
||||
@property
|
||||
def gguf_model(self):
|
||||
@ -35,6 +38,7 @@ LLAMA_CONFIG = GGUFTestConfig(
|
||||
original_model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
|
||||
gguf_filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf",
|
||||
marks=[pytest.mark.quant_model],
|
||||
)
|
||||
|
||||
QWEN2_CONFIG = GGUFTestConfig(
|
||||
@ -81,34 +85,24 @@ MODELS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
def test_models(
|
||||
num_gpus_available: int,
|
||||
def check_model_outputs(
|
||||
vllm_runner: type[VllmRunner],
|
||||
example_prompts: list[str],
|
||||
prompts: list[str],
|
||||
model: GGUFTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
) -> None:
|
||||
if num_gpus_available < tp_size:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model.original_model)
|
||||
if tokenizer.chat_template is not None:
|
||||
messages = [[{
|
||||
'role': 'user',
|
||||
'content': prompt
|
||||
}] for prompt in example_prompts]
|
||||
example_prompts = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True)
|
||||
}] for prompt in prompts]
|
||||
prompts = tokenizer.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
# Run gguf model.
|
||||
with vllm_runner(model_name=model.gguf_model,
|
||||
@ -118,17 +112,19 @@ def test_models(
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size) as gguf_model:
|
||||
gguf_outputs = gguf_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
prompts[:-1], max_tokens, num_logprobs)
|
||||
|
||||
# Run unquantized model.
|
||||
# Should run with tp=1, otherwise the test will stuck at
|
||||
# nccl initialization.
|
||||
with vllm_runner(
|
||||
model_name=model.original_model,
|
||||
enforce_eager=True, # faster tests
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size) as original_model:
|
||||
tensor_parallel_size=1) as original_model:
|
||||
original_outputs = original_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
prompts[:-1], max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=original_outputs,
|
||||
@ -136,3 +132,47 @@ def test_models(
|
||||
name_0="original",
|
||||
name_1="gguf",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", [
|
||||
pytest.param(test_config, marks=test_config.marks)
|
||||
for test_config in MODELS
|
||||
])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_models(
|
||||
vllm_runner: type[VllmRunner],
|
||||
example_prompts: list[str],
|
||||
model: GGUFTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
) -> None:
|
||||
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
|
||||
num_logprobs, tp_size)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_distributed(
|
||||
vllm_runner: type[VllmRunner],
|
||||
example_prompts: list[str],
|
||||
model: GGUFTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
) -> None:
|
||||
check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
|
||||
num_logprobs, tp_size)
|
||||
|
||||
@ -330,9 +330,8 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=4,
|
||||
dtype="bfloat16",
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
tensor_parallel_size=8,
|
||||
vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
|
||||
marks=multi_gpu_marks(num_gpus=8),
|
||||
tensor_parallel_size=4,
|
||||
marks=multi_gpu_marks(num_gpus=4),
|
||||
),
|
||||
"llava_next": VLMTestInfo(
|
||||
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
|
||||
@ -426,23 +425,20 @@ VLM_TEST_SETTINGS = {
|
||||
max_num_seqs=2,
|
||||
patch_hf_runner=model_utils.molmo_patch_hf_runner,
|
||||
),
|
||||
# Tests for phi3v currently live in another file because of a bug in
|
||||
# transformers. Once this issue is fixed, we can enable them here instead.
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
# "phi3v": VLMTestInfo(
|
||||
# models=["microsoft/Phi-3.5-vision-instruct"],
|
||||
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
# prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
# img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
|
||||
# max_model_len=4096,
|
||||
# max_num_seqs=2,
|
||||
# task="generate",
|
||||
# # use eager mode for hf runner since phi3v didn't work with flash_attn
|
||||
# hf_model_kwargs={"_attn_implementation": "eager"},
|
||||
# use_tokenizer_eos=True,
|
||||
# vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
|
||||
# num_logprobs=10,
|
||||
# ),
|
||||
"phi3v": VLMTestInfo(
|
||||
models=["microsoft/Phi-3.5-vision-instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
task="generate",
|
||||
# use sdpa mode for hf runner since phi3v didn't work with flash_attn
|
||||
hf_model_kwargs={"_attn_implementation": "sdpa"},
|
||||
use_tokenizer_eos=True,
|
||||
vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output,
|
||||
num_logprobs=10,
|
||||
),
|
||||
"pixtral_hf": VLMTestInfo(
|
||||
models=["nm-testing/pixtral-12b-FP8-dynamic"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
@ -494,6 +490,16 @@ VLM_TEST_SETTINGS = {
|
||||
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
|
||||
marks=[large_gpu_mark(min_gb=80)],
|
||||
),
|
||||
"smolvlm": VLMTestInfo(
|
||||
models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<image>",
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
hf_output_post_proc=model_utils.smolvlm_trunc_hf_output,
|
||||
),
|
||||
### Tensor parallel / multi-gpu broadcast tests
|
||||
"chameleon-broadcast": VLMTestInfo(
|
||||
models=["facebook/chameleon-7b"],
|
||||
|
||||
@ -1,245 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from packaging.version import Version
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom":
|
||||
"<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
|
||||
})
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
|
||||
models = ["microsoft/Phi-3.5-vision-instruct"]
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
|
||||
Optional[SampleLogprobs]],
|
||||
model: str):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
|
||||
assert output_str_without_image[0] == " "
|
||||
output_str_without_image = output_str_without_image[1:]
|
||||
|
||||
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
hf_output_ids = tokenizer.encode(output_str_without_image)
|
||||
assert hf_output_ids[0] == 1
|
||||
hf_output_ids = hf_output_ids[1:]
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
target_dtype = "half"
|
||||
|
||||
# ROCm Triton FA can run into shared memory issues with these models,
|
||||
# use other backends in the meantime
|
||||
# FIXME (mattwong, gshtrasb, hongxiayan)
|
||||
if current_platform.is_rocm():
|
||||
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
||||
|
||||
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[tuple[list[str], PromptImageInput]],
|
||||
model: str,
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
mm_limit: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test are from IMAGE_ASSETS.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding MultiModalConfig as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
# HACK - this is an attempted workaround for the following bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
from transformers import AutoImageProcessor # noqa: F401
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
|
||||
# Once the model repo is updated to 4.49, we should be able to run the
|
||||
# test in `test_models.py` without the above workaround
|
||||
if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
|
||||
pytest.skip(f"`transformers=={TRANSFORMERS_VERSION}` installed, "
|
||||
"but `transformers<=4.49` is required to run this model. "
|
||||
"Reason: Cannot run HF implementation")
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(model,
|
||||
task="generate",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
|
||||
hf_model_kwargs = {"_attn_implementation": "eager"}
|
||||
with hf_runner(model, dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs) as hf_model:
|
||||
eos_token_id = hf_model.processor.tokenizer.eos_token_id
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
eos_token_id=eos_token_id)
|
||||
for prompts, images in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(vllm_output, model)
|
||||
for vllm_output in vllm_outputs
|
||||
],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
# Since we use _attn_implementation="eager" for hf_runner, there is more
|
||||
# significant numerical difference. The basic `logprobs=5` fails to pass.
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_tokens: int, num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_image,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
def test_regression_7840(hf_runner, vllm_runner, image_assets, model,
|
||||
dtype) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_regresion_7840 = [
|
||||
([prompt], [image]) for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
# Regression test for #7840.
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_regresion_7840,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=128,
|
||||
num_logprobs=10,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors])
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_per_case,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@ -2,18 +2,22 @@
|
||||
|
||||
import os
|
||||
import re
|
||||
from collections.abc import Sequence
|
||||
from typing import Optional
|
||||
|
||||
import librosa
|
||||
import pytest
|
||||
from huggingface_hub import snapshot_download
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
|
||||
PromptImageInput, VllmRunner)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
@ -29,6 +33,8 @@ model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
speech_question = os.path.join(model_path, "examples",
|
||||
"what_is_shown_in_this_image.wav")
|
||||
models = [model_path]
|
||||
|
||||
|
||||
@ -64,7 +70,8 @@ if current_platform.is_rocm():
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[tuple[list[str], PromptImageInput]],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput,
|
||||
Optional[PromptAudioInput]]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
@ -104,28 +111,49 @@ def run_test(
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images)
|
||||
for prompts, images in inputs
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
|
||||
hf_model_kwargs = {"_attn_implementation": "eager"}
|
||||
hf_model_kwargs = {"_attn_implementation": "sdpa"}
|
||||
with hf_runner(model, dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs) as hf_model:
|
||||
eos_token_id = hf_model.processor.tokenizer.eos_token_id
|
||||
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
|
||||
def patch_hf_processor(*args,
|
||||
text="",
|
||||
images=None,
|
||||
audio=None,
|
||||
sampling_rate=None,
|
||||
**kwargs):
|
||||
audios = None
|
||||
if audio is not None and sampling_rate is not None:
|
||||
audios = [(audio, sampling_rate)]
|
||||
return hf_processor(*args,
|
||||
text=text,
|
||||
images=images,
|
||||
audios=audios,
|
||||
**kwargs)
|
||||
|
||||
hf_model.processor = patch_hf_processor
|
||||
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
num_logits_to_keep=0)
|
||||
for prompts, images in inputs
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
@ -138,8 +166,6 @@ def run_test(
|
||||
)
|
||||
|
||||
|
||||
# Since we use _attn_implementation="eager" for hf_runner, there is more
|
||||
# significant numerical difference. The basic `logprobs=5` fails to pass.
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
@ -151,7 +177,7 @@ def run_test(
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.7, 0.75, 1.0],
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@ -166,6 +192,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
|
||||
run_test(
|
||||
@ -201,17 +228,18 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
@pytest.mark.parametrize("max_model_len", [10000])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
@pytest.mark.xfail(
|
||||
reason="Phi-4-MM multi-image inference is divergent with hf model.")
|
||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str, max_model_len: int,
|
||||
max_tokens: int, num_logprobs: int) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors])
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors],
|
||||
None,
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
@ -226,3 +254,38 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
mm_limit=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_model_len", [10000])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
|
||||
max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
|
||||
# use the example speech question so that the model outputs are reasonable
|
||||
audio = librosa.load(speech_question, sr=None)
|
||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||
|
||||
inputs_vision_speech = [
|
||||
(
|
||||
["<|user|><|image_1|><|audio_1|><|end|><|assistant|>"],
|
||||
[image],
|
||||
[audio],
|
||||
),
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
inputs_vision_speech,
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=max_model_len,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
mm_limit=1,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
@ -200,22 +200,14 @@ def test_chat(
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=48)
|
||||
@pytest.mark.parametrize(
|
||||
"prompt,expected_ranges",
|
||||
[(_create_engine_inputs_hf(IMG_URLS[:1]), [{
|
||||
"offset": 11,
|
||||
"length": 494
|
||||
}]),
|
||||
(_create_engine_inputs_hf(IMG_URLS[1:4]), [{
|
||||
"offset": 11,
|
||||
"length": 266
|
||||
}, {
|
||||
"offset": 277,
|
||||
"length": 1056
|
||||
}, {
|
||||
"offset": 1333,
|
||||
"length": 418
|
||||
}])])
|
||||
@pytest.mark.parametrize("prompt,expected_ranges",
|
||||
[(_create_engine_inputs_hf(IMG_URLS[:1]),
|
||||
[PlaceholderRange(offset=11, length=494)]),
|
||||
(_create_engine_inputs_hf(IMG_URLS[1:4]), [
|
||||
PlaceholderRange(offset=11, length=266),
|
||||
PlaceholderRange(offset=277, length=1056),
|
||||
PlaceholderRange(offset=1333, length=418)
|
||||
])])
|
||||
def test_multi_modal_placeholders(vllm_runner, prompt,
|
||||
expected_ranges: list[PlaceholderRange],
|
||||
monkeypatch) -> None:
|
||||
|
||||
@ -204,6 +204,12 @@ def idefics3_trunc_hf_output(hf_output: RunnerOutput,
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
# Based on Idefics3
|
||||
return idefics3_trunc_hf_output(hf_output, model)
|
||||
|
||||
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
|
||||
@ -2,13 +2,15 @@
|
||||
# ruff: noqa: E501
|
||||
"""Compare the scoring outputs of HF and vLLM models.
|
||||
|
||||
Run `pytest tests/models/embedding/language/test_jina_reranker_v2.py`.
|
||||
Run `pytest tests/models/embedding/language/test_jina.py`.
|
||||
"""
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
MODELS = [
|
||||
from tests.models.embedding.utils import check_embeddings_close
|
||||
|
||||
SCORING_MODELS = [
|
||||
"jinaai/jina-reranker-v2-base-multilingual", # Roberta
|
||||
]
|
||||
|
||||
@ -27,8 +29,21 @@ TEXTS_2 = [
|
||||
"新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
|
||||
]
|
||||
|
||||
EMBEDDING_MODELS = [
|
||||
"jinaai/jina-embeddings-v3",
|
||||
]
|
||||
|
||||
@pytest.fixture(scope="module", params=MODELS)
|
||||
EMBEDDING_PROMPTS = [
|
||||
"Follow the white rabbit.", # English
|
||||
"Sigue al conejo blanco.", # Spanish
|
||||
"Suis le lapin blanc.", # French
|
||||
"跟着白兔走。", # Chinese
|
||||
"اتبع الأرنب الأبيض.", # Arabic
|
||||
"Folge dem weißen Kaninchen.", # German
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=SCORING_MODELS)
|
||||
def model_name(request):
|
||||
yield request.param
|
||||
|
||||
@ -68,3 +83,46 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
|
||||
|
||||
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
|
||||
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
|
||||
def emb_model_name(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
def test_is_matryoshka(vllm_runner, emb_model_name):
|
||||
with vllm_runner(emb_model_name, task="embed",
|
||||
max_model_len=None) as vllm_model:
|
||||
assert vllm_model.model.llm_engine.model_config.is_matryoshka
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", EMBEDDING_MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_embeddings(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
|
||||
example_prompts = EMBEDDING_PROMPTS
|
||||
|
||||
with hf_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
is_sentence_transformer=True,
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts, task="text-matching")
|
||||
|
||||
with vllm_runner(model, task="embed", dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.encode(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
@ -211,7 +211,7 @@ def _run_test(
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=3,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
@ -422,7 +422,7 @@ def test_bnb_regression(
|
||||
llm = LLM(
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
quantization="bitsandbytes",
|
||||
)
|
||||
@ -475,7 +475,7 @@ def test_explicit_implicit_prompt(
|
||||
llm = LLM(
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
@ -506,7 +506,7 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
|
||||
with global_force_attn_backend_context_manager(attn_backend), vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
limit_mm_per_prompt={"image":
|
||||
|
||||
@ -257,6 +257,7 @@ def _test_processing_correctness_mistral(
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
"HuggingFaceM4/Idefics3-8B-Llama3",
|
||||
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
||||
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf",
|
||||
|
||||
@ -71,29 +71,14 @@ def test_processor_override(
|
||||
# image token offsets
|
||||
img_locs = processed_inputs["mm_placeholders"].get("image", [])
|
||||
assert len(img_locs) == num_imgs
|
||||
assert [img_loc["offset"] for img_loc in img_locs] == \
|
||||
assert [img_loc.offset for img_loc in img_locs] == \
|
||||
[i for i, v in enumerate(prompt_token_ids) \
|
||||
if v == config.boi_token_index]
|
||||
|
||||
# patch sizes and masks
|
||||
assert prompt_token_ids.count(config.image_token_index) \
|
||||
== sum(img_patch.sum() for img_patch in mm_kwargs["embed_is_patch"])
|
||||
patch_token_id = vocab[hf_processor.img_patch_token]
|
||||
num_patches = processed_inputs["prompt_token_ids"].count(patch_token_id)
|
||||
mm_counts = {"image": num_imgs}
|
||||
assert num_patches / num_imgs <= \
|
||||
processor.info.get_mm_max_tokens_per_item(32768, mm_counts)["image"]
|
||||
num_patches_per_chunk = processor.info.get_patch_per_chunk(
|
||||
config.vision_config)
|
||||
assert prompt_token_ids.count(config.image_token_index) \
|
||||
== mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
|
||||
assert mm_kwargs["pixel_values"].shape[0] \
|
||||
== mm_kwargs["patches_per_image"].sum()
|
||||
|
||||
for embed_is_patch, aspect_ratio in zip(mm_kwargs["embed_is_patch"],
|
||||
mm_kwargs["aspect_ratios"]):
|
||||
assert embed_is_patch.shape[0] == \
|
||||
len(tokenizer.encode(
|
||||
hf_processor._prompt_split_image(
|
||||
aspect_ratio, num_patches_per_chunk),
|
||||
add_special_tokens=False))
|
||||
|
||||
@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
|
||||
first_placeholder = image_placeholders[0]
|
||||
|
||||
# NOTE: There is a BOS token
|
||||
assert first_placeholder["offset"] == 1
|
||||
assert first_placeholder["length"] == (
|
||||
assert first_placeholder.offset == 1
|
||||
assert first_placeholder.length == (
|
||||
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
|
||||
|
||||
except Exception as exc:
|
||||
|
||||
@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
|
||||
|
||||
first_placeholder = image_placeholders[0]
|
||||
|
||||
assert first_placeholder["offset"] == 0
|
||||
assert first_placeholder["length"] == len(
|
||||
assert first_placeholder.offset == 0
|
||||
assert first_placeholder.length == len(
|
||||
processed_inputs["prompt_token_ids"]) // num_imgs
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
65
tests/models/multimodal/processing/test_smolvlm.py
Normal file
65
tests/models/multimodal/processing/test_smolvlm.py
Normal file
@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Tests for smolvlm's multimodal preprocessing kwargs."""
|
||||
import pytest
|
||||
from transformers import SmolVLMConfig
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
({"max_image_size": {"longest_edge": 384}}, 1377),
|
||||
({"max_image_size": {"longest_edge": 768}}, 405),
|
||||
])
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
def test_processor_override(
|
||||
image_assets: _ImageAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, object],
|
||||
expected_toks_per_img: int,
|
||||
num_imgs: int,
|
||||
kwargs_on_init: bool,
|
||||
):
|
||||
"""Ensure Idefics3MultiModalProcessor handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the custom input processor.
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
placeholders = "<image>" if num_imgs == 1 else "\n".join(
|
||||
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
|
||||
# Build mm_data
|
||||
image_size = ctx.get_hf_config(SmolVLMConfig).vision_config.image_size
|
||||
dummy_image_size = (image_size * 4, image_size * 4)
|
||||
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
|
||||
mm_data = {"image": [dummy_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
# Ensure the placeholders format are correct
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
|
||||
"input_ids"][0]
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
@ -146,6 +146,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it",
|
||||
min_transformers_version="4.50"),
|
||||
"GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
|
||||
"Glm4ForCausalLM": _HfExamplesInfo(
|
||||
"THUDM/GLM-4-32B-Chat-0414",
|
||||
is_available_online=False,
|
||||
min_transformers_version="4.52.dev0"
|
||||
),
|
||||
"GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
|
||||
"GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
|
||||
"GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),
|
||||
@ -321,7 +326,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}), # noqa: E501
|
||||
"MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
|
||||
max_transformers_version="4.48",
|
||||
transformers_version_reason="Use of private method which no longer exists.", # noqa: E501
|
||||
transformers_version_reason="Incorrectly-detected `tensorflow` import.", # noqa: E501
|
||||
extras={"olmo": "allenai/Molmo-7B-O-0924"}, # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
|
||||
@ -330,6 +335,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
|
||||
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.48",
|
||||
transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501
|
||||
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}), # noqa: E501
|
||||
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
|
||||
trust_remote_code=True),
|
||||
@ -344,9 +351,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
|
||||
min_transformers_version="4.49"), # noqa: E501
|
||||
"SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
|
||||
"SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501
|
||||
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
|
||||
trust_remote_code=True,
|
||||
max_transformers_version="4.50"),
|
||||
trust_remote_code=True),
|
||||
# [Encoder-decoder]
|
||||
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
|
||||
# Therefore, we borrow the BartTokenizer from the original Bart model
|
||||
@ -367,6 +374,10 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
"DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
|
||||
speculative_model="luccafong/deepseek_mtp_draft_random", # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
trust_remote_code=True,
|
||||
speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
|
||||
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"), # noqa: E501
|
||||
}
|
||||
|
||||
_TRANSFORMERS_MODELS = {
|
||||
|
||||
@ -785,6 +785,7 @@ def test_find_update_tokens(
|
||||
item_idx=0,
|
||||
start_idx=6,
|
||||
tokens=[32000, 32000],
|
||||
is_embed=None,
|
||||
),
|
||||
],
|
||||
"pattern_4": [
|
||||
@ -793,6 +794,7 @@ def test_find_update_tokens(
|
||||
item_idx=0,
|
||||
start_idx=3,
|
||||
tokens=[32000],
|
||||
is_embed=None,
|
||||
),
|
||||
],
|
||||
}
|
||||
@ -807,12 +809,14 @@ def test_find_update_tokens(
|
||||
item_idx=0,
|
||||
start_idx=1,
|
||||
tokens=[32000, 32000],
|
||||
is_embed=None,
|
||||
),
|
||||
PlaceholderFeaturesInfo(
|
||||
modality="pattern_1",
|
||||
item_idx=1,
|
||||
start_idx=5,
|
||||
tokens=[32000, 32000],
|
||||
is_embed=None,
|
||||
),
|
||||
],
|
||||
"pattern_3": [
|
||||
@ -821,6 +825,7 @@ def test_find_update_tokens(
|
||||
item_idx=0,
|
||||
start_idx=7,
|
||||
tokens=[1550, 918, 1550],
|
||||
is_embed=None,
|
||||
),
|
||||
],
|
||||
# No match for pattern_4 as it has lower priority than pattern_1
|
||||
@ -835,12 +840,14 @@ def test_find_update_tokens(
|
||||
item_idx=0,
|
||||
start_idx=1,
|
||||
tokens=[32000, 32000],
|
||||
is_embed=None,
|
||||
),
|
||||
PlaceholderFeaturesInfo(
|
||||
modality="pattern_1",
|
||||
item_idx=1,
|
||||
start_idx=3,
|
||||
tokens=[32000, 32000],
|
||||
is_embed=None,
|
||||
),
|
||||
],
|
||||
"pattern_4": [
|
||||
@ -849,6 +856,7 @@ def test_find_update_tokens(
|
||||
item_idx=0,
|
||||
start_idx=5,
|
||||
tokens=[32000],
|
||||
is_embed=None,
|
||||
),
|
||||
],
|
||||
"pattern_3": [
|
||||
@ -857,6 +865,7 @@ def test_find_update_tokens(
|
||||
item_idx=0,
|
||||
start_idx=6,
|
||||
tokens=[1550, 918, 1550],
|
||||
is_embed=None,
|
||||
),
|
||||
],
|
||||
}
|
||||
|
||||
@ -4,17 +4,28 @@
|
||||
Run `pytest tests/quantization/test_quark.py`.
|
||||
"""
|
||||
|
||||
import torch
|
||||
import pytest
|
||||
|
||||
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
|
||||
QuarkLinearMethod, QuarkW8A8Fp8)
|
||||
QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def test_quark_fp8(vllm_runner, monkeypatch):
|
||||
# vllm_runner.apply_model() relies on V0 internals.
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
This module relies on V0 internals, so set VLLM_USE_V1=0.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8'])
|
||||
@pytest.mark.parametrize('tp', [1])
|
||||
def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
|
||||
model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
|
||||
with vllm_runner(model_path) as llm:
|
||||
with vllm_runner(model_path,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
tensor_parallel_size=tp) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
@ -26,11 +37,29 @@ def test_quark_fp8(vllm_runner, monkeypatch):
|
||||
|
||||
if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
|
||||
assert len(qkv_proj.input_scale.shape) == 0
|
||||
assert qkv_proj.weight.dtype is torch.float8_e4m3fn
|
||||
#assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz
|
||||
assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
|
||||
assert len(qkv_proj.weight_scale.shape) == 0
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize('tp', [1])
|
||||
def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
|
||||
model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
|
||||
with vllm_runner(model_path, tensor_parallel_size=tp) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, QuarkW8A8Int8)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
assert output
|
||||
|
||||
25
tests/quantization/test_torchao.py
Normal file
25
tests/quantization/test_torchao.py
Normal file
@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import importlib.metadata
|
||||
import importlib.util
|
||||
|
||||
import pytest
|
||||
|
||||
DTYPE = ["bfloat16"]
|
||||
|
||||
TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_pre_quantized_model(vllm_runner):
|
||||
with vllm_runner("drisspg/float8_dynamic_act_float8_weight-opt-125m",
|
||||
quantization="torchao",
|
||||
dtype="bfloat16",
|
||||
enforce_eager=True) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"],
|
||||
max_tokens=32)
|
||||
assert output
|
||||
print(output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
@ -1,7 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Tests for the SamplingParams class.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen1.5-7B"
|
||||
|
||||
|
||||
def test_max_tokens_none():
|
||||
@ -9,6 +16,74 @@ def test_max_tokens_none():
|
||||
SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
pytest.main([__file__])
|
||||
@pytest.fixture(scope="module")
|
||||
def model_config():
|
||||
return ModelConfig(
|
||||
MODEL_NAME,
|
||||
task="auto",
|
||||
tokenizer=MODEL_NAME,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_max_tokens():
|
||||
return 4096
|
||||
|
||||
|
||||
def test_sampling_params_from_request_with_no_guided_decoding_backend(
|
||||
model_config, default_max_tokens):
|
||||
# guided_decoding_backend is not present at request level
|
||||
request = ChatCompletionRequest.model_validate({
|
||||
'messages': [{
|
||||
'role': 'user',
|
||||
'content': 'Hello'
|
||||
}],
|
||||
'model':
|
||||
MODEL_NAME,
|
||||
'response_format': {
|
||||
'type': 'json_object',
|
||||
},
|
||||
})
|
||||
|
||||
sampling_params = request.to_sampling_params(
|
||||
default_max_tokens,
|
||||
model_config.logits_processor_pattern,
|
||||
)
|
||||
# we do not expect any backend to be present and the default
|
||||
# guided_decoding_backend at engine level will be used.
|
||||
assert sampling_params.guided_decoding.backend is None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
|
||||
[("xgrammar", "xgrammar"),
|
||||
("lm-format-enforcer", "lm-format-enforcer"),
|
||||
("outlines", "outlines")])
|
||||
def test_sampling_params_from_request_with_guided_decoding_backend(
|
||||
request_level_guided_decoding_backend: str, expected: str,
|
||||
model_config, default_max_tokens):
|
||||
|
||||
request = ChatCompletionRequest.model_validate({
|
||||
'messages': [{
|
||||
'role': 'user',
|
||||
'content': 'Hello'
|
||||
}],
|
||||
'model':
|
||||
MODEL_NAME,
|
||||
'response_format': {
|
||||
'type': 'json_object',
|
||||
},
|
||||
'guided_decoding_backend':
|
||||
request_level_guided_decoding_backend,
|
||||
})
|
||||
|
||||
sampling_params = request.to_sampling_params(
|
||||
default_max_tokens,
|
||||
model_config.logits_processor_pattern,
|
||||
)
|
||||
# backend correctly identified in resulting sampling_params
|
||||
assert sampling_params.guided_decoding.backend == expected
|
||||
|
||||
@ -47,12 +47,10 @@ def test_filter_subtensors():
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llama_3p2_1b_files():
|
||||
with TemporaryDirectory() as cache_dir:
|
||||
input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
|
||||
cache_dir=cache_dir,
|
||||
ignore_patterns=["*.bin*", "original/*"])
|
||||
input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
|
||||
ignore_patterns=["*.bin*", "original/*"])
|
||||
|
||||
yield input_dir
|
||||
yield input_dir
|
||||
|
||||
|
||||
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
|
||||
@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
|
||||
|
||||
# Copy metadata files to output directory
|
||||
for file in os.listdir(input_dir):
|
||||
if not any(
|
||||
file.endswith(ext) and not os.path.isdir(file)
|
||||
for ext in weights_patterns):
|
||||
if os.path.isdir(os.path.join(input_dir, file)):
|
||||
continue
|
||||
if not any(file.endswith(ext) for ext in weights_patterns):
|
||||
shutil.copy(f"{input_dir}/{file}", output_dir)
|
||||
|
||||
|
||||
@ -81,7 +79,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
|
||||
@pytest.mark.parametrize("enable_lora", [False, True])
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
llama_3p2_1b_files):
|
||||
llama_3p2_1b_files,
|
||||
monkeypatch: pytest.MonkeyPatch):
|
||||
if num_gpus_available < tp_size:
|
||||
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
|
||||
|
||||
@ -89,6 +88,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
|
||||
gpu_memory_utilization = 0.8
|
||||
input_dir = llama_3p2_1b_files
|
||||
ctx = mp.get_context("spawn")
|
||||
# The interface in v1 engine has changed, run in v1 engine will hang.
|
||||
monkeypatch.setenv("VLLM_USE_V1", "0")
|
||||
|
||||
# Run in separate processes for memory & CUDA isolation
|
||||
with TemporaryDirectory() as output_dir:
|
||||
|
||||
@ -44,7 +44,7 @@ def test_tpu_compilation():
|
||||
assert generated_text.startswith(answer)
|
||||
|
||||
compiled_codes = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
|
||||
glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
|
||||
|
||||
for i, compiled_code in enumerate(compiled_codes):
|
||||
print("{} file: {}".format(i + 1, compiled_code))
|
||||
@ -52,15 +52,21 @@ def test_tpu_compilation():
|
||||
# We should only trigger Dynamo compilation 2 times:
|
||||
# 1. Forward pass without kv_caches
|
||||
# 2. Forward pass with kv_caches
|
||||
# Check we have 4 compiled codes
|
||||
# Check we have 2 compiled codes
|
||||
assert len(compiled_codes) == 2
|
||||
|
||||
kv_cache_prefix = "kv_cache"
|
||||
attn_prefix = "ragged_paged_attention"
|
||||
|
||||
def extract_compiled_index(s):
|
||||
parts = s.replace(".", "_").split("_")
|
||||
numbers = [int(part) for part in parts if part.isdigit()]
|
||||
return numbers[0]
|
||||
|
||||
# Check all the compilations are as expected
|
||||
compiled_fns = sorted(
|
||||
glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
|
||||
compiled_fns = sorted(glob.glob(
|
||||
os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
|
||||
key=lambda s: extract_compiled_index(s))
|
||||
|
||||
for i, compiled_fn in enumerate(compiled_fns):
|
||||
print("{} file: {}".format(i + 1, compiled_fn))
|
||||
|
||||
@ -3,14 +3,16 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalKwargs
|
||||
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import sha256
|
||||
from vllm.utils import GiB_bytes, sha256
|
||||
# disable yapf here as it formats differently than isort such that both fail
|
||||
# yapf: disable
|
||||
from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
|
||||
FreeKVCacheBlockQueue, KVCacheBlock,
|
||||
PrefixCachingMetrics,
|
||||
estimate_max_model_len,
|
||||
generate_block_hash_extra_keys,
|
||||
hash_block_tokens,
|
||||
hash_request_tokens,
|
||||
@ -158,13 +160,10 @@ def test_generate_block_hash_extra_keys():
|
||||
request = make_request(
|
||||
request_id=0,
|
||||
prompt_token_ids=[_ for _ in range(20)],
|
||||
mm_positions=[{
|
||||
"offset": 0,
|
||||
"length": 5
|
||||
}, {
|
||||
"offset": 10,
|
||||
"length": 5
|
||||
}],
|
||||
mm_positions=[
|
||||
PlaceholderRange(offset=0, length=5),
|
||||
PlaceholderRange(offset=10, length=5),
|
||||
],
|
||||
mm_hashes=["hash1", "hash2"],
|
||||
)
|
||||
|
||||
@ -222,13 +221,10 @@ def test_hash_request_tokens(hash_fn):
|
||||
request = make_request(
|
||||
request_id=0,
|
||||
prompt_token_ids=[_ for _ in range(6)],
|
||||
mm_positions=[{
|
||||
"offset": 0,
|
||||
"length": 3
|
||||
}, {
|
||||
"offset": 3,
|
||||
"length": 3
|
||||
}],
|
||||
mm_positions=[
|
||||
PlaceholderRange(offset=0, length=3),
|
||||
PlaceholderRange(offset=3, length=3),
|
||||
],
|
||||
mm_hashes=["hash1", "hash2"],
|
||||
)
|
||||
|
||||
@ -253,25 +249,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
|
||||
request1 = make_request(
|
||||
request_id=0,
|
||||
prompt_token_ids=[_ for _ in range(6)],
|
||||
mm_positions=[{
|
||||
"offset": 0,
|
||||
"length": 3
|
||||
}, {
|
||||
"offset": 3,
|
||||
"length": 3
|
||||
}],
|
||||
mm_positions=[
|
||||
PlaceholderRange(offset=0, length=3),
|
||||
PlaceholderRange(offset=3, length=3),
|
||||
],
|
||||
mm_hashes=["hash1", "hash2"],
|
||||
)
|
||||
request2 = make_request(
|
||||
request_id=1,
|
||||
prompt_token_ids=[_ for _ in range(6)],
|
||||
mm_positions=[{
|
||||
"offset": 0,
|
||||
"length": 3
|
||||
}, {
|
||||
"offset": 3,
|
||||
"length": 3
|
||||
}],
|
||||
mm_positions=[
|
||||
PlaceholderRange(offset=0, length=3),
|
||||
PlaceholderRange(offset=3, length=3),
|
||||
],
|
||||
mm_hashes=["hash3", "hash2"],
|
||||
)
|
||||
block_size = 3
|
||||
@ -438,3 +428,45 @@ def test_unify_kv_cache_configs():
|
||||
]
|
||||
with pytest.raises(AssertionError):
|
||||
unify_kv_cache_configs(diff_kv_cache_config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "max_model_len", "want_estimated_max_len"), [
|
||||
("Qwen/Qwen1.5-7B", 16385, 16384),
|
||||
("Qwen/Qwen1.5-7B", 16383, 16383),
|
||||
])
|
||||
def test_estimate_max_model_len(model_id, max_model_len,
|
||||
want_estimated_max_len):
|
||||
# Create a VllmConfig
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="generate",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_model_len=max_model_len,
|
||||
)
|
||||
scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
scheduler_config=scheduler_config,
|
||||
)
|
||||
|
||||
# Create KV cache specs
|
||||
kv_cache_spec = {}
|
||||
for i in range(32):
|
||||
layer_name = f"layer_{i}"
|
||||
kv_cache_spec[layer_name] = FullAttentionSpec(
|
||||
block_size=16,
|
||||
num_kv_heads=32,
|
||||
head_size=128,
|
||||
dtype=torch.float16,
|
||||
use_mla=False,
|
||||
)
|
||||
# Estimate the maximum model length, 16384 model_len need 8GB
|
||||
estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
|
||||
8 * GiB_bytes)
|
||||
assert estimated_max_len == want_estimated_max_len
|
||||
|
||||
@ -24,6 +24,7 @@ def create_scheduler(
|
||||
max_num_batched_tokens: int = 8192,
|
||||
enable_prefix_caching: Optional[bool] = None,
|
||||
long_prefill_token_threshold: int = 0,
|
||||
disable_chunked_mm_input: bool = False,
|
||||
) -> Scheduler:
|
||||
'''Create scheduler under test.
|
||||
|
||||
@ -43,6 +44,7 @@ def create_scheduler(
|
||||
max_num_batched_tokens=max_num_batched_tokens,
|
||||
max_model_len=max_num_batched_tokens,
|
||||
long_prefill_token_threshold=long_prefill_token_threshold,
|
||||
disable_chunked_mm_input=disable_chunked_mm_input,
|
||||
)
|
||||
model_config = ModelConfig(
|
||||
model=model,
|
||||
@ -278,6 +280,58 @@ def test_schedule_partial_requests():
|
||||
assert requests[2].request_id not in output.num_scheduled_tokens
|
||||
|
||||
|
||||
def test_no_mm_input_chunking():
|
||||
# Disable multimodal input chunking.
|
||||
scheduler = create_scheduler(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
max_num_batched_tokens=1024,
|
||||
disable_chunked_mm_input=True,
|
||||
)
|
||||
mm_positions = [[PlaceholderRange(offset=400, length=800)]]
|
||||
requests = create_requests(num_requests=1,
|
||||
num_tokens=1200,
|
||||
mm_positions=mm_positions)
|
||||
for request in requests:
|
||||
scheduler.add_request(request)
|
||||
|
||||
output = scheduler.schedule()
|
||||
assert len(output.scheduled_new_reqs) == 1
|
||||
assert len(output.scheduled_cached_reqs) == 0
|
||||
assert len(output.finished_req_ids) == 0
|
||||
# We want to only see the 400 text tokens at the start scheduled
|
||||
assert output.num_scheduled_tokens[requests[0].request_id] == 400
|
||||
|
||||
req_to_index = {
|
||||
request.request_id: i
|
||||
for i, request in enumerate(requests)
|
||||
}
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[request.request_id for request in requests],
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[] for _ in range(len(requests))],
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
)
|
||||
scheduler.update_from_output(output, model_runner_output)
|
||||
|
||||
output = scheduler.schedule()
|
||||
assert len(scheduler.running) == 1
|
||||
assert len(output.scheduled_new_reqs) == 0
|
||||
assert len(output.scheduled_cached_reqs) == 1
|
||||
assert len(output.finished_req_ids) == 0
|
||||
assert output.num_scheduled_tokens[requests[0].request_id] == 800
|
||||
|
||||
# Test that we fail if we disable chunked mm input and use too small
|
||||
# of a max_num_batched_tokens for the mm input.
|
||||
with pytest.raises(ValueError):
|
||||
_ = create_scheduler(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
max_num_batched_tokens=100,
|
||||
disable_chunked_mm_input=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enable_prefix_caching", [True, False])
|
||||
def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
|
||||
"""Test scheduling behavior with concurrent partial requests.
|
||||
|
||||
@ -53,6 +53,11 @@ def model_name():
|
||||
return "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def eagle_model_name():
|
||||
return "yuhuili/EAGLE-LLaMA3-Instruct-8B"
|
||||
|
||||
|
||||
def test_ngram_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
@ -95,3 +100,47 @@ def test_ngram_correctness(
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.7 * len(ref_outputs))
|
||||
del spec_llm
|
||||
|
||||
|
||||
def test_eagle_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
eagle_model_name: str,
|
||||
):
|
||||
'''
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using eagle speculative decoding.
|
||||
'''
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=1024)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
del ref_llm
|
||||
|
||||
spec_llm = LLM(
|
||||
model=model_name,
|
||||
speculative_config={
|
||||
"method": "eagle",
|
||||
"model": eagle_model_name,
|
||||
"num_speculative_tokens": 3,
|
||||
},
|
||||
max_model_len=1024,
|
||||
)
|
||||
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
|
||||
matches = 0
|
||||
misses = 0
|
||||
for ref_output, spec_output in zip(ref_outputs, spec_outputs):
|
||||
if ref_output.outputs[0].text == spec_output.outputs[0].text:
|
||||
matches += 1
|
||||
else:
|
||||
misses += 1
|
||||
print(f"ref_output: {ref_output.outputs[0].text}")
|
||||
print(f"spec_output: {spec_output.outputs[0].text}")
|
||||
|
||||
# Heuristic: expect at least 70% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.7 * len(ref_outputs))
|
||||
del spec_llm
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user