Compare commits
6 Commits
seemethere
...
remove-met
| Author | SHA1 | Date | |
|---|---|---|---|
| 69dbcc56bf | |||
| 1d20c34717 | |||
| b6af24fba7 | |||
| 0ca2393b47 | |||
| 31a500c86f | |||
| 4e8614e88b |
@ -1,22 +1,4 @@
|
||||
steps:
|
||||
# aarch64 + CUDA builds
|
||||
# NOTE: PyTorch only has aarch64 builds for CUDA 12.9
|
||||
# See CUDA_AARCH64_ARCHES in https://github.com/pytorch/pytorch/blob/main/.github/scripts/generate_binary_build_matrix.py#L41
|
||||
- label: "Build arm64 wheel - CUDA 12.9"
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
# x86 + CUDA builds
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
id: build-wheel-cuda-12-8
|
||||
agents:
|
||||
|
||||
@ -208,22 +208,6 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/metrics
|
||||
- tests/tracing
|
||||
commands:
|
||||
- pytest -v -s metrics
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0' \
|
||||
'opentelemetry-api>=1.26.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||
- pytest -v -s tracing
|
||||
|
||||
##### fast check tests #####
|
||||
##### 1 GPU test #####
|
||||
|
||||
|
||||
20
.github/PULL_REQUEST_TEMPLATE.md
vendored
20
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,11 +1,5 @@
|
||||
# Essential Elements of an Effective PR Description Checklist
|
||||
|
||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||
- [ ] The test plan, such as providing test command.
|
||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||
|
||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
|
||||
<!-- markdownlint-disable -->
|
||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
|
||||
|
||||
## Purpose
|
||||
|
||||
@ -15,4 +9,14 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
|
||||
|
||||
## (Optional) Documentation Update
|
||||
|
||||
---
|
||||
<details>
|
||||
<summary> Essential Elements of an Effective PR Description Checklist </summary>
|
||||
|
||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||
- [ ] The test plan, such as providing test command.
|
||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||
</details>
|
||||
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||
|
||||
8
.github/scripts/cleanup_pr_body.sh
vendored
8
.github/scripts/cleanup_pr_body.sh
vendored
@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
|
||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
|
||||
cp "${OLD}" "${NEW}"
|
||||
|
||||
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
|
||||
sed -i '/FIX #xxxx.*$/d' "${NEW}"
|
||||
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
|
||||
sed -i '/<!--.*-->$/d' "${NEW}"
|
||||
|
||||
# Remove "FILL IN THE PR DESCRIPTION HERE"
|
||||
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
|
||||
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
|
||||
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
|
||||
|
||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
|
||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
||||
|
||||
74
benchmarks/benchmark_block_pool.py
Normal file
74
benchmarks/benchmark_block_pool.py
Normal file
@ -0,0 +1,74 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for allocate_block in args.allocate_blocks:
|
||||
# Enforce a GC collect ahead to minimize the impact among runs
|
||||
gc.collect()
|
||||
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
|
||||
|
||||
get_blocks_times = TimeCollector(TimeCollector.US)
|
||||
free_blocks_times = TimeCollector(TimeCollector.US)
|
||||
for _ in range(args.num_iteration):
|
||||
with get_blocks_times:
|
||||
blocks = block_pool.get_new_blocks(allocate_block)
|
||||
with free_blocks_times:
|
||||
block_pool.free_blocks(blocks)
|
||||
|
||||
rows.append(
|
||||
[get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
|
||||
+ get_blocks_times.dump_avg_max()
|
||||
+ free_blocks_times.dump_avg_max()
|
||||
)
|
||||
|
||||
print(
|
||||
tabulate(
|
||||
rows,
|
||||
headers=[
|
||||
"Iterations",
|
||||
"Total\nBlocks",
|
||||
"Allocated\nBlocks",
|
||||
"Get Blocks\nAvg (us)",
|
||||
"Get Blocks\nMax (us)",
|
||||
"Free Blocks\nAvg (us)",
|
||||
"Free Blocks\nMax (us)",
|
||||
],
|
||||
tablefmt="grid",
|
||||
floatfmt=".3f",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of BlockPool for KV Cache."
|
||||
)
|
||||
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of iterations to run to stablize final data readings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allocate-blocks",
|
||||
type=int,
|
||||
nargs="*",
|
||||
default=[10, 50, 100, 500, 1000],
|
||||
help="Number of blocks to allocate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
112
benchmarks/benchmark_ngram_proposer.py
Normal file
112
benchmarks/benchmark_ngram_proposer.py
Normal file
@ -0,0 +1,112 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
|
||||
import numpy as np
|
||||
from tabulate import tabulate
|
||||
|
||||
from benchmark_utils import TimeCollector
|
||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for max_ngram in args.max_ngram:
|
||||
collector = TimeCollector(TimeCollector.US)
|
||||
|
||||
model_config = ModelConfig(
|
||||
model="facebook/opt-125m",
|
||||
task="generate",
|
||||
max_model_len=args.num_token + args.num_spec_token,
|
||||
tokenizer="facebook/opt-125m",
|
||||
tokenizer_mode="auto",
|
||||
dtype="auto",
|
||||
seed=None,
|
||||
trust_remote_code=False,
|
||||
)
|
||||
proposer = NgramProposer(
|
||||
vllm_config=VllmConfig(
|
||||
model_config=model_config,
|
||||
speculative_config=SpeculativeConfig(
|
||||
prompt_lookup_min=args.min_ngram,
|
||||
prompt_lookup_max=max_ngram,
|
||||
num_speculative_tokens=args.num_spec_token,
|
||||
method="ngram",
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Warm up
|
||||
proposer.propose(np.random.randint(0, 20, (args.num_token,)))
|
||||
|
||||
gc.collect()
|
||||
for _ in range(args.num_iteration):
|
||||
tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
|
||||
with collector:
|
||||
for i in range(args.num_req):
|
||||
proposer.propose(tokens[i, :])
|
||||
rows.append(
|
||||
[args.num_req, args.num_token, args.min_ngram, max_ngram]
|
||||
+ collector.dump_avg_max()
|
||||
)
|
||||
|
||||
print(
|
||||
tabulate(
|
||||
rows,
|
||||
headers=[
|
||||
"# Request",
|
||||
"# Token",
|
||||
"Min Ngram",
|
||||
"Max Ngram",
|
||||
"Avg (us)",
|
||||
"Max (us)",
|
||||
],
|
||||
tablefmt="grid",
|
||||
floatfmt=".3f",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of iterations to run to stablize final data readings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-req", type=int, default=128, help="Number of requests in the batch"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-token", type=int, default=1500, help="Number of tokens for each request"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-ngram",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Minimum n-gram to match",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-ngram",
|
||||
type=int,
|
||||
nargs="*",
|
||||
default=[5, 7, 10, 15, 20],
|
||||
help="Maximum n-gram to match",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-spec-token",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of speculative tokens to generate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
@ -1,11 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from typing import Any
|
||||
import time
|
||||
from types import TracebackType
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None:
|
||||
cls=InfEncoder,
|
||||
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
||||
)
|
||||
|
||||
|
||||
# Collect time and generate time metrics
|
||||
#
|
||||
# Example Usage:
|
||||
# collector = TimeCollector(TimeCollector.US)
|
||||
# for _ in range(total_iteration):
|
||||
# with collector:
|
||||
# ...
|
||||
# collector.dump_avg_max()
|
||||
class TimeCollector:
|
||||
NS: int = 1
|
||||
US: int = NS * 1000
|
||||
MS: int = US * 1000
|
||||
S: int = MS * 1000
|
||||
|
||||
def __init__(self, scale: int) -> None:
|
||||
self.cnt: int = 0
|
||||
self._sum: int = 0
|
||||
self._max: Optional[int] = None
|
||||
self.scale = scale
|
||||
self.start_time: int = time.monotonic_ns()
|
||||
|
||||
def collect(self, v: int) -> None:
|
||||
self.cnt += 1
|
||||
self._sum += v
|
||||
if self._max is None:
|
||||
self._max = v
|
||||
else:
|
||||
self._max = max(self._max, v)
|
||||
|
||||
def avg(self) -> Union[float, str]:
|
||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||
|
||||
def max(self) -> Union[float, str]:
|
||||
return self._max / self.scale if self._max else "N/A"
|
||||
|
||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
||||
return [self.avg(), self.max()]
|
||||
|
||||
def __enter__(self) -> None:
|
||||
self.start_time = time.monotonic_ns()
|
||||
|
||||
def __exit__(
|
||||
self,
|
||||
exc_type: Optional[type[BaseException]],
|
||||
exc_value: Optional[BaseException],
|
||||
exc_traceback: Optional[TracebackType],
|
||||
) -> None:
|
||||
self.collect(time.monotonic_ns() - self.start_time)
|
||||
|
||||
@ -1,108 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import gc
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from tabulate import tabulate
|
||||
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
|
||||
|
||||
class Metric:
|
||||
def __init__(self) -> None:
|
||||
self.cnt: int = 0
|
||||
self.sum_v: int = 0
|
||||
self.max_v: Optional[int] = None
|
||||
|
||||
def update(self, v: int) -> None:
|
||||
self.cnt += 1
|
||||
self.sum_v += v
|
||||
if self.max_v is None:
|
||||
self.max_v = v
|
||||
else:
|
||||
self.max_v = max(self.max_v, v)
|
||||
|
||||
def avg_v(self) -> float:
|
||||
return self.sum_v * 1.0 / self.cnt
|
||||
|
||||
|
||||
def main(args):
|
||||
rows = []
|
||||
for allocate_block in args.allocate_blocks:
|
||||
# Enforce a GC collect ahead to minimize the impact among runs
|
||||
gc.collect()
|
||||
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
|
||||
|
||||
get_blocks_metric: Metric = Metric()
|
||||
free_blocks_metric: Metric = Metric()
|
||||
for _ in range(args.num_iteration):
|
||||
t1 = time.monotonic_ns()
|
||||
blocks = block_pool.get_new_blocks(allocate_block)
|
||||
t2 = time.monotonic_ns()
|
||||
block_pool.free_blocks(blocks)
|
||||
t3 = time.monotonic_ns()
|
||||
get_blocks_metric.update(t2 - t1)
|
||||
free_blocks_metric.update(t3 - t2)
|
||||
|
||||
if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
|
||||
rows.append(
|
||||
[
|
||||
get_blocks_metric.cnt,
|
||||
args.num_gpu_blocks,
|
||||
allocate_block,
|
||||
get_blocks_metric.avg_v() / 1000000,
|
||||
get_blocks_metric.max_v / 1000000.0,
|
||||
free_blocks_metric.avg_v() / 1000000,
|
||||
free_blocks_metric.max_v / 1000000.0,
|
||||
]
|
||||
)
|
||||
else:
|
||||
print(
|
||||
"No valid metrics found."
|
||||
f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
|
||||
)
|
||||
|
||||
print(
|
||||
tabulate(
|
||||
rows,
|
||||
headers=[
|
||||
"Iterations",
|
||||
"Total\nBlocks",
|
||||
"Allocated\nBlocks",
|
||||
"Get Blocks\nAvg (ms)",
|
||||
"Get Blocks\nMax (ms)",
|
||||
"Free Blocks\nAvg (ms)",
|
||||
"Free Blocks\nMax (ms)",
|
||||
],
|
||||
tablefmt="grid",
|
||||
floatfmt=".6f",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def invoke_main() -> None:
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance of BlockPool for KV Cache."
|
||||
)
|
||||
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
|
||||
parser.add_argument(
|
||||
"--num-iteration",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of iterations to run to stablize final data readings",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allocate-blocks",
|
||||
type=int,
|
||||
nargs="*",
|
||||
default=[10, 50, 100, 500, 1000],
|
||||
help="Number of blocks to allocate",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
invoke_main() # pragma: no cover
|
||||
@ -54,38 +54,54 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
|
||||
op = context.operation
|
||||
assert op is not None
|
||||
|
||||
def no_file_type(case: schemathesis.models.Case):
|
||||
def no_invalid_types(case: schemathesis.models.Case):
|
||||
"""
|
||||
This filter skips test cases for the `POST /tokenize` endpoint where the
|
||||
HTTP request body uses `"type": "file"` in any message's content.
|
||||
We expect these cases to fail because that type isn't implemented here
|
||||
https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
|
||||
This filter skips test cases with invalid data that schemathesis
|
||||
incorrectly generates due to permissive schema configurations.
|
||||
|
||||
1. Skips `POST /tokenize` endpoint cases with `"type": "file"` in
|
||||
message content, which isn't implemented.
|
||||
|
||||
2. Skips tool_calls with `"type": "custom"` which schemathesis
|
||||
incorrectly generates instead of the valid `"type": "function"`.
|
||||
|
||||
Example test cases that are skipped:
|
||||
curl -X POST -H 'Content-Type: application/json' \
|
||||
-d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
http://localhost:8000/tokenize
|
||||
|
||||
curl -X POST -H 'Content-Type: application/json' \
|
||||
-d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
|
||||
http://localhost:8000/tokenize
|
||||
-d '{"messages": [{"role": "assistant", "tool_calls": [{"custom": {"input": "", "name": ""}, "id": "", "type": "custom"}]}]}' \
|
||||
http://localhost:8000/v1/chat/completions
|
||||
""" # noqa: E501
|
||||
if (op.method.lower() == "post" and op.path == "/tokenize"
|
||||
and hasattr(case, "body") and isinstance(case.body, dict)
|
||||
if (hasattr(case, "body") and isinstance(case.body, dict)
|
||||
and "messages" in case.body
|
||||
and isinstance(case.body["messages"], list)
|
||||
and len(case.body["messages"]) > 0):
|
||||
|
||||
for message in case.body["messages"]:
|
||||
if not isinstance(message, dict):
|
||||
continue
|
||||
content = message.get("content", [])
|
||||
if not isinstance(content, list) or len(content) == 0:
|
||||
continue
|
||||
if any(item.get("type") == "file" for item in content):
|
||||
return False
|
||||
|
||||
# Check for invalid file type in tokenize endpoint
|
||||
if op.method.lower() == "post" and op.path == "/tokenize":
|
||||
content = message.get("content", [])
|
||||
if (isinstance(content, list) and len(content) > 0 and any(
|
||||
item.get("type") == "file" for item in content)):
|
||||
return False
|
||||
|
||||
# Check for invalid tool_calls with non-function types
|
||||
tool_calls = message.get("tool_calls", [])
|
||||
if isinstance(tool_calls, list):
|
||||
for tool_call in tool_calls:
|
||||
if isinstance(tool_call, dict):
|
||||
if tool_call.get("type") != "function":
|
||||
return False
|
||||
if "custom" in tool_call:
|
||||
return False
|
||||
return True
|
||||
|
||||
return strategy.filter(no_file_type)
|
||||
return strategy.filter(no_invalid_types)
|
||||
|
||||
|
||||
@schema.parametrize()
|
||||
|
||||
@ -1,268 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
from prometheus_client import REGISTRY
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import EngineArgs, LLMEngine
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.engine.metrics import RayPrometheusStatLogger
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch):
|
||||
"""
|
||||
This module tests V0 internals, so set VLLM_USE_V1=0.
|
||||
"""
|
||||
monkeypatch.setenv('VLLM_USE_V1', '0')
|
||||
|
||||
|
||||
MODELS = [
|
||||
"distilbert/distilgpt2",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_metric_counter_prompt_tokens(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.4) as vllm_model:
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
prompt_token_counts = [
|
||||
len(tokenizer.encode(p)) for p in example_prompts
|
||||
]
|
||||
# This test needs at least 2 prompts in a batch of different lengths to
|
||||
# verify their token count is correct despite padding.
|
||||
assert len(example_prompts) > 1, "at least 2 prompts are required"
|
||||
assert prompt_token_counts[0] != prompt_token_counts[1], (
|
||||
"prompts of different lengths are required")
|
||||
vllm_prompt_token_count = sum(prompt_token_counts)
|
||||
|
||||
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
|
||||
**stat_logger.labels)._value.get()
|
||||
|
||||
assert vllm_prompt_token_count == metric_count, (
|
||||
f"prompt token count: {vllm_prompt_token_count!r}\n"
|
||||
f"metric: {metric_count!r}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_metric_counter_generation_tokens(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.4) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
|
||||
**stat_logger.labels)._value.get()
|
||||
vllm_generation_count = 0
|
||||
for i in range(len(example_prompts)):
|
||||
vllm_output_ids, vllm_output_str = vllm_outputs[i]
|
||||
prompt_ids = tokenizer.encode(example_prompts[i])
|
||||
# vllm_output_ids contains both prompt tokens and generation tokens.
|
||||
# We're interested only in the count of the generation tokens.
|
||||
vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
|
||||
|
||||
assert vllm_generation_count == metric_count, (
|
||||
f"generation token count: {vllm_generation_count!r}\n"
|
||||
f"metric: {metric_count!r}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize(
|
||||
"served_model_name",
|
||||
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
|
||||
def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
|
||||
served_model_name: list[str]) -> None:
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
gpu_memory_utilization=0.3,
|
||||
served_model_name=served_model_name) as vllm_model:
|
||||
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
|
||||
metrics_tag_content = stat_logger.labels["model_name"]
|
||||
|
||||
if envs.VLLM_CI_USE_S3:
|
||||
model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
|
||||
if served_model_name is None or served_model_name == []:
|
||||
assert metrics_tag_content == model, (
|
||||
f"Metrics tag model_name is wrong! expect: {model!r}\n"
|
||||
f"actual: {metrics_tag_content!r}")
|
||||
else:
|
||||
assert metrics_tag_content == served_model_name[0], (
|
||||
f"Metrics tag model_name is wrong! expect: "
|
||||
f"{served_model_name[0]!r}\n"
|
||||
f"actual: {metrics_tag_content!r}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("disable_log_stats", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_engine_log_metrics_regression(
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
disable_log_stats: bool,
|
||||
) -> None:
|
||||
"""
|
||||
Regression test ensuring async engine generates metrics
|
||||
when disable_log_stats=False
|
||||
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
|
||||
"""
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=disable_log_stats,
|
||||
)
|
||||
async_engine = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
for i, prompt in enumerate(example_prompts):
|
||||
results = async_engine.generate(
|
||||
prompt,
|
||||
SamplingParams(max_tokens=max_tokens),
|
||||
f"request-id-{i}",
|
||||
)
|
||||
# Exhaust the async iterator to make the async engine work
|
||||
async for _ in results:
|
||||
pass
|
||||
|
||||
assert_metrics(model, async_engine.engine, disable_log_stats,
|
||||
len(example_prompts))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("disable_log_stats", [True, False])
|
||||
def test_engine_log_metrics_regression(
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
disable_log_stats: bool,
|
||||
) -> None:
|
||||
engine_args = EngineArgs(
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=disable_log_stats,
|
||||
)
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
for i, prompt in enumerate(example_prompts):
|
||||
engine.add_request(
|
||||
f"request-id-{i}",
|
||||
prompt,
|
||||
SamplingParams(max_tokens=max_tokens),
|
||||
)
|
||||
while engine.has_unfinished_requests():
|
||||
engine.step()
|
||||
|
||||
if envs.VLLM_CI_USE_S3:
|
||||
model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
|
||||
assert_metrics(model, engine, disable_log_stats, len(example_prompts))
|
||||
|
||||
|
||||
def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
|
||||
num_requests: int) -> None:
|
||||
if disable_log_stats:
|
||||
with pytest.raises(AttributeError):
|
||||
_ = engine.stat_loggers
|
||||
else:
|
||||
assert (engine.stat_loggers
|
||||
is not None), "engine.stat_loggers should be set"
|
||||
# Ensure the count bucket of request-level histogram metrics matches
|
||||
# the number of requests as a simple sanity check to ensure metrics are
|
||||
# generated
|
||||
labels = {'model_name': model}
|
||||
request_histogram_metrics = [
|
||||
"vllm:e2e_request_latency_seconds",
|
||||
"vllm:request_prompt_tokens",
|
||||
"vllm:request_generation_tokens",
|
||||
"vllm:request_params_n",
|
||||
"vllm:request_params_max_tokens",
|
||||
]
|
||||
for metric_name in request_histogram_metrics:
|
||||
metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
|
||||
labels)
|
||||
assert (
|
||||
metric_value == num_requests), "Metrics should be collected"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [16])
|
||||
def test_engine_log_metrics_ray(
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
# This test is quite weak - it only checks that we can use
|
||||
# RayPrometheusStatLogger without exceptions.
|
||||
# Checking whether the metrics are actually emitted is unfortunately
|
||||
# non-trivial.
|
||||
|
||||
# We have to run in a Ray task for Ray metrics to be emitted correctly
|
||||
@ray.remote(num_gpus=1)
|
||||
def _inner():
|
||||
|
||||
class _RayPrometheusStatLogger(RayPrometheusStatLogger):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._i = 0
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def log(self, *args, **kwargs):
|
||||
self._i += 1
|
||||
return super().log(*args, **kwargs)
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model,
|
||||
dtype=dtype,
|
||||
disable_log_stats=False,
|
||||
)
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
logger = _RayPrometheusStatLogger(
|
||||
local_interval=0.5,
|
||||
labels=dict(model_name=engine.model_config.served_model_name),
|
||||
vllm_config=engine.vllm_config)
|
||||
engine.add_logger("ray", logger)
|
||||
for i, prompt in enumerate(example_prompts):
|
||||
engine.add_request(
|
||||
f"request-id-{i}",
|
||||
prompt,
|
||||
SamplingParams(max_tokens=max_tokens),
|
||||
)
|
||||
while engine.has_unfinished_requests():
|
||||
engine.step()
|
||||
assert logger._i > 0, ".log must be called at least once"
|
||||
|
||||
ray.get(_inner.remote())
|
||||
@ -36,7 +36,7 @@ MODELS = [
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
|
||||
@ -46,7 +46,7 @@ MODELS = [
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_embed_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
|
||||
@ -1,237 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa
|
||||
# type: ignore
|
||||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
from collections.abc import Iterable
|
||||
from concurrent import futures
|
||||
from typing import Callable, Generator, Literal
|
||||
|
||||
import grpc
|
||||
import pytest
|
||||
from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
|
||||
ExportTraceServiceResponse)
|
||||
from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
|
||||
TraceServiceServicer, add_TraceServiceServicer_to_server)
|
||||
from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
|
||||
from opentelemetry.sdk.environment_variables import (
|
||||
OTEL_EXPORTER_OTLP_TRACES_INSECURE)
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.tracing import SpanAttributes
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Since this module is V0 only, set VLLM_USE_V1=0 for
|
||||
all tests in the module.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv('VLLM_USE_V1', '0')
|
||||
yield
|
||||
|
||||
|
||||
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
|
||||
|
||||
FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
|
||||
'array_value']
|
||||
|
||||
|
||||
def decode_value(value: AnyValue):
|
||||
field_decoders: dict[FieldName, Callable] = {
|
||||
"bool_value": (lambda v: v.bool_value),
|
||||
"string_value": (lambda v: v.string_value),
|
||||
"int_value": (lambda v: v.int_value),
|
||||
"double_value": (lambda v: v.double_value),
|
||||
"array_value":
|
||||
(lambda v: [decode_value(item) for item in v.array_value.values]),
|
||||
}
|
||||
for field, decoder in field_decoders.items():
|
||||
if value.HasField(field):
|
||||
return decoder(value)
|
||||
raise ValueError(f"Couldn't decode value: {value}")
|
||||
|
||||
|
||||
def decode_attributes(attributes: Iterable[KeyValue]):
|
||||
return {kv.key: decode_value(kv.value) for kv in attributes}
|
||||
|
||||
|
||||
class FakeTraceService(TraceServiceServicer):
|
||||
|
||||
def __init__(self):
|
||||
self.request = None
|
||||
self.evt = threading.Event()
|
||||
|
||||
def Export(self, request, context):
|
||||
self.request = request
|
||||
self.evt.set()
|
||||
return ExportTraceServiceResponse()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def trace_service() -> Generator[FakeTraceService, None, None]:
|
||||
"""Fixture to set up a fake gRPC trace service"""
|
||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
|
||||
service = FakeTraceService()
|
||||
add_TraceServiceServicer_to_server(service, server)
|
||||
server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
|
||||
server.start()
|
||||
|
||||
yield service
|
||||
|
||||
server.stop(None)
|
||||
|
||||
|
||||
def test_traces(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
trace_service: FakeTraceService,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.01,
|
||||
top_p=0.1,
|
||||
max_tokens=256,
|
||||
)
|
||||
model = "facebook/opt-125m"
|
||||
llm = LLM(
|
||||
model=model,
|
||||
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
|
||||
)
|
||||
prompts = ["This is a short prompt"]
|
||||
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
||||
|
||||
timeout = 5
|
||||
if not trace_service.evt.wait(timeout):
|
||||
raise TimeoutError(
|
||||
f"The fake trace service didn't receive a trace within "
|
||||
f"the {timeout} seconds timeout")
|
||||
|
||||
request = trace_service.request
|
||||
assert len(request.resource_spans) == 1, (
|
||||
f"Expected 1 resource span, "
|
||||
f"but got {len(request.resource_spans)}")
|
||||
assert len(request.resource_spans[0].scope_spans) == 1, (
|
||||
f"Expected 1 scope span, "
|
||||
f"but got {len(request.resource_spans[0].scope_spans)}")
|
||||
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
|
||||
f"Expected 1 span, "
|
||||
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
|
||||
|
||||
attributes = decode_attributes(
|
||||
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
||||
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
|
||||
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
|
||||
) == sampling_params.temperature
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
|
||||
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
|
||||
) == sampling_params.max_tokens
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
|
||||
outputs[0].prompt_token_ids)
|
||||
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
|
||||
metrics = outputs[0].metrics
|
||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
|
||||
) == metrics.time_in_queue
|
||||
ttft = metrics.first_token_time - metrics.arrival_time
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
||||
e2e_time = metrics.finished_time - metrics.arrival_time
|
||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
|
||||
assert metrics.scheduler_time > 0
|
||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
|
||||
) == metrics.scheduler_time
|
||||
# Model forward and model execute should be none, since detailed traces is
|
||||
# not enabled.
|
||||
assert metrics.model_forward_time is None
|
||||
assert metrics.model_execute_time is None
|
||||
|
||||
|
||||
def test_traces_with_detailed_steps(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
trace_service: FakeTraceService,
|
||||
):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.01,
|
||||
top_p=0.1,
|
||||
max_tokens=256,
|
||||
)
|
||||
model = "facebook/opt-125m"
|
||||
llm = LLM(
|
||||
model=model,
|
||||
otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
|
||||
collect_detailed_traces=["all"],
|
||||
)
|
||||
prompts = ["This is a short prompt"]
|
||||
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
||||
|
||||
timeout = 5
|
||||
if not trace_service.evt.wait(timeout):
|
||||
raise TimeoutError(
|
||||
f"The fake trace service didn't receive a trace within "
|
||||
f"the {timeout} seconds timeout")
|
||||
|
||||
request = trace_service.request
|
||||
assert len(request.resource_spans) == 1, (
|
||||
f"Expected 1 resource span, "
|
||||
f"but got {len(request.resource_spans)}")
|
||||
assert len(request.resource_spans[0].scope_spans) == 1, (
|
||||
f"Expected 1 scope span, "
|
||||
f"but got {len(request.resource_spans[0].scope_spans)}")
|
||||
assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
|
||||
f"Expected 1 span, "
|
||||
f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
|
||||
|
||||
attributes = decode_attributes(
|
||||
request.resource_spans[0].scope_spans[0].spans[0].attributes)
|
||||
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
|
||||
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
|
||||
) == sampling_params.temperature
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
|
||||
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
|
||||
) == sampling_params.max_tokens
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
|
||||
outputs[0].prompt_token_ids)
|
||||
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
|
||||
metrics = outputs[0].metrics
|
||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
|
||||
) == metrics.time_in_queue
|
||||
ttft = metrics.first_token_time - metrics.arrival_time
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
|
||||
e2e_time = metrics.finished_time - metrics.arrival_time
|
||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
|
||||
assert metrics.scheduler_time > 0
|
||||
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
|
||||
) == metrics.scheduler_time
|
||||
assert metrics.model_forward_time > 0
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
|
||||
) == pytest.approx(metrics.model_forward_time / 1000)
|
||||
assert metrics.model_execute_time > 0
|
||||
assert attributes.get(
|
||||
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
|
||||
) == metrics.model_execute_time
|
||||
assert metrics.model_forward_time < 1000 * metrics.model_execute_time
|
||||
@ -1,43 +1,63 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import numpy as np
|
||||
|
||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
||||
from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
|
||||
_find_subarray_kmp,
|
||||
_kmp_lps_array)
|
||||
from vllm.v1.spec_decode.ngram_proposer import (
|
||||
NgramProposer, _find_longest_matched_ngram_and_propose_tokens)
|
||||
|
||||
|
||||
def test_kmp_lps_array():
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
|
||||
np.array([0, 1, 2]))
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
|
||||
np.array([0, 0, 0, 0]))
|
||||
np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
|
||||
np.array([0, 0, 1, 2, 0]))
|
||||
def test_find_longest_matched_ngram_and_propose_tokens():
|
||||
tokens = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
|
||||
assert _find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=2) is None
|
||||
|
||||
tokens = np.array([1, 2, 3, 4, 1, 2, 3])
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=3),
|
||||
np.array([4, 1, 2]))
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=2), np.array([4, 1]))
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=1,
|
||||
max_ngram=1,
|
||||
max_model_len=1024,
|
||||
k=3),
|
||||
np.array([4, 1, 2]))
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=1,
|
||||
max_ngram=1,
|
||||
max_model_len=1024,
|
||||
k=2), np.array([4, 1]))
|
||||
|
||||
def test_find_subarray_kmp():
|
||||
X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
|
||||
assert _find_subarray_kmp(X, 2, 2) is None
|
||||
X = np.array([1, 2, 3, 4, 1, 2, 3])
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
|
||||
np.array([4, 1, 2]))
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
|
||||
1]))
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
|
||||
np.array([4, 1, 2]))
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
|
||||
1]))
|
||||
X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
|
||||
np.array([4, 1, 2]))
|
||||
tokens = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=2,
|
||||
max_ngram=2,
|
||||
max_model_len=1024,
|
||||
k=3),
|
||||
np.array([4, 1, 2]))
|
||||
# Return on the first match
|
||||
np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
|
||||
np.array([6, 2, 3]))
|
||||
np.testing.assert_array_equal(
|
||||
_find_longest_matched_ngram_and_propose_tokens(origin_tokens=tokens,
|
||||
min_ngram=1,
|
||||
max_ngram=1,
|
||||
max_model_len=1024,
|
||||
k=2), np.array([6, 2]))
|
||||
|
||||
|
||||
def test_ngram_proposer():
|
||||
@ -56,27 +76,35 @@ def test_ngram_proposer():
|
||||
|
||||
# No match.
|
||||
result = ngram_proposer(
|
||||
2, 2, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
|
||||
min_n=2, max_n=2,
|
||||
k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 5]))
|
||||
assert result is None
|
||||
|
||||
# No match for 4-gram.
|
||||
result = ngram_proposer(
|
||||
4, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
min_n=4, max_n=4,
|
||||
k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
assert result is None
|
||||
|
||||
# No match for 4-gram but match for 3-gram.
|
||||
result = ngram_proposer(
|
||||
3, 4, 2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
min_n=3, max_n=4,
|
||||
k=2).propose(context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]))
|
||||
assert np.array_equal(result, np.array([4, 1]))
|
||||
|
||||
# Match for both 4-gram and 3-gram.
|
||||
# In this case, the proposer should return the 4-gram match.
|
||||
result = ngram_proposer(3, 4, 2).propose(
|
||||
result = ngram_proposer(min_n=3, max_n=4, k=2).propose(
|
||||
context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]))
|
||||
assert np.array_equal(result, np.array([1, 2])) # Not [5, 1]
|
||||
|
||||
# Match for 2-gram and 3-gram, but not 4-gram.
|
||||
result = ngram_proposer(
|
||||
2, 4,
|
||||
2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
|
||||
result = ngram_proposer(min_n=2, max_n=4, k=2).propose(
|
||||
context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
|
||||
assert np.array_equal(result, np.array([1, 2])) # Not [5, 2]
|
||||
|
||||
# Multiple 3-gram matched, but always pick the first one.
|
||||
result = ngram_proposer(
|
||||
min_n=3, max_n=3, k=2).propose(context_token_ids=np.array(
|
||||
[1, 2, 3, 100, 1, 2, 3, 200, 1, 2, 3, 300, 1, 2, 3]))
|
||||
assert np.array_equal(result, np.array([100, 1]))
|
||||
|
||||
@ -297,7 +297,7 @@ class CustomAllreduce:
|
||||
@staticmethod
|
||||
def free_shared_buffer(pointers: list[int],
|
||||
group: Optional[ProcessGroup] = None,
|
||||
rank: Optional[int] = 0) -> None:
|
||||
rank: Optional[int] = None) -> None:
|
||||
if rank is None:
|
||||
rank = dist.get_rank(group=group)
|
||||
if ops is not None:
|
||||
|
||||
@ -11,6 +11,10 @@ from vllm.config import VllmConfig
|
||||
class NgramProposer:
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
assert vllm_config.speculative_config is not None
|
||||
assert vllm_config.speculative_config.prompt_lookup_min is not None
|
||||
assert vllm_config.speculative_config.prompt_lookup_max is not None
|
||||
|
||||
# Minimum length of the n-gram to match.
|
||||
self.min_n = vllm_config.speculative_config.prompt_lookup_min
|
||||
# Maximum length of the n-gram to match.
|
||||
@ -54,17 +58,13 @@ class NgramProposer:
|
||||
followed that pattern. Here we will return [4,2,3] because
|
||||
we only have three tokens after the match.
|
||||
"""
|
||||
# Do not generate draft tokens beyond the max model length.
|
||||
k = min(self.k, self.max_model_len - context_token_ids.shape[0])
|
||||
if k <= 0:
|
||||
return None
|
||||
|
||||
# TODO(woosuk): Optimize this.
|
||||
for n in range(self.max_n, self.min_n - 1, -1):
|
||||
result = _find_subarray_kmp(context_token_ids, n, k)
|
||||
if result is not None:
|
||||
return result
|
||||
return None
|
||||
return _find_longest_matched_ngram_and_propose_tokens(
|
||||
origin_tokens=context_token_ids,
|
||||
min_ngram=self.min_n,
|
||||
max_ngram=self.max_n,
|
||||
max_model_len=self.max_model_len,
|
||||
k=self.k)
|
||||
|
||||
def load_model(self, *args, **kwargs):
|
||||
# No model to load.
|
||||
@ -72,61 +72,86 @@ class NgramProposer:
|
||||
|
||||
|
||||
@jit(nopython=True)
|
||||
def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
|
||||
def _find_longest_matched_ngram_and_propose_tokens(
|
||||
origin_tokens: np.ndarray, min_ngram: int, max_ngram: int,
|
||||
max_model_len: int, k: int) -> Optional[np.ndarray]:
|
||||
"""
|
||||
Build the lps (longest proper prefix which is also suffix)
|
||||
array for the pattern.
|
||||
Find the longest n-gram which matches the suffix of the given tokens
|
||||
whose length is within [min_ngram, max_ngram] (inclusive).
|
||||
|
||||
If found, we will extract k right after the matched ngram.
|
||||
"""
|
||||
lps = np.zeros(len(pattern), dtype=np.int32)
|
||||
prev_lps = 0 # length of the previous longest prefix suffix
|
||||
# Do not generate draft tokens is context is shorter than minimum n-gram
|
||||
total_token = origin_tokens.shape[0]
|
||||
if total_token < min_ngram:
|
||||
return None
|
||||
|
||||
# Do not generate draft tokens beyond the max model length.
|
||||
k = min(k, max_model_len - total_token)
|
||||
if k <= 0:
|
||||
return None
|
||||
|
||||
# Flip tokens, and the goal become to find longest ngram
|
||||
# on the rightmost position which matches the prefix with
|
||||
# length [min_n, max_n] (inclusive).
|
||||
tokens = origin_tokens[::-1]
|
||||
|
||||
# Longest prefix (not including itself) which is a suffix of
|
||||
# the current position.
|
||||
# lps[i] = max{v, where tokens[0:v] == tokens[i+1-v:i+1]}
|
||||
#
|
||||
# As ngram is capped by max_ngram to save memory, we only need to
|
||||
# store lps for the first max_ngram prefix.
|
||||
lps = np.zeros(max_ngram, dtype=np.int32)
|
||||
|
||||
longest_ngram = 0
|
||||
position = 0
|
||||
|
||||
# lps[0] always equal to 0, we starts with index 1
|
||||
prev_lps = 0
|
||||
i = 1
|
||||
|
||||
while i < len(pattern):
|
||||
if pattern[i] == pattern[prev_lps]:
|
||||
while i < total_token:
|
||||
# tokens[:prev_lps] is the longest prefix as a suffix of tokens[:i]
|
||||
if tokens[prev_lps] == tokens[i]:
|
||||
# Token match: tokens[:prev_lps+1] is the longest prefix as
|
||||
# a suffix of tokens[:i+1]
|
||||
prev_lps += 1
|
||||
lps[i] = prev_lps
|
||||
# Check if we found a longer valid ngram.
|
||||
#
|
||||
# Update position when longest_ngram matched prev_lps,
|
||||
# as we want to get the target n-gram of the earliest position
|
||||
# in the original tokens (i.e.
|
||||
# latest position in the reversed tokens)
|
||||
if prev_lps >= longest_ngram:
|
||||
longest_ngram = prev_lps
|
||||
position = i
|
||||
if i < max_ngram:
|
||||
# Store LPS for the first max_ngram prefix
|
||||
lps[i] = prev_lps
|
||||
if prev_lps == max_ngram:
|
||||
# When prev_lps reached max_ngram, update prev_lps
|
||||
# to lps[max_ngram-1] to avoid matching ngram
|
||||
# longer than max_ngram
|
||||
prev_lps = lps[max_ngram - 1]
|
||||
i += 1
|
||||
elif prev_lps != 0:
|
||||
# Token mismatch: try the second longest prefix
|
||||
# among all suffix of tokens[:i],
|
||||
# which is the longest prefix of tokens[:prev_lps]
|
||||
prev_lps = lps[prev_lps - 1]
|
||||
else:
|
||||
if prev_lps != 0:
|
||||
prev_lps = lps[prev_lps - 1]
|
||||
else:
|
||||
lps[i] = 0
|
||||
i += 1
|
||||
return lps
|
||||
|
||||
|
||||
@jit(nopython=True)
|
||||
def _find_subarray_kmp(
|
||||
context_token_ids: np.ndarray,
|
||||
n: int,
|
||||
k: int,
|
||||
) -> Optional[np.ndarray]:
|
||||
context_len = context_token_ids.shape[0]
|
||||
assert n > 0
|
||||
|
||||
pattern = context_token_ids[-n:]
|
||||
# Precompute lps array for Y
|
||||
lps = _kmp_lps_array(pattern)
|
||||
|
||||
i = 0
|
||||
j = 0
|
||||
# -n because the last n tokens are used as pattern
|
||||
while i < context_len - n:
|
||||
if context_token_ids[i] == pattern[j]:
|
||||
# Token mismatch, and no more prefix (except empty string)
|
||||
# as a suffix of tokens[:i]
|
||||
i += 1
|
||||
j += 1
|
||||
|
||||
# If we have matched the entire Y
|
||||
if j == n:
|
||||
# Found pattern in context, gather the next K elements
|
||||
return context_token_ids[i:i + k]
|
||||
else:
|
||||
# Mismatch
|
||||
if j != 0:
|
||||
# Use the lps array to avoid re-checking elements
|
||||
j = lps[j - 1]
|
||||
else:
|
||||
i += 1
|
||||
if longest_ngram < min_ngram:
|
||||
# No valid ngram is found
|
||||
return None
|
||||
|
||||
# Y not found
|
||||
return None
|
||||
# Flip the position back, so in origin_tokens,
|
||||
# origin_tokens[total_token-1-position:total_token-1-position+longest_ngram]
|
||||
# is the matched ngram, so we should start drafting tokens from
|
||||
# total_token-1-position+longest_ngram
|
||||
start_position = total_token - 1 - position + longest_ngram
|
||||
k = min(k, total_token - start_position)
|
||||
return origin_tokens[start_position:start_position + k]
|
||||
|
||||
Reference in New Issue
Block a user