Compare commits
13 Commits
releases/v
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 36960501d3 | |||
| b2e65cb4a7 | |||
| 2bf0bcc1fc | |||
| 697f507a8e | |||
| d5d2a0fe74 | |||
| c9791f1813 | |||
| e7acb20076 | |||
| 4b68c4a55b | |||
| a8141fa649 | |||
| 4917002523 | |||
| a2981c4272 | |||
| 4574d48bab | |||
| ab98f6556f |
@ -7,7 +7,7 @@ vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](http
|
||||
|
||||
## Performance benchmark quick overview
|
||||
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
|
||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
||||
|
||||
**Benchmarking Duration**: about 1hr.
|
||||
|
||||
@ -34,6 +34,7 @@ Runtime environment variables:
|
||||
|
||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||
>
|
||||
### Latency test
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
- Input length: 32 tokens.
|
||||
- Output length: 128 tokens.
|
||||
- Batch size: fixed (8).
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: throughput.
|
||||
|
||||
@ -28,7 +28,7 @@
|
||||
- Output length: the corresponding output length of these 200 prompts.
|
||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||
- CPU Models: llama-3.1 8B.
|
||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||
|
||||
@ -15,6 +15,8 @@ check_gpus() {
|
||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||
elif command -v hl-smi; then
|
||||
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
||||
fi
|
||||
|
||||
if [[ $gpu_count -gt 0 ]]; then
|
||||
@ -23,10 +25,16 @@ check_gpus() {
|
||||
echo "Need at least 1 GPU to run benchmarking."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
declare -g arch_suffix=''
|
||||
|
||||
if command -v nvidia-smi; then
|
||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||
elif command -v amd-smi; then
|
||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||
elif command -v hl-smi; then
|
||||
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
|
||||
arch_suffix='-hpu'
|
||||
fi
|
||||
echo "GPU type is $gpu_type"
|
||||
}
|
||||
@ -138,6 +146,10 @@ kill_gpu_processes() {
|
||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
elif command -v hl-smi; then
|
||||
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
|
||||
sleep 1
|
||||
done
|
||||
fi
|
||||
|
||||
# remove vllm config file
|
||||
@ -451,6 +463,7 @@ main() {
|
||||
ARCH='-cpu'
|
||||
else
|
||||
check_gpus
|
||||
ARCH="$arch_suffix"
|
||||
fi
|
||||
check_hf_token
|
||||
|
||||
|
||||
@ -0,0 +1,55 @@
|
||||
[
|
||||
{
|
||||
"test_name": "latency_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15,
|
||||
"max-model-len": 256,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_llama70B_tp4",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15,
|
||||
"max-model-len": 256,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "latency_mixtral8x7B_tp2",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"load_format": "dummy",
|
||||
"num-iters-warmup": 5,
|
||||
"num-iters": 15,
|
||||
"max-model-len": 256,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -0,0 +1,82 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 256,
|
||||
"async-scheduling": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 256,
|
||||
"async-scheduling": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||
"qps_list": [1, 4, 16, "inf"],
|
||||
"server_environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"swap_space": 16,
|
||||
"disable_log_stats": "",
|
||||
"load_format": "dummy",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 256,
|
||||
"async-scheduling": ""
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -0,0 +1,61 @@
|
||||
[
|
||||
{
|
||||
"test_name": "throughput_llama8B_tp1",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 1,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_llama70B_tp4",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||
"tensor_parallel_size": 4,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "throughput_mixtral8x7B_tp2",
|
||||
"environment_variables": {
|
||||
"PT_HPU_LAZY_MODE": 1,
|
||||
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||
"VLLM_CONTIGUOUS_PA": 1,
|
||||
"VLLM_DEFRAG": 1
|
||||
},
|
||||
"parameters": {
|
||||
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
"tensor_parallel_size": 2,
|
||||
"load_format": "dummy",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 1000,
|
||||
"backend": "vllm",
|
||||
"max-model-len": 2048,
|
||||
"max-num-seqs": 512,
|
||||
"async-scheduling": ""
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -0,0 +1,62 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8010}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
--data-parallel-size 2 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.8}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8020}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
--data-parallel-size 2 \
|
||||
--enable-expert-parallel \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@ -1234,3 +1234,21 @@ steps:
|
||||
- .buildkite/scripts/run-prime-rl-test.sh
|
||||
commands:
|
||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||
|
||||
- label: DeepSeek V2-Lite Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||
|
||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, AsyncEngineArgs, AsyncLLMEngine, SamplingParams
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
|
||||
@ -201,3 +203,42 @@ def test_deep_sleep():
|
||||
|
||||
# cmp output
|
||||
assert output[0].outputs[0].text == output2[0].outputs[0].text
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_deep_sleep_async():
|
||||
async def test():
|
||||
model = "hmellor/tiny-random-LlamaForCausalLM"
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
used_bytes_baseline = total - free # in case other process is running
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model,
|
||||
enable_sleep_mode=True,
|
||||
)
|
||||
|
||||
llm = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
prompt = "How are you?"
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
outputs = llm.generate(prompt, sampling_params, request_id="test_request_id1")
|
||||
async for output in outputs:
|
||||
pass
|
||||
|
||||
# Put the engine to deep sleep
|
||||
await llm.sleep(level=2)
|
||||
|
||||
await llm.wake_up(tags=["weights"])
|
||||
await llm.collective_rpc("reload_weights")
|
||||
free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
|
||||
used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
|
||||
assert used_bytes < 4 * GiB_bytes
|
||||
|
||||
# now allocate kv cache and cuda graph memory
|
||||
await llm.wake_up(tags=["kv_cache"])
|
||||
outputs2 = llm.generate(prompt, sampling_params, request_id="test_request_id2")
|
||||
async for output2 in outputs2:
|
||||
pass
|
||||
|
||||
# cmp output
|
||||
assert output.outputs[0].text == output2.outputs[0].text
|
||||
|
||||
asyncio.run(test())
|
||||
|
||||
@ -651,3 +651,79 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
||||
await serving_chat.create_chat_completion(req)
|
||||
engine_prompt = serving_chat._process_inputs.await_args_list[1].args[1]
|
||||
assert engine_prompt.get("cache_salt") == "test_salt"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_serving_chat_data_parallel_rank_extraction():
|
||||
"""Test that data_parallel_rank is properly extracted from header and
|
||||
passed to engine."""
|
||||
mock_engine = MagicMock(spec=AsyncLLM)
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
# Mock the generate method to return an async generator
|
||||
async def mock_generate(*args, **kwargs):
|
||||
# Yield a fake RequestOutput
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
|
||||
yield RequestOutput(
|
||||
request_id="test-request",
|
||||
prompt="test prompt",
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
prompt_logprobs=None,
|
||||
outputs=[
|
||||
CompletionOutput(
|
||||
index=0,
|
||||
text="test response",
|
||||
token_ids=[4, 5, 6],
|
||||
cumulative_logprob=0.0,
|
||||
logprobs=None,
|
||||
finish_reason="stop",
|
||||
stop_reason=None,
|
||||
)
|
||||
],
|
||||
finished=True,
|
||||
)
|
||||
|
||||
mock_engine.generate = AsyncMock(side_effect=mock_generate)
|
||||
|
||||
serving_chat = _build_serving_chat(mock_engine)
|
||||
|
||||
# Test when data_parallel_rank is present in header
|
||||
req = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": "what is 1+1?"}],
|
||||
)
|
||||
|
||||
# Mock request with X-data-parallel-rank header
|
||||
mock_raw_request = MagicMock()
|
||||
mock_raw_request.headers = {"X-data-parallel-rank": "2"}
|
||||
mock_raw_request.state = MagicMock()
|
||||
|
||||
with suppress(Exception):
|
||||
await serving_chat.create_chat_completion(req, mock_raw_request)
|
||||
|
||||
# Verify that data_parallel_rank was passed to engine.generate
|
||||
assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs
|
||||
assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] == 2
|
||||
|
||||
# Test when data_parallel_rank is not present (defaults to None)
|
||||
req_no_dp = ChatCompletionRequest(
|
||||
model=MODEL_NAME,
|
||||
messages=[{"role": "user", "content": "what is 2+2?"}],
|
||||
)
|
||||
|
||||
# Mock request with no header
|
||||
mock_raw_request_no_dp = MagicMock()
|
||||
mock_raw_request_no_dp.headers = {}
|
||||
mock_raw_request_no_dp.state = MagicMock()
|
||||
|
||||
with suppress(Exception):
|
||||
await serving_chat.create_chat_completion(req_no_dp, mock_raw_request_no_dp)
|
||||
|
||||
# Verify that data_parallel_rank defaults to None
|
||||
assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs
|
||||
assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] is None
|
||||
|
||||
0
tools/check_repo.sh
Normal file → Executable file
0
tools/check_repo.sh
Normal file → Executable file
0
tools/ep_kernels/configure_system_drivers.sh
Normal file → Executable file
0
tools/ep_kernels/configure_system_drivers.sh
Normal file → Executable file
0
tools/ep_kernels/elastic_ep/install_eep_libraries.sh
Normal file → Executable file
0
tools/ep_kernels/elastic_ep/install_eep_libraries.sh
Normal file → Executable file
1
tools/ep_kernels/install_python_libraries.sh
Normal file → Executable file
1
tools/ep_kernels/install_python_libraries.sh
Normal file → Executable file
@ -1,3 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
|
||||
# prepare workspace directory
|
||||
|
||||
0
tools/flashinfer-build.sh
Normal file → Executable file
0
tools/flashinfer-build.sh
Normal file → Executable file
0
tools/vllm-tpu/build.sh
Normal file → Executable file
0
tools/vllm-tpu/build.sh
Normal file → Executable file
@ -26,6 +26,7 @@ import os
|
||||
import random
|
||||
import shutil
|
||||
import time
|
||||
import uuid
|
||||
import warnings
|
||||
from collections.abc import AsyncGenerator, Iterable
|
||||
from dataclasses import dataclass
|
||||
@ -1160,7 +1161,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
"--request-id-prefix",
|
||||
type=str,
|
||||
required=False,
|
||||
default="benchmark-serving",
|
||||
default=f"bench-{uuid.uuid4().hex[:8]}-",
|
||||
help="Specify the prefix of request id.",
|
||||
)
|
||||
|
||||
|
||||
@ -20,9 +20,6 @@ from vllm.config.pooler import PoolerConfig
|
||||
from vllm.config.scheduler import RunnerType
|
||||
from vllm.config.utils import assert_hashable, config, getattr_iter
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.batch_invariant import (
|
||||
vllm_is_batch_invariant,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.transformers_utils.config import (
|
||||
ConfigFormat,
|
||||
@ -436,10 +433,6 @@ class ModelConfig:
|
||||
skip_mm_profiling: bool | None,
|
||||
video_pruning_rate: float | None,
|
||||
) -> None:
|
||||
# Enable batch invariance settings if requested
|
||||
if vllm_is_batch_invariant():
|
||||
self.enforce_eager = True
|
||||
|
||||
# Set the default seed to 0 in V1.
|
||||
# NOTE(woosuk): In V1, we use separate processes for workers (unless
|
||||
# VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
|
||||
|
||||
@ -363,7 +363,7 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
|
||||
num_rdma_bytes=num_rdma_bytes,
|
||||
low_latency_mode=True,
|
||||
num_qps_per_rank=num_qps_per_rank,
|
||||
allow_nvlink_for_low_latency_mode=envs.VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK,
|
||||
allow_nvlink_for_low_latency_mode=True,
|
||||
allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL,
|
||||
)
|
||||
|
||||
|
||||
@ -264,6 +264,9 @@ class OpenAIServingChat(OpenAIServing):
|
||||
if raw_request:
|
||||
raw_request.state.request_metadata = request_metadata
|
||||
|
||||
# Extract data_parallel_rank from header (router can inject it)
|
||||
data_parallel_rank = self._get_data_parallel_rank(raw_request)
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[RequestOutput, None]] = []
|
||||
try:
|
||||
@ -331,6 +334,7 @@ class OpenAIServingChat(OpenAIServing):
|
||||
priority=request.priority,
|
||||
prompt_text=prompt_text,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
data_parallel_rank=data_parallel_rank,
|
||||
)
|
||||
|
||||
generators.append(generator)
|
||||
|
||||
@ -141,6 +141,9 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
logger.exception("Error in preprocessing prompt inputs")
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
# Extract data_parallel_rank from header (router can inject it)
|
||||
data_parallel_rank = self._get_data_parallel_rank(raw_request)
|
||||
|
||||
# Schedule the request and get the result generator.
|
||||
generators: list[AsyncGenerator[RequestOutput, None]] = []
|
||||
try:
|
||||
@ -224,6 +227,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
priority=request.priority,
|
||||
prompt_text=prompt_text,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
data_parallel_rank=data_parallel_rank,
|
||||
)
|
||||
|
||||
generators.append(generator)
|
||||
|
||||
@ -1298,6 +1298,21 @@ class OpenAIServing:
|
||||
|
||||
return raw_request.headers.get("X-Request-Id", default)
|
||||
|
||||
@staticmethod
|
||||
def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
|
||||
"""Pulls the data parallel rank from a header, if provided"""
|
||||
if raw_request is None:
|
||||
return None
|
||||
|
||||
rank_str = raw_request.headers.get("X-data-parallel-rank")
|
||||
if rank_str is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
return int(rank_str)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _get_decoded_token(
|
||||
logprob: Logprob,
|
||||
|
||||
15
vllm/envs.py
15
vllm/envs.py
@ -207,7 +207,6 @@ if TYPE_CHECKING:
|
||||
VLLM_OBJECT_STORAGE_SHM_BUFFER_NAME: str = "VLLM_OBJECT_STORAGE_SHM_BUFFER"
|
||||
VLLM_DEEPEP_BUFFER_SIZE_MB: int = 1024
|
||||
VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE: bool = False
|
||||
VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK: bool = True
|
||||
VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL: bool = False
|
||||
VLLM_DBO_COMM_SMS: int = 20
|
||||
VLLM_PATTERN_MATCH_DEBUG: str | None = None
|
||||
@ -252,6 +251,9 @@ def disable_compile_cache() -> bool:
|
||||
|
||||
|
||||
def use_aot_compile() -> bool:
|
||||
from vllm.model_executor.layers.batch_invariant import (
|
||||
vllm_is_batch_invariant,
|
||||
)
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
default_value = (
|
||||
@ -260,7 +262,10 @@ def use_aot_compile() -> bool:
|
||||
else "0"
|
||||
)
|
||||
|
||||
return os.environ.get("VLLM_USE_AOT_COMPILE", default_value) == "1"
|
||||
return (
|
||||
not vllm_is_batch_invariant()
|
||||
and os.environ.get("VLLM_USE_AOT_COMPILE", default_value) == "1"
|
||||
)
|
||||
|
||||
|
||||
def env_with_choices(
|
||||
@ -1400,11 +1405,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool(
|
||||
int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0"))
|
||||
),
|
||||
# Allow DeepEP to use nvlink for internode_ll kernel, turn this on for
|
||||
# better latency on GB200 like system
|
||||
"VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK": lambda: bool(
|
||||
int(os.getenv("VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK", "1"))
|
||||
),
|
||||
# Allow DeepEP to use MNNVL (multi-node nvlink) for internode_ll kernel,
|
||||
# turn this for better latency on GB200 like system
|
||||
"VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL": lambda: bool(
|
||||
@ -1566,7 +1566,6 @@ def compute_hash() -> str:
|
||||
"VLLM_NVFP4_GEMM_BACKEND",
|
||||
"VLLM_USE_FBGEMM",
|
||||
"VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE",
|
||||
"VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK",
|
||||
"VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL",
|
||||
]
|
||||
for key in environment_variables_to_hash:
|
||||
|
||||
@ -11,6 +11,7 @@ import torch
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.triton_utils import tl, triton
|
||||
from vllm.utils.torch_utils import is_torch_equal_or_newer
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -716,6 +717,10 @@ def linear_batch_invariant(input, weight, bias=None):
|
||||
_batch_invariant_MODE = False
|
||||
_batch_invariant_LIB = None
|
||||
_original_torch_bmm = None
|
||||
_original_fp16_reduction_precision = None
|
||||
_original_bf16_reduction_precision = None
|
||||
_original_cublas_workspace_cfg = None
|
||||
_original_cublaslt_workspace_size = None
|
||||
|
||||
|
||||
def is_batch_invariant_mode_enabled():
|
||||
@ -724,6 +729,8 @@ def is_batch_invariant_mode_enabled():
|
||||
|
||||
def enable_batch_invariant_mode():
|
||||
global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
|
||||
global _original_fp16_reduction_precision, _original_bf16_reduction_precision
|
||||
global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size
|
||||
if _batch_invariant_MODE:
|
||||
return
|
||||
|
||||
@ -745,14 +752,75 @@ def enable_batch_invariant_mode():
|
||||
_original_torch_bmm = torch.bmm
|
||||
torch.bmm = bmm_batch_invariant
|
||||
|
||||
_original_bf16_reduction_precision = (
|
||||
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
|
||||
)
|
||||
_original_fp16_reduction_precision = (
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
|
||||
)
|
||||
|
||||
reduced_precision_val = (
|
||||
(False, False) if is_torch_equal_or_newer("2.10.0.dev") else False
|
||||
)
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
|
||||
reduced_precision_val
|
||||
)
|
||||
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
|
||||
reduced_precision_val
|
||||
)
|
||||
torch.backends.cuda.preferred_blas_library(backend="cublaslt")
|
||||
|
||||
if not is_torch_equal_or_newer("2.10.0.dev"):
|
||||
_original_cublas_workspace_cfg = os.environ.get("CUBLAS_WORKSPACE_CONFIG", None)
|
||||
_original_cublaslt_workspace_size = os.environ.get(
|
||||
"CUBLASLT_WORKSPACE_SIZE", None
|
||||
)
|
||||
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
|
||||
os.environ["CUBLASLT_WORKSPACE_SIZE"] = "1"
|
||||
|
||||
|
||||
def disable_batch_invariant_mode():
|
||||
global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
|
||||
global _original_fp16_reduction_precision, _original_bf16_reduction_precision
|
||||
global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size
|
||||
if not _batch_invariant_MODE:
|
||||
return
|
||||
|
||||
if _batch_invariant_LIB is not None:
|
||||
_batch_invariant_LIB._destroy()
|
||||
if _original_torch_bmm is not None:
|
||||
torch.bmm = _original_torch_bmm
|
||||
_original_torch_bmm = None
|
||||
|
||||
if _original_bf16_reduction_precision is not None:
|
||||
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
|
||||
_original_bf16_reduction_precision
|
||||
)
|
||||
_original_bf16_reduction_precision = None
|
||||
if _original_fp16_reduction_precision is not None:
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
|
||||
_original_fp16_reduction_precision
|
||||
)
|
||||
_original_fp16_reduction_precision = None
|
||||
|
||||
torch.backends.cuda.preferred_blas_library(backend="default")
|
||||
|
||||
if not is_torch_equal_or_newer("2.10.0.dev"):
|
||||
# Set cublas env vars to previous results. If previous results are None,
|
||||
# that means the env vars were not set, so we should remove them.
|
||||
if _original_cublas_workspace_cfg:
|
||||
os.environ["CUBLAS_WORKSPACE_CONFIG"] = _original_cublas_workspace_cfg
|
||||
elif "CUBLAS_WORKSPACE_CONFIG" in os.environ:
|
||||
del os.environ["CUBLAS_WORKSPACE_CONFIG"]
|
||||
|
||||
if _original_cublaslt_workspace_size:
|
||||
os.environ["CUBLASLT_WORKSPACE_SIZE"] = _original_cublaslt_workspace_size
|
||||
elif "CUBLASLT_WORKSPACE_SIZE" in os.environ:
|
||||
del os.environ["CUBLASLT_WORKSPACE_SIZE"]
|
||||
|
||||
_original_cublas_workspace_cfg = None
|
||||
_original_cublaslt_workspace_size = None
|
||||
|
||||
_batch_invariant_MODE = False
|
||||
_batch_invariant_LIB = None
|
||||
|
||||
@ -831,6 +899,9 @@ def override_envs_for_invariance():
|
||||
os.environ["NCCL_NTHREADS"] = "1"
|
||||
os.environ["NCCL_SOCKET_NTHREADS"] = "1"
|
||||
|
||||
# torch.compile settings
|
||||
os.environ["VLLM_USE_AOT_COMPILE"] = "0"
|
||||
|
||||
|
||||
def init_batch_invariance():
|
||||
# this will hit all the csrc overrides as well
|
||||
|
||||
@ -363,6 +363,7 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
self.use_marlin = False
|
||||
|
||||
self.use_aiter_and_is_supported = check_aiter_fp8_linear_support()
|
||||
self.use_deep_gemm = is_deep_gemm_supported()
|
||||
|
||||
self.weight_block_size = self.quant_config.weight_block_size
|
||||
self.block_quant = self.weight_block_size is not None
|
||||
@ -545,8 +546,10 @@ class Fp8LinearMethod(LinearMethodBase):
|
||||
# if batch invariant mode is enabled, prefer DeepGEMM FP8 path
|
||||
# we will use BF16 dequant when DeepGEMM is not supported.
|
||||
if vllm_is_batch_invariant():
|
||||
# Call is_deep_gemm_supported() ahead of time for torch.compile
|
||||
# dynamo has trouble tracing through
|
||||
if self.block_quant and should_use_deepgemm_for_fp8_linear(
|
||||
torch.bfloat16, layer.weight, None
|
||||
torch.bfloat16, layer.weight, self.use_deep_gemm
|
||||
):
|
||||
# use group quant consistent with block size across K
|
||||
assert self.act_q_group_shape is not None
|
||||
|
||||
@ -406,7 +406,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
|
||||
# easily by changing the way we layout chunks in the
|
||||
# mamba2 kernels.
|
||||
|
||||
base_chunk_size = model_config.get_mamba_chunk_size()
|
||||
base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size()
|
||||
attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token)
|
||||
chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)
|
||||
attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size)
|
||||
|
||||
@ -316,7 +316,8 @@ class CpuPlatform(Platform):
|
||||
|
||||
if (
|
||||
platform.system() == "Linux"
|
||||
and Platform.get_cpu_architecture() == CpuArchEnum.ARM
|
||||
and Platform.get_cpu_architecture()
|
||||
in (CpuArchEnum.ARM, CpuArchEnum.POWERPC)
|
||||
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
|
||||
):
|
||||
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
|
||||
|
||||
@ -306,11 +306,12 @@ class KVCacheManager:
|
||||
"Computed blocks should be empty when prefix caching is disabled"
|
||||
)
|
||||
|
||||
# Append the new computed blocks to the request blocks until now to
|
||||
# avoid the case where the new blocks cannot be allocated.
|
||||
self.coordinator.save_new_computed_blocks(
|
||||
request.request_id, new_computed_block_list
|
||||
)
|
||||
if new_computed_block_list is not self.empty_kv_cache_blocks.blocks:
|
||||
# Append the new computed blocks to the request blocks until now to
|
||||
# avoid the case where the new blocks cannot be allocated.
|
||||
self.coordinator.save_new_computed_blocks(
|
||||
request.request_id, new_computed_block_list
|
||||
)
|
||||
|
||||
new_blocks = self.coordinator.allocate_new_blocks(
|
||||
request.request_id, num_tokens_need_slot, num_encoder_tokens
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1 import (
|
||||
KVConnectorBase_V1,
|
||||
KVConnectorRole,
|
||||
supports_hma,
|
||||
SupportsHMA,
|
||||
)
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
|
||||
from vllm.logger import init_logger
|
||||
@ -93,7 +93,11 @@ class Scheduler(SchedulerInterface):
|
||||
)
|
||||
|
||||
connector_vllm_config = copy.copy(self.vllm_config)
|
||||
connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config)
|
||||
|
||||
# We're dynamically inserting a kv_cache_config variable into the
|
||||
# connector_vllm_config. This is distinct from the cache_config
|
||||
# that is already in there.
|
||||
connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config) # type: ignore[attr-defined]
|
||||
self.connector = KVConnectorFactory.create_connector(
|
||||
config=connector_vllm_config, role=KVConnectorRole.SCHEDULER
|
||||
)
|
||||
@ -1327,15 +1331,15 @@ class Scheduler(SchedulerInterface):
|
||||
|
||||
block_ids = self.kv_cache_manager.get_block_ids(request.request_id)
|
||||
|
||||
if not supports_hma(self.connector):
|
||||
if not isinstance(self.connector, SupportsHMA):
|
||||
# NOTE(Kuntai): We should deprecate this code path after we enforce
|
||||
# all connectors to support HMA.
|
||||
# Hybrid memory allocator should be already turned off for this
|
||||
# code path, but let's double-check here.
|
||||
assert len(self.kv_cache_config.kv_cache_groups) == 1
|
||||
return self.connector.request_finished(request, block_ids[0])
|
||||
else:
|
||||
return self.connector.request_finished(request, block_ids)
|
||||
|
||||
return self.connector.request_finished_all_groups(request, block_ids)
|
||||
|
||||
def _update_waiting_for_remote_kv(self, request: Request) -> bool:
|
||||
"""
|
||||
|
||||
@ -151,7 +151,7 @@ class SingleTypeKVCacheManager(ABC):
|
||||
num_tokens: The total number of tokens that need to be cached
|
||||
(including tokens that are already cached).
|
||||
"""
|
||||
num_cached_blocks = self.num_cached_block[request.request_id]
|
||||
num_cached_blocks = self.num_cached_block.get(request.request_id, 0)
|
||||
num_full_blocks = num_tokens // self.block_size
|
||||
|
||||
if num_cached_blocks >= num_full_blocks:
|
||||
|
||||
@ -322,7 +322,7 @@ def initialize_ray_cluster(
|
||||
|
||||
# Prevalidate GPU requirements before Ray processing
|
||||
if current_platform.is_cuda() and parallel_config.world_size > 1:
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
from vllm.utils.torch_utils import cuda_device_count_stateless
|
||||
|
||||
available_gpus = cuda_device_count_stateless()
|
||||
if parallel_config.world_size > available_gpus:
|
||||
|
||||
@ -1052,6 +1052,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
|
||||
self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time()
|
||||
|
||||
def record_sleep_state(self, sleep: int = 0, level: int = 0):
|
||||
if not envs.VLLM_SERVER_DEV_MODE:
|
||||
return
|
||||
|
||||
awake = 1
|
||||
discard_all = 0
|
||||
weights_offloaded = 0
|
||||
|
||||
@ -2323,11 +2323,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
|
||||
else:
|
||||
sampled_ids = valid_sampled_token_ids[req_idx]
|
||||
|
||||
num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
|
||||
|
||||
if cu_num_accepted_tokens is not None:
|
||||
cu_num_accepted_tokens.append(
|
||||
cu_num_accepted_tokens[-1] + num_sampled_ids
|
||||
)
|
||||
|
||||
if not sampled_ids:
|
||||
continue
|
||||
|
||||
start_idx = self.input_batch.num_tokens_no_spec[req_idx]
|
||||
end_idx = start_idx + len(sampled_ids)
|
||||
end_idx = start_idx + num_sampled_ids
|
||||
assert end_idx <= self.max_model_len, (
|
||||
"Sampled token IDs exceed the max model length. "
|
||||
f"Total number of tokens: {end_idx} > max_model_len: "
|
||||
@ -2343,11 +2351,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
||||
req_state = self.requests[req_id]
|
||||
req_state.output_token_ids.extend(sampled_ids)
|
||||
|
||||
if cu_num_accepted_tokens is not None:
|
||||
cu_num_accepted_tokens.append(
|
||||
cu_num_accepted_tokens[-1] + len(sampled_ids)
|
||||
)
|
||||
|
||||
logprobs_lists = (
|
||||
logprobs_tensors.tolists(cu_num_accepted_tokens)
|
||||
if not self.use_async_scheduling and logprobs_tensors is not None
|
||||
|
||||
Reference in New Issue
Block a user