[Misc] Update disaggregation benchmark scripts and test logs (#11456)

Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
This commit is contained in:
Jiaxin Shan
2024-12-24 22:58:48 -08:00
committed by GitHub
parent 9832e5572a
commit fc601665eb
6 changed files with 43 additions and 29 deletions

View File

@ -10,7 +10,8 @@ set -ex
kill_gpu_processes() {
# kill all processes on GPU.
pkill -f pt_main_thread
pgrep pt_main_thread | xargs -r kill -9
pgrep python3 | xargs -r kill -9
sleep 10
# remove vllm config file
@ -54,7 +55,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES=0 python3 \
-m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--model $model \
--port 8100 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
@ -64,7 +65,7 @@ benchmark() {
CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--model $model \
--port 8200 \
--max-model-len 10000 \
--gpu-memory-utilization 0.6 \
@ -87,7 +88,7 @@ benchmark() {
--port 8100 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_2xtp4.json \
--result-filename disagg_prefill_tp1.json \
--request-rate "inf"
@ -105,7 +106,7 @@ benchmark() {
--port 8200 \
--save-result \
--result-dir $results_folder \
--result-filename disagg_prefill_2xtp4.json \
--result-filename disagg_prefill_tp1_overhead.json \
--request-rate "$qps"
kill_gpu_processes
@ -118,7 +119,7 @@ main() {
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)
pip install quart httpx
pip install quart httpx datasets
cd "$(dirname "$0")"

View File

@ -1,13 +1,12 @@
#!/bin/bash
# Requirement: 8x H100 GPUs.
# Requirement: 2x GPUs.
# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV
# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
# Resource: 8x H100
# Model: meta-llama/Meta-Llama-3.1-8B-Instruct
# Query: 1024 input tokens, 6 output tokens, QPS 2/4/6/8, 100 requests
# Resource: 2x GPU
# Approaches:
# 1. Chunked prefill: 1 vllm instance with tp=8
# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
# Prefilling instance: max_output_token=1
@ -114,7 +113,6 @@ benchmark() {
--request-rate "$qps"
sleep 2
}
@ -123,8 +121,9 @@ main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get -y install jq)
(which socat) || (apt-get -y install socat)
(which lsof) || (apt-get -y install lsof)
pip install quart httpx matplotlib aiohttp
pip install quart httpx matplotlib aiohttp datasets
cd "$(dirname "$0")"