[Misc][Tools] make max-model-len a parameter in auto_tune script (#21321)
Signed-off-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@ -39,6 +39,7 @@ You must set the following variables at the top of the script before execution.
|
||||
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
|
||||
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
|
||||
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
|
||||
| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
|
||||
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
|
||||
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
|
||||
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
|
||||
@ -69,6 +70,7 @@ Here are a few examples of how to configure the script for different goals:
|
||||
```bash
|
||||
INPUT_LEN=1800
|
||||
OUTPUT_LEN=20
|
||||
MAX_MODEL_LEN=2048
|
||||
MIN_CACHE_HIT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||
```
|
||||
@ -80,6 +82,7 @@ MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
|
||||
```bash
|
||||
INPUT_LEN=1800
|
||||
OUTPUT_LEN=20
|
||||
MAX_MODEL_LEN=2048
|
||||
MIN_CACHE_HIT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=500
|
||||
```
|
||||
@ -91,6 +94,7 @@ MAX_LATENCY_ALLOWED_MS=500
|
||||
```bash
|
||||
INPUT_LEN=1800
|
||||
OUTPUT_LEN=20
|
||||
MAX_MODEL_LEN=2048
|
||||
MIN_CACHE_HIT_PCT=60
|
||||
MAX_LATENCY_ALLOWED_MS=500
|
||||
```
|
||||
|
||||
@ -4,13 +4,15 @@
|
||||
# See details in README (benchmarks/auto_tune/README.md).
|
||||
|
||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||
BASE=""
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
BASE="$SCRIPT_DIR/../../.."
|
||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||
SYSTEM="TPU"
|
||||
TP=1
|
||||
DOWNLOAD_DIR=""
|
||||
INPUT_LEN=4000
|
||||
OUTPUT_LEN=16
|
||||
MAX_MODEL_LEN=4096
|
||||
MIN_CACHE_HIT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||
NUM_SEQS_LIST="128 256"
|
||||
@ -36,6 +38,13 @@ current_hash=$(git rev-parse HEAD)
|
||||
echo "hash:$current_hash" >> "$RESULT"
|
||||
echo "current_hash: $current_hash"
|
||||
|
||||
TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
|
||||
RED='\033[0;31m'
|
||||
if (( TOTAL_LEN > MAX_MODEL_LEN )); then
|
||||
echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
best_throughput=0
|
||||
best_max_num_seqs=0
|
||||
best_num_batched_tokens=0
|
||||
@ -60,7 +69,7 @@ start_server() {
|
||||
--enable-prefix-caching \
|
||||
--load-format dummy \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
||||
--max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
|
||||
|
||||
# wait for 10 minutes...
|
||||
server_started=0
|
||||
@ -245,4 +254,3 @@ done
|
||||
echo "finish permutations"
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user