[Misc][Tools] make max-model-len a parameter in auto_tune script (#21321)

Signed-off-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-24 22:46:43 -07:00
parent fde60ee775
commit 947edd099e
2 changed files with 15 additions and 3 deletions
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -39,6 +39,7 @@ You must set the following variables at the top of the script before execution.
 | `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
 | `INPUT_LEN` | **Required.** Request input length. | `4000` |
 | `OUTPUT_LEN` | **Required.** Request output length. | `16` |
+| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
 | `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
 | `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
 | `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
@ -69,6 +70,7 @@ Here are a few examples of how to configure the script for different goals:
 ```bash
 INPUT_LEN=1800
 OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
 MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```
@ -80,6 +82,7 @@ MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
 ```bash
 INPUT_LEN=1800
 OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
 MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=500
 ```
@ -91,6 +94,7 @@ MAX_LATENCY_ALLOWED_MS=500
 ```bash
 INPUT_LEN=1800
 OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
 MIN_CACHE_HIT_PCT=60
 MAX_LATENCY_ALLOWED_MS=500
 ```
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -4,13 +4,15 @@
 # See details in README (benchmarks/auto_tune/README.md).

 TAG=$(date +"%Y_%m_%d_%H_%M")
-BASE=""
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+BASE="$SCRIPT_DIR/../../.."
 MODEL="meta-llama/Llama-3.1-8B-Instruct"
 SYSTEM="TPU"
 TP=1
 DOWNLOAD_DIR=""
 INPUT_LEN=4000
 OUTPUT_LEN=16
+MAX_MODEL_LEN=4096
 MIN_CACHE_HIT_PCT=0
 MAX_LATENCY_ALLOWED_MS=100000000000
 NUM_SEQS_LIST="128 256"
@ -36,6 +38,13 @@ current_hash=$(git rev-parse HEAD)
 echo "hash:$current_hash" >> "$RESULT"
 echo "current_hash: $current_hash"

+TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
+RED='\033[0;31m'
+if (( TOTAL_LEN > MAX_MODEL_LEN )); then
+    echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
+    exit 1
+fi
+
 best_throughput=0
 best_max_num_seqs=0
 best_num_batched_tokens=0
@ -60,7 +69,7 @@ start_server() {
        --enable-prefix-caching \
        --load-format dummy \
        --download-dir "$DOWNLOAD_DIR" \
-        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
+        --max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &

    # wait for 10 minutes...
    server_started=0
@ -245,4 +254,3 @@ done
 echo "finish permutations"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
-