Compare commits
83 Commits
memory-lea
...
debug
| Author | SHA1 | Date | |
|---|---|---|---|
| f4331d1b8b | |||
| 7742eb6c59 | |||
| 22a0070530 | |||
| 170129eb28 | |||
| 955c624915 | |||
| 4f87abdcc6 | |||
| 6910b56da2 | |||
| e10fef0883 | |||
| e680723eba | |||
| 620db1fc58 | |||
| 41183c1fe0 | |||
| 43d9ad03ba | |||
| 7be141b2c5 | |||
| 8d7f39b48c | |||
| cd08636926 | |||
| 3feeeb9fea | |||
| 6f4a82f8b5 | |||
| c44797a4d6 | |||
| 55be93baf5 | |||
| 717fc00e98 | |||
| 01dfb5e982 | |||
| 03dd652c16 | |||
| 9cd76b71ab | |||
| e041314184 | |||
| 5e537f45b4 | |||
| c2a8b08fcd | |||
| f4962a6d55 | |||
| 2f0b833a05 | |||
| 425b04b8f4 | |||
| 60f0843ef8 | |||
| 8a46602606 | |||
| 61aa4b2901 | |||
| 8c892b1831 | |||
| 3bca396f79 | |||
| 3a3e91bdfe | |||
| b3d7e3c845 | |||
| 67841317d1 | |||
| 86173ad593 | |||
| 795b6951cd | |||
| 2e5d21378d | |||
| 0661cb9df3 | |||
| 105d3d62ef | |||
| 62f66be1f7 | |||
| 81c53ef55c | |||
| 75334956c2 | |||
| 77aec83b8c | |||
| e67597545b | |||
| 37a6fa95fd | |||
| 558f0907dc | |||
| 4172235ab7 | |||
| 848562bd49 | |||
| e68dc2f014 | |||
| a3645ed94d | |||
| fb691ee4e7 | |||
| 6024d115cd | |||
| 7555d6b34a | |||
| 00a4e56d8d | |||
| 0eadaeff7e | |||
| 0077c8634e | |||
| b121ca22ad | |||
| eddaafc1c7 | |||
| 305a1cc0d2 | |||
| 6d6c6b05d3 | |||
| 53b19ccdd5 | |||
| 6432739ef1 | |||
| ac201a0eaf | |||
| 3c529fc994 | |||
| 35bf193864 | |||
| 35efa70297 | |||
| cee182b297 | |||
| c954c6629c | |||
| 9dfbeb41e5 | |||
| eedb2a2a10 | |||
| 23a6c5280e | |||
| 7812bcf278 | |||
| 006e7a34ae | |||
| e599e2c65e | |||
| c29fb540ff | |||
| 65e038931d | |||
| 886ccbe5ba | |||
| adc3ddb430 | |||
| 60b755cbcb | |||
| 482e52f56c |
@ -1,6 +1,6 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
@ -32,7 +32,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
@ -64,7 +64,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
@ -96,7 +96,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
@ -131,7 +131,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
@ -166,7 +166,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
||||
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
@ -198,5 +198,413 @@
|
||||
"random-output-len": 128,
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 4,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[
|
||||
{
|
||||
"test_name": "serving_llama8B_pp1_sharegpt",
|
||||
"test_name": "serving_llama8B_bf16_pp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
@ -32,7 +32,39 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp3_sharegpt",
|
||||
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
@ -64,7 +96,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2pp3_sharegpt",
|
||||
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
@ -97,7 +129,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp1_random_128_128",
|
||||
"test_name": "serving_llama8B_bf16_pp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
@ -132,7 +164,42 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_pp3_random_128_128",
|
||||
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_bf16_pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
@ -167,7 +234,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_tp2pp3_random_128_128",
|
||||
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
@ -201,5 +268,553 @@
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_pp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"pipeline_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_pp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"pipeline_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"tensor_parallel_size": 2,
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_pp1_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"pipeline_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "sharegpt",
|
||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||
"num_prompts": 200
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_pp1_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"pipeline_parallel_size": 1,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
||||
"qps_list": ["inf"],
|
||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||
"server_environment_variables": {
|
||||
"VLLM_RPC_TIMEOUT": 100000,
|
||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||
"VLLM_CPU_SGL_KERNEL": 1,
|
||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||
},
|
||||
"server_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"quantization": "awq",
|
||||
"tensor_parallel_size": 2,
|
||||
"pipeline_parallel_size": 3,
|
||||
"dtype": "bfloat16",
|
||||
"distributed_executor_backend": "mp",
|
||||
"block_size": 128,
|
||||
"trust_remote_code": "",
|
||||
"enable_chunked_prefill": "",
|
||||
"disable_log_stats": "",
|
||||
"enforce_eager": "",
|
||||
"max_num_batched_tokens": 2048,
|
||||
"max_num_seqs": 256,
|
||||
"load_format": "dummy"
|
||||
},
|
||||
"client_parameters": {
|
||||
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||
"backend": "vllm",
|
||||
"dataset_name": "random",
|
||||
"random-input-len": 128,
|
||||
"random-output-len": 128,
|
||||
"ignore-eos": "",
|
||||
"num_prompts": 1000
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
@ -150,18 +150,24 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build Neuron release image"
|
||||
key: block-neuron-release-image-build
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build and publish Neuron release image"
|
||||
depends_on: block-neuron-release-image-build
|
||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
agents:
|
||||
queue: neuron-postmerge
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
|
||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
- "docker push vllm/vllm-openai:nightly"
|
||||
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
97
.buildkite/scripts/cleanup-nightly-builds.sh
Executable file
97
.buildkite/scripts/cleanup-nightly-builds.sh
Executable file
@ -0,0 +1,97 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
||||
|
||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||
|
||||
# Get DockerHub token from environment
|
||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
||||
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Function to get all tags from DockerHub
|
||||
get_all_tags() {
|
||||
local page=1
|
||||
local all_tags=""
|
||||
|
||||
while true; do
|
||||
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
|
||||
"$REPO_API_URL?page=$page&page_size=100")
|
||||
|
||||
# Get both last_updated timestamp and tag name, separated by |
|
||||
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||
|
||||
if [ -z "$tags" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
all_tags="$all_tags$tags"$'\n'
|
||||
page=$((page + 1))
|
||||
done
|
||||
|
||||
# Sort by timestamp (newest first) and extract just the tag names
|
||||
echo "$all_tags" | sort -r | cut -d'|' -f2
|
||||
}
|
||||
|
||||
delete_tag() {
|
||||
local tag_name="$1"
|
||||
echo "Deleting tag: $tag_name"
|
||||
|
||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
|
||||
|
||||
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
||||
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
||||
else
|
||||
echo "Successfully deleted tag: $tag_name"
|
||||
fi
|
||||
}
|
||||
|
||||
# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
|
||||
echo "Fetching all tags from DockerHub..."
|
||||
all_tags=$(get_all_tags)
|
||||
|
||||
if [ -z "$all_tags" ]; then
|
||||
echo "No tags found to clean up"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Count total tags
|
||||
total_tags=$(echo "$all_tags" | wc -l)
|
||||
echo "Found $total_tags tags"
|
||||
|
||||
# Keep only the last 14 builds (including the current one)
|
||||
tags_to_keep=14
|
||||
tags_to_delete=$((total_tags - tags_to_keep))
|
||||
|
||||
if [ $tags_to_delete -le 0 ]; then
|
||||
echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
|
||||
|
||||
# Get tags to delete (skip the first $tags_to_keep tags)
|
||||
tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
|
||||
|
||||
if [ -z "$tags_to_delete_list" ]; then
|
||||
echo "No tags to delete"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Delete old tags
|
||||
echo "Deleting old tags..."
|
||||
while IFS= read -r tag; do
|
||||
if [ -n "$tag" ]; then
|
||||
delete_tag "$tag"
|
||||
# Add a small delay to avoid rate limiting
|
||||
sleep 1
|
||||
fi
|
||||
done <<< "$tags_to_delete_list"
|
||||
|
||||
echo "Cleanup completed successfully"
|
||||
@ -1,64 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script build the Neuron docker image and run the API server inside the container.
|
||||
# It serves a sanity check for compilation and basic model usage.
|
||||
set -e
|
||||
set -v
|
||||
|
||||
image_name="neuron/vllm-ci"
|
||||
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
HF_CACHE="$(realpath ~)/huggingface"
|
||||
mkdir -p "${HF_CACHE}"
|
||||
HF_MOUNT="/root/.cache/huggingface"
|
||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
||||
|
||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||
|
||||
# Try building the docker image
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
||||
|
||||
# prune old image and containers to save disk space, and only once a day
|
||||
# by using a timestamp file in tmp.
|
||||
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||
current_time=$(date +%s)
|
||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune -f
|
||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
else
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
docker image rm -f "${image_name}" || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Run the image
|
||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||
-e "HF_HOME=${HF_MOUNT}" \
|
||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "
|
||||
set -e; # Exit on first error
|
||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||
echo \"Running test file: \$f\";
|
||||
python3 -m pytest \$f -v --capture=tee-sys;
|
||||
done
|
||||
"
|
||||
@ -41,7 +41,8 @@ steps:
|
||||
commands:
|
||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -63,7 +64,8 @@ steps:
|
||||
- pytest -v -s utils_ # Utils
|
||||
- pytest -v -s worker # Worker
|
||||
|
||||
- label: Python-only Installation Test
|
||||
- label: Python-only Installation Test # 10min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- tests/standalone_tests/python_only_compile.sh
|
||||
@ -71,7 +73,8 @@ steps:
|
||||
commands:
|
||||
- bash standalone_tests/python_only_compile.sh
|
||||
|
||||
- label: Basic Correctness Test # 30min
|
||||
- label: Basic Correctness Test # 20min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: true
|
||||
torch_nightly: true
|
||||
@ -88,7 +91,8 @@ steps:
|
||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Core Test # 10min
|
||||
- label: Core Test # 22min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: true
|
||||
source_file_dependencies:
|
||||
@ -98,7 +102,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s core
|
||||
|
||||
- label: Entrypoints Test (LLM) # 40min
|
||||
- label: Entrypoints Test (LLM) # 30min
|
||||
timeout_in_minutes: 40
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
@ -114,7 +119,8 @@ steps:
|
||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||
|
||||
- label: Entrypoints Test (API Server) # 40min
|
||||
- label: Entrypoints Test (API Server) # 100min
|
||||
timeout_in_minutes: 130
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
fast_check: true
|
||||
@ -129,7 +135,8 @@ steps:
|
||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/test_chat_utils.py
|
||||
|
||||
- label: Distributed Tests (4 GPUs) # 10min
|
||||
- label: Distributed Tests (4 GPUs) # 35min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
@ -172,7 +179,8 @@ steps:
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: EPLB Algorithm Test
|
||||
- label: EPLB Algorithm Test # 5min
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
@ -181,6 +189,7 @@ steps:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
|
||||
- label: EPLB Execution Test # 5min
|
||||
timeout_in_minutes: 15
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -189,7 +198,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
- label: Metrics, Tracing Test # 12min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
num_gpus: 2
|
||||
source_file_dependencies:
|
||||
@ -208,7 +218,8 @@ steps:
|
||||
##### fast check tests #####
|
||||
##### 1 GPU test #####
|
||||
|
||||
- label: Regression Test # 5min
|
||||
- label: Regression Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -218,7 +229,8 @@ steps:
|
||||
- pytest -v -s test_regression.py
|
||||
working_dir: "/vllm-workspace/tests" # optional
|
||||
|
||||
- label: Engine Test # 10min
|
||||
- label: Engine Test # 25min
|
||||
timeout_in_minutes: 40
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -233,7 +245,8 @@ steps:
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
- label: V1 Test e2e + engine
|
||||
- label: V1 Test e2e + engine # 30min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -244,7 +257,8 @@ steps:
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
|
||||
- label: V1 Test entrypoints
|
||||
- label: V1 Test entrypoints # 35min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -252,7 +266,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s v1/entrypoints
|
||||
|
||||
- label: V1 Test others
|
||||
- label: V1 Test others # 42min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -276,7 +291,8 @@ steps:
|
||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: Examples Test # 25min
|
||||
- label: Examples Test # 30min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/examples"
|
||||
source_file_dependencies:
|
||||
@ -301,7 +317,8 @@ steps:
|
||||
- python3 offline_inference/basic/score.py
|
||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||
|
||||
- label: Platform Tests (CUDA)
|
||||
- label: Platform Tests (CUDA) # 4min
|
||||
timeout_in_minutes: 15
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -309,7 +326,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
|
||||
- label: Samplers Test # 36min
|
||||
- label: Samplers Test # 56min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
@ -320,15 +338,23 @@ steps:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
|
||||
- label: LoRA Test %N # 15min each
|
||||
- label: LoRA Test %N # 20min each
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
|
||||
commands:
|
||||
- pytest -v -s lora \
|
||||
--shard-id=$$BUILDKITE_PARALLEL_JOB \
|
||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||
--ignore=lora/test_chatglm3_tp.py \
|
||||
--ignore=lora/test_llama_tp.py \
|
||||
--ignore=lora/test_llm_with_multi_loras.py
|
||||
parallelism: 4
|
||||
|
||||
- label: PyTorch Compilation Unit Tests
|
||||
- label: PyTorch Compilation Unit Tests # 15min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -344,7 +370,8 @@ steps:
|
||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s compile/test_decorator.py
|
||||
|
||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -358,7 +385,8 @@ steps:
|
||||
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
||||
- pytest -v -s compile/piecewise/test_multiple_graphs.py
|
||||
|
||||
- label: PyTorch Fullgraph Test # 18min
|
||||
- label: PyTorch Fullgraph Test # 20min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -367,7 +395,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s compile/test_full_graph.py
|
||||
|
||||
- label: Kernels Core Operation Test
|
||||
- label: Kernels Core Operation Test # 48min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@ -375,7 +404,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/core
|
||||
|
||||
- label: Kernels Attention Test %N
|
||||
- label: Kernels Attention Test %N # 23min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/attention/
|
||||
@ -386,7 +416,8 @@ steps:
|
||||
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Quantization Test %N
|
||||
- label: Kernels Quantization Test %N # 64min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/
|
||||
@ -396,7 +427,8 @@ steps:
|
||||
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels MoE Test %N
|
||||
- label: Kernels MoE Test %N # 40min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/quantization/cutlass_w8a8/moe/
|
||||
@ -408,7 +440,8 @@ steps:
|
||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||
parallelism: 2
|
||||
|
||||
- label: Kernels Mamba Test
|
||||
- label: Kernels Mamba Test # 31min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/mamba/
|
||||
@ -416,7 +449,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s kernels/mamba
|
||||
|
||||
- label: Tensorizer Test # 11min
|
||||
- label: Tensorizer Test # 14min
|
||||
timeout_in_minutes: 25
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/model_loader
|
||||
@ -428,7 +462,8 @@ steps:
|
||||
- pytest -v -s tensorizer_loader
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Model Executor Test
|
||||
- label: Model Executor Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
@ -438,7 +473,8 @@ steps:
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
|
||||
- label: Benchmarks # 9min
|
||||
- label: Benchmarks # 11min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
source_file_dependencies:
|
||||
@ -446,7 +482,8 @@ steps:
|
||||
commands:
|
||||
- bash scripts/run-benchmarks.sh
|
||||
|
||||
- label: Benchmarks CLI Test # 10min
|
||||
- label: Benchmarks CLI Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -454,7 +491,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s benchmarks/
|
||||
|
||||
- label: Quantization Test
|
||||
- label: Quantization Test # 70min
|
||||
timeout_in_minutes: 90
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@ -467,6 +505,7 @@ steps:
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@ -474,7 +513,8 @@ steps:
|
||||
commands:
|
||||
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||
|
||||
- label: OpenAI API correctness
|
||||
- label: OpenAI API correctness # 22min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- csrc/
|
||||
@ -483,7 +523,8 @@ steps:
|
||||
commands: # LMEval+Transcription WER check
|
||||
- pytest -s entrypoints/openai/correctness/
|
||||
|
||||
- label: Encoder Decoder tests # 5min
|
||||
- label: Encoder Decoder tests # 12min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
@ -491,7 +532,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s encoder_decoder
|
||||
|
||||
- label: OpenAI-Compatible Tool Use # 20 min
|
||||
- label: OpenAI-Compatible Tool Use # 23 min
|
||||
timeout_in_minutes: 35
|
||||
mirror_hardwares: [amdexperimental]
|
||||
fast_check: false
|
||||
source_file_dependencies:
|
||||
@ -504,7 +546,8 @@ steps:
|
||||
|
||||
##### models test #####
|
||||
|
||||
- label: Basic Models Test # 24min
|
||||
- label: Basic Models Test # 57min
|
||||
timeout_in_minutes: 75
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -517,7 +560,8 @@ steps:
|
||||
- pytest -v -s models/test_vision.py
|
||||
- pytest -v -s models/test_initialization.py
|
||||
|
||||
- label: Language Models Test (Standard)
|
||||
- label: Language Models Test (Standard) # 35min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -528,6 +572,7 @@ steps:
|
||||
- pytest -v -s models/language -m core_model
|
||||
|
||||
- label: Language Models Test (Hybrid) # 35 min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -540,7 +585,8 @@ steps:
|
||||
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||
- pytest -v -s models/language/generation -m hybrid_model
|
||||
|
||||
- label: Language Models Test (Extended Generation) # 1hr20min
|
||||
- label: Language Models Test (Extended Generation) # 80min
|
||||
timeout_in_minutes: 110
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
@ -552,6 +598,7 @@ steps:
|
||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||
|
||||
- label: Language Models Test (Extended Pooling) # 36min
|
||||
timeout_in_minutes: 50
|
||||
mirror_hardwares: [amdexperimental]
|
||||
optional: true
|
||||
source_file_dependencies:
|
||||
@ -560,7 +607,8 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||
|
||||
- label: Multi-Modal Processor Test
|
||||
- label: Multi-Modal Processor Test # 44min
|
||||
timeout_in_minutes: 60
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/models/multimodal
|
||||
@ -568,7 +616,8 @@ steps:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/processing
|
||||
|
||||
- label: Multi-Modal Models Test (Standard)
|
||||
- label: Multi-Modal Models Test (Standard) # 60min
|
||||
timeout_in_minutes: 80
|
||||
mirror_hardwares: [amdexperimental]
|
||||
torch_nightly: true
|
||||
source_file_dependencies:
|
||||
@ -610,7 +659,8 @@ steps:
|
||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||
|
||||
- label: Quantized Models Test
|
||||
- label: Quantized Models Test # 45 min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers/quantization
|
||||
@ -640,7 +690,8 @@ steps:
|
||||
- python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||
|
||||
- label: Blackwell Test
|
||||
- label: Blackwell Test # 38 min
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/"
|
||||
gpu: b200
|
||||
# optional: true
|
||||
@ -666,7 +717,7 @@ steps:
|
||||
# Quantization
|
||||
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
||||
# - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
|
||||
- pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
|
||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||
@ -676,12 +727,13 @@ steps:
|
||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||
# - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||
|
||||
##### 1 GPU test #####
|
||||
##### multi gpus test #####
|
||||
|
||||
- label: Distributed Comm Ops Test # 7min
|
||||
timeout_in_minutes: 20
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
@ -693,6 +745,7 @@ steps:
|
||||
- pytest -v -s distributed/test_shm_broadcast.py
|
||||
|
||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
@ -716,7 +769,8 @@ steps:
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 40min
|
||||
- label: Distributed Tests (2 GPUs) # 110min
|
||||
timeout_in_minutes: 150
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
@ -757,6 +811,7 @@ steps:
|
||||
- pytest -v -s models/multimodal/generation/test_maverick.py
|
||||
|
||||
- label: Plugin Tests (2 GPUs) # 40min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
@ -782,7 +837,8 @@ steps:
|
||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
|
||||
- label: Pipeline Parallelism Test # 45min
|
||||
- label: Pipeline + Context Parallelism Test # 45min
|
||||
timeout_in_minutes: 60
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
@ -795,8 +851,10 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pp_cudagraph.py
|
||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||
# - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
|
||||
|
||||
- label: LoRA TP Test (Distributed)
|
||||
- label: LoRA TP Test (Distributed) # 17 min
|
||||
timeout_in_minutes: 30
|
||||
mirror_hardwares: [amdexperimental]
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -814,6 +872,7 @@ steps:
|
||||
|
||||
|
||||
- label: Weight Loading Multiple GPU Test # 33min
|
||||
timeout_in_minutes: 45
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
|
||||
13
.github/CODEOWNERS
vendored
13
.github/CODEOWNERS
vendored
@ -5,13 +5,15 @@
|
||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||
/vllm/model_executor/layers/mamba @tdoublep
|
||||
/vllm/model_executor/model_loader @22quinn
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/v1/sample @22quinn @houseroad
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm
|
||||
@ -25,7 +27,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||
/vllm/v1/spec_decode @benchislett @luccafong
|
||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||
|
||||
# Test ownership
|
||||
@ -67,6 +70,9 @@ mkdocs.yaml @hmellor
|
||||
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
|
||||
/vllm/model_executor/models/qwen* @sighingnow
|
||||
|
||||
# MTP-specific files
|
||||
/vllm/model_executor/models/deepseek_mtp.py @luccafong
|
||||
|
||||
# Mistral-specific files
|
||||
/vllm/model_executor/models/mistral*.py @patrickvonplaten
|
||||
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
||||
@ -85,4 +91,3 @@ mkdocs.yaml @hmellor
|
||||
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
|
||||
/vllm/attention/ops/rocm*.py @gshtras
|
||||
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
|
||||
|
||||
|
||||
14
.github/mergify.yml
vendored
14
.github/mergify.yml
vendored
@ -273,6 +273,20 @@ pull_request_rules:
|
||||
users:
|
||||
- "sangstar"
|
||||
|
||||
- name: assign reviewer for modelopt changes
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
||||
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
||||
- files~=^tests/models/quantization/test_modelopt\.py$
|
||||
- files~=^tests/quantization/test_modelopt\.py$
|
||||
- files~=^tests/models/quantization/test_nvfp4\.py$
|
||||
- files~=^docs/features/quantization/modelopt\.md$
|
||||
actions:
|
||||
assign:
|
||||
users:
|
||||
- "Edwardf0t1"
|
||||
|
||||
- name: remove 'needs-rebase' label when conflict is resolved
|
||||
conditions:
|
||||
- -conflict
|
||||
|
||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
|
||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||
with:
|
||||
python-version: "3.12"
|
||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||
|
||||
@ -2,7 +2,6 @@ include LICENSE
|
||||
include requirements/common.txt
|
||||
include requirements/cuda.txt
|
||||
include requirements/rocm.txt
|
||||
include requirements/neuron.txt
|
||||
include requirements/cpu.txt
|
||||
include CMakeLists.txt
|
||||
|
||||
|
||||
@ -18,16 +18,17 @@ Easy, fast, and cheap LLM serving for everyone
|
||||
|
||||
*Latest News* 🔥
|
||||
|
||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
||||
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
|
||||
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
|
||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||
|
||||
<details>
|
||||
<summary>Previous News</summary>
|
||||
|
||||
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
|
||||
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
|
||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||
|
||||
104
benchmarks/kernels/benchmark_activation.py
Normal file
104
benchmarks/kernels/benchmark_activation.py
Normal file
@ -0,0 +1,104 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# benchmark custom activation op performance
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.model_executor.layers.activation # noqa F401
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.triton_utils import triton
|
||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||
|
||||
batch_size_range = [1, 16, 32, 64, 128]
|
||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
intermediate_size = [3072, 9728, 12288]
|
||||
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
|
||||
|
||||
|
||||
def benchmark_activation(
|
||||
batch_size: int,
|
||||
seq_len: int,
|
||||
intermediate_size: int,
|
||||
provider: str,
|
||||
func_name: str,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
device = "cuda"
|
||||
num_tokens = batch_size * seq_len
|
||||
dim = intermediate_size
|
||||
current_platform.seed_everything(42)
|
||||
torch.set_default_device(device)
|
||||
|
||||
if func_name == "gelu_and_mul":
|
||||
layer = CustomOp.op_registry[func_name](approximate="none")
|
||||
elif func_name == "gelu_and_mul_tanh":
|
||||
layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
|
||||
elif func_name == "fatrelu_and_mul":
|
||||
threshold = 0.5
|
||||
layer = CustomOp.op_registry[func_name](threshold)
|
||||
else:
|
||||
layer = CustomOp.op_registry[func_name]()
|
||||
|
||||
x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
|
||||
compiled_layer = torch.compile(layer.forward_native)
|
||||
|
||||
if provider == "custom":
|
||||
fn = lambda: layer(x)
|
||||
elif provider == "compiled":
|
||||
fn = lambda: compiled_layer(x)
|
||||
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
fn, quantiles=[0.5, 0.2, 0.8]
|
||||
)
|
||||
return ms, max_ms, min_ms
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
|
||||
parser.add_argument(
|
||||
"--func-name",
|
||||
type=str,
|
||||
choices=[
|
||||
"mul_and_silu",
|
||||
"silu_and_mul",
|
||||
"gelu_and_mul",
|
||||
"gelu_and_mul_tanh",
|
||||
"fatrelu_and_mul",
|
||||
"swigluoai_and_mul",
|
||||
"gelu_new",
|
||||
"gelu_fast",
|
||||
"quick_gelu",
|
||||
],
|
||||
default="silu_and_mul",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
assert args
|
||||
|
||||
func_name = args.func_name
|
||||
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
||||
|
||||
perf_report = triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size", "seq_len", "intermediate_size"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["custom", "compiled"],
|
||||
line_names=["Custom OP", "Compiled"],
|
||||
styles=[("blue", "-"), ("green", "-")],
|
||||
ylabel="ms",
|
||||
plot_name=f"{func_name}-op-performance",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
|
||||
perf_report(
|
||||
lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
|
||||
batch_size, seq_len, intermediate_size, provider, func_name, dtype
|
||||
)
|
||||
).run(print_data=True)
|
||||
@ -678,7 +678,11 @@ def main(args: argparse.Namespace):
|
||||
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
|
||||
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
|
||||
print(f"Start tuning over {len(search_space)} configurations...")
|
||||
|
||||
if use_deep_gemm:
|
||||
raise ValueError(
|
||||
"Tuning with --use-deep-gemm is not supported as it only tunes Triton "
|
||||
"kernels. Please remove the flag."
|
||||
)
|
||||
start = time.time()
|
||||
configs = _distribute(
|
||||
"tune",
|
||||
|
||||
@ -36,6 +36,7 @@ limitations under the License.
|
||||
#if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
|
||||
void sm100_cutlass_mla_decode(
|
||||
torch::Tensor const& out,
|
||||
torch::Tensor const& lse,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
@ -99,6 +100,7 @@ struct MlaSm100 {
|
||||
template <typename T>
|
||||
typename T::Fmha::Arguments args_from_options(
|
||||
at::Tensor const& out,
|
||||
at::Tensor const& lse,
|
||||
at::Tensor const& q_nope,
|
||||
at::Tensor const& q_pe,
|
||||
at::Tensor const& kv_c_and_k_pe_cache,
|
||||
@ -162,7 +164,10 @@ typename T::Fmha::Arguments args_from_options(
|
||||
stride_PT,
|
||||
page_count_total,
|
||||
page_size},
|
||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
|
||||
{static_cast<ElementOut*>(out.data_ptr()),
|
||||
stride_O,
|
||||
static_cast<ElementAcc*>(lse.defined() ? lse.data_ptr() : nullptr),
|
||||
stride_LSE},
|
||||
hw_info,
|
||||
// TODO(trevor-m): Change split_kv back to -1 when
|
||||
// https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
|
||||
@ -181,6 +186,7 @@ typename T::Fmha::Arguments args_from_options(
|
||||
template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
|
||||
void runMla(
|
||||
at::Tensor const& out,
|
||||
at::Tensor const& lse,
|
||||
at::Tensor const& q_nope,
|
||||
at::Tensor const& q_pe,
|
||||
at::Tensor const& kv_c_and_k_pe_cache,
|
||||
@ -192,7 +198,7 @@ void runMla(
|
||||
cudaStream_t stream) {
|
||||
using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
|
||||
typename MlaSm100Type::Fmha fmha;
|
||||
auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
|
||||
auto arguments = args_from_options<MlaSm100Type>(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
|
||||
|
||||
CUTLASS_CHECK(fmha.can_implement(arguments));
|
||||
|
||||
@ -214,6 +220,7 @@ void runMla(
|
||||
|
||||
void sm100_cutlass_mla_decode(
|
||||
torch::Tensor const& out,
|
||||
torch::Tensor const& lse,
|
||||
torch::Tensor const& q_nope,
|
||||
torch::Tensor const& q_pe,
|
||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
||||
@ -234,13 +241,13 @@ void sm100_cutlass_mla_decode(
|
||||
DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
|
||||
if (in_dtype == at::ScalarType::Half) {
|
||||
runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
|
||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
|
||||
out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
|
||||
} else if (in_dtype == at::ScalarType::BFloat16) {
|
||||
runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
|
||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
|
||||
out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
|
||||
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
|
||||
runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
|
||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
|
||||
out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Unsupported input data type of MLA");
|
||||
}
|
||||
|
||||
@ -36,13 +36,6 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
|
||||
const std::string& kv_cache_dtype,
|
||||
torch::Tensor& scale);
|
||||
|
||||
void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
|
||||
torch::Tensor& cp_local_token_select_indices,
|
||||
torch::Tensor& kv_cache,
|
||||
torch::Tensor& slot_mapping,
|
||||
const std::string& kv_cache_dtype,
|
||||
torch::Tensor& scale);
|
||||
|
||||
// Just for unittest
|
||||
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||
const double scale, const std::string& kv_cache_dtype);
|
||||
|
||||
@ -396,51 +396,6 @@ __global__ void concat_and_cache_mla_kernel(
|
||||
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||
__global__ void cp_fused_concat_and_cache_mla_kernel(
|
||||
const scalar_t* __restrict__ kv_c, // [num_full_tokens, kv_lora_rank]
|
||||
const scalar_t* __restrict__ k_pe, // [num_full_tokens, pe_dim]
|
||||
const int64_t* __restrict__ cp_local_token_select_indices, // [num_tokens]
|
||||
cache_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
|
||||
// + pe_dim)]
|
||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||
const int block_stride, //
|
||||
const int entry_stride, //
|
||||
const int kv_c_stride, //
|
||||
const int k_pe_stride, //
|
||||
const int kv_lora_rank, //
|
||||
const int pe_dim, //
|
||||
const int block_size, //
|
||||
const float* scale //
|
||||
) {
|
||||
const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
|
||||
const int64_t slot_idx = slot_mapping[blockIdx.x];
|
||||
// NOTE: slot_idx can be -1 if the token is padded
|
||||
if (slot_idx < 0) {
|
||||
return;
|
||||
}
|
||||
const int64_t block_idx = slot_idx / block_size;
|
||||
const int64_t block_offset = slot_idx % block_size;
|
||||
|
||||
auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
|
||||
int src_stride, int dst_stride, int size, int offset) {
|
||||
for (int i = threadIdx.x; i < size; i += blockDim.x) {
|
||||
const int64_t src_idx = token_idx * src_stride + i;
|
||||
const int64_t dst_idx =
|
||||
block_idx * block_stride + block_offset * entry_stride + i + offset;
|
||||
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||
dst[dst_idx] = src[src_idx];
|
||||
} else {
|
||||
dst[dst_idx] =
|
||||
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
|
||||
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
// KV_T is the data type of key and value tensors.
|
||||
@ -554,20 +509,6 @@ void reshape_and_cache_flash(
|
||||
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()));
|
||||
|
||||
// KV_T is the data type of key and value tensors.
|
||||
// CACHE_T is the stored data type of kv-cache.
|
||||
// KV_DTYPE is the real data type of kv-cache.
|
||||
#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE) \
|
||||
vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||
<<<grid, block, 0, stream>>>( \
|
||||
reinterpret_cast<KV_T*>(kv_c.data_ptr()), \
|
||||
reinterpret_cast<KV_T*>(k_pe.data_ptr()), \
|
||||
cp_local_token_select_indices.data_ptr<int64_t>(), \
|
||||
reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()), \
|
||||
slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
|
||||
kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size, \
|
||||
reinterpret_cast<const float*>(scale.data_ptr()));
|
||||
|
||||
void concat_and_cache_mla(
|
||||
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
|
||||
torch::Tensor& k_pe, // [num_tokens, pe_dim]
|
||||
@ -606,50 +547,6 @@ void concat_and_cache_mla(
|
||||
CALL_CONCAT_AND_CACHE_MLA);
|
||||
}
|
||||
|
||||
// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
|
||||
// calls into one:
|
||||
// k_c_normed.index_select(0, cp_local_token_select_indices) + \
|
||||
// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
|
||||
// concat_and_cache_mla.
|
||||
void cp_fused_concat_and_cache_mla(
|
||||
torch::Tensor& kv_c, // [num_total_tokens, kv_lora_rank]
|
||||
torch::Tensor& k_pe, // [num_total_tokens, pe_dim]
|
||||
torch::Tensor& cp_local_token_select_indices, // [num_tokens]
|
||||
torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank +
|
||||
// pe_dim)]
|
||||
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
|
||||
const std::string& kv_cache_dtype, torch::Tensor& scale) {
|
||||
// NOTE(woosuk): In vLLM V1, key.size(0) can be different from
|
||||
// slot_mapping.size(0) because of padding for CUDA graphs.
|
||||
// In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
|
||||
// both include padding.
|
||||
// In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
|
||||
// since key includes padding for CUDA graphs, while slot_mapping does not.
|
||||
// In this case, slot_mapping.size(0) represents the actual number of tokens
|
||||
// before padding.
|
||||
// For compatibility with both cases, we use slot_mapping.size(0) as the
|
||||
// number of tokens.
|
||||
int num_tokens = slot_mapping.size(0);
|
||||
int kv_lora_rank = kv_c.size(1);
|
||||
int pe_dim = k_pe.size(1);
|
||||
int block_size = kv_cache.size(1);
|
||||
|
||||
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
|
||||
|
||||
int kv_c_stride = kv_c.stride(0);
|
||||
int k_pe_stride = k_pe.stride(0);
|
||||
int block_stride = kv_cache.stride(0);
|
||||
int entry_stride = kv_cache.stride(1);
|
||||
|
||||
dim3 grid(num_tokens);
|
||||
dim3 block(std::min(kv_lora_rank, 512));
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
|
||||
CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
|
||||
}
|
||||
|
||||
namespace vllm {
|
||||
|
||||
template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
|
||||
|
||||
@ -145,7 +145,8 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
|
||||
}
|
||||
}
|
||||
|
||||
float scale_val, azp_val;
|
||||
float scale_val;
|
||||
float azp_val = 0.0f;
|
||||
if constexpr (AZP) {
|
||||
float max_scalar = max_value.reduce_max();
|
||||
float min_scalar = min_value.reduce_min();
|
||||
|
||||
@ -52,15 +52,6 @@
|
||||
#define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
|
||||
|
||||
#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \
|
||||
AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \
|
||||
AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__)
|
||||
|
||||
#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__))
|
||||
|
||||
#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
|
||||
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
|
||||
|
||||
|
||||
@ -130,8 +130,7 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
|
||||
void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
|
||||
torch::Tensor& scale);
|
||||
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
#ifndef USE_ROCM
|
||||
void silu_and_mul_nvfp4_quant(torch::Tensor& out,
|
||||
torch::Tensor& output_block_scale,
|
||||
torch::Tensor& input,
|
||||
|
||||
@ -26,164 +26,17 @@
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Get type2 from type or vice versa (applied to half and bfloat16)
|
||||
template <typename T>
|
||||
struct TypeConverter {
|
||||
using Type = half2;
|
||||
}; // keep for generality
|
||||
|
||||
template <>
|
||||
struct TypeConverter<half2> {
|
||||
using Type = c10::Half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<c10::Half> {
|
||||
using Type = half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<__nv_bfloat162> {
|
||||
using Type = c10::BFloat16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<c10::BFloat16> {
|
||||
using Type = __nv_bfloat162;
|
||||
};
|
||||
|
||||
#define ELTS_PER_THREAD 8
|
||||
|
||||
constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
|
||||
constexpr int CVT_FP4_SF_VEC_SIZE = 16;
|
||||
|
||||
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b8 byte0;\n"
|
||||
".reg .b8 byte1;\n"
|
||||
".reg .b8 byte2;\n"
|
||||
".reg .b8 byte3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n"
|
||||
"mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
|
||||
"}"
|
||||
: "=r"(val)
|
||||
: "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
|
||||
"f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
|
||||
return val;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b8 byte0;\n"
|
||||
".reg .b8 byte1;\n"
|
||||
".reg .b8 byte2;\n"
|
||||
".reg .b8 byte3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n"
|
||||
"mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
|
||||
"}"
|
||||
: "=r"(val)
|
||||
: "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
|
||||
"f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
|
||||
return val;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Fast reciprocal.
|
||||
inline __device__ float reciprocal_approximate_ftz(float a) {
|
||||
float b;
|
||||
asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
|
||||
return b;
|
||||
}
|
||||
|
||||
template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
|
||||
__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
|
||||
int numCols,
|
||||
SFType* SFout) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
|
||||
CVT_FP4_NUM_THREADS_PER_SF == 2);
|
||||
|
||||
// One pair of threads write one SF to global memory.
|
||||
// TODO: stage through smem for packed STG.32
|
||||
// is it better than STG.8 from 4 threads ?
|
||||
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
|
||||
// SF vector index (16 elements share one SF in the K dimension).
|
||||
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
|
||||
int32_t mIdx = rowIdx;
|
||||
|
||||
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
|
||||
// --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
|
||||
|
||||
int32_t mTileIdx = mIdx / (32 * 4);
|
||||
// SF vector size 16.
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
int32_t numKTiles = (numCols + factor - 1) / factor;
|
||||
int64_t mTileStride = numKTiles * 32 * 4 * 4;
|
||||
|
||||
int32_t kTileIdx = (kIdx / 4);
|
||||
int64_t kTileStride = 32 * 4 * 4;
|
||||
|
||||
// M tile layout [32, 4] is column-major.
|
||||
int32_t outerMIdx = (mIdx % 32);
|
||||
int64_t outerMStride = 4 * 4;
|
||||
|
||||
int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
|
||||
int64_t innerMStride = 4;
|
||||
|
||||
int32_t innerKIdx = (kIdx % 4);
|
||||
int64_t innerKStride = 1;
|
||||
|
||||
// Compute the global offset.
|
||||
int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
|
||||
outerMIdx * outerMStride + innerMIdx * innerMStride +
|
||||
innerKIdx * innerKStride;
|
||||
|
||||
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
|
||||
}
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Define a 16 bytes packed data type.
|
||||
template <class Type>
|
||||
struct PackedVec {
|
||||
typename TypeConverter<Type>::Type elts[4];
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedVec<__nv_fp8_e4m3> {
|
||||
__nv_fp8x2_e4m3 elts[8];
|
||||
};
|
||||
|
||||
template <class Type>
|
||||
__inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
|
||||
PackedVec<Type>& vec2) {
|
||||
PackedVec<Type> result;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
|
||||
if constexpr (std::is_same_v<Type, c10::Half>) {
|
||||
if constexpr (std::is_same_v<Type, half>) {
|
||||
half2 val(0.5f, 0.5f);
|
||||
half2 t0 = __hmul2(vec.elts[i], val);
|
||||
half2 t1 = __hfma2(h2tanh(t0), val, val);
|
||||
@ -206,13 +59,12 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
|
||||
PackedVec<Type>& vec2,
|
||||
float SFScaleVal,
|
||||
uint8_t* SFout) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
PackedVec<Type> out_silu = compute_silu(vec, vec2);
|
||||
// Get absolute maximum values among the local 8 values.
|
||||
auto localMax = __habs2(out_silu.elts[0]);
|
||||
|
||||
// Local maximum value.
|
||||
#pragma unroll
|
||||
// Local maximum value.
|
||||
#pragma unroll
|
||||
for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
|
||||
}
|
||||
@ -259,9 +111,9 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
|
||||
// Convert the input to float.
|
||||
float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
|
||||
|
||||
#pragma unroll
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
if constexpr (std::is_same_v<Type, c10::Half>) {
|
||||
if constexpr (std::is_same_v<Type, half>) {
|
||||
fp2Vals[i] = __half22float2(out_silu.elts[i]);
|
||||
} else {
|
||||
fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
|
||||
@ -275,22 +127,14 @@ __device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
|
||||
|
||||
// Write the e2m1 values to global memory.
|
||||
return e2m1Vec;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4(
|
||||
#else
|
||||
silu_and_cvt_fp16_to_fp4(
|
||||
#endif
|
||||
int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
|
||||
uint32_t* out, uint32_t* SFout) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__global__ void __launch_bounds__(1024, 4)
|
||||
silu_and_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out,
|
||||
uint32_t* SFout) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
@ -328,22 +172,25 @@ silu_and_cvt_fp16_to_fp4(
|
||||
in_vec, in_vec2, SFScaleVal, sf_out);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void silu_and_mul_nvfp4_quant(torch::Tensor& output, // [..., d]
|
||||
torch::Tensor& output_sf,
|
||||
torch::Tensor& input, // [..., 2 * d]
|
||||
torch::Tensor& input_sf) {
|
||||
TORCH_CHECK(input.dtype() == torch::kFloat16 ||
|
||||
input.dtype() == torch::kBFloat16);
|
||||
void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d]
|
||||
torch::Tensor& output_sf,
|
||||
torch::Tensor& input, // [..., 2 * d]
|
||||
torch::Tensor& input_sf) {
|
||||
int32_t m = input.size(0);
|
||||
int32_t n = input.size(1) / 2;
|
||||
|
||||
TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
|
||||
TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
|
||||
input.scalar_type() == at::ScalarType::BFloat16,
|
||||
"Unsupported input data type for quantize_to_fp4.");
|
||||
|
||||
int multiProcessorCount =
|
||||
get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
|
||||
|
||||
auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
|
||||
auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
|
||||
auto output_ptr = static_cast<int64_t*>(output.data_ptr());
|
||||
@ -352,17 +199,14 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& output, // [..., d]
|
||||
dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
|
||||
int const numBlocksPerSM = 2048 / block.x;
|
||||
dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(
|
||||
input.scalar_type(), "act_and_mul_quant_kernel", [&] {
|
||||
auto input_ptr = reinterpret_cast<scalar_t const*>(input.data_ptr());
|
||||
VLLM_DISPATCH_BYTE_TYPES(
|
||||
output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type",
|
||||
[&] {
|
||||
vllm::silu_and_cvt_fp16_to_fp4<scalar_t>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
m, n, input_ptr, input_sf_ptr,
|
||||
reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
});
|
||||
input.scalar_type(), "silu_and_mul_nvfp4_quant_kernel", [&] {
|
||||
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
|
||||
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
|
||||
vllm::silu_and_cvt_fp16_to_fp4<cuda_type><<<grid, block, 0, stream>>>(
|
||||
m, n, input_ptr, input_sf_ptr,
|
||||
reinterpret_cast<uint32_t*>(output_ptr),
|
||||
reinterpret_cast<uint32_t*>(sf_out));
|
||||
});
|
||||
}
|
||||
|
||||
@ -1,3 +1,19 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
#include <cutlass/arch/arch.h>
|
||||
|
||||
|
||||
@ -1,247 +1,42 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <torch/all.h>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp8.h>
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
template <typename T>
|
||||
struct TypeConverter {
|
||||
using Type = half2;
|
||||
}; // keep for generality
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
template <>
|
||||
struct TypeConverter<half2> {
|
||||
using Type = half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<half> {
|
||||
using Type = half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<__nv_bfloat162> {
|
||||
using Type = __nv_bfloat16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<__nv_bfloat16> {
|
||||
using Type = __nv_bfloat162;
|
||||
};
|
||||
|
||||
#define ELTS_PER_THREAD 8
|
||||
|
||||
constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
|
||||
constexpr int CVT_FP4_SF_VEC_SIZE = 16;
|
||||
|
||||
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b8 byte0;\n"
|
||||
".reg .b8 byte1;\n"
|
||||
".reg .b8 byte2;\n"
|
||||
".reg .b8 byte3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n"
|
||||
"mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
|
||||
"}"
|
||||
: "=r"(val)
|
||||
: "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
|
||||
"f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
|
||||
return val;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b8 byte0;\n"
|
||||
".reg .b8 byte1;\n"
|
||||
".reg .b8 byte2;\n"
|
||||
".reg .b8 byte3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n"
|
||||
"mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
|
||||
"}"
|
||||
: "=r"(val)
|
||||
: "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
|
||||
"f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
|
||||
return val;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Fast reciprocal.
|
||||
inline __device__ float reciprocal_approximate_ftz(float a) {
|
||||
float b;
|
||||
asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
|
||||
return b;
|
||||
}
|
||||
|
||||
template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
|
||||
__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
|
||||
int numCols,
|
||||
SFType* SFout) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
|
||||
CVT_FP4_NUM_THREADS_PER_SF == 2);
|
||||
|
||||
// One pair of threads write one SF to global memory.
|
||||
// TODO: stage through smem for packed STG.32
|
||||
// is it better than STG.8 from 4 threads ?
|
||||
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
|
||||
// SF vector index (16 elements share one SF in the K dimension).
|
||||
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
|
||||
int32_t mIdx = rowIdx;
|
||||
|
||||
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
|
||||
// --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
|
||||
|
||||
int32_t mTileIdx = mIdx / (32 * 4);
|
||||
// SF vector size 16.
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
int32_t numKTiles = (numCols + factor - 1) / factor;
|
||||
int64_t mTileStride = numKTiles * 32 * 4 * 4;
|
||||
|
||||
int32_t kTileIdx = (kIdx / 4);
|
||||
int64_t kTileStride = 32 * 4 * 4;
|
||||
|
||||
// M tile layout [32, 4] is column-major.
|
||||
int32_t outerMIdx = (mIdx % 32);
|
||||
int64_t outerMStride = 4 * 4;
|
||||
|
||||
int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
|
||||
int64_t innerMStride = 4;
|
||||
|
||||
int32_t innerKIdx = (kIdx % 4);
|
||||
int64_t innerKStride = 1;
|
||||
|
||||
// Compute the global offset.
|
||||
int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
|
||||
outerMIdx * outerMStride + innerMIdx * innerMStride +
|
||||
innerKIdx * innerKStride;
|
||||
|
||||
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
|
||||
}
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Define a 16 bytes packed data type.
|
||||
template <class Type>
|
||||
struct PackedVec {
|
||||
typename TypeConverter<Type>::Type elts[4];
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedVec<__nv_fp8_e4m3> {
|
||||
__nv_fp8x2_e4m3 elts[8];
|
||||
};
|
||||
|
||||
// Quantizes the provided PackedVec into the uint32_t output
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
|
||||
uint8_t* SFout) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
// Get absolute maximum values among the local 8 values.
|
||||
auto localMax = __habs2(vec.elts[0]);
|
||||
|
||||
// Local maximum value.
|
||||
#pragma unroll
|
||||
for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
localMax = __hmax2(localMax, __habs2(vec.elts[i]));
|
||||
}
|
||||
|
||||
// Get the absolute maximum among all 16 values (two threads).
|
||||
localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
|
||||
// Get the final absolute maximum values.
|
||||
float vecMax = float(__hmax(localMax.x, localMax.y));
|
||||
|
||||
// Get the SF (max value of the vector / max value of e2m1).
|
||||
// maximum value of e2m1 = 6.0.
|
||||
// TODO: use half as compute data type.
|
||||
float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
|
||||
// 8 bits representation of the SF.
|
||||
uint8_t fp8SFVal;
|
||||
// Write the SF to global memory (STG.8).
|
||||
if constexpr (UE8M0_SF) {
|
||||
// Extract the 8 exponent bits from float32.
|
||||
// float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
|
||||
uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
|
||||
fp8SFVal = tmp & 0xff;
|
||||
// Convert back to fp32.
|
||||
reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
|
||||
} else {
|
||||
// Here SFValue is always positive, so E4M3 is the same as UE4M3.
|
||||
__nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
|
||||
reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
|
||||
// Convert back to fp32.
|
||||
SFValue = float(tmp);
|
||||
}
|
||||
// Get the output scale.
|
||||
// Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
|
||||
// reciprocal(SFScaleVal))
|
||||
float outputScale =
|
||||
SFValue != 0 ? reciprocal_approximate_ftz(
|
||||
SFValue * reciprocal_approximate_ftz(SFScaleVal))
|
||||
: 0.0f;
|
||||
|
||||
if (SFout) {
|
||||
// Write the SF to global memory (STG.8).
|
||||
*SFout = fp8SFVal;
|
||||
}
|
||||
|
||||
// Convert the input to float.
|
||||
float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
if constexpr (std::is_same_v<Type, half>) {
|
||||
fp2Vals[i] = __half22float2(vec.elts[i]);
|
||||
} else {
|
||||
fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
|
||||
}
|
||||
fp2Vals[i].x *= outputScale;
|
||||
fp2Vals[i].y *= outputScale;
|
||||
}
|
||||
|
||||
// Convert to e2m1 values.
|
||||
uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
|
||||
|
||||
// Write the e2m1 values to global memory.
|
||||
return e2m1Vec;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
namespace vllm {
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||
__global__ void
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__launch_bounds__(512, 4) cvt_fp16_to_fp4(
|
||||
#else
|
||||
cvt_fp16_to_fp4(
|
||||
#endif
|
||||
int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
|
||||
uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
|
||||
uint32_t* output_scale_offset_by_experts, int n_experts, bool low_latency) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__global__ void __launch_bounds__(512, 4)
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||
uint32_t* input_offset_by_experts,
|
||||
uint32_t* output_scale_offset_by_experts, int n_experts,
|
||||
bool low_latency) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
@ -299,8 +94,8 @@ cvt_fp16_to_fp4(
|
||||
&input_offset_by_experts[chunk_start + 12]));
|
||||
local_offsets[16] = __ldca(&input_offset_by_experts[chunk_start + 16]);
|
||||
|
||||
// Check against the 16 loaded offsets
|
||||
#pragma unroll
|
||||
// Check against the 16 loaded offsets
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 16; i++) {
|
||||
if (rowIdx >= local_offsets[i] && rowIdx < local_offsets[i + 1]) {
|
||||
rowIdx_in_expert = rowIdx - local_offsets[i];
|
||||
@ -330,21 +125,15 @@ cvt_fp16_to_fp4(
|
||||
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Kernel for LARGE_M_TOPK = true (large m_topk optimized version)
|
||||
template <class Type, bool UE8M0_SF = false, bool SMALL_NUM_EXPERTS = false>
|
||||
__global__ void
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__launch_bounds__(1024, 4) cvt_fp16_to_fp4(
|
||||
#else
|
||||
cvt_fp16_to_fp4(
|
||||
#endif
|
||||
int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
|
||||
uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts,
|
||||
uint32_t* output_scale_offset_by_experts, int n_experts) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__global__ void __launch_bounds__(1024, 4)
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout,
|
||||
uint32_t* input_offset_by_experts,
|
||||
uint32_t* output_scale_offset_by_experts, int n_experts) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
@ -425,7 +214,6 @@ cvt_fp16_to_fp4(
|
||||
|
||||
out_pos = cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -501,6 +289,8 @@ void quant_impl(void* output, void* output_scale, void* input,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
/*Quantization entry for fp4 experts quantization*/
|
||||
#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
|
||||
#define CHECK_CONTIGUOUS(x, m) \
|
||||
@ -560,23 +350,17 @@ void scaled_fp4_experts_quant_sm100a(
|
||||
// 4 means 4 fp8 values are packed into one int32
|
||||
TORCH_CHECK(output_scale.size(1) * 4 == padded_k);
|
||||
|
||||
auto in_dtype = input.dtype();
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||
const cudaStream_t stream =
|
||||
at::cuda::getCurrentCUDAStream(input.get_device());
|
||||
if (in_dtype == at::ScalarType::Half) {
|
||||
quant_impl<half>(output.data_ptr(), output_scale.data_ptr(),
|
||||
input.data_ptr(), input_global_scale.data_ptr(),
|
||||
input_offset_by_experts.data_ptr(),
|
||||
output_scale_offset_by_experts.data_ptr(), m_topk, k,
|
||||
n_experts, stream);
|
||||
} else if (in_dtype == at::ScalarType::BFloat16) {
|
||||
quant_impl<__nv_bfloat16>(output.data_ptr(), output_scale.data_ptr(),
|
||||
input.data_ptr(), input_global_scale.data_ptr(),
|
||||
input_offset_by_experts.data_ptr(),
|
||||
output_scale_offset_by_experts.data_ptr(), m_topk,
|
||||
k, n_experts, stream);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Expected input data type to be half or bfloat16");
|
||||
}
|
||||
|
||||
VLLM_DISPATCH_HALF_TYPES(
|
||||
input.scalar_type(), "nvfp4_experts_quant_kernel", [&] {
|
||||
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
|
||||
vllm::quant_impl<cuda_type>(
|
||||
output.data_ptr(), output_scale.data_ptr(), input.data_ptr(),
|
||||
input_global_scale.data_ptr(), input_offset_by_experts.data_ptr(),
|
||||
output_scale_offset_by_experts.data_ptr(), m_topk, k, n_experts,
|
||||
stream);
|
||||
});
|
||||
}
|
||||
|
||||
@ -32,6 +32,14 @@ void scaled_fp4_experts_quant_sm100a(
|
||||
torch::Tensor const& output_scale_offset_by_experts);
|
||||
#endif
|
||||
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output,
|
||||
torch::Tensor& output_sf,
|
||||
torch::Tensor& input,
|
||||
torch::Tensor& input_sf);
|
||||
#endif
|
||||
|
||||
void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
|
||||
torch::Tensor& output_sf, torch::Tensor const& input_sf) {
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
@ -54,3 +62,13 @@ void scaled_fp4_experts_quant(
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
||||
"No compiled nvfp4 experts quantization kernel");
|
||||
}
|
||||
|
||||
void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf,
|
||||
torch::Tensor& input, torch::Tensor& input_sf) {
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf);
|
||||
#endif
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false, "No compiled silu_and_mul nvfp4 quantization kernel");
|
||||
}
|
||||
|
||||
@ -23,245 +23,18 @@
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <cuda_fp8.h>
|
||||
#include "dispatch_utils.h"
|
||||
|
||||
#include "cuda_utils.h"
|
||||
#include "nvfp4_utils.cuh"
|
||||
|
||||
// Get type2 from type or vice versa (applied to half and bfloat16)
|
||||
template <typename T>
|
||||
struct TypeConverter {
|
||||
using Type = half2;
|
||||
}; // keep for generality
|
||||
|
||||
template <>
|
||||
struct TypeConverter<half2> {
|
||||
using Type = half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<half> {
|
||||
using Type = half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<__nv_bfloat162> {
|
||||
using Type = __nv_bfloat16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<__nv_bfloat16> {
|
||||
using Type = __nv_bfloat162;
|
||||
};
|
||||
|
||||
#define ELTS_PER_THREAD 8
|
||||
|
||||
constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
|
||||
constexpr int CVT_FP4_SF_VEC_SIZE = 16;
|
||||
|
||||
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b8 byte0;\n"
|
||||
".reg .b8 byte1;\n"
|
||||
".reg .b8 byte2;\n"
|
||||
".reg .b8 byte3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n"
|
||||
"mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
|
||||
"}"
|
||||
: "=r"(val)
|
||||
: "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
|
||||
"f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
|
||||
return val;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b8 byte0;\n"
|
||||
".reg .b8 byte1;\n"
|
||||
".reg .b8 byte2;\n"
|
||||
".reg .b8 byte3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n"
|
||||
"mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
|
||||
"}"
|
||||
: "=r"(val)
|
||||
: "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
|
||||
"f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
|
||||
return val;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Fast reciprocal.
|
||||
inline __device__ float reciprocal_approximate_ftz(float a) {
|
||||
float b;
|
||||
asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
|
||||
return b;
|
||||
}
|
||||
|
||||
template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
|
||||
__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
|
||||
int numCols,
|
||||
SFType* SFout) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
|
||||
CVT_FP4_NUM_THREADS_PER_SF == 2);
|
||||
|
||||
// One pair of threads write one SF to global memory.
|
||||
// TODO: stage through smem for packed STG.32
|
||||
// is it better than STG.8 from 4 threads ?
|
||||
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
|
||||
// SF vector index (16 elements share one SF in the K dimension).
|
||||
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
|
||||
int32_t mIdx = rowIdx;
|
||||
|
||||
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
|
||||
// --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
|
||||
|
||||
int32_t mTileIdx = mIdx / (32 * 4);
|
||||
// SF vector size 16.
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
int32_t numKTiles = (numCols + factor - 1) / factor;
|
||||
int64_t mTileStride = numKTiles * 32 * 4 * 4;
|
||||
|
||||
int32_t kTileIdx = (kIdx / 4);
|
||||
int64_t kTileStride = 32 * 4 * 4;
|
||||
|
||||
// M tile layout [32, 4] is column-major.
|
||||
int32_t outerMIdx = (mIdx % 32);
|
||||
int64_t outerMStride = 4 * 4;
|
||||
|
||||
int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
|
||||
int64_t innerMStride = 4;
|
||||
|
||||
int32_t innerKIdx = (kIdx % 4);
|
||||
int64_t innerKStride = 1;
|
||||
|
||||
// Compute the global offset.
|
||||
int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
|
||||
outerMIdx * outerMStride + innerMIdx * innerMStride +
|
||||
innerKIdx * innerKStride;
|
||||
|
||||
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
|
||||
}
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Define a 16 bytes packed data type.
|
||||
template <class Type>
|
||||
struct PackedVec {
|
||||
typename TypeConverter<Type>::Type elts[4];
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedVec<__nv_fp8_e4m3> {
|
||||
__nv_fp8x2_e4m3 elts[8];
|
||||
};
|
||||
|
||||
// Quantizes the provided PackedVec into the uint32_t output
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
|
||||
uint8_t* SFout) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
// Get absolute maximum values among the local 8 values.
|
||||
auto localMax = __habs2(vec.elts[0]);
|
||||
|
||||
// Local maximum value.
|
||||
#pragma unroll
|
||||
for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
localMax = __hmax2(localMax, __habs2(vec.elts[i]));
|
||||
}
|
||||
|
||||
// Get the absolute maximum among all 16 values (two threads).
|
||||
localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
|
||||
// Get the final absolute maximum values.
|
||||
float vecMax = float(__hmax(localMax.x, localMax.y));
|
||||
|
||||
// Get the SF (max value of the vector / max value of e2m1).
|
||||
// maximum value of e2m1 = 6.0.
|
||||
// TODO: use half as compute data type.
|
||||
float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
|
||||
// 8 bits representation of the SF.
|
||||
uint8_t fp8SFVal;
|
||||
// Write the SF to global memory (STG.8).
|
||||
if constexpr (UE8M0_SF) {
|
||||
// Extract the 8 exponent bits from float32.
|
||||
// float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
|
||||
uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
|
||||
fp8SFVal = tmp & 0xff;
|
||||
// Convert back to fp32.
|
||||
reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
|
||||
} else {
|
||||
// Here SFValue is always positive, so E4M3 is the same as UE4M3.
|
||||
__nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
|
||||
reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
|
||||
// Convert back to fp32.
|
||||
SFValue = float(tmp);
|
||||
}
|
||||
// Get the output scale.
|
||||
// Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
|
||||
// reciprocal(SFScaleVal))
|
||||
float outputScale =
|
||||
SFValue != 0 ? reciprocal_approximate_ftz(
|
||||
SFValue * reciprocal_approximate_ftz(SFScaleVal))
|
||||
: 0.0f;
|
||||
|
||||
if (SFout) {
|
||||
// Write the SF to global memory (STG.8).
|
||||
*SFout = fp8SFVal;
|
||||
}
|
||||
|
||||
// Convert the input to float.
|
||||
float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
if constexpr (std::is_same_v<Type, half>) {
|
||||
fp2Vals[i] = __half22float2(vec.elts[i]);
|
||||
} else {
|
||||
fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
|
||||
}
|
||||
fp2Vals[i].x *= outputScale;
|
||||
fp2Vals[i].y *= outputScale;
|
||||
}
|
||||
|
||||
// Convert to e2m1 values.
|
||||
uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
|
||||
|
||||
// Write the e2m1 values to global memory.
|
||||
return e2m1Vec;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
namespace vllm {
|
||||
|
||||
// Use UE4M3 by default.
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__global__ void
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__launch_bounds__(512, 4) cvt_fp16_to_fp4(
|
||||
#else
|
||||
cvt_fp16_to_fp4(
|
||||
#endif
|
||||
int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
|
||||
uint32_t* out, uint32_t* SFout) {
|
||||
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
|
||||
__global__ void __launch_bounds__(512, 4)
|
||||
cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in,
|
||||
float const* SFScale, uint32_t* out, uint32_t* SFout) {
|
||||
using PackedVec = PackedVec<Type>;
|
||||
static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
|
||||
(CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
|
||||
@ -293,7 +66,6 @@ cvt_fp16_to_fp4(
|
||||
cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -332,6 +104,8 @@ template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
|
||||
int multiProcessorCount,
|
||||
cudaStream_t stream);
|
||||
|
||||
} // namespace vllm
|
||||
|
||||
void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
torch::Tensor const& input,
|
||||
torch::Tensor const& output_sf,
|
||||
@ -340,6 +114,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
int32_t n = input.size(1);
|
||||
|
||||
TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
|
||||
TORCH_CHECK(input.scalar_type() == at::ScalarType::Half ||
|
||||
input.scalar_type() == at::ScalarType::BFloat16,
|
||||
"Unsupported input data type for quantize_to_fp4.");
|
||||
|
||||
int multiProcessorCount =
|
||||
get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
|
||||
@ -353,24 +130,10 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
|
||||
// We don't support e8m0 scales at this moment.
|
||||
bool useUE8M0 = false;
|
||||
|
||||
switch (input.scalar_type()) {
|
||||
case torch::kHalf: {
|
||||
auto input_ptr = reinterpret_cast<half const*>(input.data_ptr());
|
||||
invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
|
||||
useUE8M0, multiProcessorCount, stream);
|
||||
break;
|
||||
}
|
||||
case torch::kBFloat16: {
|
||||
auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr());
|
||||
invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
|
||||
useUE8M0, multiProcessorCount, stream);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
std::cerr << "Observing: " << input.scalar_type()
|
||||
<< " for the input datatype which is invalid";
|
||||
throw std::runtime_error(
|
||||
"Unsupported input data type for quantize_to_fp4.");
|
||||
}
|
||||
}
|
||||
VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] {
|
||||
using cuda_type = vllm::CUDATypeConverter<scalar_t>::Type;
|
||||
auto input_ptr = static_cast<cuda_type const*>(input.data_ptr());
|
||||
vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr,
|
||||
sf_out, useUE8M0, multiProcessorCount, stream);
|
||||
});
|
||||
}
|
||||
|
||||
251
csrc/quantization/fp4/nvfp4_utils.cuh
Normal file
251
csrc/quantization/fp4/nvfp4_utils.cuh
Normal file
@ -0,0 +1,251 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_fp8.h>
|
||||
|
||||
#define ELTS_PER_THREAD 8
|
||||
|
||||
constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
|
||||
constexpr int CVT_FP4_SF_VEC_SIZE = 16;
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Convert PyTorch cpp type to CUDA type
|
||||
template <typename T>
|
||||
struct CUDATypeConverter {
|
||||
using Type = T;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CUDATypeConverter<at::Half> {
|
||||
using Type = half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CUDATypeConverter<at::BFloat16> {
|
||||
using Type = __nv_bfloat16;
|
||||
};
|
||||
|
||||
// Get type2 from type or vice versa (applied to half and bfloat16)
|
||||
template <typename T>
|
||||
struct TypeConverter {
|
||||
using Type = half2;
|
||||
}; // keep for generality
|
||||
|
||||
template <>
|
||||
struct TypeConverter<half2> {
|
||||
using Type = half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<half> {
|
||||
using Type = half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<__nv_bfloat162> {
|
||||
using Type = __nv_bfloat16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TypeConverter<__nv_bfloat16> {
|
||||
using Type = __nv_bfloat162;
|
||||
};
|
||||
|
||||
// Define a 16 bytes packed data type.
|
||||
template <class Type>
|
||||
struct PackedVec {
|
||||
typename TypeConverter<Type>::Type elts[4];
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PackedVec<__nv_fp8_e4m3> {
|
||||
__nv_fp8x2_e4m3 elts[8];
|
||||
};
|
||||
|
||||
// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b8 byte0;\n"
|
||||
".reg .b8 byte1;\n"
|
||||
".reg .b8 byte2;\n"
|
||||
".reg .b8 byte3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n"
|
||||
"mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
|
||||
"}"
|
||||
: "=r"(val)
|
||||
: "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
|
||||
"f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
|
||||
return val;
|
||||
}
|
||||
|
||||
// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
|
||||
inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
|
||||
uint32_t val;
|
||||
asm volatile(
|
||||
"{\n"
|
||||
".reg .b8 byte0;\n"
|
||||
".reg .b8 byte1;\n"
|
||||
".reg .b8 byte2;\n"
|
||||
".reg .b8 byte3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte0, %2, %1;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte1, %4, %3;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte2, %6, %5;\n"
|
||||
"cvt.rn.satfinite.e2m1x2.f32 byte3, %8, %7;\n"
|
||||
"mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
|
||||
"}"
|
||||
: "=r"(val)
|
||||
: "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
|
||||
"f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
|
||||
return val;
|
||||
}
|
||||
|
||||
// Fast reciprocal.
|
||||
inline __device__ float reciprocal_approximate_ftz(float a) {
|
||||
float b;
|
||||
asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
|
||||
return b;
|
||||
}
|
||||
|
||||
template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
|
||||
__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
|
||||
int numCols,
|
||||
SFType* SFout) {
|
||||
static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
|
||||
CVT_FP4_NUM_THREADS_PER_SF == 2);
|
||||
|
||||
// One pair of threads write one SF to global memory.
|
||||
// TODO: stage through smem for packed STG.32
|
||||
// is it better than STG.8 from 4 threads ?
|
||||
if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
|
||||
// SF vector index (16 elements share one SF in the K dimension).
|
||||
int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
|
||||
int32_t mIdx = rowIdx;
|
||||
|
||||
// SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
|
||||
// --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
|
||||
|
||||
int32_t mTileIdx = mIdx / (32 * 4);
|
||||
// SF vector size 16.
|
||||
int factor = CVT_FP4_SF_VEC_SIZE * 4;
|
||||
int32_t numKTiles = (numCols + factor - 1) / factor;
|
||||
int64_t mTileStride = numKTiles * 32 * 4 * 4;
|
||||
|
||||
int32_t kTileIdx = (kIdx / 4);
|
||||
int64_t kTileStride = 32 * 4 * 4;
|
||||
|
||||
// M tile layout [32, 4] is column-major.
|
||||
int32_t outerMIdx = (mIdx % 32);
|
||||
int64_t outerMStride = 4 * 4;
|
||||
|
||||
int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
|
||||
int64_t innerMStride = 4;
|
||||
|
||||
int32_t innerKIdx = (kIdx % 4);
|
||||
int64_t innerKStride = 1;
|
||||
|
||||
// Compute the global offset.
|
||||
int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
|
||||
outerMIdx * outerMStride + innerMIdx * innerMStride +
|
||||
innerKIdx * innerKStride;
|
||||
|
||||
return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Quantizes the provided PackedVec into the uint32_t output
|
||||
template <class Type, bool UE8M0_SF = false>
|
||||
__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
|
||||
uint8_t* SFout) {
|
||||
// Get absolute maximum values among the local 8 values.
|
||||
auto localMax = __habs2(vec.elts[0]);
|
||||
|
||||
// Local maximum value.
|
||||
#pragma unroll
|
||||
for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
localMax = __hmax2(localMax, __habs2(vec.elts[i]));
|
||||
}
|
||||
|
||||
// Get the absolute maximum among all 16 values (two threads).
|
||||
localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
|
||||
// Get the final absolute maximum values.
|
||||
float vecMax = float(__hmax(localMax.x, localMax.y));
|
||||
|
||||
// Get the SF (max value of the vector / max value of e2m1).
|
||||
// maximum value of e2m1 = 6.0.
|
||||
// TODO: use half as compute data type.
|
||||
float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
|
||||
// 8 bits representation of the SF.
|
||||
uint8_t fp8SFVal;
|
||||
// Write the SF to global memory (STG.8).
|
||||
if constexpr (UE8M0_SF) {
|
||||
// Extract the 8 exponent bits from float32.
|
||||
// float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
|
||||
uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
|
||||
fp8SFVal = tmp & 0xff;
|
||||
// Convert back to fp32.
|
||||
reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
|
||||
} else {
|
||||
// Here SFValue is always positive, so E4M3 is the same as UE4M3.
|
||||
__nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
|
||||
reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
|
||||
// Convert back to fp32.
|
||||
SFValue = float(tmp);
|
||||
}
|
||||
// Get the output scale.
|
||||
// Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
|
||||
// reciprocal(SFScaleVal))
|
||||
float outputScale =
|
||||
SFValue != 0 ? reciprocal_approximate_ftz(
|
||||
SFValue * reciprocal_approximate_ftz(SFScaleVal))
|
||||
: 0.0f;
|
||||
|
||||
if (SFout) {
|
||||
// Write the SF to global memory (STG.8).
|
||||
*SFout = fp8SFVal;
|
||||
}
|
||||
|
||||
// Convert the input to float.
|
||||
float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
|
||||
if constexpr (std::is_same_v<Type, half>) {
|
||||
fp2Vals[i] = __half22float2(vec.elts[i]);
|
||||
} else {
|
||||
fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
|
||||
}
|
||||
fp2Vals[i].x *= outputScale;
|
||||
fp2Vals[i].y *= outputScale;
|
||||
}
|
||||
|
||||
// Convert to e2m1 values.
|
||||
uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
|
||||
|
||||
// Write the e2m1 values to global memory.
|
||||
return e2m1Vec;
|
||||
}
|
||||
|
||||
} // namespace vllm
|
||||
@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
|
||||
))
|
||||
|
||||
def prepacked_type_key(prepack_type: PrepackTypeConfig):
|
||||
# For now we we can just use the first accumulator type seen since
|
||||
# For now, we can just use the first accumulator type seen since
|
||||
# the tensor core shapes/layouts don't vary based on accumulator
|
||||
# type so we can generate less code this way
|
||||
return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
|
||||
|
||||
@ -115,8 +115,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
|
||||
ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
|
||||
|
||||
#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
|
||||
(defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
|
||||
#ifndef USE_ROCM
|
||||
ops.def(
|
||||
"silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
|
||||
"Tensor input, Tensor input_global_scale) -> ()");
|
||||
@ -517,10 +516,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
|
||||
// SM100 CUTLASS MLA decode
|
||||
ops.def(
|
||||
"sm100_cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
|
||||
" Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
|
||||
" Tensor page_table, Tensor workspace, float "
|
||||
"scale,"
|
||||
"sm100_cutlass_mla_decode(Tensor! out, Tensor! lse, Tensor q_nope,"
|
||||
" Tensor q_pe, Tensor kv_c_and_k_pe_cache,"
|
||||
" Tensor seq_lens, Tensor page_table,"
|
||||
" Tensor workspace, float scale,"
|
||||
" int num_kv_splits) -> ()");
|
||||
// conditionally compiled so impl in source file
|
||||
|
||||
@ -694,16 +693,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||
" Tensor scale) -> ()");
|
||||
cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
|
||||
|
||||
cache_ops.def(
|
||||
"cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
|
||||
" Tensor cp_local_token_select_indices,"
|
||||
" Tensor! kv_cache,"
|
||||
" Tensor slot_mapping,"
|
||||
" str kv_cache_dtype,"
|
||||
" Tensor scale) -> ()");
|
||||
cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
|
||||
&cp_fused_concat_and_cache_mla);
|
||||
|
||||
// Convert the key and value cache to fp8 data type.
|
||||
cache_ops.def(
|
||||
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
|
||||
|
||||
@ -1,56 +0,0 @@
|
||||
# default base image
|
||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
|
||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
|
||||
|
||||
FROM $BASE_IMAGE
|
||||
|
||||
RUN echo "Base image is $BASE_IMAGE"
|
||||
|
||||
# Install some basic utilities
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
ffmpeg libsm6 libxext6 libgl1
|
||||
|
||||
### Mount Point ###
|
||||
# When launching the container, mount the code directory to /workspace
|
||||
ARG APP_MOUNT=/workspace
|
||||
VOLUME [ ${APP_MOUNT} ]
|
||||
WORKDIR ${APP_MOUNT}/vllm
|
||||
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
|
||||
RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||
RUN python3 -m pip install pytest
|
||||
|
||||
# uninstall transformers-neuronx package explicitly to avoid version conflict
|
||||
RUN python3 -m pip uninstall -y transformers-neuronx
|
||||
|
||||
COPY . .
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN python3 -m pip install -U \
|
||||
'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
||||
-r requirements/neuron.txt
|
||||
|
||||
ENV VLLM_TARGET_DEVICE neuron
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
pip install --no-build-isolation -v -e .
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
|
||||
# install transformers-neuronx package as an optional dependencies (for V0)
|
||||
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
|
||||
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
|
||||
|
||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
|
||||
|
||||
# overwrite entrypoint to run bash script
|
||||
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@ -16,7 +16,8 @@ ENV LANG=C.UTF-8 \
|
||||
RUN microdnf install -y \
|
||||
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
|
||||
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
|
||||
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile && \
|
||||
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
|
||||
clang llvm-devel llvm-static clang-devel && \
|
||||
microdnf clean all
|
||||
|
||||
# Python Installation
|
||||
@ -191,7 +192,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
-DCOMPILER_RT_BUILD_ORC=OFF \
|
||||
-DCOMPILER_RT_INCLUDE_TESTS=OFF \
|
||||
${CMAKE_ARGS} -GNinja ../llvm \
|
||||
|
||||
&& ninja install . && \
|
||||
# build llvmlite
|
||||
cd ../../llvmlite && python setup.py bdist_wheel && \
|
||||
@ -200,6 +200,45 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
|
||||
fi && python setup.py bdist_wheel
|
||||
|
||||
# Edit aws-lc-sys to support s390x
|
||||
FROM python-install AS aws-lc-sys-editor
|
||||
WORKDIR /tmp
|
||||
ENV CARGO_HOME=/root/.cargo
|
||||
ENV RUSTUP_HOME=/root/.rustup
|
||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
||||
ARG AWS_LC_VERSION=v0.30.0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
|
||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
||||
git clone --recursive https://github.com/aws/aws-lc-rs.git && \
|
||||
cd aws-lc-rs && \
|
||||
git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \
|
||||
git submodule sync && \
|
||||
git submodule update --init --recursive && \
|
||||
cd aws-lc-sys && \
|
||||
sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
|
||||
sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
|
||||
sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c
|
||||
|
||||
# Build Outlines Core
|
||||
FROM python-install AS outlines-core-builder
|
||||
WORKDIR /tmp
|
||||
ENV CARGO_HOME=/root/.cargo
|
||||
ENV RUSTUP_HOME=/root/.rustup
|
||||
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
|
||||
ARG OUTLINES_CORE_VERSION=0.2.10
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
|
||||
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
|
||||
--mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \
|
||||
git clone https://github.com/dottxt-ai/outlines-core.git && \
|
||||
cd outlines-core && \
|
||||
git checkout tags/${OUTLINES_CORE_VERSION} && \
|
||||
sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \
|
||||
echo '[patch.crates-io]' >> Cargo.toml && \
|
||||
echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \
|
||||
uv pip install maturin && \
|
||||
python -m maturin build --release --out dist
|
||||
|
||||
# Final build stage
|
||||
FROM python-install AS vllm-cpu
|
||||
@ -230,6 +269,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
|
||||
--mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
|
||||
--mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
|
||||
--mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
|
||||
sed -i '/^torch/d' requirements/build.txt && \
|
||||
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
|
||||
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
|
||||
@ -237,6 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
|
||||
LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
|
||||
NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
|
||||
OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
|
||||
uv pip install -v \
|
||||
$ARROW_WHL_FILE \
|
||||
$VISION_WHL_FILE \
|
||||
@ -244,6 +285,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
$TORCH_WHL_FILE \
|
||||
$LLVM_WHL_FILE \
|
||||
$NUMBA_WHL_FILE \
|
||||
$OUTLINES_CORE_WHL_FILE \
|
||||
--index-strategy unsafe-best-match \
|
||||
-r requirements/build.txt \
|
||||
-r requirements/cpu.txt
|
||||
|
||||
@ -1,12 +1,10 @@
|
||||
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
|
||||
|
||||
RUN rm /etc/apt/sources.list.d/intel-graphics.list
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||
add-apt-repository -y ppa:kobuk-team/intel-graphics
|
||||
|
||||
RUN apt clean && apt-get update -y && \
|
||||
apt-get install -y software-properties-common && \
|
||||
add-apt-repository ppa:deadsnakes/ppa && \
|
||||
apt-get install -y python3.10 python3.10-distutils && \
|
||||
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
|
||||
apt-get install -y --no-install-recommends --fix-missing \
|
||||
curl \
|
||||
ffmpeg \
|
||||
@ -17,17 +15,29 @@ RUN apt clean && apt-get update -y && \
|
||||
libgl1 \
|
||||
lsb-release \
|
||||
numactl \
|
||||
python3.10-dev \
|
||||
wget
|
||||
wget \
|
||||
vim \
|
||||
python3.12 \
|
||||
python3.12-dev \
|
||||
python3-pip
|
||||
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
|
||||
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
|
||||
|
||||
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
|
||||
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
|
||||
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
|
||||
|
||||
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
|
||||
RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
|
||||
SHELL ["bash", "-c"]
|
||||
CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
|
||||
|
||||
WORKDIR /workspace/vllm
|
||||
COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
|
||||
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
||||
|
||||
# suppress the python externally managed environment error
|
||||
RUN python3 -m pip config set global.break-system-packages true
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install --no-cache-dir \
|
||||
-r requirements/xpu.txt
|
||||
@ -54,8 +64,9 @@ FROM vllm-base AS vllm-openai
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
|
||||
|
||||
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||
TRITON_XPU_PROFILE 1
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip uninstall oneccl oneccl-devel -y
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||
|
||||
@ -32,10 +32,7 @@ nav:
|
||||
- models/pooling_models.md
|
||||
- models/extensions
|
||||
- Hardware Supported Models: models/hardware_supported_models
|
||||
- Features:
|
||||
- features/compatibility_matrix.md
|
||||
- features/*
|
||||
- features/quantization
|
||||
- Features: features
|
||||
- Developer Guide:
|
||||
- contributing/README.md
|
||||
- General:
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||
|
||||
- [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ), August 30th 2025. [[Slides]](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA)
|
||||
- [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet), August 27th 2025. [[Slides]](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing)
|
||||
- [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
|
||||
- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
|
||||
|
||||
@ -11,9 +11,39 @@ vLLM contains two sets of benchmarks:
|
||||
|
||||
The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
|
||||
|
||||
### Manually Trigger the benchmark
|
||||
|
||||
Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
|
||||
For CPU environment, please use the image with "-cpu" postfix.
|
||||
|
||||
Here is an example for docker run command for CPU.
|
||||
|
||||
```bash
|
||||
docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface -e HF_TOKEN='' --shm-size=16g --name vllm-cpu-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
|
||||
```
|
||||
|
||||
Then, run below command inside the docker instance.
|
||||
|
||||
```bash
|
||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||
```
|
||||
|
||||
When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
|
||||
|
||||
#### Runtime environment variables
|
||||
|
||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||
|
||||
For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
|
||||
|
||||
The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
|
||||
|
||||
More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
|
||||
|
||||
[](){ #nightly-benchmarks }
|
||||
|
||||
|
||||
@ -19,7 +19,7 @@ When using `vllm bench serve`, you can enable profiling by passing the `--profil
|
||||
Traces can be visualized using <https://ui.perfetto.dev/>.
|
||||
|
||||
!!! tip
|
||||
You can directly call bench module without installing vllm using `python -m vllm.entrypoints.cli.main bench`.
|
||||
You can directly call bench module without installing vLLM using `python -m vllm.entrypoints.cli.main bench`.
|
||||
|
||||
!!! tip
|
||||
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Llama Stack
|
||||
|
||||
vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
|
||||
vLLM is also available via [Llama Stack](https://github.com/llamastack/llama-stack).
|
||||
|
||||
To install Llama Stack, run
|
||||
|
||||
@ -8,9 +8,9 @@ To install Llama Stack, run
|
||||
pip install llama-stack -q
|
||||
```
|
||||
|
||||
## Inference using OpenAI Compatible API
|
||||
## Inference using OpenAI-Compatible API
|
||||
|
||||
Then start Llama Stack server pointing to your vLLM server with the following configuration:
|
||||
Then start the Llama Stack server and configure it to point to your vLLM server with the following settings:
|
||||
|
||||
```yaml
|
||||
inference:
|
||||
@ -20,15 +20,15 @@ inference:
|
||||
url: http://127.0.0.1:8000
|
||||
```
|
||||
|
||||
Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider.
|
||||
Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/providers/inference/remote_vllm.html) for more details on this remote vLLM provider.
|
||||
|
||||
## Inference via Embedded vLLM
|
||||
## Inference using Embedded vLLM
|
||||
|
||||
An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm)
|
||||
An [inline provider](https://github.com/llamastack/llama-stack/tree/main/llama_stack/providers/inline/inference)
|
||||
is also available. This is a sample of configuration using that method:
|
||||
|
||||
```yaml
|
||||
inference
|
||||
inference:
|
||||
- provider_type: vllm
|
||||
config:
|
||||
model: Llama3.1-8B-Instruct
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
# Compatibility Matrix
|
||||
# Features
|
||||
|
||||
## Compatibility Matrix
|
||||
|
||||
The tables below show mutually exclusive features and the support on some hardware.
|
||||
|
||||
@ -12,7 +14,7 @@ The symbols used have the following meanings:
|
||||
!!! note
|
||||
Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
|
||||
|
||||
## Feature x Feature
|
||||
### Feature x Feature
|
||||
|
||||
<style>
|
||||
td:not(:first-child) {
|
||||
@ -56,7 +58,7 @@ th:not(:first-child) {
|
||||
|
||||
[](){ #feature-x-hardware }
|
||||
|
||||
## Feature x Hardware
|
||||
### Feature x Hardware
|
||||
|
||||
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU |
|
||||
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
|
||||
@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
|
||||
# Default white background (no configuration needed)
|
||||
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
|
||||
|
||||
|
||||
# Custom black background for dark theme
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
|
||||
)
|
||||
|
||||
|
||||
# Custom brand color background (e.g., blue)
|
||||
llm = LLM(
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
model="llava-hf/llava-1.5-7b-hf",
|
||||
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
|
||||
)
|
||||
```
|
||||
@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
|
||||
|
||||
## Online Serving
|
||||
|
||||
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
|
||||
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
|
||||
|
||||
!!! important
|
||||
A chat template is **required** to use Chat Completions API.
|
||||
@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows:
|
||||
# NOTE: The prompt formatting with the image token `<image>` is not needed
|
||||
# since the prompt will be processed automatically by the API server.
|
||||
{"type": "text", "text": "What’s in this image?"},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
url": image_url
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
)
|
||||
@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows:
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What are the animals in these images?"},
|
||||
{"type": "image_url", "image_url": {"url": image_url_duck}},
|
||||
{"type": "image_url", "image_url": {"url": image_url_lion}},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_duck
|
||||
},
|
||||
"uuid": image_url_duck # Optional
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url_lion
|
||||
},
|
||||
"uuid": image_url_lion # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
)
|
||||
@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows:
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
"uuid": video_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows:
|
||||
"data": audio_base64,
|
||||
"format": "wav"
|
||||
},
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
},
|
||||
"uuid": audio_url # Optional
|
||||
},
|
||||
],
|
||||
}],
|
||||
@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
model = "llava-hf/llava-1.5-7b-hf"
|
||||
embeds = {
|
||||
"type": "image_embeds",
|
||||
"image_embeds": f"{base64_image_embedding}"
|
||||
"image_embeds": f"{base64_image_embedding}",
|
||||
"uuid": image_url # Optional
|
||||
}
|
||||
|
||||
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
|
||||
@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
}
|
||||
model = "openbmb/MiniCPM-V-2_6"
|
||||
embeds = {
|
||||
@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
|
||||
"image_embeds": f"{base64_image_embedding}" , # Required
|
||||
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
|
||||
},
|
||||
"uuid": image_url # Optional
|
||||
}
|
||||
chat_completion = client.chat.completions.create(
|
||||
messages=[
|
||||
|
||||
@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
|
||||
- Offline Inference: `256 * world_size`
|
||||
- Online Serving: `128 * world_size`
|
||||
|
||||
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
|
||||
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
|
||||
|
||||
### Which quantization configs does vLLM CPU support?
|
||||
|
||||
|
||||
@ -3,13 +3,16 @@
|
||||
vLLM initially supports basic model inference and serving on Intel GPU platform.
|
||||
|
||||
!!! warning
|
||||
There are no pre-built wheels or images for this device, so you must build vLLM from source.
|
||||
There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions.
|
||||
|
||||
# --8<-- [end:installation]
|
||||
# --8<-- [start:requirements]
|
||||
|
||||
- Supported Hardware: Intel Data Center GPU, Intel ARC GPU
|
||||
- OneAPI requirements: oneAPI 2025.0
|
||||
- OneAPI requirements: oneAPI 2025.1
|
||||
- Python: 3.12
|
||||
!!! warning
|
||||
The provided IPEX whl is Python3.12 specific so this version is a MUST.
|
||||
|
||||
# --8<-- [end:requirements]
|
||||
# --8<-- [start:set-up-using-python]
|
||||
@ -24,7 +27,7 @@ Currently, there are no pre-built XPU wheels.
|
||||
# --8<-- [end:pre-built-wheels]
|
||||
# --8<-- [start:build-wheel-from-source]
|
||||
|
||||
- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.0 or later.
|
||||
- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later.
|
||||
- Second, install Python packages for vLLM XPU backend building:
|
||||
|
||||
```bash
|
||||
@ -40,14 +43,10 @@ pip install -v -r requirements/xpu.txt
|
||||
VLLM_TARGET_DEVICE=xpu python setup.py install
|
||||
```
|
||||
|
||||
!!! note
|
||||
- FP16 is the default data type in the current XPU backend. The BF16 data
|
||||
type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
|
||||
|
||||
# --8<-- [end:build-wheel-from-source]
|
||||
# --8<-- [start:pre-built-images]
|
||||
|
||||
Currently, there are no pre-built XPU images.
|
||||
Currently, we release prebuilt XPU images at docker [hub](https://hub.docker.com/r/intel/vllm/tags) based on vLLM released version. For more information, please refer release [note](https://github.com/intel/ai-containers/blob/main/vllm).
|
||||
|
||||
# --8<-- [end:pre-built-images]
|
||||
# --8<-- [start:build-image-from-source]
|
||||
@ -65,14 +64,14 @@ docker run -it \
|
||||
# --8<-- [end:build-image-from-source]
|
||||
# --8<-- [start:supported-features]
|
||||
|
||||
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
|
||||
XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following:
|
||||
|
||||
```bash
|
||||
python -m vllm.entrypoints.openai.api_server \
|
||||
--model=facebook/opt-13b \
|
||||
--dtype=bfloat16 \
|
||||
--max_model_len=1024 \
|
||||
--distributed-executor-backend=ray \
|
||||
--distributed-executor-backend=mp \
|
||||
--pipeline-parallel-size=2 \
|
||||
-tp=8
|
||||
```
|
||||
|
||||
@ -165,6 +165,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
# Generate documentation for each parser
|
||||
for stem, parser in parsers.items():
|
||||
doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
|
||||
with open(doc_path, "w") as f:
|
||||
# Specify encoding for building on Windows
|
||||
with open(doc_path, "w", encoding="utf-8") as f:
|
||||
f.write(parser.format_help())
|
||||
logger.info("Argparse generated: %s", doc_path.relative_to(ROOT_DIR))
|
||||
|
||||
@ -106,7 +106,8 @@ class Example:
|
||||
|
||||
def determine_title(self) -> str:
|
||||
if not self.is_code:
|
||||
with open(self.main_file) as f:
|
||||
# Specify encoding for building on Windows
|
||||
with open(self.main_file, encoding="utf-8") as f:
|
||||
first_line = f.readline().strip()
|
||||
match = re.match(r'^#\s+(?P<title>.+)$', first_line)
|
||||
if match:
|
||||
@ -174,6 +175,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
|
||||
doc_path = EXAMPLE_DOC_DIR / example.category / example_name
|
||||
if not doc_path.parent.exists():
|
||||
doc_path.parent.mkdir(parents=True)
|
||||
with open(doc_path, "w+") as f:
|
||||
# Specify encoding for building on Windows
|
||||
with open(doc_path, "w+", encoding="utf-8") as f:
|
||||
f.write(example.generate())
|
||||
logger.debug("Example generated: %s", doc_path.relative_to(ROOT_DIR))
|
||||
|
||||
@ -440,6 +440,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
|
||||
|--------------|--------|-------------------|----------------------|---------------------------|---------------------|
|
||||
| `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | | ✅︎ |
|
||||
| `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | ✅︎ |
|
||||
| `GteModel`<sup>C</sup> | Arctic-Embed-2.0-M | `Snowflake/snowflake-arctic-embed-m-v2.0`. | | | ✅︎ |
|
||||
| `GteNewModel`<sup>C</sup> | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc. | | | ✅︎ |
|
||||
|
||||
@ -123,12 +123,33 @@ When enabled, vLLM collects load statistics with every forward pass and periodic
|
||||
|
||||
### EPLB Parameters
|
||||
|
||||
Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. The available keys and their descriptions are:
|
||||
|
||||
| Parameter | Description | Default |
|
||||
|-----------|-------------|---------|
|
||||
| `--eplb-window-size` | Number of engine steps to track for rebalancing decisions | - |
|
||||
| `--eplb-step-interval` | Frequency of rebalancing (every N engine steps) | - |
|
||||
| `--eplb-log-balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
|
||||
| `--num-redundant-experts` | Additional global experts per EP rank beyond equal distribution | `0` |
|
||||
| `window_size`| Number of engine steps to track for rebalancing decisions | 1000 |
|
||||
| `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 |
|
||||
| `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` |
|
||||
| `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` |
|
||||
|
||||
For example:
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-30B-A3B \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
|
||||
```
|
||||
|
||||
??? tip "Prefer individual arguments instead of JSON?"
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-30B-A3B \
|
||||
--enable-eplb \
|
||||
--eplb-config.window_size 1000 \
|
||||
--eplb-config.step_interval 3000 \
|
||||
--eplb-config.num_redundant_experts 2 \
|
||||
--eplb-config.log_balancedness true
|
||||
```
|
||||
|
||||
### Expert Distribution Formula
|
||||
|
||||
@ -146,12 +167,10 @@ VLLM_ALL2ALL_BACKEND=pplx VLLM_USE_DEEP_GEMM=1 vllm serve deepseek-ai/DeepSeek-V
|
||||
--data-parallel-size 8 \ # Data parallelism
|
||||
--enable-expert-parallel \ # Enable EP
|
||||
--enable-eplb \ # Enable load balancer
|
||||
--eplb-log-balancedness \ # Log balancing metrics
|
||||
--eplb-window-size 1000 \ # Track last 1000 engine steps
|
||||
--eplb-step-interval 3000 # Rebalance every 3000 steps
|
||||
--eplb-config '{"window_size":1000,"step_interval":3000,"num_redundant_experts":2,"log_balancedness":true}'
|
||||
```
|
||||
|
||||
For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--num-redundant-experts` to 32 in large scale use cases so the most popular experts are always available.
|
||||
For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available.
|
||||
|
||||
## Disaggregated Serving (Prefill/Decode Split)
|
||||
|
||||
|
||||
@ -66,7 +66,7 @@ Ray is a distributed computing framework for scaling Python programs. Multi-node
|
||||
|
||||
vLLM uses Ray to manage the distributed execution of tasks across multiple nodes and control where execution happens.
|
||||
|
||||
Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm/serving-llms.html) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
|
||||
Ray also offers high-level APIs for large-scale [offline batch inference](https://docs.ray.io/en/latest/data/working-with-llms.html) and [online serving](https://docs.ray.io/en/latest/serve/llm) that can leverage vLLM as the engine. These APIs add production-grade fault tolerance, scaling, and distributed observability to vLLM workloads.
|
||||
|
||||
For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.html).
|
||||
|
||||
@ -104,7 +104,7 @@ Note that `VLLM_HOST_IP` is unique for each worker. Keep the shells running thes
|
||||
From any node, enter a container and run `ray status` and `ray list nodes` to verify that Ray finds the expected number of nodes and GPUs.
|
||||
|
||||
!!! tip
|
||||
Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/vllm-rayservice.html).
|
||||
Alternatively, set up the Ray cluster using KubeRay. For more information, see [KubeRay vLLM documentation](https://docs.ray.io/en/latest/cluster/kubernetes/examples/rayserve-llm-example.html).
|
||||
|
||||
### Running vLLM on a Ray cluster
|
||||
|
||||
|
||||
@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools)
|
||||
|
||||
print(outputs[0].outputs[0].text.strip())
|
||||
# yields
|
||||
# 'The weather in Dallas, TX is 85 degrees fahrenheit. '
|
||||
# 'The weather in Dallas, TX is 85 degrees Fahrenheit. '
|
||||
# 'It is partly cloudly, with highs in the 90's.'
|
||||
|
||||
@ -1,49 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
|
||||
def main():
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
max_num_seqs=8,
|
||||
# The max_model_len and block_size arguments are required to be same as
|
||||
# max sequence length when targeting neuron device.
|
||||
# Currently, this is a known limitation in continuous batching support
|
||||
# in transformers-neuronx.
|
||||
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
||||
max_model_len=1024,
|
||||
block_size=1024,
|
||||
# ruff: noqa: E501
|
||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||
# The device argument can be either unspecified for automated detection,
|
||||
# or explicitly assigned.
|
||||
device="neuron",
|
||||
tensor_parallel_size=2,
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,61 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This example shows how to run offline inference with an EAGLE speculative
|
||||
decoding model on neuron. To use EAGLE speculative decoding, you must use
|
||||
a draft model that is specifically fine-tuned for EAGLE speculation.
|
||||
Additionally, to use EAGLE with NxD Inference, the draft model must include
|
||||
the LM head weights from the target model. These weights are shared between
|
||||
the draft and target model.
|
||||
"""
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"What is annapurna labs?",
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
|
||||
speculative_config={
|
||||
"model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 2048,
|
||||
},
|
||||
max_num_seqs=4,
|
||||
# The max_model_len and block_size arguments are required to be same as
|
||||
# max sequence length when targeting neuron device.
|
||||
# Currently, this is a known limitation in continuous batching support
|
||||
# in neuronx-distributed-inference.
|
||||
max_model_len=2048,
|
||||
block_size=2048,
|
||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||
# The device argument can be either unspecified for automated detection,
|
||||
# or explicitly assigned.
|
||||
device="neuron",
|
||||
tensor_parallel_size=32,
|
||||
override_neuron_config={
|
||||
"enable_eagle_speculation": True,
|
||||
"enable_fused_speculation": True,
|
||||
},
|
||||
)
|
||||
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,63 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# creates XLA hlo graphs for all the context length buckets.
|
||||
os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
|
||||
# creates XLA hlo graphs for all the token gen buckets.
|
||||
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
|
||||
# Quantizes neuron model weight to int8 ,
|
||||
# The default config for quantization is int8 dtype.
|
||||
os.environ["NEURON_QUANT_DTYPE"] = "s8"
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
|
||||
def main():
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
max_num_seqs=8,
|
||||
# The max_model_len and block_size arguments are required to be same as
|
||||
# max sequence length when targeting neuron device.
|
||||
# Currently, this is a known limitation in continuous batching support
|
||||
# in transformers-neuronx.
|
||||
# TODO(liangfu): Support paged-attention in transformers-neuronx.
|
||||
max_model_len=2048,
|
||||
block_size=2048,
|
||||
# ruff: noqa: E501
|
||||
# The device can be automatically detected when AWS Neuron SDK is installed.
|
||||
# The device argument can be either unspecified for automated detection,
|
||||
# or explicitly assigned.
|
||||
device="neuron",
|
||||
quantization="neuron_quant",
|
||||
override_neuron_config={
|
||||
"cast_logits_dtype": "bfloat16",
|
||||
},
|
||||
tensor_parallel_size=2,
|
||||
)
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,110 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import requests
|
||||
import torch
|
||||
from neuronx_distributed_inference.models.mllama.utils import add_instruct
|
||||
from PIL import Image
|
||||
|
||||
from vllm import LLM, SamplingParams, TextPrompt
|
||||
|
||||
|
||||
def get_image(image_url):
|
||||
image = Image.open(requests.get(image_url, stream=True).raw)
|
||||
return image
|
||||
|
||||
|
||||
# Model Inputs
|
||||
PROMPTS = [
|
||||
"What is in this image? Tell me a story",
|
||||
"What is the recipe of mayonnaise in two sentences?",
|
||||
"Describe this image",
|
||||
"What is the capital of Italy famous for?",
|
||||
]
|
||||
IMAGES = [
|
||||
get_image(
|
||||
"https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
|
||||
),
|
||||
None,
|
||||
get_image(
|
||||
"https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
|
||||
),
|
||||
None,
|
||||
]
|
||||
SAMPLING_PARAMS = [
|
||||
dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
|
||||
for _ in range(len(PROMPTS))
|
||||
]
|
||||
|
||||
|
||||
def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
|
||||
# Prepare all inputs for mllama generation, including:
|
||||
# 1. put text prompt into instruct chat template
|
||||
# 2. compose single text and single image prompt into Vllm's prompt class
|
||||
# 3. prepare sampling parameters
|
||||
input_image = single_image
|
||||
has_image = torch.tensor([1])
|
||||
if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
|
||||
has_image = torch.tensor([0])
|
||||
|
||||
instruct_prompt = add_instruct(prompt, has_image)
|
||||
inputs = TextPrompt(prompt=instruct_prompt)
|
||||
|
||||
if input_image is not None:
|
||||
inputs["multi_modal_data"] = {"image": input_image}
|
||||
|
||||
sampling_params = SamplingParams(**sampling_params)
|
||||
return inputs, sampling_params
|
||||
|
||||
|
||||
def print_outputs(outputs):
|
||||
# Print the outputs.
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
def main():
|
||||
assert (
|
||||
len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
|
||||
), f"""Text, image prompts and sampling parameters should have the
|
||||
same batch size; but got {len(PROMPTS)}, {len(IMAGES)},
|
||||
and {len(SAMPLING_PARAMS)}"""
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
|
||||
max_num_seqs=1,
|
||||
max_model_len=4096,
|
||||
block_size=4096,
|
||||
device="neuron",
|
||||
tensor_parallel_size=32,
|
||||
override_neuron_config={
|
||||
"sequence_parallel_enabled": False,
|
||||
"skip_warmup": True,
|
||||
"save_sharded_checkpoint": True,
|
||||
"on_device_sampling_config": {
|
||||
"global_topk": 1,
|
||||
"dynamic": False,
|
||||
"deterministic": False,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
batched_inputs = []
|
||||
batched_sample_params = []
|
||||
for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
|
||||
inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
|
||||
# test batch-size = 1
|
||||
outputs = llm.generate(inputs, sampling_params)
|
||||
print_outputs(outputs)
|
||||
batched_inputs.append(inputs)
|
||||
batched_sample_params.append(sampling_params)
|
||||
|
||||
# test batch-size = 4
|
||||
outputs = llm.generate(batched_inputs, batched_sample_params)
|
||||
print_outputs(outputs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,64 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This example shows how to run offline inference with a speculative
|
||||
decoding model on neuron.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, I am a language model and I can help",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
]
|
||||
|
||||
|
||||
def config_buckets():
|
||||
"""Configure context length and token gen buckets."""
|
||||
# creates XLA hlo graphs for all the context length buckets.
|
||||
os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
|
||||
# creates XLA hlo graphs for all the token gen buckets.
|
||||
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
|
||||
|
||||
|
||||
def initialize_llm():
|
||||
"""Create an LLM with speculative decoding."""
|
||||
return LLM(
|
||||
model="openlm-research/open_llama_7b",
|
||||
speculative_config={
|
||||
"model": "openlm-research/open_llama_3b",
|
||||
"num_speculative_tokens": 4,
|
||||
"max_model_len": 2048,
|
||||
},
|
||||
max_num_seqs=4,
|
||||
max_model_len=2048,
|
||||
block_size=2048,
|
||||
device="neuron",
|
||||
tensor_parallel_size=32,
|
||||
)
|
||||
|
||||
|
||||
def process_requests(llm: LLM, sampling_params: SamplingParams):
|
||||
"""Generate texts from prompts and print them."""
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function that sets up the llm and processes prompts."""
|
||||
config_buckets()
|
||||
llm = initialize_llm()
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(max_tokens=100, top_k=1)
|
||||
process_requests(llm, sampling_params)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -18,7 +18,7 @@ from vllm.pooling_params import PoolingParams
|
||||
|
||||
def main():
|
||||
torch.set_default_dtype(torch.float16)
|
||||
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif" # noqa: E501
|
||||
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501
|
||||
|
||||
img_prompt = dict(
|
||||
data=image_url,
|
||||
@ -36,7 +36,7 @@ def main():
|
||||
# to avoid the model going OOM.
|
||||
# The maximum number depends on the available GPU memory
|
||||
max_num_seqs=32,
|
||||
io_processor_plugin="prithvi_to_tiff_india",
|
||||
io_processor_plugin="prithvi_to_tiff",
|
||||
model_impl="terratorch",
|
||||
)
|
||||
|
||||
|
||||
@ -18,11 +18,11 @@ import requests
|
||||
# --model-impl terratorch
|
||||
# --task embed --trust-remote-code
|
||||
# --skip-tokenizer-init --enforce-eager
|
||||
# --io-processor-plugin prithvi_to_tiff_india
|
||||
# --io-processor-plugin prithvi_to_tiff
|
||||
|
||||
|
||||
def main():
|
||||
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif" # noqa: E501
|
||||
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff" # noqa: E501
|
||||
server_endpoint = "http://localhost:8000/pooling"
|
||||
|
||||
request_payload_url = {
|
||||
|
||||
@ -20,8 +20,7 @@ prometheus-fastapi-instrumentator >= 7.0.0
|
||||
tiktoken >= 0.6.0 # Required for DBRX tokenizer
|
||||
lm-format-enforcer == 0.11.3
|
||||
llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
|
||||
outlines_core == 0.2.10 ; platform_machine != "s390x"
|
||||
outlines == 0.1.11 ; platform_machine == "s390x"
|
||||
outlines_core == 0.2.10
|
||||
# required for outlines backend disk cache
|
||||
diskcache == 5.6.3
|
||||
lark == 1.2.2
|
||||
|
||||
@ -1,9 +0,0 @@
|
||||
# Common dependencies
|
||||
-r common.txt
|
||||
|
||||
# Dependencies for Neuron devices
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
torch-neuronx >= 2.5.0
|
||||
neuronx-cc>=2.0.0a0
|
||||
torchvision # Required for Llama3.2 multimodal image preprocessing
|
||||
@ -54,4 +54,4 @@ runai-model-streamer-s3==0.11.0
|
||||
fastsafetensors>=0.1.10
|
||||
pydantic>=2.10 # 2.9 leads to error on python 3.10
|
||||
decord==0.6.0
|
||||
terratorch==1.1rc3 # required for PrithviMAE test
|
||||
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
|
||||
|
||||
@ -1042,7 +1042,7 @@ tensorboardx==2.6.4
|
||||
# via lightning
|
||||
tensorizer==2.10.1
|
||||
# via -r requirements/test.in
|
||||
terratorch==1.1rc3
|
||||
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
|
||||
# via -r requirements/test.in
|
||||
threadpoolctl==3.5.0
|
||||
# via scikit-learn
|
||||
|
||||
@ -10,10 +10,10 @@ wheel
|
||||
jinja2>=3.1.6
|
||||
datasets # for benchmark scripts
|
||||
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
|
||||
--extra-index-url=https://download.pytorch.org/whl/xpu
|
||||
nixl==0.3.0 # for PD disaggregation
|
||||
torch==2.8.0+xpu
|
||||
torchaudio
|
||||
torchvision
|
||||
pytorch-triton-xpu
|
||||
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
|
||||
intel-extension-for-pytorch==2.8.10+xpu
|
||||
--extra-index-url=https://download.pytorch.org/whl/xpu
|
||||
|
||||
intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
|
||||
|
||||
36
setup.py
36
setup.py
@ -413,8 +413,7 @@ def _no_device() -> bool:
|
||||
|
||||
def _is_cuda() -> bool:
|
||||
has_cuda = torch.version.cuda is not None
|
||||
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
|
||||
and not (_is_neuron() or _is_tpu()))
|
||||
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu())
|
||||
|
||||
|
||||
def _is_hip() -> bool:
|
||||
@ -422,10 +421,6 @@ def _is_hip() -> bool:
|
||||
or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
|
||||
|
||||
|
||||
def _is_neuron() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "neuron"
|
||||
|
||||
|
||||
def _is_tpu() -> bool:
|
||||
return VLLM_TARGET_DEVICE == "tpu"
|
||||
|
||||
@ -470,25 +465,6 @@ def get_rocm_version():
|
||||
return None
|
||||
|
||||
|
||||
def get_neuronxcc_version():
|
||||
import sysconfig
|
||||
site_dir = sysconfig.get_paths()["purelib"]
|
||||
version_file = os.path.join(site_dir, "neuronxcc", "version",
|
||||
"__init__.py")
|
||||
|
||||
# Check if the command was executed successfully
|
||||
with open(version_file) as fp:
|
||||
content = fp.read()
|
||||
|
||||
# Extract the version using a regular expression
|
||||
match = re.search(r"__version__ = '(\S+)'", content)
|
||||
if match:
|
||||
# Return the version string
|
||||
return match.group(1)
|
||||
else:
|
||||
raise RuntimeError("Could not find Neuron version in the output")
|
||||
|
||||
|
||||
def get_nvcc_cuda_version() -> Version:
|
||||
"""Get the CUDA version from nvcc.
|
||||
|
||||
@ -541,12 +517,6 @@ def get_vllm_version() -> str:
|
||||
rocm_version = get_rocm_version() or torch.version.hip
|
||||
if rocm_version and rocm_version != MAIN_CUDA_VERSION:
|
||||
version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
|
||||
elif _is_neuron():
|
||||
# Get the Neuron version
|
||||
neuron_version = str(get_neuronxcc_version())
|
||||
if neuron_version != MAIN_CUDA_VERSION:
|
||||
neuron_version_str = neuron_version.replace(".", "")[:3]
|
||||
version += f"{sep}neuron{neuron_version_str}"
|
||||
elif _is_tpu():
|
||||
version += f"{sep}tpu"
|
||||
elif _is_cpu():
|
||||
@ -591,8 +561,6 @@ def get_requirements() -> list[str]:
|
||||
requirements = modified_requirements
|
||||
elif _is_hip():
|
||||
requirements = _read_requirements("rocm.txt")
|
||||
elif _is_neuron():
|
||||
requirements = _read_requirements("neuron.txt")
|
||||
elif _is_tpu():
|
||||
requirements = _read_requirements("tpu.txt")
|
||||
elif _is_cpu():
|
||||
@ -601,7 +569,7 @@ def get_requirements() -> list[str]:
|
||||
requirements = _read_requirements("xpu.txt")
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
|
||||
"Unsupported platform, please use CUDA, ROCm, or CPU.")
|
||||
return requirements
|
||||
|
||||
|
||||
|
||||
@ -61,6 +61,16 @@ backend_configs = {
|
||||
"cudagraph_mode": "FULL_AND_PIECEWISE",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# FlashAttention MLA on Hopper
|
||||
"FlashAttentionMLA":
|
||||
BackendConfig(name="FlashAttentionMLA",
|
||||
env_vars={
|
||||
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
|
||||
},
|
||||
comp_config={
|
||||
"cudagraph_mode": "FULL_DECODE_ONLY",
|
||||
},
|
||||
specific_gpu_arch=(9, 0)),
|
||||
# Cutlass MLA on Blackwell
|
||||
"CutlassMLA":
|
||||
BackendConfig(
|
||||
@ -102,7 +112,7 @@ backend_configs = {
|
||||
test_params_full_cudagraph = []
|
||||
|
||||
# deepseek-ai/DeepSeek-V2-Lite with MLA
|
||||
MLA_backends = ["FlashMLA", "CutlassMLA"]
|
||||
MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
|
||||
for mla_backend in MLA_backends:
|
||||
test_params_full_cudagraph.append(
|
||||
pytest.param(
|
||||
|
||||
@ -62,8 +62,12 @@ class TestSetting:
|
||||
TestSetting(
|
||||
model="BAAI/bge-multilingual-gemma2",
|
||||
model_args=[
|
||||
"--runner", "pooling", "--dtype", "bfloat16",
|
||||
"--max-model-len", "2048"
|
||||
"--runner",
|
||||
"pooling",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
],
|
||||
pp_size=1,
|
||||
tp_size=1,
|
||||
@ -71,17 +75,15 @@ class TestSetting:
|
||||
method="encode",
|
||||
fullgraph=True,
|
||||
),
|
||||
# TODO: bert models are not supported in V1 yet
|
||||
# # encoder-based embedding model (BERT)
|
||||
# TestSetting(
|
||||
# model="BAAI/bge-base-en-v1.5",
|
||||
# model_args=["--runner", "pooling"],
|
||||
# pp_size=1,
|
||||
# tp_size=1,
|
||||
# attn_backend="XFORMERS",
|
||||
# method="encode",
|
||||
# fullgraph=True,
|
||||
# ),
|
||||
TestSetting(
|
||||
model="BAAI/bge-base-en-v1.5",
|
||||
model_args=["--runner", "pooling"],
|
||||
pp_size=1,
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="encode",
|
||||
fullgraph=True,
|
||||
),
|
||||
# vision language model
|
||||
TestSetting(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
@ -92,7 +94,8 @@ class TestSetting:
|
||||
method="generate_with_image",
|
||||
fullgraph=False,
|
||||
),
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_compile_correctness(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
test_setting: TestSetting,
|
||||
|
||||
@ -1,9 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import cast
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
|
||||
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
@ -64,24 +67,27 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
|
||||
|
||||
class TestSiluMulNvfp4QuantModel(torch.nn.Module):
|
||||
|
||||
def __init__(self, hidden_size: int, **kwargs):
|
||||
def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs):
|
||||
super().__init__()
|
||||
self.silu_and_mul = SiluAndMul()
|
||||
self.w = torch.randint(256, (hidden_size, hidden_size // 2),
|
||||
dtype=FP4_DTYPE)
|
||||
self.wscale = torch.randn(hidden_size,
|
||||
hidden_size // 16).to(dtype=FP8_DTYPE)
|
||||
self.wscale2 = torch.rand(1, dtype=torch.float32)
|
||||
self.scale = torch.rand(1, dtype=torch.float32)
|
||||
|
||||
# create nvfp4 weight
|
||||
w = torch.rand((hidden_size, hidden_size))
|
||||
self.w, self.w_block_scale, self.w_global_scale = quant_nvfp4_tensor(w)
|
||||
|
||||
# get global scale offline
|
||||
_, _, self.y_global_scale = quant_nvfp4_tensor(self.silu_and_mul(x))
|
||||
|
||||
self.alpha = 1.0 / (self.w_global_scale * self.y_global_scale)
|
||||
|
||||
def forward(self, x):
|
||||
y = self.silu_and_mul(x)
|
||||
y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
|
||||
y_quant, y_block_scale = scaled_fp4_quant(y, self.y_global_scale)
|
||||
out = cutlass_scaled_fp4_mm(a=y_quant,
|
||||
b=self.w,
|
||||
block_scale_a=y_block_scale,
|
||||
block_scale_b=self.wscale,
|
||||
alpha=self.scale * self.wscale2,
|
||||
block_scale_b=self.w_block_scale,
|
||||
alpha=self.alpha,
|
||||
out_dtype=y.dtype)
|
||||
return out
|
||||
|
||||
@ -95,8 +101,9 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
|
||||
@pytest.mark.parametrize("num_tokens", [64])
|
||||
@pytest.mark.parametrize("hidden_size", [128])
|
||||
@pytest.mark.parametrize(
|
||||
"model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
|
||||
if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
|
||||
"model_class",
|
||||
cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
|
||||
if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]))
|
||||
# cuda_force_torch used to test torch code path on platforms that
|
||||
# cutlass_fp8_supported() == True.
|
||||
@pytest.mark.parametrize("cuda_force_torch",
|
||||
@ -111,6 +118,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
|
||||
torch.set_default_device("cuda")
|
||||
torch.set_default_dtype(torch.float16)
|
||||
|
||||
x = torch.rand(num_tokens, hidden_size * 2)
|
||||
|
||||
# Reshape pass is needed for the fusion pass to work
|
||||
config = VllmConfig()
|
||||
config.compilation_config = CompilationConfig(
|
||||
@ -118,10 +127,11 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
|
||||
fusion_pass = ActivationQuantFusionPass(config)
|
||||
|
||||
backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
|
||||
model = model_class(hidden_size, cuda_force_torch)
|
||||
model = model_class(hidden_size=hidden_size,
|
||||
cuda_force_torch=cuda_force_torch,
|
||||
x=x)
|
||||
|
||||
# First dimension dynamic
|
||||
x = torch.rand(num_tokens, hidden_size * 2)
|
||||
torch._dynamo.mark_dynamic(x, 0)
|
||||
|
||||
result = model(x)
|
||||
@ -130,10 +140,15 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
|
||||
result2 = model2(x)
|
||||
|
||||
# Check that it gives the same answer
|
||||
if model_class == TestSiluMulFp8QuantModel:
|
||||
atol, rtol = 1e-3, 1e-3
|
||||
elif model_class == TestSiluMulNvfp4QuantModel:
|
||||
atol, rtol = 1e-1, 1e-1
|
||||
|
||||
torch.testing.assert_close(result[0].to(dtype=torch.float16),
|
||||
result2[0].to(dtype=torch.float16),
|
||||
atol=1e-3,
|
||||
rtol=1e-3)
|
||||
atol=atol,
|
||||
rtol=rtol)
|
||||
|
||||
# In pre-nodes, quant op should be present and fused kernels should not
|
||||
backend.check_before_ops(model.ops_in_model_before())
|
||||
|
||||
@ -8,7 +8,7 @@ import msgspec.msgpack
|
||||
import pytest
|
||||
import zmq
|
||||
|
||||
from vllm.config import KVEventsConfig
|
||||
from vllm.config.kv_events import KVEventsConfig
|
||||
from vllm.distributed.kv_events import EventPublisherFactory
|
||||
|
||||
from .test_events import SampleBatch
|
||||
|
||||
263
tests/distributed/test_context_parallel.py
Normal file
263
tests/distributed/test_context_parallel.py
Normal file
@ -0,0 +1,263 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
WARNING: This test runs in both single-node (4 GPUs) and multi-node
|
||||
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
|
||||
important to set the distributed backend to "mp" to avoid Ray scheduling
|
||||
all workers in a node other than the head node, which can cause the test
|
||||
to fail.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, NamedTuple, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import RunnerOption
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
from ..utils import compare_two_settings, create_new_process_for_each_test
|
||||
|
||||
logger = init_logger("test_context_parallel")
|
||||
|
||||
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
|
||||
|
||||
|
||||
class ParallelSetup(NamedTuple):
|
||||
tp_size: int
|
||||
pp_size: int
|
||||
dcp_size: int
|
||||
eager_mode: bool
|
||||
chunked_prefill: bool
|
||||
|
||||
|
||||
class CPTestOptions(NamedTuple):
|
||||
multi_node_only: bool
|
||||
load_format: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CPTestSettings:
|
||||
parallel_setups: list[ParallelSetup]
|
||||
# NOTE: the length of distributed_backends and
|
||||
# vllm_major_versions should be the same, and they
|
||||
# are first zipped together to iterate over all
|
||||
# test settings.
|
||||
distributed_backends: list[str]
|
||||
# vllm major version: "0" for V0, "1" for V1
|
||||
vllm_major_versions: list[str]
|
||||
runner: RunnerOption
|
||||
test_options: CPTestOptions
|
||||
|
||||
def __post_init__(self):
|
||||
if len(self.distributed_backends) != len(self.vllm_major_versions):
|
||||
raise ValueError(
|
||||
f"Length mismatch: distributed_backends "
|
||||
f"({len(self.distributed_backends)}) != "
|
||||
f"vllm_major_versions ({len(self.vllm_major_versions)})")
|
||||
|
||||
@staticmethod
|
||||
def detailed(
|
||||
*,
|
||||
tp_base: int = 4,
|
||||
pp_base: int = 1,
|
||||
dcp_base: int = 1,
|
||||
multi_node_only: bool = False,
|
||||
runner: RunnerOption = "auto",
|
||||
load_format: Optional[str] = None,
|
||||
):
|
||||
parallel_setups = []
|
||||
for eager_mode_val in [False]:
|
||||
for pp_multiplier in [1]:
|
||||
for dcp_multiplier in [2, 4]:
|
||||
for chunked_prefill_val in [True]:
|
||||
parallel_setups.append(
|
||||
ParallelSetup(tp_size=tp_base,
|
||||
pp_size=pp_multiplier * pp_base,
|
||||
dcp_size=dcp_multiplier * dcp_base,
|
||||
eager_mode=eager_mode_val,
|
||||
chunked_prefill=chunked_prefill_val))
|
||||
return CPTestSettings(
|
||||
parallel_setups=parallel_setups,
|
||||
distributed_backends=["mp"],
|
||||
vllm_major_versions=["1"],
|
||||
runner=runner,
|
||||
test_options=CPTestOptions(multi_node_only=multi_node_only,
|
||||
load_format=load_format),
|
||||
)
|
||||
|
||||
def iter_params(self, model_id: str):
|
||||
opts = self.test_options
|
||||
|
||||
for parallel_setup in self.parallel_setups:
|
||||
for backend, vllm_major_version in zip(self.distributed_backends,
|
||||
self.vllm_major_versions):
|
||||
yield (model_id, parallel_setup, backend, vllm_major_version,
|
||||
self.runner, opts)
|
||||
|
||||
|
||||
def _compare_cp_with_tp(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
runner: RunnerOption,
|
||||
test_options: CPTestOptions,
|
||||
num_gpus_available: int,
|
||||
*,
|
||||
method: Literal["generate"],
|
||||
is_multimodal: bool,
|
||||
):
|
||||
(
|
||||
tp_size,
|
||||
pp_size,
|
||||
dcp_size,
|
||||
eager_mode,
|
||||
chunked_prefill,
|
||||
) = parallel_setup
|
||||
|
||||
multi_node_only, load_format = test_options
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
trust_remote_code = model_info.trust_remote_code
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
hf_overrides = model_info.hf_overrides
|
||||
|
||||
if load_format == "dummy":
|
||||
# Avoid OOM
|
||||
text_overrides = {
|
||||
"num_hidden_layers": 4,
|
||||
"hidden_size": 512,
|
||||
"intermediate_size": 800,
|
||||
"num_attention_heads": 4,
|
||||
"num_key_value_heads": 1,
|
||||
}
|
||||
|
||||
if is_multimodal:
|
||||
hf_overrides.update({"text_config": text_overrides})
|
||||
else:
|
||||
hf_overrides.update(text_overrides)
|
||||
else:
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
if num_gpus_available < tp_size * pp_size:
|
||||
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
|
||||
if VLLM_MULTI_NODE and distributed_backend == "mp":
|
||||
pytest.skip("Skipping multi-node pipeline parallel test for "
|
||||
"multiprocessing distributed backend")
|
||||
if multi_node_only and not VLLM_MULTI_NODE:
|
||||
pytest.skip("Not in multi-node setting")
|
||||
|
||||
common_args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"8",
|
||||
]
|
||||
if chunked_prefill:
|
||||
common_args.append("--enable-chunked-prefill")
|
||||
if eager_mode:
|
||||
common_args.append("--enforce-eager")
|
||||
if runner != "auto":
|
||||
common_args.extend(["--runner", runner])
|
||||
if trust_remote_code:
|
||||
common_args.append("--trust-remote-code")
|
||||
if tokenizer_mode:
|
||||
common_args.extend(["--tokenizer-mode", tokenizer_mode])
|
||||
if load_format:
|
||||
common_args.extend(["--load-format", load_format])
|
||||
if hf_overrides:
|
||||
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
|
||||
|
||||
cp_env = tp_env = {
|
||||
"VLLM_USE_V1":
|
||||
vllm_major_version, # Note(hc): DCP only support V1 engine only
|
||||
}
|
||||
|
||||
cp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--decode-context-parallel-size",
|
||||
str(dcp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
|
||||
tp_args = [
|
||||
*common_args,
|
||||
"--tensor-parallel-size",
|
||||
str(tp_size),
|
||||
"--pipeline-parallel-size",
|
||||
str(pp_size),
|
||||
"--distributed-executor-backend",
|
||||
distributed_backend,
|
||||
]
|
||||
|
||||
try:
|
||||
compare_two_settings(model_id,
|
||||
cp_args,
|
||||
tp_args,
|
||||
cp_env,
|
||||
tp_env,
|
||||
method=method,
|
||||
max_wait_seconds=720)
|
||||
except Exception:
|
||||
testing_ray_compiled_graph = cp_env is not None
|
||||
if testing_ray_compiled_graph and vllm_major_version == "0":
|
||||
# Ray Compiled Graph tests are flaky for V0,
|
||||
# so we don't want to fail the test
|
||||
logger.exception("Ray Compiled Graph tests failed")
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
CP_TEXT_GENERATION_MODELS = {
|
||||
# [MLA attention only]
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
|
||||
}
|
||||
|
||||
CP_TEST_MODELS = [
|
||||
# TODO support other models
|
||||
# [LANGUAGE GENERATION]
|
||||
"deepseek-ai/DeepSeek-V2-Lite-Chat",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
|
||||
"runner", "test_options"),
|
||||
[
|
||||
params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
|
||||
for params in settings.iter_params(model_id)
|
||||
if model_id in CP_TEST_MODELS
|
||||
],
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_cp_generation(
|
||||
model_id: str,
|
||||
parallel_setup: ParallelSetup,
|
||||
distributed_backend: str,
|
||||
vllm_major_version: str,
|
||||
runner: RunnerOption,
|
||||
test_options: CPTestOptions,
|
||||
num_gpus_available,
|
||||
):
|
||||
_compare_cp_with_tp(model_id,
|
||||
parallel_setup,
|
||||
distributed_backend,
|
||||
vllm_major_version,
|
||||
runner,
|
||||
test_options,
|
||||
num_gpus_available,
|
||||
method="generate",
|
||||
is_multimodal=False)
|
||||
@ -287,15 +287,6 @@ def test_prefix_cache_default():
|
||||
},
|
||||
"mm-processor-kwargs"
|
||||
),
|
||||
(
|
||||
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
|
||||
{
|
||||
"cast_logits_dtype": "bfloat16",
|
||||
"sequence_parallel_norm": True,
|
||||
"sequence_parallel_norm_threshold": 2048,
|
||||
},
|
||||
"override-neuron-config"
|
||||
),
|
||||
])
|
||||
# yapf: enable
|
||||
def test_composite_arg_parser(arg, expected, option):
|
||||
|
||||
@ -1,13 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Optional
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from vllm.config import MultiModalConfig
|
||||
from vllm.engine.multiprocessing.client import MQLLMEngineClient
|
||||
@ -17,6 +20,198 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
|
||||
OpenAIServingModels)
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from openai import OpenAI
|
||||
|
||||
GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def monkeypatch_module():
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
mpatch = MonkeyPatch()
|
||||
yield mpatch
|
||||
mpatch.undo()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module",
|
||||
params=[True, False],
|
||||
ids=["with_tool_parser", "without_tool_parser"])
|
||||
def with_tool_parser(request) -> bool:
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def default_server_args(with_tool_parser: bool):
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--reasoning-parser",
|
||||
"openai_gptoss",
|
||||
"--gpu-memory-utilization",
|
||||
"0.8",
|
||||
]
|
||||
if with_tool_parser:
|
||||
args.extend([
|
||||
"--tool-call-parser",
|
||||
"openai",
|
||||
"--enable-auto-tool-choice",
|
||||
])
|
||||
return args
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
|
||||
default_server_args: list[str]):
|
||||
with monkeypatch_module.context() as m:
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
|
||||
with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
|
||||
default_server_args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def gptoss_client(gptoss_server):
|
||||
async with gptoss_server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI,
|
||||
with_tool_parser: bool):
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string"
|
||||
},
|
||||
"state": {
|
||||
"type": "string"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
}]
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is the weather in Dallas, TX?"
|
||||
},
|
||||
]
|
||||
|
||||
stream = await gptoss_client.chat.completions.create(
|
||||
model=GPT_OSS_MODEL_NAME,
|
||||
messages=messages,
|
||||
tools=tools if with_tool_parser else None,
|
||||
stream=True)
|
||||
|
||||
name = None
|
||||
args_buf = ""
|
||||
content_buf = ""
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.tool_calls:
|
||||
tc = delta.tool_calls[0]
|
||||
if tc.function and tc.function.name:
|
||||
name = tc.function.name
|
||||
if tc.function and tc.function.arguments:
|
||||
args_buf += tc.function.arguments
|
||||
if getattr(delta, "content", None):
|
||||
content_buf += delta.content
|
||||
if with_tool_parser:
|
||||
assert name is not None
|
||||
assert len(args_buf) > 0
|
||||
else:
|
||||
assert name is None
|
||||
assert len(args_buf) == 0
|
||||
assert len(content_buf) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
|
||||
with_tool_parser: bool):
|
||||
if not with_tool_parser:
|
||||
pytest.skip("skip non-tool for multi-turn tests")
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_current_weather",
|
||||
"description": "Get the current weather in a given location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string"
|
||||
},
|
||||
"state": {
|
||||
"type": "string"
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ["celsius", "fahrenheit"],
|
||||
},
|
||||
},
|
||||
"required": ["city", "state", "unit"],
|
||||
},
|
||||
},
|
||||
}]
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What is the weather in Dallas, TX?"
|
||||
},
|
||||
]
|
||||
|
||||
first = await gptoss_client.chat.completions.create(
|
||||
model=GPT_OSS_MODEL_NAME,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
temperature=0.0,
|
||||
)
|
||||
first_msg = first.choices[0].message
|
||||
assert first_msg.tool_calls is not None and len(first_msg.tool_calls) > 0
|
||||
tc = first_msg.tool_calls[0]
|
||||
assert tc.function is not None and tc.function.name == "get_current_weather"
|
||||
args1 = tc.function.arguments
|
||||
assert args1 is not None and len(args1) > 0
|
||||
|
||||
messages.append({"role": "assistant", "content": args1})
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": "Now convert to celsius and return JSON only"
|
||||
})
|
||||
|
||||
second = await gptoss_client.chat.completions.create(
|
||||
model=GPT_OSS_MODEL_NAME,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
temperature=0.0,
|
||||
)
|
||||
second_msg = second.choices[0].message
|
||||
assert (second_msg.content is not None and len(second_msg.content) > 0) or \
|
||||
(second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)
|
||||
|
||||
|
||||
MODEL_NAME = "openai-community/gpt2"
|
||||
CHAT_TEMPLATE = "Dummy chat template for testing {}"
|
||||
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
|
||||
|
||||
@ -11,7 +11,7 @@ import torch
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
|
||||
MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
|
||||
DTYPE = "float16"
|
||||
|
||||
|
||||
|
||||
@ -73,17 +73,11 @@ async def test_zero_truncation_size(client: openai.AsyncOpenAI):
|
||||
"truncate_prompt_tokens": truncation_size
|
||||
}
|
||||
|
||||
with pytest.raises(openai.BadRequestError) as err:
|
||||
await client.post(path="embeddings", cast_to=object, body={**kwargs})
|
||||
response = await client.post(path="embeddings",
|
||||
cast_to=object,
|
||||
body={**kwargs})
|
||||
|
||||
assert err.value.status_code == 400
|
||||
error_details = err.value.response.json()["error"]
|
||||
|
||||
assert error_details["type"] == "BadRequestError"
|
||||
assert "This model's maximum context length is" in error_details["message"]
|
||||
assert "tokens in the input for embedding generation" in error_details[
|
||||
"message"]
|
||||
assert "Please reduce the length of the input" in error_details["message"]
|
||||
assert response["usage"]["prompt_tokens"] == truncation_size
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@ -436,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True)
|
||||
async def test_completions_with_image(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True)
|
||||
async def test_completions_with_image_with_uuid(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_url
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True)
|
||||
async def test_completions_with_image_with_incorrect_uuid_format(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
"incorrect_uuid_key": image_url,
|
||||
},
|
||||
"also_incorrect_uuid_key": image_url,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
|
||||
resolve_chat_template_content_format,
|
||||
resolve_hf_chat_template)
|
||||
from vllm.entrypoints.llm import apply_hf_chat_template
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
||||
from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
|
||||
encode_video_base64)
|
||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||
@ -179,6 +179,27 @@ def _assert_mm_data_is_image_input(
|
||||
assert isinstance(image_data, list) and len(image_data) == image_count
|
||||
|
||||
|
||||
def _assert_mm_uuids(
|
||||
mm_uuids: Optional[MultiModalUUIDDict],
|
||||
media_count: int,
|
||||
expected_uuids: list[Optional[str]],
|
||||
modality: str = "image",
|
||||
) -> None:
|
||||
if len(expected_uuids) > 0:
|
||||
assert mm_uuids is not None
|
||||
assert modality in mm_uuids
|
||||
|
||||
image_uuids = mm_uuids.get(modality)
|
||||
assert image_uuids is not None
|
||||
|
||||
assert isinstance(image_uuids,
|
||||
list) and len(image_uuids) == media_count
|
||||
|
||||
assert image_uuids == expected_uuids
|
||||
else:
|
||||
assert mm_uuids is None
|
||||
|
||||
|
||||
ModalityType = Literal["image", "video", "audio"]
|
||||
MultiModalDataCounts = Mapping[ModalityType, int]
|
||||
|
||||
@ -201,7 +222,7 @@ def test_parse_chat_messages_single_image(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -228,6 +249,260 @@ def test_parse_chat_messages_single_image(
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_single_image_with_uuid(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
"bad_uuid_key": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
image_uuid2 = "my_uuid_2"
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_uuid1,
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_uuid2,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in the image?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_single_image_with_uuid_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
image_uuid2 = "my_uuid_2"
|
||||
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid1,
|
||||
},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": ImageAsset("cherry_blossom").pil_image,
|
||||
"uuid": image_uuid2,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid2 = "my_uuid_2"
|
||||
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": ImageAsset("cherry_blossom").pil_image,
|
||||
"uuid": image_uuid2,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2])
|
||||
|
||||
|
||||
def test_parse_chat_messages_empty_system(
|
||||
@ -235,7 +510,7 @@ def test_parse_chat_messages_empty_system(
|
||||
mistral_tokenizer,
|
||||
):
|
||||
# Test string format
|
||||
conversation, _ = parse_chat_messages(
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
@ -265,7 +540,7 @@ def test_parse_chat_messages_empty_system(
|
||||
]
|
||||
|
||||
# Test openai format
|
||||
conversation, _ = parse_chat_messages(
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
@ -307,7 +582,7 @@ async def test_parse_chat_messages_single_image_async(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_future = parse_chat_messages_futures(
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -334,6 +609,7 @@ async def test_parse_chat_messages_single_image_async(
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images(
|
||||
@ -341,7 +617,7 @@ def test_parse_chat_messages_multiple_images(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -374,6 +650,7 @@ def test_parse_chat_messages_multiple_images(
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -382,7 +659,7 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_future = parse_chat_messages_futures(
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -415,6 +692,7 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
@ -422,7 +700,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -458,6 +736,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
"What's in <|image_1|> and how does it compare to <|image_2|>?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
@ -465,7 +744,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -503,6 +782,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
"other one?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_across_messages(
|
||||
@ -510,7 +790,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
@ -569,13 +849,84 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What about this one?"
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in this image?"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "<|image_2|>\nWhat about this one?"
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_context_text_format(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@ -621,6 +972,8 @@ def test_parse_chat_messages_context_text_format(
|
||||
}],
|
||||
},
|
||||
]
|
||||
assert mm_data is None
|
||||
assert mm_uuids is None
|
||||
|
||||
|
||||
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
|
||||
@ -736,7 +1089,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -762,6 +1115,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_interleave(
|
||||
@ -769,7 +1123,7 @@ def test_parse_chat_messages_multiple_images_interleave(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -813,6 +1167,7 @@ def test_parse_chat_messages_multiple_images_interleave(
|
||||
"Do they have differences?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@ -821,7 +1176,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages_futures(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -865,6 +1220,63 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
"Do they have differences?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "I need you to compare this image",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "and this one"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Do they have differences?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501
|
||||
"Do they have differences?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
@ -872,7 +1284,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
@ -935,6 +1347,81 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Be accurate."
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|image_1|>\nBe accurate.",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|image_2|>"
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
@ -944,7 +1431,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
video_url,
|
||||
audio_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
@ -1030,6 +1517,229 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
]
|
||||
|
||||
_assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
2,
|
||||
modality="image",
|
||||
expected_uuids=[None, None])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": "image_123",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Now listen to this audio"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
},
|
||||
"uuid": "audio_123",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": "image_123",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "And what's in the video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
"uuid": "video_123",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
_assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
2,
|
||||
modality="image",
|
||||
expected_uuids=["image_123", "image_123"])
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="video",
|
||||
expected_uuids=["video_123"])
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="audio",
|
||||
expected_uuids=["audio_123"])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": "image_123",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Now listen to this audio"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "And what's in the video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
"uuid": "video_123",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
_assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
2,
|
||||
modality="image",
|
||||
expected_uuids=["image_123", None])
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="video",
|
||||
expected_uuids=["video_123"])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
|
||||
@ -1081,7 +1791,7 @@ def test_mllama_single_image(
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that a single image is parsed correctly mllama."""
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -1100,6 +1810,7 @@ def test_mllama_single_image(
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
@ -1121,7 +1832,7 @@ def test_mllama_interleaved_images(
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that multiple image are parsed as interleaved dicts."""
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@ -1147,6 +1858,7 @@ def test_mllama_interleaved_images(
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
@ -1227,7 +1939,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
|
||||
# Now parse with vLLMs chat utils & apply the template
|
||||
vllm_conversation = get_conversation(is_hf=False)
|
||||
conversation, _ = parse_chat_messages(
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
vllm_conversation,
|
||||
model_config,
|
||||
tokenizer_group,
|
||||
@ -1518,7 +2230,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
|
||||
}],
|
||||
}]
|
||||
|
||||
conversation_with_thinking, _ = parse_chat_messages(
|
||||
conversation_with_thinking, _, _ = parse_chat_messages(
|
||||
messages,
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
|
||||
425
tests/entrypoints/test_context.py
Normal file
425
tests/entrypoints/test_context.py
Normal file
@ -0,0 +1,425 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from openai_harmony import StreamState
|
||||
|
||||
from vllm.entrypoints.context import HarmonyContext, StreamingHarmonyContext
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
|
||||
|
||||
# Helper function for Python < 3.10 compatibility
|
||||
async def async_next(async_iterator):
|
||||
"""Compatibility function equivalent to Python 3.10's anext()."""
|
||||
return await async_iterator.__anext__()
|
||||
|
||||
|
||||
def create_mock_request_output(
|
||||
prompt_token_ids=None,
|
||||
output_token_ids=None,
|
||||
num_cached_tokens=0,
|
||||
finished=True,
|
||||
):
|
||||
"""Helper function to create a mock RequestOutput object for testing."""
|
||||
outputs = []
|
||||
token_ids = output_token_ids if output_token_ids is not None else []
|
||||
outputs = [
|
||||
CompletionOutput(
|
||||
index=0,
|
||||
text="Test output",
|
||||
token_ids=token_ids,
|
||||
cumulative_logprob=0.0,
|
||||
logprobs=None,
|
||||
finish_reason=None,
|
||||
stop_reason=None,
|
||||
)
|
||||
]
|
||||
|
||||
return RequestOutput(
|
||||
request_id="test-id",
|
||||
prompt="Test prompt",
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
prompt_logprobs=None,
|
||||
outputs=outputs,
|
||||
finished=finished,
|
||||
num_cached_tokens=num_cached_tokens,
|
||||
)
|
||||
|
||||
|
||||
async def generate_mock_outputs(num_turns,
|
||||
prompt_token_counts,
|
||||
output_token_counts,
|
||||
cached_token_counts=None):
|
||||
"""Generate a sequence of mock RequestOutput objects to simulate multiple
|
||||
turns."""
|
||||
if cached_token_counts is None:
|
||||
cached_token_counts = [0] * num_turns
|
||||
|
||||
for i in range(num_turns):
|
||||
# Create mock prompt token IDs and output token IDs
|
||||
prompt_token_ids = list(range(1, prompt_token_counts[i] + 1))
|
||||
output_token_ids = list(range(1, output_token_counts[i] + 1))
|
||||
|
||||
# Create and yield the RequestOutput
|
||||
yield create_mock_request_output(
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
output_token_ids=output_token_ids,
|
||||
num_cached_tokens=cached_token_counts[i],
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_parser():
|
||||
"""Set up a mock parser for tests."""
|
||||
with patch("vllm.entrypoints.context.get_streamable_parser_for_assistant"
|
||||
) as mock_parser_factory:
|
||||
# Create a mock parser object
|
||||
parser = MagicMock()
|
||||
parser.messages = []
|
||||
parser.current_channel = None
|
||||
parser.state = StreamState.EXPECT_START
|
||||
mock_parser_factory.return_value = parser
|
||||
yield parser
|
||||
|
||||
|
||||
def test_single_turn_token_counting():
|
||||
"""Test token counting behavior for a single turn."""
|
||||
# Create a context
|
||||
context = HarmonyContext(messages=[], available_tools=[])
|
||||
|
||||
# Create a mock RequestOutput with specific token counts
|
||||
mock_output = create_mock_request_output(
|
||||
prompt_token_ids=[1, 2, 3, 4, 5], # 5 prompt tokens
|
||||
output_token_ids=[6, 7, 8], # 3 output tokens
|
||||
num_cached_tokens=2, # 2 cached tokens
|
||||
)
|
||||
|
||||
# Append the output to the context
|
||||
context.append_output(mock_output)
|
||||
|
||||
# Verify the token counts
|
||||
assert context.num_prompt_tokens == 5
|
||||
assert context.num_output_tokens == 3
|
||||
assert context.num_cached_tokens == 2
|
||||
assert context.num_tool_output_tokens == 0 # No tool tokens in first turn
|
||||
|
||||
# Verify internal state tracking
|
||||
assert not context.is_first_turn
|
||||
assert context.previous_turn.input_tokens == 5
|
||||
assert context.previous_turn.output_tokens == 3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_turn_token_counting():
|
||||
"""Test token counting behavior across multiple turns with tool output."""
|
||||
# Create a context
|
||||
context = HarmonyContext(messages=[], available_tools=["browser"])
|
||||
|
||||
# Simulate a conversation with 3 turns
|
||||
# Turn 1: prefill 5, decode 3, tool 7
|
||||
# Turn 2: prefill 15, cached 5, decode 4, tool 1
|
||||
# Turn 3: prefill 20, cached 15, decode 5
|
||||
prompt_token_counts = [5, 15, 20]
|
||||
output_token_counts = [3, 4, 5]
|
||||
cached_token_counts = [0, 5, 15]
|
||||
mock_generator = generate_mock_outputs(3, prompt_token_counts,
|
||||
output_token_counts,
|
||||
cached_token_counts)
|
||||
|
||||
# First turn - initial prompt and response
|
||||
mock_output1 = await async_next(mock_generator)
|
||||
context.append_output(mock_output1)
|
||||
|
||||
# At this point, we should have 5 prompt tokens and 3 output tokens
|
||||
assert context.num_prompt_tokens == 5
|
||||
assert context.num_output_tokens == 3
|
||||
assert context.num_tool_output_tokens == 0
|
||||
|
||||
# Second turn - after tool output
|
||||
mock_output2 = await async_next(mock_generator)
|
||||
context.append_output(mock_output2)
|
||||
# Current prompt tokens (15) - last_turn_input_tokens (5) -
|
||||
# last_turn_output_tokens (3) = 7
|
||||
expected_tool_output = 7
|
||||
|
||||
assert context.num_prompt_tokens == 5 + 15
|
||||
assert context.num_output_tokens == 3 + 4
|
||||
assert context.num_tool_output_tokens == expected_tool_output
|
||||
assert context.num_cached_tokens == 5
|
||||
|
||||
# Third turn - final response
|
||||
mock_output3 = await async_next(mock_generator)
|
||||
context.append_output(mock_output3)
|
||||
# Additional tool output tokens from third turn:
|
||||
# Current prompt (20) - last_turn_input_tokens (15) -
|
||||
# last_turn_output_tokens (4) = 1
|
||||
expected_tool_output = 7 + 1
|
||||
|
||||
assert context.num_prompt_tokens == 5 + 15 + 20
|
||||
assert context.num_output_tokens == 3 + 4 + 5
|
||||
assert context.num_tool_output_tokens == expected_tool_output
|
||||
assert context.num_cached_tokens == 5 + 15
|
||||
|
||||
|
||||
def test_empty_output_tokens():
|
||||
"""Test behavior when RequestOutput has empty output tokens."""
|
||||
context = HarmonyContext(messages=[], available_tools=[])
|
||||
|
||||
# Create a RequestOutput with empty output tokens
|
||||
mock_output = create_mock_request_output(
|
||||
prompt_token_ids=[1, 2, 3], # 3 prompt tokens
|
||||
output_token_ids=[], # Empty output tokens list
|
||||
num_cached_tokens=1,
|
||||
)
|
||||
|
||||
context.append_output(mock_output)
|
||||
|
||||
# Should handle empty outputs gracefully
|
||||
assert context.num_prompt_tokens == 3
|
||||
assert context.num_output_tokens == 0 # No output tokens
|
||||
assert context.num_cached_tokens == 1
|
||||
assert context.num_tool_output_tokens == 0
|
||||
|
||||
|
||||
def test_missing_prompt_token_ids():
|
||||
"""Test behavior when RequestOutput has None prompt_token_ids."""
|
||||
context = HarmonyContext(messages=[], available_tools=[])
|
||||
|
||||
mock_output = create_mock_request_output(
|
||||
prompt_token_ids=None, # No prompt token IDs
|
||||
output_token_ids=[1, 2], # 2 output tokens
|
||||
num_cached_tokens=0,
|
||||
)
|
||||
|
||||
# Logger.error will be called, but we don't need to check for warnings
|
||||
# here Just ensure it doesn't raise an exception
|
||||
context.append_output(mock_output)
|
||||
|
||||
# Should handle missing prompt tokens gracefully
|
||||
assert context.num_prompt_tokens == 0
|
||||
assert context.num_output_tokens == 2
|
||||
assert context.num_cached_tokens == 0
|
||||
assert context.num_tool_output_tokens == 0
|
||||
|
||||
|
||||
def test_reasoning_tokens_counting(mock_parser):
|
||||
"""Test that reasoning tokens are counted correctly."""
|
||||
context = HarmonyContext(messages=[], available_tools=[])
|
||||
|
||||
# Mock parser to simulate reasoning channel
|
||||
mock_parser.current_channel = "analysis" # Reasoning channel
|
||||
|
||||
mock_output = create_mock_request_output(
|
||||
prompt_token_ids=[1, 2, 3],
|
||||
output_token_ids=[4, 5, 6, 7], # 4 tokens, all in reasoning
|
||||
num_cached_tokens=0,
|
||||
)
|
||||
|
||||
context.append_output(mock_output)
|
||||
|
||||
# All output tokens should be counted as reasoning
|
||||
assert context.num_reasoning_tokens == 4
|
||||
assert context.num_output_tokens == 4
|
||||
|
||||
|
||||
def test_zero_tokens_edge_case():
|
||||
"""Test behavior with all zero token counts."""
|
||||
context = HarmonyContext(messages=[], available_tools=[])
|
||||
|
||||
# Create a request with empty lists (not None) for both prompt and
|
||||
# output tokens
|
||||
mock_output = create_mock_request_output(
|
||||
prompt_token_ids=[], # Empty prompt tokens
|
||||
output_token_ids=[], # Empty output tokens
|
||||
num_cached_tokens=0,
|
||||
)
|
||||
|
||||
context.append_output(mock_output)
|
||||
|
||||
# All counts should be zero
|
||||
assert context.num_prompt_tokens == 0
|
||||
assert context.num_output_tokens == 0
|
||||
assert context.num_cached_tokens == 0
|
||||
assert context.num_tool_output_tokens == 0
|
||||
assert context.num_reasoning_tokens == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_single_turn_no_tool_output():
|
||||
"""Test that first turn never generates tool output tokens."""
|
||||
context = HarmonyContext(
|
||||
messages=[],
|
||||
available_tools=["browser"] # Tools available
|
||||
)
|
||||
|
||||
# Even with large prompt in first turn, no tool tokens should be counted
|
||||
mock_output = create_mock_request_output(
|
||||
prompt_token_ids=list(range(100)), # 100 tokens
|
||||
output_token_ids=[1, 2, 3],
|
||||
num_cached_tokens=0,
|
||||
)
|
||||
|
||||
context.append_output(mock_output)
|
||||
|
||||
# First turn should never have tool output tokens
|
||||
assert context.num_tool_output_tokens == 0
|
||||
assert context.is_first_turn is False # Should be updated after first turn
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_negative_tool_tokens_edge_case():
|
||||
"""Test edge case where calculation could result in negative tool
|
||||
tokens. We should log an error and clamp the value to 0."""
|
||||
# Use patch to check if logger.error was called
|
||||
with patch("vllm.entrypoints.context.logger.error") as mock_log:
|
||||
context = HarmonyContext(messages=[], available_tools=["browser"])
|
||||
|
||||
# First turn
|
||||
mock_output1 = create_mock_request_output(
|
||||
prompt_token_ids=list(range(10)), # 10 tokens
|
||||
output_token_ids=[1, 2, 3, 4, 5], # 5 tokens
|
||||
)
|
||||
context.append_output(mock_output1)
|
||||
|
||||
# Second turn with fewer new tokens than previous output
|
||||
# This could happen in edge cases with aggressive caching
|
||||
mock_output2 = create_mock_request_output(
|
||||
prompt_token_ids=list(range(12)), # 12 tokens (only 2 new)
|
||||
output_token_ids=[6, 7], # 2 tokens
|
||||
)
|
||||
context.append_output(mock_output2)
|
||||
|
||||
# Calculated negative tool tokens (12 - 10 - 5 = -3) should be clamped
|
||||
# to 0 and an error should be logged
|
||||
assert context.num_tool_output_tokens == 0
|
||||
assert context.num_prompt_tokens == 10 + 12
|
||||
assert context.num_output_tokens == 5 + 2
|
||||
|
||||
# Verify the error was logged properly
|
||||
mock_log.assert_called_once()
|
||||
|
||||
# Extract the actual log message and arguments from the call
|
||||
args, _ = mock_log.call_args
|
||||
log_message = args[0]
|
||||
|
||||
# Check for key parts of the message
|
||||
assert "Negative tool output tokens calculated" in log_message
|
||||
assert "-3" in str(args) # Check that -3 is in the arguments
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_streaming_multi_turn_token_counting(mock_parser):
|
||||
"""Test token counting for streaming multi-turn conversations.
|
||||
|
||||
This test focuses on how StreamingHarmonyContext counts tokens in a
|
||||
multi-turn conversation with streaming (token-by-token) outputs and
|
||||
message boundaries.
|
||||
"""
|
||||
# Create a streaming context
|
||||
context = StreamingHarmonyContext(messages=[], available_tools=["browser"])
|
||||
|
||||
# Simulate three turns of conversation:
|
||||
# Turn 1: stream tokens one by one, then finish the message
|
||||
# Turn 2: new prompt, stream more tokens with a reasoning segment
|
||||
# Turn 3: new prompt with tool output and cached tokens
|
||||
|
||||
# First turn: 3 tokens streamed one by one
|
||||
# First token of first turn
|
||||
context.append_output(
|
||||
create_mock_request_output(
|
||||
prompt_token_ids=[1, 2, 3], # 3 prompt tokens
|
||||
output_token_ids=[101], # Single token
|
||||
num_cached_tokens=0,
|
||||
finished=False, # Not end of message yet
|
||||
))
|
||||
|
||||
# Second token of first turn
|
||||
context.append_output(
|
||||
create_mock_request_output(
|
||||
output_token_ids=[102],
|
||||
finished=False,
|
||||
))
|
||||
|
||||
# Last token of first turn (finished=True signals end of message)
|
||||
context.append_output(
|
||||
create_mock_request_output(
|
||||
output_token_ids=[103],
|
||||
finished=True, # End of message
|
||||
))
|
||||
|
||||
# Check token counts after first turn
|
||||
assert context.num_prompt_tokens == 3 # Initial prompt tokens
|
||||
assert context.num_output_tokens == 3 # Three output tokens
|
||||
assert context.num_cached_tokens == 0
|
||||
assert context.num_tool_output_tokens == 0 # No tool output in first turn
|
||||
assert context.first_tok_of_message is True # Ready for next message
|
||||
|
||||
# Second turn: reasoning tokens in analysis channel
|
||||
mock_parser.current_channel = "analysis" # Set to reasoning channel
|
||||
|
||||
# First token of second turn
|
||||
context.append_output(
|
||||
create_mock_request_output(
|
||||
prompt_token_ids=[1, 2, 3, 101, 102, 103, 4,
|
||||
5], # 8 tokens (includes previous)
|
||||
output_token_ids=[201],
|
||||
num_cached_tokens=3, # Some tokens cached
|
||||
finished=False,
|
||||
))
|
||||
|
||||
# More tokens in reasoning channel
|
||||
context.append_output(
|
||||
create_mock_request_output(
|
||||
output_token_ids=[202],
|
||||
finished=False,
|
||||
))
|
||||
|
||||
context.append_output(
|
||||
create_mock_request_output(
|
||||
output_token_ids=[203],
|
||||
finished=True, # End of reasoning message
|
||||
))
|
||||
|
||||
# Check counts after second turn (reasoning message)
|
||||
assert context.num_prompt_tokens == 3 + 8 # Initial + second prompt
|
||||
assert context.num_output_tokens == 3 + 3 # First turn + second turn
|
||||
assert context.num_reasoning_tokens == 3 # All tokens in analysis channel
|
||||
assert context.num_cached_tokens == 3 # Cached tokens from second turn
|
||||
|
||||
# Formula: this turn prompt tokens - last turn prompt - last turn output
|
||||
expected_tool_tokens = 8 - 3 - 3 # = 2
|
||||
assert context.num_tool_output_tokens == expected_tool_tokens
|
||||
|
||||
# Third turn: regular output channel
|
||||
mock_parser.current_channel = "final" # Switch back to regular channel
|
||||
|
||||
# Third turn (with more cached tokens)
|
||||
context.append_output(
|
||||
create_mock_request_output(
|
||||
prompt_token_ids=[
|
||||
1, 2, 3, 101, 102, 103, 4, 5, 201, 202, 203, 6, 7
|
||||
], # 13 tokens
|
||||
output_token_ids=[301],
|
||||
num_cached_tokens=8, # More cached tokens
|
||||
finished=False,
|
||||
))
|
||||
|
||||
context.append_output(
|
||||
create_mock_request_output(
|
||||
output_token_ids=[302],
|
||||
finished=True,
|
||||
))
|
||||
|
||||
# Final token counts check
|
||||
assert context.num_prompt_tokens == 3 + 8 + 13 # All prompts
|
||||
assert context.num_output_tokens == 3 + 3 + 2 # All outputs
|
||||
assert context.num_reasoning_tokens == 3 # Unchanged from second turn
|
||||
assert context.num_cached_tokens == 3 + 8 # Accumulated cached tokens
|
||||
|
||||
# Additional tool tokens from third turn
|
||||
# Formula: this turn prompt - last turn prompt - last turn output
|
||||
additional_tool_tokens = 13 - 8 - 3 # = 2
|
||||
assert context.num_tool_output_tokens == expected_tool_tokens \
|
||||
+ additional_tool_tokens
|
||||
@ -130,6 +130,23 @@ class TestRenderPrompt:
|
||||
assert call_args.kwargs["truncation"] is True
|
||||
assert call_args.kwargs["max_length"] == 50
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_truncation_negative(self, renderer, mock_async_tokenizer):
|
||||
# Test that negative truncation uses model's max_model_len
|
||||
mock_async_tokenizer.return_value = MockTokenizerResult(
|
||||
[101, 7592, 2088]) # Truncated to max_model_len
|
||||
renderer.async_tokenizer_pool[
|
||||
renderer.tokenizer] = mock_async_tokenizer
|
||||
|
||||
results = await renderer.render_prompt(prompt_or_prompts="Hello world",
|
||||
max_length=200,
|
||||
truncate_prompt_tokens=-1)
|
||||
|
||||
assert len(results) == 1
|
||||
call_args = mock_async_tokenizer.call_args
|
||||
assert call_args.kwargs["truncation"] is True
|
||||
assert call_args.kwargs["max_length"] == 100 # model's max_model_len
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_token_truncation_last_elements(self, renderer):
|
||||
# Test that token truncation keeps the last N elements
|
||||
|
||||
@ -115,21 +115,27 @@ def generate_continuous_batched_examples(example_lens_by_batch,
|
||||
n_heads,
|
||||
d_head,
|
||||
itype,
|
||||
device='cuda'):
|
||||
device='cuda',
|
||||
return_naive_ref=True):
|
||||
|
||||
# this function generates a random examples of certain length
|
||||
# and then cut according to "example_lens_by_batch" and feed
|
||||
# them in continuous batches to the kernels
|
||||
# them in continuous batches to the kernels.
|
||||
# If if return_naive_ref=True, the naive torch implementation
|
||||
# ssd_minimal_discrete will be used to compute and return
|
||||
# reference output.
|
||||
|
||||
# generate the full-length example
|
||||
A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads,
|
||||
d_head, itype)
|
||||
|
||||
Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
|
||||
A * dt,
|
||||
B,
|
||||
C,
|
||||
block_len=full_length // 4)
|
||||
if return_naive_ref:
|
||||
Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
|
||||
A * dt,
|
||||
B,
|
||||
C,
|
||||
block_len=full_length //
|
||||
4)
|
||||
|
||||
# internal function that outputs a cont batch of examples
|
||||
# given a tuple of lengths for each example in the batch
|
||||
@ -179,7 +185,8 @@ def generate_continuous_batched_examples(example_lens_by_batch,
|
||||
IND_S = [x % full_length for x in IND_E]
|
||||
IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
|
||||
|
||||
yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)],
|
||||
yield ([Y_min[s, IND_S[s]:IND_E[s]]
|
||||
for s in range(num_examples)] if return_naive_ref else None,
|
||||
cu_seqlens, seq_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
|
||||
|
||||
|
||||
@ -324,3 +331,213 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
|
||||
if clear:
|
||||
states[i].fill_(0.)
|
||||
exhausted[i] = False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunk_size", [8, 256])
|
||||
@pytest.mark.parametrize("seqlens", [
|
||||
(16, 2, 8, 13),
|
||||
(270, 88, 212, 203),
|
||||
(16, 20),
|
||||
])
|
||||
def test_mamba_chunk_scan_cont_batch_prefill_chunking(chunk_size, seqlens):
|
||||
|
||||
# This test verifies the correctness of the chunked prefill implementation
|
||||
# in the mamba2 ssd kernels, by comparing concatenation (in the sequence
|
||||
# dimension) of chunked results with the full sequence result.
|
||||
# It is different from test_mamba_chunk_scan_cont_batch by:
|
||||
# 1. Not using the naive torch implementaion (ssd_minimal_discrete) to get
|
||||
# reference outputs. Instead, it compares chunked kernel outputs to full
|
||||
# sequence kernel outputs. This is the most straightforward way to
|
||||
# assert chunked prefill correctness.
|
||||
# 2. It focuses on cases where sequences change in the middle of mamba
|
||||
# chunks, and not necessarily on chunk boundaries.
|
||||
|
||||
max_seqlen = max(seqlens)
|
||||
# This test can have larger error for longer sequences
|
||||
if max_seqlen > 256:
|
||||
atol, rtol = 1e-2, 5e-3
|
||||
else:
|
||||
atol, rtol = 5e-3, 5e-3
|
||||
|
||||
num_sequences = len(seqlens)
|
||||
n_heads = 16
|
||||
d_head = 64
|
||||
itype = torch.float32
|
||||
|
||||
# hold state during the cutting process so we know if an
|
||||
# example has been exhausted and needs to cycle
|
||||
last_taken: dict = {} # map: eg -> pointer to last taken sample
|
||||
exhausted: dict = {} # map: eg -> boolean indicating example is exhausted
|
||||
_, cu_seqlens, seq_idx, (A, dt, X, B, C) = next(
|
||||
generate_continuous_batched_examples([seqlens],
|
||||
num_sequences,
|
||||
max_seqlen,
|
||||
last_taken,
|
||||
exhausted,
|
||||
n_heads,
|
||||
d_head,
|
||||
itype,
|
||||
return_naive_ref=False))
|
||||
seqlens = torch.tensor(seqlens, dtype=torch.int32, device=X.device)
|
||||
device = X.device
|
||||
|
||||
## full seqlen computation
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
cu_seqlens, chunk_size, cu_seqlens[-1])
|
||||
Y_ref = torch.empty_like(X)
|
||||
state_ref = mamba_chunk_scan_combined(
|
||||
X,
|
||||
dt,
|
||||
A,
|
||||
B,
|
||||
C,
|
||||
chunk_size,
|
||||
D=None,
|
||||
cu_seqlens=cu_seqlens,
|
||||
seq_idx=seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
return_varlen_states=True,
|
||||
initial_states=None,
|
||||
out=Y_ref,
|
||||
)
|
||||
|
||||
## chunked seqlen computation
|
||||
# first chunk
|
||||
chunked_seqlens = seqlens // 2
|
||||
chunked_cu_seqlens = torch.cat([
|
||||
torch.tensor([0], device=device),
|
||||
torch.cumsum(chunked_seqlens, dim=0)
|
||||
],
|
||||
dim=0)
|
||||
chunked_seq_idx = torch.repeat_interleave(
|
||||
torch.arange(len(chunked_seqlens), device=device),
|
||||
chunked_seqlens,
|
||||
output_size=chunked_cu_seqlens[-1]).unsqueeze(0).to(torch.int32)
|
||||
chunked_input_seq_len = chunked_cu_seqlens[-1]
|
||||
X_chunked = torch.zeros_like(X)[:, :chunked_input_seq_len, ...]
|
||||
dt_chunked = torch.zeros_like(dt)[:, :chunked_input_seq_len, ...]
|
||||
B_chunked = torch.zeros_like(B)[:, :chunked_input_seq_len, ...]
|
||||
C_chunked = torch.zeros_like(C)[:, :chunked_input_seq_len, ...]
|
||||
for i in range(num_sequences):
|
||||
# fmt: off
|
||||
chunk_f = lambda x, i: x[:, cu_seqlens[i]:cu_seqlens[i] + chunked_seqlens[i], ...] # noqa: E501
|
||||
|
||||
X_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(X, i) # noqa: E501
|
||||
dt_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(dt, i) # noqa: E501
|
||||
B_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(B, i) # noqa: E501
|
||||
C_chunked[:, chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1], ...] = chunk_f(C, i) # noqa: E501
|
||||
# fmt: on
|
||||
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
chunked_cu_seqlens, chunk_size, chunked_cu_seqlens[-1])
|
||||
Y_partial = torch.empty_like(X_chunked)
|
||||
partial_state = mamba_chunk_scan_combined(
|
||||
X_chunked,
|
||||
dt_chunked,
|
||||
A,
|
||||
B_chunked,
|
||||
C_chunked,
|
||||
chunk_size,
|
||||
D=None,
|
||||
cu_seqlens=chunked_cu_seqlens,
|
||||
seq_idx=chunked_seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
return_varlen_states=True,
|
||||
initial_states=None,
|
||||
out=Y_partial,
|
||||
)
|
||||
|
||||
# remaining chunk
|
||||
remaining_chunked_seqlens = seqlens - chunked_seqlens
|
||||
remaining_chunked_cu_seqlens = torch.cat([
|
||||
torch.tensor([0], device=device),
|
||||
torch.cumsum(remaining_chunked_seqlens, dim=0)
|
||||
],
|
||||
dim=0)
|
||||
remaining_chunked_seq_idx = torch.repeat_interleave(
|
||||
torch.arange(len(remaining_chunked_seqlens), device=device),
|
||||
remaining_chunked_seqlens,
|
||||
output_size=remaining_chunked_cu_seqlens[-1]).unsqueeze(0).to(
|
||||
torch.int32)
|
||||
remaining_chunked_input_seq_len = remaining_chunked_cu_seqlens[-1]
|
||||
# fmt: off
|
||||
remaining_X_chunked = torch.zeros_like(X)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_dt_chunked = torch.zeros_like(dt)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_B_chunked = torch.zeros_like(B)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
remaining_C_chunked = torch.zeros_like(C)[:, :remaining_chunked_input_seq_len, ...] # noqa: E501
|
||||
for i in range(num_sequences):
|
||||
remaining_chunk_f = lambda x, i: x[:, cu_seqlens[i] + chunked_seqlens[i]:cu_seqlens[i+1], ...] # noqa: E501
|
||||
|
||||
remaining_X_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(X, i) # noqa: E501
|
||||
remaining_dt_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(dt, i) # noqa: E501
|
||||
remaining_B_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(B, i) # noqa: E501
|
||||
remaining_C_chunked[:, remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1], ...] = remaining_chunk_f(C, i) # noqa: E501
|
||||
|
||||
# assert input chunking is correct
|
||||
concat_chunk_f = lambda pt1, pt2, i: torch.cat([
|
||||
pt1[:,chunked_cu_seqlens[i]:chunked_cu_seqlens[i+1],...],
|
||||
pt2[:,remaining_chunked_cu_seqlens[i]:remaining_chunked_cu_seqlens[i+1],...],
|
||||
],
|
||||
dim=1)
|
||||
concat_batch_f = lambda pt1, pt2: torch.cat([concat_chunk_f(pt1, pt2, i) for i in range(num_sequences)], dim=1) # noqa: E501
|
||||
# fmt: on
|
||||
|
||||
assert concat_batch_f(X_chunked, remaining_X_chunked).equal(X)
|
||||
assert concat_batch_f(dt_chunked, remaining_dt_chunked).equal(dt)
|
||||
assert concat_batch_f(B_chunked, remaining_B_chunked).equal(B)
|
||||
assert concat_batch_f(C_chunked, remaining_C_chunked).equal(C)
|
||||
|
||||
chunk_indices, chunk_offsets = \
|
||||
_query_start_loc_to_chunk_indices_offsets(
|
||||
remaining_chunked_cu_seqlens,
|
||||
chunk_size,
|
||||
remaining_chunked_cu_seqlens[-1])
|
||||
|
||||
Y_chunked = torch.empty_like(remaining_X_chunked)
|
||||
state_chunked = mamba_chunk_scan_combined(
|
||||
remaining_X_chunked,
|
||||
remaining_dt_chunked,
|
||||
A,
|
||||
remaining_B_chunked,
|
||||
remaining_C_chunked,
|
||||
chunk_size,
|
||||
D=None,
|
||||
cu_seqlens=remaining_chunked_cu_seqlens,
|
||||
seq_idx=remaining_chunked_seq_idx,
|
||||
chunk_indices=chunk_indices,
|
||||
chunk_offsets=chunk_offsets,
|
||||
return_varlen_states=True,
|
||||
initial_states=partial_state,
|
||||
out=Y_chunked,
|
||||
)
|
||||
Y = concat_batch_f(Y_partial, Y_chunked)
|
||||
|
||||
# kernel chunked is same as kernel overall
|
||||
for i in range(num_sequences):
|
||||
Y_seq = Y[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
|
||||
Y_ref_seq = Y_ref[:, cu_seqlens[i]:cu_seqlens[i + 1], ...]
|
||||
torch.testing.assert_close(
|
||||
Y_seq[:, :chunked_seqlens[i], ...],
|
||||
Y_ref_seq[:, :chunked_seqlens[i], ...],
|
||||
atol=atol,
|
||||
rtol=rtol,
|
||||
msg=lambda x: f"seq{i} output part1 " + x) # noqa: B023
|
||||
torch.testing.assert_close(
|
||||
Y_seq[:, chunked_seqlens[i]:, ...],
|
||||
Y_ref_seq[:, chunked_seqlens[i]:, ...],
|
||||
atol=atol,
|
||||
rtol=rtol,
|
||||
msg=lambda x: f"seq{i} output part2 " + x) # noqa: B023
|
||||
|
||||
state_seq = state_chunked[i]
|
||||
state_seq_ref = state_ref[i]
|
||||
torch.testing.assert_close(
|
||||
state_seq,
|
||||
state_seq_ref,
|
||||
atol=atol,
|
||||
rtol=rtol,
|
||||
msg=lambda x: f"seq{i} state " + x) # noqa: B023
|
||||
|
||||
@ -371,8 +371,8 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
|
||||
@torch.inference_mode()
|
||||
def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
|
||||
monkeypatch):
|
||||
def test_mixtral_moe(dist_init, dtype: torch.dtype, padding: bool,
|
||||
use_rocm_aiter: bool, monkeypatch):
|
||||
"""Make sure our Mixtral MoE implementation agrees with the one from
|
||||
huggingface."""
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
|
||||
from vllm._custom_ops import scaled_fp4_quant
|
||||
from vllm.scalar_type import scalar_types
|
||||
|
||||
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
|
||||
@ -65,3 +66,10 @@ def break_fp4_bytes(a, dtype):
|
||||
|
||||
# Reshape to final form
|
||||
return values.reshape(m, n * 2).to(dtype=dtype)
|
||||
|
||||
|
||||
def quant_nvfp4_tensor(a: torch.Tensor):
|
||||
a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
|
||||
torch.abs(a).max().to(torch.float32))
|
||||
a_quant, a_block_scale = scaled_fp4_quant(a, a_global_scale)
|
||||
return a_quant, a_block_scale, a_global_scale
|
||||
|
||||
@ -8,8 +8,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import scalar_types
|
||||
|
||||
if not (current_platform.has_device_capability(100)
|
||||
and hasattr(torch.ops._C, "silu_and_mul_nvfp4_quant")):
|
||||
if not current_platform.has_device_capability(100):
|
||||
pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
|
||||
allow_module_level=True)
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
import random
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
@ -14,14 +15,20 @@ from vllm.triton_utils import triton
|
||||
def cal_diff(x: torch.Tensor,
|
||||
y: torch.Tensor,
|
||||
name: str,
|
||||
use_fp8: bool = False) -> None:
|
||||
use_fp8: bool = False,
|
||||
diff_threshold: Optional[float] = None) -> None:
|
||||
x, y = x.double(), y.double()
|
||||
cos_diff = 1 - 2 * (x * y).sum().item() / max(
|
||||
(x * x + y * y).sum().item(), 1e-12)
|
||||
if (use_fp8):
|
||||
assert cos_diff < 1e-4
|
||||
if diff_threshold is not None:
|
||||
# directly compare the cos_diff with the threshold
|
||||
assert cos_diff < diff_threshold
|
||||
else:
|
||||
assert cos_diff < 1e-5
|
||||
# use the default threshold
|
||||
if (use_fp8):
|
||||
assert cos_diff < 1e-4
|
||||
else:
|
||||
assert cos_diff < 1e-5
|
||||
|
||||
|
||||
CUTLASS_MLA_UNSUPPORTED_REASON = \
|
||||
@ -118,11 +125,13 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
|
||||
dtype=torch.uint8)
|
||||
|
||||
out_ans = torch.empty(b, MAX_HEADS, dv, dtype=init_dtype)
|
||||
|
||||
ops.sm100_cutlass_mla_decode(out_ans, q_nope, q_pe, kv_cache_flat,
|
||||
cache_seqlens, block_table, workspace,
|
||||
scale, 1)
|
||||
return out_ans[:, :h_q].contiguous()
|
||||
output_lse = torch.empty((b, MAX_HEADS),
|
||||
dtype=torch.float32,
|
||||
device=q_nope.device)
|
||||
ops.sm100_cutlass_mla_decode(out_ans, output_lse, q_nope, q_pe,
|
||||
kv_cache_flat, cache_seqlens, block_table,
|
||||
workspace, scale, 1)
|
||||
return out_ans[:, :h_q].contiguous(), output_lse[:, :h_q].contiguous()
|
||||
|
||||
def scaled_dot_product_attention(query, key, value, is_causal=False):
|
||||
query = query.float()
|
||||
@ -165,11 +174,14 @@ def test_cutlass_mla_decode(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size,
|
||||
lse[i] = lse_i
|
||||
return out, lse
|
||||
|
||||
out_cutlass = cutlass_mla()
|
||||
out_cutlass, lse_cutlass = cutlass_mla()
|
||||
out_torch, lse_torch = ref_mla()
|
||||
# Extract the single token (s_q=1) slice to match cutlass output shape
|
||||
out_torch_slice = out_torch[:, 0, :, :] # [b, h_q, dv]
|
||||
lse_torch_slice = lse_torch[:, 0, :] # [b, h_q]
|
||||
cal_diff(out_cutlass, out_torch_slice, "out", use_fp8)
|
||||
# lse has larger numerical error, so use a larger threshold
|
||||
cal_diff(lse_cutlass, lse_torch_slice, "lse", use_fp8, diff_threshold=1e-3)
|
||||
|
||||
t = triton.testing.do_bench(cutlass_mla)
|
||||
FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
|
||||
|
||||
@ -60,9 +60,9 @@ DEVICES = ([
|
||||
# prefill stage(True) or decode stage(False)
|
||||
STAGES = [True, False]
|
||||
|
||||
NUM_RANDOM_SEEDS = 6
|
||||
NUM_RANDOM_SEEDS = 2
|
||||
|
||||
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
||||
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 2
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
|
||||
@ -10,7 +10,8 @@ import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from tests.models.utils import EmbedModelInfo, RerankModelInfo
|
||||
from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
|
||||
check_embeddings_close)
|
||||
|
||||
# Most embedding models on the STS12 task (See #17175):
|
||||
# - Model implementation and minor changes in tensor dtype
|
||||
@ -163,12 +164,14 @@ def mteb_test_embed_models(hf_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
vllm_extra_kwargs=None,
|
||||
hf_model_callback=None,
|
||||
atol=MTEB_RERANK_TOL):
|
||||
atol=MTEB_EMBED_TOL):
|
||||
if not model_info.enable_test:
|
||||
# A model family has many models with the same architecture,
|
||||
# and we don't need to test each one.
|
||||
pytest.skip("Skipping test.")
|
||||
|
||||
example_prompts = ["The chef prepared a delicious meal."]
|
||||
|
||||
vllm_extra_kwargs = vllm_extra_kwargs or {}
|
||||
vllm_extra_kwargs["dtype"] = model_info.dtype
|
||||
|
||||
@ -191,6 +194,7 @@ def mteb_test_embed_models(hf_runner,
|
||||
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
|
||||
MTEB_EMBED_TASKS)
|
||||
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
if model_info.mteb_score is None:
|
||||
with hf_runner(model_info.name,
|
||||
@ -202,6 +206,16 @@ def mteb_test_embed_models(hf_runner,
|
||||
|
||||
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
|
||||
st_dtype = next(hf_model.model.parameters()).dtype
|
||||
|
||||
# Test embed_dims and whether to use normalize
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=1e-2,
|
||||
)
|
||||
else:
|
||||
st_main_score = model_info.mteb_score
|
||||
st_dtype = "Constant"
|
||||
|
||||
@ -2,7 +2,8 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
|
||||
from ...utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
|
||||
LASTPoolingEmbedModelInfo)
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
# ST models with projector (Dense) layers
|
||||
@ -13,6 +14,10 @@ ST_PROJECTOR_MODELS = [
|
||||
mteb_score=0.688611955,
|
||||
enable_test=True,
|
||||
),
|
||||
LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
|
||||
architecture="Gemma3TextModel",
|
||||
mteb_score=0.7473819294684156,
|
||||
enable_test=True)
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -187,27 +187,19 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
|
||||
name_1="output")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def prompt(request, local_asset_server) -> TextPrompt:
|
||||
names = request.param
|
||||
urls = [local_asset_server.url_for(n) for n in names]
|
||||
return _create_engine_inputs_hf(urls)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prompt,expected_ranges",
|
||||
[
|
||||
pytest.param(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
|
||||
pytest.param(IMG_URLS[1:4], [
|
||||
PlaceholderRange(offset=11, length=266),
|
||||
PlaceholderRange(offset=277, length=1056),
|
||||
PlaceholderRange(offset=1333, length=418)
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt,
|
||||
"image_urls,expected_ranges",
|
||||
[(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
|
||||
(IMG_URLS[1:4], [
|
||||
PlaceholderRange(offset=11, length=266),
|
||||
PlaceholderRange(offset=277, length=1056),
|
||||
PlaceholderRange(offset=1333, length=418)
|
||||
])])
|
||||
def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
|
||||
expected_ranges: list[PlaceholderRange],
|
||||
monkeypatch) -> None:
|
||||
local_asset_server, monkeypatch) -> None:
|
||||
local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
|
||||
prompt = _create_engine_inputs_hf(local_image_urls)
|
||||
|
||||
# This placeholder checking test only works with V0 engine
|
||||
# where `multi_modal_placeholders` is returned with `RequestOutput`
|
||||
|
||||
@ -42,7 +42,7 @@ def run_test(
|
||||
tensor_parallel_size: int = 1,
|
||||
vllm_embeddings: Optional[torch.Tensor] = None,
|
||||
):
|
||||
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
|
||||
"""Modality agnostic test executor for comparing HF/vLLM outputs."""
|
||||
# In the case of embeddings, vLLM takes separate input tensors
|
||||
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
|
||||
|
||||
|
||||
@ -285,7 +285,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
|
||||
"MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1", # noqa: E501
|
||||
{"tiny": "TitanML/tiny-mixtral"}), # noqa: E501
|
||||
"QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"), # noqa: E501
|
||||
"MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
|
||||
"MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
|
||||
"NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
|
||||
@ -352,6 +351,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
|
||||
# [Text-only]
|
||||
"BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
|
||||
"Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), # noqa: E501
|
||||
"Gemma3TextModel": _HfExamplesInfo("google/embeddinggemma-300m"),
|
||||
"GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
|
||||
"GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
|
||||
trust_remote_code=True),
|
||||
@ -382,7 +382,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
|
||||
"Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
|
||||
trust_remote_code=True),
|
||||
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
|
||||
"PrithviGeoSpatialMAE": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
|
||||
"PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
|
||||
dtype=torch.float16,
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
@ -390,7 +390,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
|
||||
# going OOM in CI
|
||||
max_num_seqs=32,
|
||||
),
|
||||
"Terratorch": _HfExamplesInfo("mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
|
||||
"Terratorch": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
|
||||
dtype=torch.float16,
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
|
||||
@ -11,7 +11,7 @@ from vllm.utils import set_default_torch_num_threads
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
|
||||
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
|
||||
"mgazz/Prithvi_v2_eo_300_tl_unet_agb"
|
||||
],
|
||||
)
|
||||
|
||||
@ -1,43 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.model_executor.layers.activation import FastGELU, SiluAndMul
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
|
||||
@pytest.mark.parametrize("num_tokens,d,dtype", [
|
||||
(7, 512, torch.half),
|
||||
(7, 512, torch.float),
|
||||
(83, 512, torch.half),
|
||||
])
|
||||
@torch.inference_mode()
|
||||
def test_act_and_mul(
|
||||
activation: str,
|
||||
num_tokens: int,
|
||||
d: int,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
device = xm.xla_device()
|
||||
current_platform.seed_everything(0)
|
||||
torch.set_default_device("cpu")
|
||||
x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device)
|
||||
if activation == "silu_and_mul":
|
||||
layer = SiluAndMul()
|
||||
fn = layer.forward_native
|
||||
elif activation == "gelu_fast":
|
||||
layer = FastGELU()
|
||||
fn = F.gelu
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"activation {activation} is not implemented.")
|
||||
assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
|
||||
out = layer.to(device=device).forward_neuron(x)
|
||||
ref_out = fn(x.cpu())
|
||||
torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0)
|
||||
@ -1,154 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import neuronxcc.nki.language as nl
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from neuronxcc import nki
|
||||
|
||||
from vllm.attention.ops.nki_flash_attn import (
|
||||
load_block_tables, transform_block_tables_for_indirect_load)
|
||||
|
||||
|
||||
def is_power_of_2(n):
|
||||
return n > 0 and (n & (n - 1) == 0)
|
||||
|
||||
|
||||
def nki_load_and_transform_block_tables(
|
||||
block_tables,
|
||||
num_tiles,
|
||||
num_blocks_per_tile,
|
||||
num_head,
|
||||
head_id,
|
||||
block_size_tiling_factor,
|
||||
):
|
||||
assert is_power_of_2(
|
||||
num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2"
|
||||
block_tables_sbuf = load_block_tables(block_tables, num_tiles,
|
||||
num_blocks_per_tile)
|
||||
|
||||
# we need to pass an Index as head_id
|
||||
head_id = nl.arange(1)[None, :] + head_id
|
||||
|
||||
block_tables_transposed = transform_block_tables_for_indirect_load(
|
||||
block_tables_sbuf, block_size_tiling_factor, num_head, head_id)
|
||||
B_P_SIZE = 128
|
||||
assert block_tables_transposed.shape[1] == B_P_SIZE
|
||||
|
||||
out = nl.ndarray(
|
||||
block_tables_transposed.shape,
|
||||
dtype=nl.int32,
|
||||
buffer=nl.shared_hbm,
|
||||
)
|
||||
for i in nl.affine_range(block_tables_transposed.shape[0]):
|
||||
nl.store(dst=out[i], value=block_tables_transposed[i])
|
||||
return out
|
||||
|
||||
|
||||
def ref_block_tables_transform(
|
||||
block_tables,
|
||||
num_tiles,
|
||||
num_blocks_per_tile,
|
||||
num_head,
|
||||
head_id,
|
||||
block_size_tiling_factor,
|
||||
):
|
||||
assert block_tables.numel() == num_tiles * num_blocks_per_tile
|
||||
block_tables = block_tables.view(num_tiles, num_blocks_per_tile)
|
||||
B_F_SIZE = 128
|
||||
num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE
|
||||
block_tables = F.pad(
|
||||
block_tables,
|
||||
(0, 0, 0, num_tiles_padded - num_tiles),
|
||||
"constant",
|
||||
0,
|
||||
)
|
||||
|
||||
block_tables = block_tables * num_head + head_id
|
||||
block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1)
|
||||
offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1)
|
||||
block_tables = block_tables * block_size_tiling_factor + offset
|
||||
block_tables_transposed = block_tables.view(num_tiles_padded, -1).t()
|
||||
|
||||
num_blocks_per_tile = block_tables_transposed.shape[0]
|
||||
assert num_blocks_per_tile % B_F_SIZE == 0
|
||||
return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE,
|
||||
B_F_SIZE, num_tiles_padded)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"q_head_per_kv_head,head_id",
|
||||
[
|
||||
(1, 0),
|
||||
(3, 1),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"num_tiles,num_blocks_per_tile",
|
||||
[
|
||||
(1, 1),
|
||||
(13, 16),
|
||||
(17, 128),
|
||||
(35, 512),
|
||||
(128, 128),
|
||||
(130, 64),
|
||||
(280, 256),
|
||||
(315, 1),
|
||||
],
|
||||
)
|
||||
@torch.inference_mode()
|
||||
def test_load_and_transform_block_tables(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
num_tiles,
|
||||
num_blocks_per_tile,
|
||||
q_head_per_kv_head,
|
||||
head_id,
|
||||
) -> None:
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
device = xm.xla_device()
|
||||
|
||||
compiler_flags_str = " ".join([
|
||||
"-O1",
|
||||
"--retry_failed_compilation",
|
||||
])
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
|
||||
|
||||
torch.manual_seed(10000)
|
||||
torch.set_printoptions(sci_mode=False)
|
||||
|
||||
# On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
|
||||
B_P_SIZE = 128
|
||||
if num_blocks_per_tile < B_P_SIZE:
|
||||
assert B_P_SIZE % num_blocks_per_tile == 0
|
||||
block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
|
||||
else:
|
||||
block_size_tiling_factor = 1
|
||||
max_num_blocks = 100000
|
||||
block_tables = torch.randint(
|
||||
0,
|
||||
max_num_blocks,
|
||||
(num_tiles * num_blocks_per_tile, ),
|
||||
dtype=torch.int32,
|
||||
)
|
||||
nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
|
||||
block_tables.to(device=device),
|
||||
num_tiles,
|
||||
num_blocks_per_tile,
|
||||
q_head_per_kv_head,
|
||||
head_id,
|
||||
block_size_tiling_factor,
|
||||
).cpu()
|
||||
ref_out = ref_block_tables_transform(
|
||||
block_tables,
|
||||
num_tiles,
|
||||
num_blocks_per_tile,
|
||||
q_head_per_kv_head,
|
||||
head_id,
|
||||
block_size_tiling_factor,
|
||||
)
|
||||
assert (nki_out.shape == ref_out.shape
|
||||
), f"{nki_out.shape=} != {ref_out.shape=}"
|
||||
assert torch.all(nki_out == ref_out)
|
||||
@ -1,86 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.attention.ops.nki_flash_attn import reshape_and_cache
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_tokens, n_kv_head, d_head, num_blocks, block_size",
|
||||
[
|
||||
# Small model configuration (e.g., GPT-2 small)
|
||||
(32, 12, 64, 4, 128), # Typical sequence processing
|
||||
(1, 12, 64, 4, 128), # Single token update
|
||||
(128, 12, 64, 4, 128), # Longer sequence
|
||||
|
||||
# Medium model configuration (e.g., GPT-2 medium)
|
||||
(64, 16, 96, 8, 256), # Standard batch
|
||||
(256, 16, 96, 8, 256), # Large batch
|
||||
|
||||
# Large model configuration (e.g., GPT-3 style)
|
||||
(48, 32, 128, 16, 512), # Typical processing window
|
||||
(512, 32, 128, 16, 512), # Full context window
|
||||
|
||||
# Edge cases and stress tests
|
||||
(1024, 8, 32, 32, 32), # Many tokens, small heads
|
||||
(16, 64, 256, 4, 64), # Few tokens, many heads
|
||||
(2048, 24, 128, 64, 128), # Large scale test
|
||||
|
||||
# Minimal configurations for debugging
|
||||
(4, 2, 16, 2, 16), # Tiny test case
|
||||
(1, 1, 8, 1, 8), # Minimal possible
|
||||
])
|
||||
def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
|
||||
block_size):
|
||||
# Set random seed for reproducibility
|
||||
torch.manual_seed(42)
|
||||
|
||||
# Create CPU tensors for reference implementation
|
||||
key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
|
||||
torch.tensor(d_head))
|
||||
value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
|
||||
torch.tensor(d_head))
|
||||
key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
|
||||
value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
|
||||
slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens]
|
||||
|
||||
# Run reference implementation on CPU
|
||||
block_indices = torch.div(slot_mapping_cpu,
|
||||
block_size,
|
||||
rounding_mode="floor")
|
||||
block_offsets = slot_mapping_cpu % block_size
|
||||
|
||||
for i in range(num_tokens):
|
||||
block_idx = block_indices[i]
|
||||
block_offset = block_offsets[i]
|
||||
key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i]
|
||||
value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i]
|
||||
|
||||
# Create XLA device tensors
|
||||
device = torch.device('xla')
|
||||
key = key_cpu.to(device)
|
||||
value = value_cpu.to(device)
|
||||
key_cache = torch.zeros_like(key_cache_cpu, device=device)
|
||||
value_cache = torch.zeros_like(value_cache_cpu, device=device)
|
||||
slot_mapping = slot_mapping_cpu.to(device)
|
||||
kv_cache = torch.stack([key_cache, value_cache])
|
||||
|
||||
# Run vectorized implementation on XLA device
|
||||
reshape_and_cache(key, value, kv_cache, slot_mapping)
|
||||
key_cache, value_cache = torch.unbind(kv_cache, dim=0)
|
||||
|
||||
# Move results back to CPU for comparison
|
||||
key_cache_result = key_cache.cpu()
|
||||
value_cache_result = value_cache.cpu()
|
||||
|
||||
# Assert results match
|
||||
torch.testing.assert_close(key_cache_result,
|
||||
key_cache_cpu,
|
||||
rtol=1e-5,
|
||||
atol=1e-5)
|
||||
torch.testing.assert_close(value_cache_result,
|
||||
value_cache_cpu,
|
||||
rtol=1e-5,
|
||||
atol=1e-5)
|
||||
@ -1,57 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [
|
||||
(7, 8, False, torch.half),
|
||||
(83, 768, False, torch.half),
|
||||
(83, 768, True, torch.half),
|
||||
(83, 768, True, torch.bfloat16),
|
||||
(83, 768, True, torch.float32),
|
||||
])
|
||||
@torch.inference_mode()
|
||||
def test_rms_norm(
|
||||
num_tokens: int,
|
||||
hidden_size: int,
|
||||
add_residual: bool,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
device = xm.xla_device()
|
||||
current_platform.seed_everything(0)
|
||||
torch.set_default_device("cpu")
|
||||
layer = RMSNorm(hidden_size).to(dtype=dtype)
|
||||
layer.weight.data.normal_(mean=1.0, std=0.1)
|
||||
scale = 1 / (2 * hidden_size)
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device)
|
||||
x *= scale
|
||||
residual = torch.randn_like(x) * scale if add_residual else None
|
||||
|
||||
residual_cpu = residual.cpu() if add_residual else None
|
||||
ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu)
|
||||
assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
|
||||
out = layer.to(device=device)(x, residual)
|
||||
|
||||
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
|
||||
# numerical errors than other operators because they involve reductions.
|
||||
# Therefore, we use a larger tolerance.
|
||||
if add_residual:
|
||||
assert out[0].is_xla, "output tensor is expected to be XLA tensor"
|
||||
torch.testing.assert_close(out[0].cpu(),
|
||||
ref_out[0],
|
||||
atol=1e-2,
|
||||
rtol=1e-2)
|
||||
torch.testing.assert_close(out[1].cpu(),
|
||||
ref_out[1],
|
||||
atol=1e-2,
|
||||
rtol=1e-2)
|
||||
else:
|
||||
assert out.is_xla, "output tensor is expected to be XLA tensor"
|
||||
torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
|
||||
@ -1,95 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
|
||||
from vllm.utils import is_pin_memory_available
|
||||
|
||||
|
||||
class MockLogitsProcessor(LogitsProcessor):
|
||||
|
||||
def __init__(self, vocab_size: int, scale: float,
|
||||
fake_logits: torch.Tensor):
|
||||
super().__init__(vocab_size=vocab_size, scale=scale)
|
||||
self.fake_logits = fake_logits.clone()
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
with patch(
|
||||
"vllm.model_executor.layers.logits_processor._prune_hidden_states",
|
||||
lambda x, y: x
|
||||
), patch(
|
||||
"vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
|
||||
lambda *args, **kwargs: self.fake_logits):
|
||||
return super().forward(*args, **kwargs)
|
||||
|
||||
|
||||
def _prepare_test(
|
||||
batch_size: int
|
||||
) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
|
||||
vocab_size = 32000
|
||||
input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
|
||||
fake_logits = torch.full((batch_size, vocab_size),
|
||||
1e-2,
|
||||
dtype=input_tensor.dtype)
|
||||
logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
|
||||
return input_tensor, fake_logits, logits_processor
|
||||
|
||||
|
||||
RANDOM_SEEDS = list(range(8))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", RANDOM_SEEDS)
|
||||
def test_logits_processors(seed: int):
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
device = xm.xla_device()
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device("cpu")
|
||||
batch_size = random.randint(1, 256)
|
||||
input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
|
||||
|
||||
# This sample logits processor gives infinite score to the i-th token,
|
||||
# where i is the length of the input sequence.
|
||||
# We therefore expect the output token sequence to be [0, 1, 2, ...]
|
||||
def pick_ith(token_ids, logits):
|
||||
logits[len(token_ids)] = float("inf")
|
||||
return logits
|
||||
|
||||
seq_group_metadata_list = []
|
||||
seq_lens = []
|
||||
for i in range(batch_size):
|
||||
seq_group_metadata_list.append(
|
||||
SequenceGroupMetadata(
|
||||
request_id=f"test_{i}",
|
||||
is_prompt=True,
|
||||
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
|
||||
sampling_params=SamplingParams(temperature=0,
|
||||
logits_processors=[pick_ith]),
|
||||
block_tables={0: [1]},
|
||||
))
|
||||
seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
|
||||
|
||||
sampling_metadata = SamplingMetadata.prepare(
|
||||
seq_group_metadata_list,
|
||||
seq_lens,
|
||||
query_lens=seq_lens,
|
||||
device=device,
|
||||
pin_memory=is_pin_memory_available())
|
||||
logits_processor_output = logits_processor(
|
||||
lm_head=None,
|
||||
hidden_states=input_tensor,
|
||||
sampling_metadata=sampling_metadata)
|
||||
|
||||
fake_logits *= logits_processor.scale
|
||||
torch.testing.assert_close(logits_processor_output[:, 1],
|
||||
fake_logits[:, 1],
|
||||
rtol=1e-4,
|
||||
atol=0.0)
|
||||
@ -1,127 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.platforms.neuron import NeuronFramework
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import SequenceData, SequenceGroupMetadata
|
||||
from vllm.worker.neuron_model_runner import NeuronModelRunner
|
||||
|
||||
os.environ[
|
||||
'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
|
||||
|
||||
|
||||
def _create_neuron_model_runner(model: str, *args,
|
||||
**kwargs) -> NeuronModelRunner:
|
||||
engine_args = EngineArgs(model, *args, **kwargs)
|
||||
engine_config = engine_args.create_engine_config()
|
||||
vllm_config = VllmConfig(
|
||||
model_config=engine_config.model_config,
|
||||
parallel_config=engine_config.parallel_config,
|
||||
scheduler_config=engine_config.scheduler_config,
|
||||
device_config=engine_config.device_config,
|
||||
)
|
||||
neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config)
|
||||
return neuron_model_runner
|
||||
|
||||
|
||||
def test_update_neuron_sampling_params_not_full_batch():
|
||||
os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
|
||||
model_runner = _create_neuron_model_runner(
|
||||
"facebook/opt-125m",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_seqs=2,
|
||||
)
|
||||
assert not model_runner._on_device_sampling_disabled
|
||||
# Test sampling param updating only when TNx is framework
|
||||
# NxDI handles sampling parameter updating inside model
|
||||
if current_platform.use_transformers_neuronx():
|
||||
model_mock = MagicMock()
|
||||
model_runner.model = model_mock
|
||||
|
||||
seq_group_metadata_list = [
|
||||
SequenceGroupMetadata(
|
||||
request_id="test_0",
|
||||
is_prompt=True,
|
||||
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
|
||||
sampling_params=SamplingParams(temperature=0.5,
|
||||
top_k=1,
|
||||
top_p=0.5),
|
||||
block_tables={0: [1]},
|
||||
)
|
||||
]
|
||||
|
||||
model_runner.prepare_model_input(seq_group_metadata_list)
|
||||
|
||||
# Index neuron sampling parameters based on block_tables indices.
|
||||
# The first block_id of the sequence 0 is 1, so its parameters are
|
||||
# placed at index 1. So the sampling parameters will be:
|
||||
# Index 0: default sampling parameters
|
||||
# Index 1: sequecne 0's sampling parameters.
|
||||
neuron_sampling_params = (
|
||||
model_runner.model_config.neuron_sampling_params)
|
||||
assert neuron_sampling_params.temperature == [1.0, 0.5]
|
||||
assert neuron_sampling_params.top_k == [
|
||||
model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
|
||||
]
|
||||
assert neuron_sampling_params.top_p == [1.0, 0.5]
|
||||
model_mock.model.update_generation_config.assert_called_once_with(
|
||||
neuron_sampling_params)
|
||||
|
||||
|
||||
def test_update_neuron_sampling_params_full_batch():
|
||||
os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
|
||||
model_runner = _create_neuron_model_runner(
|
||||
"facebook/opt-125m",
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
max_num_seqs=2,
|
||||
)
|
||||
assert not model_runner._on_device_sampling_disabled
|
||||
|
||||
# Test sampling param updating only when TNx is framework
|
||||
# NxDI handles sampling parameter updating inside model
|
||||
if current_platform.use_transformers_neuronx():
|
||||
model_mock = MagicMock()
|
||||
model_runner.model = model_mock
|
||||
|
||||
seq_group_metadata_list = [
|
||||
SequenceGroupMetadata(
|
||||
request_id="test_0",
|
||||
is_prompt=True,
|
||||
seq_data={0: SequenceData.from_seqs([1, 2, 3])},
|
||||
sampling_params=SamplingParams(temperature=0.5,
|
||||
top_k=1,
|
||||
top_p=0.5),
|
||||
block_tables={0: [1]},
|
||||
),
|
||||
SequenceGroupMetadata(
|
||||
request_id="test_0",
|
||||
is_prompt=True,
|
||||
seq_data={1: SequenceData.from_seqs([4, 5, 6])},
|
||||
sampling_params=SamplingParams(temperature=0.2,
|
||||
top_k=2,
|
||||
top_p=0.2),
|
||||
block_tables={1: [0]},
|
||||
)
|
||||
]
|
||||
|
||||
model_runner.prepare_model_input(seq_group_metadata_list)
|
||||
|
||||
# Index neuron sampling parameters based on block_tables indices.
|
||||
# The first block_id of the sequence 0 is 1, so its parameters are
|
||||
# placed at index 1. So the sampling parameters will be:
|
||||
# Index 0: sequence 1's sampling parameters
|
||||
# Index 1: sequecne 0's sampling parameters.
|
||||
neuron_sampling_params = (
|
||||
model_runner.model_config.neuron_sampling_params)
|
||||
assert neuron_sampling_params.temperature == [0.2, 0.5]
|
||||
assert neuron_sampling_params.top_k == [2, 1]
|
||||
assert neuron_sampling_params.top_p == [0.2, 0.5]
|
||||
model_mock.model.update_generation_config.assert_called_once_with(
|
||||
neuron_sampling_params)
|
||||
@ -1,12 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from vllm.model_executor.layers.quantization.neuron_quant import (
|
||||
NeuronQuantConfig)
|
||||
|
||||
|
||||
def test_get_supported_act_dtypes():
|
||||
neuron_quant_config = NeuronQuantConfig()
|
||||
supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes()
|
||||
target_list = ["any_dtype1", "any_dtype2"]
|
||||
for dtype in target_list:
|
||||
assert dtype in supported_act_dtypes
|
||||
@ -1,514 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.utils import cdiv
|
||||
|
||||
|
||||
class BlockDiagonalCausalFromBottomRightMask:
|
||||
|
||||
@staticmethod
|
||||
def _from_seqlens(query_lens, seq_lens, block_size=None):
|
||||
from torch import logical_and, logical_or
|
||||
|
||||
contexted = block_size is None
|
||||
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
|
||||
n_queries = sum(query_lens)
|
||||
num_seqs = len(query_lens)
|
||||
if contexted:
|
||||
key_lens_blockaligned = seq_lens
|
||||
else:
|
||||
n_blocks_per_seq = (context_lens + block_size - 1) // block_size
|
||||
offset_per_seq = n_blocks_per_seq * block_size
|
||||
key_lens_blockaligned = offset_per_seq[:num_seqs].tolist()
|
||||
n_keys = sum(key_lens_blockaligned)
|
||||
|
||||
a = (torch.arange(n_queries).reshape(n_queries,
|
||||
1).expand(n_queries, n_keys))
|
||||
b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys)
|
||||
q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0)
|
||||
k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0)
|
||||
|
||||
prior_mask = torch.zeros(n_queries, n_keys)
|
||||
new_masks: list[torch.Tensor] = []
|
||||
for seq_id in range(num_seqs):
|
||||
ri = q_cumsum[seq_id]
|
||||
ci = k_cumsum[seq_id]
|
||||
nr = query_lens[seq_id]
|
||||
|
||||
if contexted:
|
||||
nc = seq_lens[seq_id]
|
||||
a_offset = ci + nc - ri - nr
|
||||
new_mask = (a + a_offset) >= b
|
||||
else:
|
||||
nc = context_lens[seq_id]
|
||||
a_offset = ci + nc - 1
|
||||
new_mask = a_offset >= b
|
||||
|
||||
left_mask = b >= ci
|
||||
top_mask = a >= ri
|
||||
bottom_mask = a < (ri + nr)
|
||||
|
||||
new_mask = logical_and(
|
||||
logical_and(logical_and(new_mask, left_mask), top_mask),
|
||||
bottom_mask,
|
||||
)
|
||||
prior_mask = logical_or(prior_mask, new_mask)
|
||||
new_masks = new_masks + [new_mask]
|
||||
return prior_mask
|
||||
|
||||
@staticmethod
|
||||
def from_seqlens(query_lens, seq_lens, block_size=None):
|
||||
contexted = block_size is None
|
||||
if contexted:
|
||||
prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
|
||||
query_lens, seq_lens)
|
||||
active_mask = None
|
||||
else:
|
||||
prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
|
||||
query_lens, seq_lens, block_size)
|
||||
active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens(
|
||||
query_lens, query_lens)
|
||||
return prior_mask, active_mask
|
||||
|
||||
|
||||
def ref_softmax(x: torch.Tensor,
|
||||
dim: int,
|
||||
mixed_precision=False,
|
||||
return_max_reduce=False):
|
||||
max_value = torch.amax(x, dim=dim, keepdims=True)
|
||||
exp = torch.exp(x - max_value)
|
||||
if mixed_precision:
|
||||
sum_value = torch.sum(exp.astype(torch.float32),
|
||||
dim=dim,
|
||||
keepdims=True).astype(x.dtype)
|
||||
else:
|
||||
sum_value = torch.sum(exp, dim=dim, keepdims=True)
|
||||
if return_max_reduce:
|
||||
return exp / sum_value, max_value, torch.reciprocal(sum_value)
|
||||
return exp / sum_value
|
||||
|
||||
|
||||
def ref_masked_attention(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
value: torch.Tensor,
|
||||
scale: float,
|
||||
attn_mask: Optional[torch.Tensor] = None,
|
||||
return_max_reduce: Optional[bool] = False,
|
||||
) -> torch.Tensor:
|
||||
scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float()
|
||||
if attn_mask is not None:
|
||||
masked_score = scaled_qk + attn_mask.float()
|
||||
if return_max_reduce:
|
||||
norm_score, cached_max, cached_sum_reciprocal = ref_softmax(
|
||||
masked_score, dim=-1, return_max_reduce=True)
|
||||
else:
|
||||
norm_score = ref_softmax(masked_score, dim=-1)
|
||||
out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value)
|
||||
if return_max_reduce:
|
||||
return (
|
||||
out,
|
||||
cached_max,
|
||||
cached_sum_reciprocal,
|
||||
norm_score,
|
||||
masked_score,
|
||||
scaled_qk,
|
||||
)
|
||||
else:
|
||||
return (out, )
|
||||
|
||||
|
||||
def ref_context_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query_lens,
|
||||
seq_lens,
|
||||
head_size,
|
||||
num_queries_per_kv,
|
||||
return_max_reduce=False,
|
||||
):
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
if num_queries_per_kv > 1:
|
||||
# Handle MQA and GQA
|
||||
key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
|
||||
value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
|
||||
|
||||
attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
|
||||
query_lens, seq_lens)
|
||||
|
||||
# convert binary mask to -inf values
|
||||
attn_mask = torch.logical_not(attn_mask)
|
||||
attn_mask = attn_mask.float() * -30000
|
||||
|
||||
output, *debug_tensors = ref_masked_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
scale,
|
||||
attn_mask,
|
||||
return_max_reduce=return_max_reduce,
|
||||
)
|
||||
|
||||
output = output.unsqueeze(1)
|
||||
if return_max_reduce:
|
||||
cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
|
||||
debug_tensors)
|
||||
return (
|
||||
output,
|
||||
cached_max,
|
||||
cached_sum_reciprocal,
|
||||
lse,
|
||||
masked_score,
|
||||
scaled_qk,
|
||||
)
|
||||
else:
|
||||
return output
|
||||
|
||||
|
||||
def sample_inputs(
|
||||
prefill_batch_size,
|
||||
decode_batch_size,
|
||||
min_query_len,
|
||||
max_query_len,
|
||||
min_ctx_len,
|
||||
max_ctx_len,
|
||||
block_size,
|
||||
num_heads,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype,
|
||||
):
|
||||
batch_size = prefill_batch_size + decode_batch_size
|
||||
max_model_len = (max_query_len + max_ctx_len) * 4
|
||||
max_block_per_request = max_model_len // block_size
|
||||
cache_size = (batch_size * max_block_per_request) + 2
|
||||
prefill_ctx_lens = torch.randint(min_ctx_len,
|
||||
max_ctx_len + 1, (prefill_batch_size, ),
|
||||
dtype=torch.long).tolist()
|
||||
decode_ctx_lens = torch.randint(min_ctx_len,
|
||||
max_ctx_len + 1, (decode_batch_size, ),
|
||||
dtype=torch.long).tolist()
|
||||
ctx_lens = prefill_ctx_lens + decode_ctx_lens
|
||||
query_lens = torch.randint(
|
||||
min_query_len,
|
||||
max_query_len + 1,
|
||||
(prefill_batch_size, ),
|
||||
dtype=torch.long,
|
||||
).tolist() + [1 for _ in range(decode_batch_size)]
|
||||
seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
|
||||
|
||||
num_tokens = sum(query_lens)
|
||||
query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||
query.uniform_(-1, 1)
|
||||
torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
|
||||
|
||||
kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
|
||||
kv.uniform_(-1, 1)
|
||||
key, value = kv.unbind(dim=1)
|
||||
|
||||
k_cache = torch.zeros(cache_size,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
v_cache = torch.zeros(cache_size,
|
||||
block_size,
|
||||
num_kv_heads,
|
||||
head_size,
|
||||
dtype=dtype)
|
||||
k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype)
|
||||
values = torch.arange(0, cache_size, dtype=torch.long)
|
||||
values = values[torch.randperm(cache_size)]
|
||||
block_table = values[:batch_size * max_block_per_request].view(
|
||||
batch_size, max_block_per_request)
|
||||
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
|
||||
b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
|
||||
dtype=torch.long),
|
||||
dim=0)
|
||||
# copy kv to cache
|
||||
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1],
|
||||
dtype=torch.long),
|
||||
dim=0)
|
||||
for i in range(batch_size):
|
||||
for j in range(query_lens[i]):
|
||||
k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] +
|
||||
j])
|
||||
v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] +
|
||||
b_ctx_len[i] + j])
|
||||
cur_ctx = 0
|
||||
block_id = 0
|
||||
while cur_ctx < b_ctx_len[i]:
|
||||
start_loc = b_seq_start_loc[i] + cur_ctx
|
||||
if cur_ctx + block_size > b_ctx_len[i]:
|
||||
end_loc = b_seq_start_loc[i] + b_ctx_len[i]
|
||||
else:
|
||||
end_loc = start_loc + block_size
|
||||
start_slot = block_table[i, block_id] * block_size
|
||||
end_slot = start_slot + end_loc - start_loc
|
||||
k_cache.view(-1, num_kv_heads,
|
||||
head_size)[start_slot:end_slot].copy_(
|
||||
key[start_loc:end_loc])
|
||||
v_cache.view(-1, num_kv_heads,
|
||||
head_size)[start_slot:end_slot].copy_(
|
||||
value[start_loc:end_loc])
|
||||
cur_ctx += block_size
|
||||
block_id += 1
|
||||
kv_cache = torch.stack([k_cache, v_cache])
|
||||
|
||||
return (
|
||||
query,
|
||||
k,
|
||||
v,
|
||||
kv_cache,
|
||||
block_table,
|
||||
key,
|
||||
value,
|
||||
query_lens,
|
||||
seq_lens,
|
||||
)
|
||||
|
||||
|
||||
def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
|
||||
num_blocks):
|
||||
context_lens = seq_lens - query_lens
|
||||
blocks_per_seq = (context_lens + block_size - 1) // block_size
|
||||
num_seqs = len(seq_lens)
|
||||
active_blocks: list[int] = []
|
||||
for seq_id in range(num_seqs):
|
||||
active_blocks = (
|
||||
active_blocks +
|
||||
block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
|
||||
return F.pad(
|
||||
torch.tensor(active_blocks, dtype=torch.int32),
|
||||
(0, num_blocks - len(active_blocks)),
|
||||
"constant",
|
||||
0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision",
|
||||
[
|
||||
# Test minimal configurations (small block size)
|
||||
(1, 199, 1, 512, 4, 2, 8, False
|
||||
), # minimal block size, small dimensions
|
||||
(1, 199, 1, 512, 4, 2, 8, True), # same with mixed precision
|
||||
|
||||
# Test common/medium configurations
|
||||
(4, 12, 32, 2048, 32, 8, 64, False), # common case, larger heads
|
||||
(4, 12, 32, 2048, 16, 4, 32,
|
||||
True), # medium size, mixed precision, grouped-query attention (GQA)
|
||||
|
||||
# Test large configurations
|
||||
(4, 12, 256, 8192, 8, 1, 128, False), # large blocks, large head size
|
||||
(4, 12, 256, 8192, 64, 8, 64, True), # large blocks, many heads
|
||||
|
||||
# Test asymmetric configurations
|
||||
(2, 24, 64, 4096, 12, 4, 96, False), # varied batch sizes
|
||||
(8, 8, 128, 2048, 24, 2, 48, True), # balanced batches
|
||||
|
||||
# Test edge cases
|
||||
(1, 128, 16, 1024, 4, 2, 16, False), # large decode batch
|
||||
(16, 4, 8, 1024, 4, 2, 128, True), # large prefill batch
|
||||
(4, 12, 32, 2048, 16, 1, 32, True), # multi-head attention (MHA)
|
||||
(4, 12, 32, 2048, 16, 16, 32, True), # multi-query attention (MQA)
|
||||
])
|
||||
@torch.inference_mode()
|
||||
def test_contexted_kv_attention(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
prefill_batch_size: int,
|
||||
decode_batch_size: int,
|
||||
num_heads: int,
|
||||
num_queries_per_kv: int,
|
||||
head_size: int,
|
||||
block_size: int,
|
||||
large_tile_size,
|
||||
mixed_precision: bool,
|
||||
) -> None:
|
||||
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc,
|
||||
reorder_context_mask)
|
||||
|
||||
assert large_tile_size % block_size == 0
|
||||
|
||||
device = xm.xla_device()
|
||||
|
||||
compiler_flags_str = " ".join([
|
||||
"-O1",
|
||||
"--retry_failed_compilation",
|
||||
])
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
|
||||
|
||||
torch.manual_seed(0)
|
||||
torch.set_printoptions(sci_mode=False)
|
||||
torch.set_default_device("cpu")
|
||||
dtype = torch.float32
|
||||
|
||||
min_ctx_len = 32
|
||||
max_ctx_len = 1024
|
||||
min_query_len = 16
|
||||
max_query_len = 512
|
||||
num_kv_heads = num_heads // num_queries_per_kv
|
||||
(
|
||||
query,
|
||||
k_active,
|
||||
v_active,
|
||||
kv_cache,
|
||||
block_table,
|
||||
key,
|
||||
value,
|
||||
query_lens,
|
||||
seq_lens,
|
||||
) = sample_inputs(
|
||||
prefill_batch_size=prefill_batch_size,
|
||||
decode_batch_size=decode_batch_size,
|
||||
min_query_len=min_query_len,
|
||||
max_query_len=max_query_len,
|
||||
min_ctx_len=min_ctx_len,
|
||||
max_ctx_len=max_ctx_len,
|
||||
block_size=block_size,
|
||||
num_heads=num_heads,
|
||||
num_kv_heads=num_kv_heads,
|
||||
head_size=head_size,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
output_ref = ref_context_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
query_lens,
|
||||
seq_lens,
|
||||
head_size,
|
||||
num_queries_per_kv,
|
||||
return_max_reduce=False,
|
||||
)
|
||||
|
||||
# build neuron program
|
||||
B_P_SIZE = 128
|
||||
assert (large_tile_size >= B_P_SIZE
|
||||
), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
|
||||
|
||||
def pad_to_multiple(a, b):
|
||||
return cdiv(a, b) * b
|
||||
|
||||
def pad_to_next_power_of_2(a):
|
||||
assert a > 0
|
||||
return 2**int(a - 1).bit_length()
|
||||
|
||||
# calculate input shapes
|
||||
max_num_queries = pad_to_next_power_of_2(sum(query_lens))
|
||||
context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
|
||||
num_active_blocks = cdiv(context_lens, block_size).sum().item()
|
||||
num_active_blocks = pad_to_multiple(num_active_blocks,
|
||||
large_tile_size // block_size)
|
||||
context_kv_len = num_active_blocks * block_size
|
||||
assert (
|
||||
context_kv_len %
|
||||
large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
|
||||
|
||||
# pad QKV tensors
|
||||
pad_dims = (
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
max_num_queries - query.shape[0],
|
||||
)
|
||||
query = F.pad(query, pad_dims, "constant", 0)
|
||||
k = F.pad(k_active, pad_dims, "constant", 0)
|
||||
v = F.pad(v_active, pad_dims, "constant", 0)
|
||||
|
||||
# permute QKV tensors
|
||||
# query: (1, n_heads, d, seq_q)
|
||||
# key: (1, n_kv_heads, d, seq_k)
|
||||
# value: (1, n_kv_heads, seq_v, d)
|
||||
query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
|
||||
k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
|
||||
v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
|
||||
kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
|
||||
|
||||
# transform block table
|
||||
active_block_table = get_active_block_tables(
|
||||
block_table.cpu(),
|
||||
torch.tensor(query_lens).cpu(),
|
||||
torch.tensor(seq_lens).cpu(),
|
||||
block_size,
|
||||
num_active_blocks,
|
||||
)
|
||||
|
||||
# Build attention masks
|
||||
prior_mask, active_mask = (
|
||||
BlockDiagonalCausalFromBottomRightMask.from_seqlens(
|
||||
query_lens, seq_lens, block_size=block_size))
|
||||
prior_mask_padded = F.pad(
|
||||
prior_mask,
|
||||
(
|
||||
0,
|
||||
context_kv_len - prior_mask.shape[1],
|
||||
0,
|
||||
max_num_queries - prior_mask.shape[0],
|
||||
),
|
||||
"constant",
|
||||
0,
|
||||
).bool()
|
||||
active_mask_padded = F.pad(
|
||||
active_mask,
|
||||
(
|
||||
0,
|
||||
max_num_queries - active_mask.shape[1],
|
||||
0,
|
||||
max_num_queries - active_mask.shape[0],
|
||||
),
|
||||
"constant",
|
||||
0,
|
||||
).bool()
|
||||
attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
|
||||
dim=1)
|
||||
|
||||
attn_mask = reorder_context_mask(attn_mask, large_tile_size,
|
||||
block_size)
|
||||
|
||||
input_args = (
|
||||
query.to(device=device),
|
||||
k.to(device=device),
|
||||
v.to(device=device),
|
||||
kv_cache.to(device=device),
|
||||
active_block_table.to(device=device),
|
||||
attn_mask.to(device=device),
|
||||
)
|
||||
input_kwargs = dict(
|
||||
n_kv_head=num_kv_heads,
|
||||
head_size=head_size,
|
||||
mixed_precision=mixed_precision,
|
||||
LARGE_TILE_SZ=large_tile_size,
|
||||
)
|
||||
|
||||
output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
|
||||
|
||||
num_actual_tokens = sum(query_lens)
|
||||
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
|
||||
output_nki = output_nki.cpu().permute(0, 2, 1, 3)
|
||||
output_nki = output_nki[0, :num_actual_tokens, :, :]
|
||||
output_ref_padded = F.pad(
|
||||
output_ref,
|
||||
(0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
|
||||
"constant",
|
||||
0,
|
||||
)
|
||||
output_ref = output_ref_padded.transpose(
|
||||
0, 1)[0, :num_actual_tokens, :, :]
|
||||
|
||||
torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
|
||||
@ -1,68 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests for miscellaneous utilities
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
|
||||
(16, False, 32, 32, 1024, True),
|
||||
(16, False, 32, 128, 1024, True),
|
||||
(16, True, 32, 32, 1024, True),
|
||||
(16, True, 32, 128, 1024, True),
|
||||
(16, False, 32, 128, 1024, False),
|
||||
(16, True, 32, 128, 1024, False),
|
||||
])
|
||||
def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
|
||||
head_size, seq_len, use_key):
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
device = xm.xla_device()
|
||||
current_platform.seed_everything(0)
|
||||
torch.set_default_device("cpu")
|
||||
|
||||
batch_size = 1
|
||||
base = 10000
|
||||
num_heads = 8
|
||||
|
||||
rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
|
||||
is_neox_style, torch.float32)
|
||||
|
||||
positions = torch.randint(0,
|
||||
max_position, (batch_size, seq_len),
|
||||
device="cpu")
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_heads * head_size,
|
||||
dtype=torch.float32,
|
||||
device="cpu")
|
||||
key = torch.randn_like(query) if use_key else None
|
||||
assert positions.is_cpu, \
|
||||
"reference input tensor is expected to be CPU tensor."
|
||||
ref_query, ref_key = rot.to(device="cpu").forward_native(
|
||||
positions, query, key)
|
||||
out_query, out_key = rot.to(device=device).forward_neuron(
|
||||
positions.to(device=device), query.to(device=device),
|
||||
key.to(device=device) if key is not None else None)
|
||||
if use_key:
|
||||
assert out_query.is_xla and out_key.is_xla, \
|
||||
"output tensor is expected to be XLA tensor"
|
||||
torch.testing.assert_close(out_key.cpu(),
|
||||
ref_key,
|
||||
atol=1e-2,
|
||||
rtol=1e-2)
|
||||
else:
|
||||
assert out_key is None, "expected returned key to be None"
|
||||
assert out_query.is_xla, \
|
||||
"output tensor is expected to be XLA tensor"
|
||||
torch.testing.assert_close(out_query.cpu(),
|
||||
ref_query,
|
||||
atol=1e-2,
|
||||
rtol=1e-2)
|
||||
@ -1,101 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import functools
|
||||
from typing import Callable
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch_xla.distributed.xla_multiprocessing as xmp
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from vllm.distributed.communication_op import (
|
||||
tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
|
||||
from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment)
|
||||
from vllm.utils import get_distributed_init_method, get_open_port
|
||||
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
|
||||
def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]:
|
||||
"""Decorator to reinitialize the Neuron Runtime before executing a test.
|
||||
This is necessary for distributed tests which need to reallocate Neuron
|
||||
Cores to separate subprocesses.
|
||||
"""
|
||||
|
||||
@functools.wraps(f)
|
||||
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
|
||||
runtime = torch.classes.neuron.Runtime()
|
||||
runtime.initialize()
|
||||
runtime.unsafe_close()
|
||||
|
||||
f(*args, **kwargs)
|
||||
runtime.initialize()
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def all_gather_test_worker(index, tp_degree, distributed_init_method):
|
||||
init_distributed_environment(tp_degree,
|
||||
index,
|
||||
distributed_init_method,
|
||||
index,
|
||||
backend="xla")
|
||||
ensure_model_parallel_initialized(tp_degree, 1)
|
||||
|
||||
num_dimensions = 3
|
||||
tensor_size = list(range(2, num_dimensions + 2))
|
||||
total_size = 1
|
||||
for s in tensor_size:
|
||||
total_size *= s
|
||||
|
||||
all_gather_dimension = -1
|
||||
all_tensors = [
|
||||
torch.arange(total_size, dtype=torch.float32,
|
||||
device="xla").reshape(tensor_size) * (r + 1)
|
||||
for r in range(tp_degree)
|
||||
]
|
||||
expected = torch.cat(all_tensors, dim=all_gather_dimension)
|
||||
t = all_tensors[index % tp_degree]
|
||||
t = tensor_model_parallel_all_gather(t, all_gather_dimension)
|
||||
torch.testing.assert_close(t, expected)
|
||||
|
||||
|
||||
def all_reduce_test_worker(index, tp_degree, distributed_init_method):
|
||||
init_distributed_environment(tp_degree,
|
||||
index,
|
||||
distributed_init_method,
|
||||
index,
|
||||
backend="xla")
|
||||
ensure_model_parallel_initialized(tp_degree, 1)
|
||||
|
||||
num_elements = 8
|
||||
all_tensors = [
|
||||
torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1)
|
||||
for r in range(tp_degree)
|
||||
]
|
||||
expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
|
||||
t = all_tensors[index % tp_degree]
|
||||
t = tensor_model_parallel_all_reduce(t)
|
||||
torch.testing.assert_close(t, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("test_target",
|
||||
[all_reduce_test_worker, all_gather_test_worker])
|
||||
@reinitialize_neuron_runtime
|
||||
def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
|
||||
test_target):
|
||||
|
||||
with patch('torch_xla._XLAC._xla_runtime_is_initialized',
|
||||
return_value=False):
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
"127.0.0.1", get_open_port())
|
||||
|
||||
monkeypatch.setenv("VLLM_USE_V1", "1")
|
||||
monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
|
||||
monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
|
||||
','.join(['1' for _ in range(tp_size)]))
|
||||
|
||||
xmp.spawn(test_target, args=(tp_size, distributed_init_method))
|
||||
@ -1,83 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
import torch
|
||||
from huggingface_hub import snapshot_download
|
||||
from safetensors import safe_open
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def patch_eagle_draft_with_lm_head(target_model_id: str,
|
||||
draft_model_id: str) -> str:
|
||||
# In NxDI, draft model checkpoint must include lm_head weights from target
|
||||
# model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
|
||||
# /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
|
||||
# #eagle-checkpoint-compatibility
|
||||
final_draft_dir = "/tmp/patched_eagle_draft"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
target_dir = snapshot_download(repo_id=target_model_id,
|
||||
local_dir=os.path.join(
|
||||
tmp_dir, "target"))
|
||||
draft_dir = snapshot_download(repo_id=draft_model_id,
|
||||
local_dir=os.path.join(tmp_dir, "draft"))
|
||||
|
||||
lm_head_key = "lm_head.weight"
|
||||
index_path = os.path.join(target_dir, "model.safetensors.index.json")
|
||||
with open(index_path) as f:
|
||||
index = json.load(f)
|
||||
shard_name = index["weight_map"][lm_head_key]
|
||||
target_safetensor_path = os.path.join(target_dir, shard_name)
|
||||
|
||||
with safe_open(target_safetensor_path, framework="pt") as f:
|
||||
target_lm_head = f.get_tensor(lm_head_key)
|
||||
|
||||
draft_path = os.path.join(draft_dir, "pytorch_model.bin")
|
||||
draft_state_dict = torch.load(draft_path, map_location="cpu")
|
||||
draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
|
||||
torch.save(draft_state_dict, draft_path)
|
||||
|
||||
shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
|
||||
|
||||
return final_draft_dir
|
||||
|
||||
|
||||
def test_eagle():
|
||||
patched_draft_path = patch_eagle_draft_with_lm_head(
|
||||
target_model_id="meta-llama/Llama-2-7b-hf",
|
||||
draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
|
||||
llm = LLM(
|
||||
model="meta-llama/Llama-2-7b-hf",
|
||||
speculative_config={
|
||||
"model": patched_draft_path,
|
||||
"num_speculative_tokens": 5,
|
||||
"max_model_len": 128
|
||||
},
|
||||
max_num_seqs=1,
|
||||
max_model_len=128,
|
||||
tensor_parallel_size=2,
|
||||
override_neuron_config={
|
||||
"enable_eagle_speculation": True,
|
||||
"enable_fused_speculation": True,
|
||||
"fused_qkv": True
|
||||
},
|
||||
)
|
||||
prompts = [
|
||||
"The president of the United States is",
|
||||
]
|
||||
outputs = llm.generate(prompts, SamplingParams(top_k=1))
|
||||
expected_output = " the head of state and head of government of " \
|
||||
"the United States. The president direct"
|
||||
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
|
||||
assert (expected_output == generated_text)
|
||||
|
||||
print("Neuron Eagle speculation test passed.")
|
||||
@ -1,64 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
|
||||
def test_mistral():
|
||||
llm = LLM(model="mistralai/Mistral-7B-v0.1",
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=4,
|
||||
max_model_len=128,
|
||||
override_neuron_config={
|
||||
"sequence_parallel_enabled": False,
|
||||
"skip_warmup": True
|
||||
})
|
||||
|
||||
# Send more prompts than the compiled batch size (4) and request
|
||||
# varying generation lengths to test accuracy related to Neuron
|
||||
# specific sequence id sorting.
|
||||
prompts = [
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"What is Annapurna labs?",
|
||||
"I believe the meaning of life is",
|
||||
"Tell me a story about a brave knight",
|
||||
"Hello, my name is Llama",
|
||||
]
|
||||
|
||||
sampling_params = [
|
||||
SamplingParams(top_k=1, max_tokens=10),
|
||||
SamplingParams(top_k=1, max_tokens=20),
|
||||
SamplingParams(top_k=1, max_tokens=30),
|
||||
SamplingParams(top_k=1, max_tokens=40),
|
||||
SamplingParams(top_k=1, max_tokens=50),
|
||||
SamplingParams(top_k=1, max_tokens=60)
|
||||
]
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
expected_outputs = [
|
||||
" the most powerful person in the world. He is",
|
||||
" a city of many faces. It is a city of history, culture, art, "
|
||||
"fashion, and",
|
||||
"\n\nAnnapurna Labs is a semiconductor company that was founded "
|
||||
"in 2013 by Amazon. The company is",
|
||||
" to be happy.\n\nI believe that happiness is a choice.\n\nI "
|
||||
"believe that happiness is a state of mind.\n\nI believe that "
|
||||
"happiness is a journey.\n\nI believe",
|
||||
" who rescued a princess from a dragon.\n\nTell me a story about"
|
||||
" a princess who rescued herself from a dragon.\n\nTell me a "
|
||||
"story about a princess who rescued herself from a dragon and "
|
||||
"then rescued a knight from",
|
||||
" and I am a 10 year old male. I am a very friendly and "
|
||||
"affectionate boy who loves to be around people. I am a very "
|
||||
"active boy who loves to play and run around. I am a very smart "
|
||||
"boy who loves to learn new things. I am a very loyal boy"
|
||||
]
|
||||
|
||||
for expected_output, output in zip(expected_outputs, outputs):
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
|
||||
assert (expected_output == generated_text)
|
||||
|
||||
print("Neuron Mistral test passed.")
|
||||
@ -1,97 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
def test_llama_single_lora():
|
||||
sql_lora_files = snapshot_download(
|
||||
repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-hf",
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=4,
|
||||
max_model_len=512,
|
||||
override_neuron_config={
|
||||
"sequence_parallel_enabled": False,
|
||||
"skip_warmup": True,
|
||||
"lora_modules": [{
|
||||
"name": "lora_id_1",
|
||||
"path": sql_lora_files
|
||||
}]
|
||||
},
|
||||
enable_lora=True,
|
||||
max_loras=1,
|
||||
max_lora_rank=256,
|
||||
device="neuron")
|
||||
"""For multi-lora requests using NxDI as the backend, only the lora_name
|
||||
needs to be specified. The lora_id and lora_path are supplied at the LLM
|
||||
class/server initialization, after which the paths are handled by NxDI"""
|
||||
lora_req_1 = LoRARequest("lora_id_1", 0, " ")
|
||||
prompts = [
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
]
|
||||
outputs = llm.generate(prompts,
|
||||
SamplingParams(top_k=1),
|
||||
lora_request=[lora_req_1, lora_req_1])
|
||||
|
||||
expected_outputs = [
|
||||
" the head of state and head of government of the United States. "
|
||||
"The president direct",
|
||||
" a city of contrasts. The city is home to the Eiffel Tower"
|
||||
]
|
||||
|
||||
for expected_output, output in zip(expected_outputs, outputs):
|
||||
generated_text = output.outputs[0].text
|
||||
assert (expected_output == generated_text)
|
||||
|
||||
|
||||
def test_llama_multiple_lora():
|
||||
sql_lora_files = snapshot_download(
|
||||
repo_id="yard1/llama-2-7b-sql-lora-test")
|
||||
llm = LLM(model="meta-llama/Llama-2-7b-hf",
|
||||
tensor_parallel_size=2,
|
||||
max_num_seqs=4,
|
||||
max_model_len=512,
|
||||
override_neuron_config={
|
||||
"sequence_parallel_enabled":
|
||||
False,
|
||||
"skip_warmup":
|
||||
True,
|
||||
"lora_modules": [{
|
||||
"name": "lora_id_1",
|
||||
"path": sql_lora_files
|
||||
}, {
|
||||
"name": "lora_id_2",
|
||||
"path": sql_lora_files
|
||||
}]
|
||||
},
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=256,
|
||||
device="neuron")
|
||||
"""For multi-lora requests using NxDI as the backend, only the lora_name
|
||||
needs to be specified. The lora_id and lora_path are supplied at the LLM
|
||||
class/server initialization, after which the paths are handled by NxDI"""
|
||||
lora_req_1 = LoRARequest("lora_id_1", 0, " ")
|
||||
lora_req_2 = LoRARequest("lora_id_2", 1, " ")
|
||||
prompts = [
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
]
|
||||
outputs = llm.generate(prompts,
|
||||
SamplingParams(top_k=1),
|
||||
lora_request=[lora_req_1, lora_req_2])
|
||||
|
||||
expected_outputs = [
|
||||
" the head of state and head of government of the United States. "
|
||||
"The president direct",
|
||||
" a city of contrasts. The city is home to the Eiffel Tower"
|
||||
]
|
||||
|
||||
for expected_output, output in zip(expected_outputs, outputs):
|
||||
generated_text = output.outputs[0].text
|
||||
assert (expected_output == generated_text)
|
||||
@ -1,8 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
def register_prithvi_india():
|
||||
return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia" # noqa: E501
|
||||
|
||||
|
||||
def register_prithvi_valencia():
|
||||
return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorValencia" # noqa: E501
|
||||
def register_prithvi():
|
||||
return "prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor" # noqa: E501
|
||||
|
||||
@ -234,6 +234,8 @@ def load_image(
|
||||
|
||||
class PrithviMultimodalDataProcessor(IOProcessor):
|
||||
|
||||
indices = [0, 1, 2, 3, 4, 5]
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
|
||||
super().__init__(vllm_config)
|
||||
@ -412,21 +414,3 @@ class PrithviMultimodalDataProcessor(IOProcessor):
|
||||
format="tiff",
|
||||
data=out_data,
|
||||
request_id=request_id)
|
||||
|
||||
|
||||
class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
|
||||
super().__init__(vllm_config)
|
||||
|
||||
self.indices = [1, 2, 3, 8, 11, 12]
|
||||
|
||||
|
||||
class PrithviMultimodalDataProcessorValencia(PrithviMultimodalDataProcessor):
|
||||
|
||||
def __init__(self, vllm_config: VllmConfig):
|
||||
|
||||
super().__init__(vllm_config)
|
||||
|
||||
self.indices = [0, 1, 2, 3, 4, 5]
|
||||
|
||||
@ -9,8 +9,7 @@ setup(
|
||||
packages=["prithvi_io_processor"],
|
||||
entry_points={
|
||||
"vllm.io_processor_plugins": [
|
||||
"prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india", # noqa: E501
|
||||
"prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia", # noqa: E501
|
||||
"prithvi_to_tiff = prithvi_io_processor:register_prithvi", # noqa: E501
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user