[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)
This commit is contained in:
@ -22,7 +22,8 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize(
|
||||
"model, distributed_executor_backend, attention_backend, test_suite", [
|
||||
"model, distributed_executor_backend, attention_backend, "
|
||||
"test_suite", [
|
||||
("facebook/opt-125m", "ray", "", "L4"),
|
||||
("facebook/opt-125m", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
|
||||
@ -6,6 +6,8 @@ pytest test_chunked_prefill_distributed.py
|
||||
```
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
@ -30,6 +32,11 @@ def test_models(
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray": # noqa
|
||||
assert distributed_executor_backend == "ray"
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
|
||||
Reference in New Issue
Block a user