[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)

2024-08-18 17:57:20 -07:00
parent 200a2ffa6b
commit ff7ec82c4d
36 changed files with 722 additions and 346 deletions
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@ -22,7 +22,8 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, test_suite", [
+    "model, distributed_executor_backend, attention_backend, "
+    "test_suite", [
        ("facebook/opt-125m", "ray", "", "L4"),
        ("facebook/opt-125m", "mp", "", "L4"),
        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@ -6,6 +6,8 @@ pytest test_chunked_prefill_distributed.py
 ```
 """

+import os
+
 import pytest

 from vllm.utils import cuda_device_count_stateless
@ -30,6 +32,11 @@ def test_models(
    model: str,
    distributed_executor_backend: str,
 ) -> None:
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray":  # noqa
+        assert distributed_executor_backend == "ray"
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"

    dtype = "half"
    max_tokens = 5