[CI] Add Buildkite (#2355)

2024-01-14 12:37:58 -08:00
parent 9f659bf07f
commit 6e01e8c1c8
13 changed files with 192 additions and 37 deletions
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -29,8 +29,13 @@ def api_server():
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    uvicorn_process = subprocess.Popen([
-        sys.executable, "-u",
-        str(script_path), "--model", "facebook/opt-125m"
+        sys.executable,
+        "-u",
+        str(script_path),
+        "--model",
+        "facebook/opt-125m",
+        "--host",
+        "127.0.0.1",
    ])
    yield
    uvicorn_process.terminate()
@ -81,6 +86,9 @@ def test_api_server(api_server):
        pool.join()

        # check cancellation stats
+        # give it some times to update the stats
+        time.sleep(1)
+
        num_aborted_requests = requests.get(
            "http://localhost:8000/stats").json()["num_aborted_requests"]
        assert num_aborted_requests > 0
--- a/tests/async_engine/test_openai_server.py
+++ b/tests/async_engine/test_openai_server.py
@ -1,19 +1,24 @@
 from argparse import Namespace
 from dataclasses import dataclass
+import os
+import pathlib

 import pytest
 from fastapi.testclient import TestClient

 from vllm.entrypoints.openai.api_server import *

+chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
+    __file__))).parent.parent / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
    ("facebook/opt-125m", None, True,
     "Hello</s>Hi there!</s>What is the capital of</s>"),
    ("facebook/opt-125m", None, False,
     "Hello</s>Hi there!</s>What is the capital of</s>"),
-    ("facebook/opt-125m", "../../examples/template_chatml.jinja", True,
-     """<|im_start|>user
+    ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
@ -21,8 +26,7 @@ Hi there!<|im_end|>
 What is the capital of<|im_end|>
 <|im_start|>assistant
 """),
-    ("facebook/opt-125m", "../../examples/template_chatml.jinja", False,
-     """<|im_start|>user
+    ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
@ -54,8 +58,7 @@ class MockTokenizer:

 def test_load_chat_template():
    # Testing chatml template
-    template = "../../examples/template_chatml.jinja"
-    mock_args = Namespace(chat_template=template)
+    mock_args = Namespace(chat_template=chatml_jinja_path)
    tokenizer = MockTokenizer()

    # Call the function with the mocked args
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@ -2,10 +2,9 @@

 Run `pytest tests/distributed/test_comm_ops.py --forked`.
 """
-from multiprocessing import Process, set_start_method
-
 import pytest
 import torch
+import ray

 from vllm.config import ParallelConfig
 from vllm.utils import get_open_port
@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
                                     tensor_parallel_size,
                                     worker_use_ray=True)
    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
-    torch.cuda.set_device(rank)
    _init_distributed_environment(parallel_config, rank,
                                  distributed_init_method)


+@ray.remote(num_gpus=1, max_calls=1)
 def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
                           distributed_init_port: str):
    init_test_distributed_environment(1, tensor_parallel_size, rank,
@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
    assert torch.allclose(t, expected)


+@ray.remote(num_gpus=1, max_calls=1)
 def all_gather_test_worker(tensor_parallel_size: int, rank: int,
                           distributed_init_port: str):
    init_test_distributed_environment(1, tensor_parallel_size, rank,
@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
@pytest.mark.parametrize("test_target",
                         [all_reduce_test_worker, all_gather_test_worker])
 def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
-    set_start_method("spawn", force=True)
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    ray.init()
+
    distributed_init_port = get_open_port()
-    processes = []
+    refs = []
    for rank in range(tensor_parallel_size):
-        p = Process(target=test_target,
-                    args=(tensor_parallel_size, rank, distributed_init_port))
-        p.start()
-        processes.append(p)
-    for p in processes:
-        p.join()
-    assert all(p.exitcode == 0 for p in processes)
+        refs.append(
+            test_target.remote(tensor_parallel_size, rank,
+                               distributed_init_port))
+    ray.get(refs)
+
+    ray.shutdown()
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@ -13,7 +13,7 @@ FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
 MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
-NUM_BLOCKS = 40000  # Arbitrary values for testing
+NUM_BLOCKS = 12000  # Arbitrary values for testing
 PARTITION_SIZE = 512

 DTYPES = [torch.half, torch.bfloat16, torch.float]
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@ -6,12 +6,12 @@ import torch
 from vllm._C import cache_ops

 DTYPES = [torch.half, torch.bfloat16, torch.float]
-NUM_TOKENS = [83]  # Arbitrary values for testing
+NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [8, 16, 32]
-NUM_BLOCKS = [1024, 36000]  # Arbitrary values for testing
+NUM_BLOCKS = [1024, 3600]  # Arbitrary values for testing
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
 DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@ -30,6 +30,7 @@ def test_get_prompt_logprobs(
                                          temperature=0.0)
    vllm_results = vllm_model.model.generate(
        example_prompts, sampling_params=vllm_sampling_params)
+    del vllm_model

    # Test whether logprobs are included in the results.
    for result in vllm_results:
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@ -75,6 +75,8 @@ def test_sampler_all_greedy(seed: int):
        for nth_output in sequence_output.samples:
            assert nth_output.output_token == expected[i].item()

+    del model_runner
+

@pytest.mark.parametrize("seed", RANDOM_SEEDS)
 def test_sampler_all_random(seed: int):
@ -111,6 +113,8 @@ def test_sampler_all_random(seed: int):
        for nth_output in sequence_output.samples:
            assert nth_output.output_token == i

+    del model_runner
+

@pytest.mark.parametrize("seed", RANDOM_SEEDS)
 def test_sampler_all_beam(seed: int):
@ -144,6 +148,7 @@ def test_sampler_all_beam(seed: int):
    # the outputs are expected - in other words, this just tests
    # whether there are no exceptions in the sampler
    # when handling an all-beam search case.
+    del model_runner


@pytest.mark.parametrize("seed", RANDOM_SEEDS)
@ -198,6 +203,8 @@ def test_sampler_mixed(seed: int):
        for nth_output in sequence_output.samples:
            assert nth_output.output_token in expected_tokens

+    del model_runner
+

@pytest.mark.parametrize("seed", RANDOM_SEEDS)
 def test_sampler_logits_processors(seed: int):
@ -235,6 +242,8 @@ def test_sampler_logits_processors(seed: int):
        for idx, nth_output in enumerate(sequence_output.samples):
            assert nth_output.output_token == idx

+    del model_runner
+

@pytest.mark.parametrize("seed", RANDOM_SEEDS)
 def test_sampler_top_k_top_p(seed: int):
@ -296,3 +305,5 @@ def test_sampler_top_k_top_p(seed: int):
    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
    assert torch.allclose(hf_probs, sample_probs, atol=1e-5)
    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
+
+    del model_runner