[torch.compile] integration with compilation control (#9058)
This commit is contained in:
48
tests/compile/test_basic_correctness.py
Normal file
48
tests/compile/test_basic_correctness.py
Normal file
@ -0,0 +1,48 @@
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.compilation.levels import CompilationLevel
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import compare_all_settings
|
||||
|
||||
|
||||
# we cannot afford testing the full Catesian product
|
||||
# of all models and all levels
|
||||
@pytest.mark.parametrize(
|
||||
"model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
|
||||
[
|
||||
("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate",
|
||||
True),
|
||||
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
|
||||
["--quantization", "compressed-tensors"
|
||||
], 1, 1, "FLASH_ATTN", "generate", True),
|
||||
("google/gemma-2-2b-it", [], 1, 2, "FLASHINFER", "generate", True),
|
||||
# TODO: add multi-modality test for llava
|
||||
("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False)
|
||||
])
|
||||
def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
|
||||
method, fullgraph):
|
||||
# this test is run under multiple suits, with different GPUs.
|
||||
# make sure we only run the test with correct CUDA devices.
|
||||
# don't use "<", as it will duplicate the tests.
|
||||
if cuda_device_count_stateless() != pp_size * tp_size:
|
||||
pytest.skip("Not correct CUDA devices for the test.")
|
||||
import os
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
|
||||
if not fullgraph:
|
||||
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"
|
||||
all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"]
|
||||
+ ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3
|
||||
# don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
|
||||
# inductor will change the output, so we cannot compare them.
|
||||
all_envs: List[Optional[Dict[str, str]]] = [{
|
||||
"VLLM_TORCH_COMPILE_LEVEL":
|
||||
str(level)
|
||||
} for level in [
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
CompilationLevel.DYNAMO_AS_IS,
|
||||
CompilationLevel.DYNAMO_ONCE,
|
||||
]]
|
||||
compare_all_settings(model, all_args, all_envs, method=method)
|
||||
@ -1,13 +1,20 @@
|
||||
import pytest
|
||||
|
||||
from vllm.compilation.backends import vllm_backend
|
||||
from vllm.compilation.levels import CompilationLevel
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from .utils import TEST_MODELS, check_full_graph_support
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", TEST_MODELS)
|
||||
@pytest.mark.parametrize("backend", ["eager", vllm_backend])
|
||||
def test_full_graph(model_info, backend):
|
||||
@pytest.mark.parametrize(
|
||||
"optimization_level",
|
||||
[CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR])
|
||||
@fork_new_process_for_each_test
|
||||
def test_full_graph(model_info, optimization_level):
|
||||
model = model_info[0]
|
||||
model_kwargs = model_info[1]
|
||||
check_full_graph_support(model, model_kwargs, backend, tp_size=1)
|
||||
check_full_graph_support(model,
|
||||
model_kwargs,
|
||||
optimization_level,
|
||||
tp_size=1)
|
||||
|
||||
@ -1,22 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from vllm.compilation.backends import vllm_backend
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
from .utils import TEST_MODELS_SMOKE, check_full_graph_support
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@pytest.mark.parametrize("backend", ["eager", vllm_backend])
|
||||
@fork_new_process_for_each_test
|
||||
def test_full_graph_multi_gpu(model_info, tp_size, backend):
|
||||
model = model_info[0]
|
||||
model_kwargs = model_info[1]
|
||||
|
||||
# Skip the test if there are not enough CUDA devices.
|
||||
if cuda_device_count_stateless() < tp_size:
|
||||
pytest.skip("Not enough CUDA devices for the test.")
|
||||
|
||||
check_full_graph_support(model, model_kwargs, backend, tp_size=tp_size)
|
||||
@ -1,13 +0,0 @@
|
||||
import pytest
|
||||
|
||||
from vllm.compilation.backends import vllm_backend
|
||||
|
||||
from .utils import TEST_MODELS_SMOKE, check_full_graph_support
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_info", TEST_MODELS_SMOKE)
|
||||
@pytest.mark.parametrize("backend", ["eager", vllm_backend])
|
||||
def test_full_graph(model_info, backend):
|
||||
model = model_info[0]
|
||||
model_kwargs = model_info[1]
|
||||
check_full_graph_support(model, model_kwargs, backend, tp_size=1)
|
||||
@ -4,16 +4,9 @@ import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.plugins import set_torch_compile_backend
|
||||
from vllm.compilation.levels import CompilationLevel
|
||||
from vllm.utils import is_hip
|
||||
|
||||
TEST_MODELS_SMOKE = [
|
||||
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
|
||||
"quantization": "compressed-tensors"
|
||||
}),
|
||||
("meta-llama/Meta-Llama-3-8B", {}),
|
||||
]
|
||||
|
||||
TEST_MODELS = [
|
||||
("facebook/opt-125m", {}),
|
||||
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
|
||||
@ -68,20 +61,21 @@ if not is_hip() and is_quant_method_supported("awq"):
|
||||
}))
|
||||
|
||||
|
||||
def check_full_graph_support(model, model_kwargs, backend, tp_size=1):
|
||||
def check_full_graph_support(model,
|
||||
model_kwargs,
|
||||
optimization_level,
|
||||
tp_size=1):
|
||||
# make sure these models can be captured in full graph mode
|
||||
if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ:
|
||||
os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1"
|
||||
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
|
||||
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
|
||||
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
|
||||
|
||||
# Inductor doesn't support fp8/gptq_marlin_24 yet.
|
||||
quantization = model_kwargs.get("quantization")
|
||||
if (quantization == "fp8" or quantization == "gptq_marlin"
|
||||
or quantization == "gptq_marlin_24") and backend != "eager":
|
||||
or quantization == "gptq_marlin_24"
|
||||
) and optimization_level >= CompilationLevel.INDUCTOR:
|
||||
return
|
||||
|
||||
set_torch_compile_backend(backend)
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
|
||||
Reference in New Issue
Block a user