[FrontEnd] UNREVERT CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26502)
Signed-off-by: morrison-turnansky <mturnans@redhat.com> Signed-off-by: Morrison Turnansky <mturnans@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
This commit is contained in:
committed by
GitHub
parent
7200a21cd1
commit
e3fdb627d9
@ -258,13 +258,13 @@ def tractable_computation(
|
||||
|
||||
@torch.inference_mode
|
||||
def run_model(
|
||||
llama_config, use_compile: bool, use_inductor: bool, split_attn: bool = False
|
||||
llama_config, use_compile: bool, backend: str, split_attn: bool = False
|
||||
) -> torch.Tensor:
|
||||
if use_compile:
|
||||
compilation_config = CompilationConfig(
|
||||
level=CompilationLevel.PIECEWISE,
|
||||
use_cudagraph=True,
|
||||
use_inductor=use_inductor,
|
||||
backend=backend,
|
||||
cudagraph_capture_sizes=[1, 2],
|
||||
)
|
||||
if split_attn:
|
||||
@ -338,8 +338,8 @@ def run_model(
|
||||
return output.cpu()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
||||
def test_toy_llama(use_inductor: bool):
|
||||
@pytest.mark.parametrize("backend", ["inductor", "eager"])
|
||||
def test_toy_llama(backend: str):
|
||||
# compare output with and without piecewise compilation
|
||||
|
||||
llama_config = LlamaConfig(
|
||||
@ -358,10 +358,10 @@ def test_toy_llama(use_inductor: bool):
|
||||
num_backend_compilations=0,
|
||||
num_cudagraph_captured=0,
|
||||
):
|
||||
outputs.append(run_model(llama_config, use_inductor=False, use_compile=False))
|
||||
run_model(tractable_config, use_inductor=False, use_compile=False)
|
||||
outputs.append(run_model(llama_config, backend="eager", use_compile=False))
|
||||
run_model(tractable_config, backend="eager", use_compile=False)
|
||||
|
||||
if use_inductor:
|
||||
if backend == "inductor":
|
||||
kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
|
||||
else:
|
||||
kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
|
||||
@ -377,10 +377,8 @@ def test_toy_llama(use_inductor: bool):
|
||||
num_cudagraph_captured=2,
|
||||
**kwargs,
|
||||
):
|
||||
outputs.append(
|
||||
run_model(llama_config, use_inductor=use_inductor, use_compile=True)
|
||||
)
|
||||
run_model(tractable_config, use_inductor=use_inductor, use_compile=True)
|
||||
outputs.append(run_model(llama_config, backend=backend, use_compile=True))
|
||||
run_model(tractable_config, backend=backend, use_compile=True)
|
||||
|
||||
with compilation_counter.expect(
|
||||
num_graphs_seen=1, # one graph for the model
|
||||
@ -395,16 +393,9 @@ def test_toy_llama(use_inductor: bool):
|
||||
), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
|
||||
):
|
||||
outputs.append(
|
||||
run_model(
|
||||
llama_config,
|
||||
use_inductor=use_inductor,
|
||||
use_compile=True,
|
||||
split_attn=True,
|
||||
)
|
||||
run_model(llama_config, backend=backend, use_compile=True, split_attn=True)
|
||||
)
|
||||
run_model(
|
||||
tractable_config, use_inductor=use_inductor, use_compile=True, split_attn=True
|
||||
)
|
||||
run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)
|
||||
|
||||
for i in range(1, len(outputs)):
|
||||
assert torch.allclose(outputs[0], outputs[i])
|
||||
|
||||
@ -77,14 +77,15 @@ class TestSetting:
|
||||
method="encode",
|
||||
),
|
||||
# vision language model
|
||||
TestSetting(
|
||||
model="microsoft/Phi-3.5-vision-instruct",
|
||||
model_args=["--trust-remote-code", "--max-model-len", "2048"],
|
||||
pp_size=2,
|
||||
tp_size=1,
|
||||
attn_backend="FLASH_ATTN",
|
||||
method="generate_with_image",
|
||||
),
|
||||
# See https://github.com/vllm-project/vllm/issues/26716.
|
||||
# TestSetting(
|
||||
# model="microsoft/Phi-3.5-vision-instruct",
|
||||
# model_args=["--trust-remote-code", "--max-model-len", "2048"],
|
||||
# pp_size=2,
|
||||
# tp_size=1,
|
||||
# attn_backend="FLASH_ATTN",
|
||||
# method="generate_with_image",
|
||||
# ),
|
||||
],
|
||||
)
|
||||
def test_compile_correctness(
|
||||
@ -109,41 +110,46 @@ def test_compile_correctness(
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
|
||||
final_args = [
|
||||
"--enforce-eager",
|
||||
*model_args,
|
||||
"-pp",
|
||||
str(pp_size),
|
||||
"-tp",
|
||||
str(tp_size),
|
||||
"-O.cudagraph_mode=none",
|
||||
]
|
||||
|
||||
all_args: list[list[str]] = []
|
||||
all_envs: list[dict[str, str] | None] = []
|
||||
|
||||
for level in [
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
for comp_level in [
|
||||
CompilationLevel.DYNAMO_AS_IS,
|
||||
CompilationLevel.DYNAMO_ONCE,
|
||||
CompilationLevel.PIECEWISE,
|
||||
]:
|
||||
all_args.append(final_args + [f"-O{level}"])
|
||||
all_envs.append({})
|
||||
for level in [CompilationLevel.NO_COMPILATION, comp_level]:
|
||||
all_args.append(
|
||||
final_args + [f"-O.level={level}", "-O.backend=inductor"]
|
||||
)
|
||||
|
||||
# inductor will change the output, so we only compare if the output
|
||||
# is close, not exactly the same.
|
||||
compare_all_settings(
|
||||
model,
|
||||
all_args,
|
||||
all_envs,
|
||||
method=method if method != "generate" else "generate_close",
|
||||
)
|
||||
all_envs.clear()
|
||||
all_args.clear()
|
||||
# inductor will change the output, so we only compare if the output
|
||||
# is close, not exactly the same.
|
||||
compare_all_settings(
|
||||
model,
|
||||
all_args,
|
||||
all_envs,
|
||||
method=method if method != "generate" else "generate_close",
|
||||
)
|
||||
all_envs.clear()
|
||||
all_args.clear()
|
||||
|
||||
for level in [
|
||||
CompilationLevel.NO_COMPILATION,
|
||||
CompilationLevel.DYNAMO_AS_IS,
|
||||
CompilationLevel.DYNAMO_ONCE,
|
||||
CompilationLevel.PIECEWISE,
|
||||
]:
|
||||
all_args.append(final_args + [f"-O{level}"])
|
||||
all_args.append(final_args + [f"-O.level={level}", "-O.backend=eager"])
|
||||
all_envs.append({})
|
||||
all_envs.append({})
|
||||
|
||||
compare_all_settings(model, all_args * 3, all_envs, method=method)
|
||||
|
||||
@ -36,55 +36,56 @@ class Relu3(ReLUSquaredActivation):
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"env, torch_level, use_inductor, ops_enabled, default_on",
|
||||
"env, torch_level, backend, ops_enabled, default_on",
|
||||
[
|
||||
# Default values based on compile level
|
||||
# - All by default (no Inductor compilation)
|
||||
(None, 0, False, [True] * 4, True),
|
||||
(None, 1, True, [True] * 4, True),
|
||||
(None, 2, False, [True] * 4, True),
|
||||
(None, 0, "eager", [True] * 4, True),
|
||||
(None, 1, "eager", [True] * 4, True),
|
||||
(None, 2, "eager", [True] * 4, True),
|
||||
(None, 3, "eager", [True] * 4, True),
|
||||
# - None by default (with Inductor)
|
||||
(None, 3, True, [False] * 4, False),
|
||||
(None, 4, True, [False] * 4, False),
|
||||
# - All by default (without Inductor)
|
||||
(None, 3, False, [True] * 4, True),
|
||||
(None, 4, False, [True] * 4, True),
|
||||
(None, 0, "inductor", [True] * 4, True),
|
||||
# - None by default (with Inductor)
|
||||
(None, 1, "inductor", [False] * 4, False),
|
||||
(None, 2, "inductor", [False] * 4, False),
|
||||
(None, 3, "inductor", [False] * 4, False),
|
||||
# Explicitly enabling/disabling
|
||||
#
|
||||
# Default: all
|
||||
#
|
||||
# All but SiluAndMul
|
||||
("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True),
|
||||
("+rms_norm,-silu_and_mul", 0, "inductor", [1, 0, 1, 1], True),
|
||||
# Only ReLU3
|
||||
("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False),
|
||||
("none,-rms_norm,+relu3", 1, "eager", [0, 0, 0, 1], False),
|
||||
# All but SiluAndMul
|
||||
("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True),
|
||||
("all,-silu_and_mul", 2, "inductor", [1, 0, 1, 1], True),
|
||||
# All but ReLU3 (even if ReLU2 is on)
|
||||
("-relu3,+relu2", 3, False, [1, 1, 1, 0], True),
|
||||
("-relu3,+relu2", 3, "eager", [1, 1, 1, 0], True),
|
||||
# RMSNorm and SiluAndMul
|
||||
("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False),
|
||||
("none,-relu3,+rms_norm,+silu_and_mul", 3, "eager", [1, 1, 0, 0], False),
|
||||
# All but RMSNorm
|
||||
("-rms_norm", 3, False, [0, 1, 1, 1], True),
|
||||
("-rms_norm", 3, "eager", [0, 1, 1, 1], True),
|
||||
#
|
||||
# Default: none
|
||||
#
|
||||
# Only ReLU3
|
||||
("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False),
|
||||
("none,+relu3", 3, "inductor", [0, 0, 0, 1], False),
|
||||
# All but RMSNorm
|
||||
("all,-rms_norm", 4, True, [0, 1, 1, 1], True),
|
||||
("all,-rms_norm", 3, "inductor", [0, 1, 1, 1], True),
|
||||
],
|
||||
)
|
||||
def test_enabled_ops(
|
||||
env: str | None,
|
||||
torch_level: int,
|
||||
use_inductor: bool,
|
||||
backend: str,
|
||||
ops_enabled: list[int],
|
||||
default_on: bool,
|
||||
):
|
||||
custom_ops = env.split(",") if env else []
|
||||
vllm_config = VllmConfig(
|
||||
compilation_config=CompilationConfig(
|
||||
use_inductor=bool(use_inductor), level=torch_level, custom_ops=custom_ops
|
||||
backend=backend, level=torch_level, custom_ops=custom_ops
|
||||
)
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
|
||||
Reference in New Issue
Block a user