Compare commits
39 Commits
bench-late
...
whisper-tr
| Author | SHA1 | Date | |
|---|---|---|---|
| d3eddd6ef1 | |||
| e75a6301bd | |||
| a79cc68b3a | |||
| 7e3f7a4ee7 | |||
| 9ec8257914 | |||
| 38327cf454 | |||
| dfa82e2a3d | |||
| e59ca942f5 | |||
| a57a3044aa | |||
| 4e5a0f6ae2 | |||
| b63bd14999 | |||
| 2041c0e360 | |||
| 085cbc4f9f | |||
| 2b93162fb0 | |||
| 2e45bd29fe | |||
| 51d7c6a2b2 | |||
| f3aca1ee30 | |||
| 8dd41d6bcc | |||
| 0a298ea418 | |||
| d330558bab | |||
| 656fd72976 | |||
| 79455cf421 | |||
| 30d6a015e0 | |||
| 8af5a5c4e5 | |||
| 3a5f0afcd2 | |||
| c7e63aa4d8 | |||
| 4a9ce1784c | |||
| 7e4e709b43 | |||
| 63d8eabed0 | |||
| e830b01383 | |||
| ff6473980d | |||
| a164aea35d | |||
| a76f547e11 | |||
| b7b7676d67 | |||
| e6e3c55ef2 | |||
| f98a4920f9 | |||
| d4bfc23ef0 | |||
| 9a2160fa55 | |||
| 2de4118243 |
@ -3,7 +3,7 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
@ -14,7 +14,7 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
@ -31,7 +31,7 @@ steps:
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/upload-wheels.sh"
|
||||
@ -48,7 +48,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Build and publish TPU release image"
|
||||
@ -57,7 +57,7 @@ steps:
|
||||
agents:
|
||||
queue: tpu_queue_postmerge
|
||||
commands:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||
- "docker push vllm/vllm-tpu:nightly"
|
||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||
plugins:
|
||||
@ -82,7 +82,7 @@ steps:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@ -10,5 +10,5 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t cpu-test -f Dockerfile.ppc64le .
|
||||
docker build -t cpu-test -f docker/Dockerfile.ppc64le .
|
||||
|
||||
|
||||
@ -18,8 +18,8 @@ trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
|
||||
@ -9,6 +9,7 @@ python3 use_existing_torch.py
|
||||
|
||||
# Try building the docker image
|
||||
DOCKER_BUILDKIT=1 docker build . \
|
||||
--file docker/Dockerfile \
|
||||
--target vllm-openai \
|
||||
--platform "linux/arm64" \
|
||||
-t gh200-test \
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
set -ex
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t hpu-test-env -f Dockerfile.hpu .
|
||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
||||
|
||||
# Setup cleanup
|
||||
# certain versions of HPU software stack have a bug that can
|
||||
|
||||
@ -35,7 +35,7 @@ else
|
||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||
fi
|
||||
|
||||
docker build -t "${image_name}" -f Dockerfile.neuron .
|
||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
set -e
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
@ -21,6 +21,8 @@ docker run --privileged --net host --shm-size=16G -it \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo TEST_0 \
|
||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
|
||||
&& echo TEST_1 \
|
||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
|
||||
&& echo TEST_2 \
|
||||
|
||||
@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||
|
||||
# Try building the docker image
|
||||
docker build -t ${image_name} -f Dockerfile.xpu .
|
||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
|
||||
2
.github/mergify.yml
vendored
2
.github/mergify.yml
vendored
@ -19,7 +19,7 @@ pull_request_rules:
|
||||
- files~=\.buildkite/
|
||||
- files~=^cmake/
|
||||
- files=CMakeLists.txt
|
||||
- files~=^Dockerfile
|
||||
- files~=^docker/Dockerfile
|
||||
- files~=^requirements.*\.txt
|
||||
- files=setup.py
|
||||
actions:
|
||||
|
||||
2
.github/workflows/lint-and-deploy.yaml
vendored
2
.github/workflows/lint-and-deploy.yaml
vendored
@ -50,7 +50,7 @@ jobs:
|
||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||
|
||||
- name: Build the Docker image vllm cpu
|
||||
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
||||
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
||||
|
||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||
run: |
|
||||
|
||||
@ -1,3 +1,6 @@
|
||||
default_install_hook_types:
|
||||
- pre-commit
|
||||
- commit-msg
|
||||
default_stages:
|
||||
- pre-commit # Run locally
|
||||
- manual # Run in CI
|
||||
|
||||
@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
||||
#
|
||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||
# versions are derived from Dockerfile.rocm
|
||||
# versions are derived from docker/Dockerfile.rocm
|
||||
#
|
||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
|
||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
|
||||
@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
|
||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||
"csrc/cuda_utils_kernels.cu"
|
||||
"csrc/prepare_inputs/advance_step.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/torch_bindings.cpp")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
@ -283,7 +284,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||
"csrc/quantization/awq/gemm_kernels.cu"
|
||||
"csrc/custom_all_reduce.cu"
|
||||
"csrc/permute_cols.cu"
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||
|
||||
@ -5,21 +5,18 @@ import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from benchmark_utils import (convert_to_pytorch_benchmark_format, get_requests,
|
||||
validate_dataset, write_to_json)
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import TextPrompt, TokensPrompt
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
@ -51,34 +48,28 @@ def main(args: argparse.Namespace):
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
n=args.n,
|
||||
temperature=0,
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
ignore_eos=True,
|
||||
max_tokens=args.output_len,
|
||||
detokenize=not args.disable_detokenize,
|
||||
)
|
||||
print(sampling_params)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||
requests = get_requests(args.batch_size, args, tokenizer)
|
||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||
for request in requests:
|
||||
prompts.append(
|
||||
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||
multi_modal_data=request.multi_modal_data)
|
||||
if "prompt_token_ids" in request.prompt else \
|
||||
TextPrompt(prompt=request.prompt,
|
||||
multi_modal_data=request.multi_modal_data))
|
||||
dummy_prompt_token_ids = np.random.randint(10000,
|
||||
size=(args.batch_size,
|
||||
args.input_len))
|
||||
dummy_prompts: list[PromptType] = [{
|
||||
"prompt_token_ids": batch
|
||||
} for batch in dummy_prompt_token_ids.tolist()]
|
||||
|
||||
def llm_generate():
|
||||
if not args.use_beam_search:
|
||||
llm.generate(prompts,
|
||||
llm.generate(dummy_prompts,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
else:
|
||||
llm.beam_search(
|
||||
prompts,
|
||||
dummy_prompts,
|
||||
BeamSearchParams(
|
||||
beam_width=args.n,
|
||||
max_tokens=args.output_len,
|
||||
@ -189,44 +180,7 @@ if __name__ == "__main__":
|
||||
help=("Do not detokenize responses (i.e. do not include "
|
||||
"detokenization time in the latency measurement)"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
default="sharegpt")
|
||||
# random dataset
|
||||
parser.add_argument(
|
||||
"--random-range-ratio",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Range of sampled ratio of input/output length, "
|
||||
"used only for RandomDataSet.",
|
||||
)
|
||||
parser.add_argument("--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the dataset")
|
||||
|
||||
# LoRA
|
||||
parser.add_argument(
|
||||
"--lora-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the lora adapters to use. This can be an absolute path, "
|
||||
"a relative path, or a Hugging Face model identifier.")
|
||||
|
||||
parser.add_argument("--prefix-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of prefix tokens per request."
|
||||
"This is for the RandomDataset and SonnetDataset")
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
args.backend = "vllm"
|
||||
validate_dataset(args)
|
||||
random.seed(0)
|
||||
main(args)
|
||||
|
||||
@ -11,9 +11,11 @@ from typing import Any, Optional, Union
|
||||
|
||||
import torch
|
||||
import uvloop
|
||||
from benchmark_dataset import SampleRequest
|
||||
from benchmark_utils import (convert_to_pytorch_benchmark_format, get_requests,
|
||||
validate_dataset, write_to_json)
|
||||
from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
|
||||
InstructCoderDataset, RandomDataset,
|
||||
SampleRequest, ShareGPTDataset, SonnetDataset,
|
||||
VisionArenaDataset)
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from tqdm import tqdm
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||
PreTrainedTokenizerBase)
|
||||
@ -285,6 +287,59 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
write_to_json(pt_file, pt_records)
|
||||
|
||||
|
||||
def get_requests(args, tokenizer):
|
||||
# Common parameters for all dataset types.
|
||||
common_kwargs = {
|
||||
"dataset_path": args.dataset_path,
|
||||
"random_seed": args.seed,
|
||||
}
|
||||
sample_kwargs = {
|
||||
"tokenizer": tokenizer,
|
||||
"lora_path": args.lora_path,
|
||||
"max_loras": args.max_loras,
|
||||
"num_requests": args.num_prompts,
|
||||
"input_len": args.input_len,
|
||||
"output_len": args.output_len,
|
||||
}
|
||||
|
||||
if args.dataset_path is None or args.dataset_name == "random":
|
||||
sample_kwargs["range_ratio"] = args.random_range_ratio
|
||||
sample_kwargs["prefix_len"] = args.prefix_len
|
||||
dataset_cls = RandomDataset
|
||||
elif args.dataset_name == "sharegpt":
|
||||
dataset_cls = ShareGPTDataset
|
||||
if args.backend == "vllm-chat":
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_name == "sonnet":
|
||||
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
||||
"Tokenizer/model must have chat template for sonnet dataset.")
|
||||
dataset_cls = SonnetDataset
|
||||
sample_kwargs["prefix_len"] = args.prefix_len
|
||||
sample_kwargs["return_prompt_formatted"] = True
|
||||
elif args.dataset_name == "burstgpt":
|
||||
dataset_cls = BurstGPTDataset
|
||||
elif args.dataset_name == "hf":
|
||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = VisionArenaDataset
|
||||
common_kwargs['dataset_subset'] = None
|
||||
common_kwargs['dataset_split'] = "train"
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = InstructCoderDataset
|
||||
common_kwargs['dataset_split'] = "train"
|
||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = ConversationDataset
|
||||
common_kwargs['dataset_subset'] = args.hf_subset
|
||||
common_kwargs['dataset_split'] = args.hf_split
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||
# Remove None values
|
||||
sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
|
||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
if args.seed is None:
|
||||
args.seed = 0
|
||||
@ -293,7 +348,7 @@ def main(args: argparse.Namespace):
|
||||
# Sample the requests.
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||
requests = get_requests(args.num_prompts, args, tokenizer)
|
||||
requests = get_requests(args, tokenizer)
|
||||
is_multi_modal = any(request.multi_modal_data is not None
|
||||
for request in requests)
|
||||
request_outputs: Optional[list[RequestOutput]] = None
|
||||
@ -394,8 +449,47 @@ def validate_args(args):
|
||||
if args.backend not in valid_backends:
|
||||
raise ValueError(f"Unsupported backend: {args.backend}")
|
||||
|
||||
# === Dataset Validation ===
|
||||
validate_dataset(args)
|
||||
# === Dataset Configuration ===
|
||||
if not args.dataset and not args.dataset_path:
|
||||
print(
|
||||
"When dataset path is not set, it will default to random dataset")
|
||||
args.dataset_name = 'random'
|
||||
if args.input_len is None:
|
||||
raise ValueError("input_len must be provided for a random dataset")
|
||||
|
||||
# === Dataset Name Specific Checks ===
|
||||
# --hf-subset and --hf-split: only used
|
||||
# when dataset_name is 'hf'
|
||||
if args.dataset_name != "hf" and (
|
||||
getattr(args, "hf_subset", None) is not None
|
||||
or getattr(args, "hf_split", None) is not None):
|
||||
warnings.warn("--hf-subset and --hf-split will be ignored \
|
||||
since --dataset-name is not 'hf'.",
|
||||
stacklevel=2)
|
||||
elif args.dataset_name == "hf":
|
||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||
assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend." #noqa: E501
|
||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||
assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend." #noqa: E501
|
||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||
assert args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend." #noqa: E501
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{args.dataset_path} is not supported by hf dataset.")
|
||||
|
||||
# --random-range-ratio: only used when dataset_name is 'random'
|
||||
if args.dataset_name != 'random' and args.random_range_ratio is not None:
|
||||
warnings.warn("--random-range-ratio will be ignored since \
|
||||
--dataset-name is not 'random'.",
|
||||
stacklevel=2)
|
||||
|
||||
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
||||
# set.
|
||||
if args.dataset_name not in {"random", "sonnet", None
|
||||
} and args.prefix_len is not None:
|
||||
warnings.warn("--prefix-len will be ignored since --dataset-name\
|
||||
is not 'random', 'sonnet', or not set.",
|
||||
stacklevel=2)
|
||||
|
||||
# === LoRA Settings ===
|
||||
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
||||
@ -435,6 +529,14 @@ if __name__ == "__main__":
|
||||
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
default="sharegpt")
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the ShareGPT dataset, will be deprecated in\
|
||||
the next release. The dataset is expected to "
|
||||
"be a json in form of list[dict[..., conversations: "
|
||||
"list[dict[..., value: <prompt_or_response>]]]]")
|
||||
parser.add_argument("--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
|
||||
@ -4,14 +4,8 @@ import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import warnings
|
||||
from typing import Any
|
||||
|
||||
from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
|
||||
InstructCoderDataset, RandomDataset,
|
||||
SampleRequest, ShareGPTDataset, SonnetDataset,
|
||||
VisionArenaDataset)
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
metrics: dict[str, list],
|
||||
@ -73,113 +67,3 @@ class InfEncoder(json.JSONEncoder):
|
||||
def write_to_json(filename: str, records: list) -> None:
|
||||
with open(filename, "w") as f:
|
||||
json.dump(records, f, cls=InfEncoder)
|
||||
|
||||
|
||||
def get_requests(num_requests: int, args: argparse.Namespace,
|
||||
tokenizer: Any) -> list[SampleRequest]:
|
||||
"""
|
||||
Sample the requests for the benchmark.
|
||||
"""
|
||||
# Common parameters for all dataset types.
|
||||
common_kwargs = {
|
||||
"dataset_path": args.dataset_path,
|
||||
"random_seed": args.seed,
|
||||
}
|
||||
sample_kwargs = {
|
||||
"tokenizer": tokenizer,
|
||||
"lora_path": args.lora_path,
|
||||
"max_loras": args.max_loras,
|
||||
"num_requests": num_requests,
|
||||
"input_len": args.input_len,
|
||||
"output_len": args.output_len,
|
||||
}
|
||||
|
||||
if args.dataset_path is None or args.dataset_name == "random":
|
||||
sample_kwargs["range_ratio"] = args.random_range_ratio
|
||||
sample_kwargs["prefix_len"] = args.prefix_len
|
||||
dataset_cls = RandomDataset
|
||||
elif args.dataset_name == "sharegpt":
|
||||
dataset_cls = ShareGPTDataset
|
||||
if getattr(args, "backend", False) and args.backend == "vllm-chat":
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_name == "sonnet":
|
||||
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
||||
"Tokenizer/model must have chat template for sonnet dataset.")
|
||||
dataset_cls = SonnetDataset
|
||||
sample_kwargs["prefix_len"] = args.prefix_len
|
||||
sample_kwargs["return_prompt_formatted"] = True
|
||||
elif args.dataset_name == "burstgpt":
|
||||
dataset_cls = BurstGPTDataset
|
||||
elif args.dataset_name == "hf":
|
||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = VisionArenaDataset
|
||||
common_kwargs['dataset_subset'] = None
|
||||
common_kwargs['dataset_split'] = "train"
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = InstructCoderDataset
|
||||
common_kwargs['dataset_split'] = "train"
|
||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = ConversationDataset
|
||||
common_kwargs['dataset_subset'] = args.hf_subset
|
||||
common_kwargs['dataset_split'] = args.hf_split
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||
# Remove None values
|
||||
sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
|
||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||
|
||||
|
||||
def validate_dataset(args: argparse.Namespace, ):
|
||||
"""
|
||||
Validate the dataset arguments.
|
||||
"""
|
||||
# === Dataset Configuration ===
|
||||
if not args.dataset_path:
|
||||
print(
|
||||
"When dataset path is not set, it will default to random dataset")
|
||||
args.dataset_name = 'random'
|
||||
if args.input_len is None:
|
||||
raise ValueError("input_len must be provided for a random dataset")
|
||||
|
||||
# === Dataset Name Specific Checks ===
|
||||
# --hf-subset and --hf-split: only used
|
||||
# when dataset_name is 'hf'
|
||||
if args.dataset_name != "hf" and (
|
||||
getattr(args, "hf_subset", None) is not None
|
||||
or getattr(args, "hf_split", None) is not None):
|
||||
warnings.warn("--hf-subset and --hf-split will be ignored \
|
||||
since --dataset-name is not 'hf'.",
|
||||
stacklevel=2)
|
||||
elif args.dataset_name == "hf":
|
||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||
assert getattr(
|
||||
args, 'backend', None
|
||||
) and args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend." #noqa: E501
|
||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||
assert getattr(
|
||||
args, 'backend', None
|
||||
) and args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend." #noqa: E501
|
||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||
assert getattr(
|
||||
args, 'backend', None
|
||||
) and args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend." #noqa: E501
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{args.dataset_path} is not supported by hf dataset.")
|
||||
|
||||
# --random-range-ratio: only used when dataset_name is 'random'
|
||||
if args.dataset_name != 'random' and args.random_range_ratio is not None:
|
||||
warnings.warn("--random-range-ratio will be ignored since \
|
||||
--dataset-name is not 'random'.",
|
||||
stacklevel=2)
|
||||
|
||||
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
||||
# set.
|
||||
if args.dataset_name not in {"random", "sonnet", None
|
||||
} and args.prefix_len is not None:
|
||||
warnings.warn("--prefix-len will be ignored since --dataset-name\
|
||||
is not 'random', 'sonnet', or not set.",
|
||||
stacklevel=2)
|
||||
|
||||
@ -30,19 +30,18 @@ class BenchmarkConfig(TypedDict):
|
||||
num_stages: int
|
||||
|
||||
|
||||
def benchmark_config(
|
||||
config: BenchmarkConfig,
|
||||
num_tokens: int,
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
block_quant_shape: List[int] = None,
|
||||
) -> float:
|
||||
def benchmark_config(config: BenchmarkConfig,
|
||||
num_tokens: int,
|
||||
num_experts: int,
|
||||
shard_intermediate_size: int,
|
||||
hidden_size: int,
|
||||
topk: int,
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
block_quant_shape: List[int] = None,
|
||||
use_deep_gemm: bool = False) -> float:
|
||||
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||
if use_int8_w8a16:
|
||||
@ -115,22 +114,41 @@ def benchmark_config(
|
||||
def run():
|
||||
from vllm.model_executor.layers.fused_moe import override_config
|
||||
with override_config(config):
|
||||
fused_moe(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
input_gating,
|
||||
topk,
|
||||
renormalize=True,
|
||||
inplace=True,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
block_shape=block_quant_shape,
|
||||
)
|
||||
if use_deep_gemm:
|
||||
topk_weights, topk_ids = fused_topk(x, input_gating, topk,
|
||||
False)
|
||||
return fused_experts(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
inplace=True,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
block_shape=block_quant_shape,
|
||||
allow_deep_gemm=True,
|
||||
)
|
||||
else:
|
||||
fused_moe(
|
||||
x,
|
||||
w1,
|
||||
w2,
|
||||
input_gating,
|
||||
topk,
|
||||
renormalize=True,
|
||||
inplace=True,
|
||||
use_fp8_w8a8=use_fp8_w8a8,
|
||||
use_int8_w8a16=use_int8_w8a16,
|
||||
w1_scale=w1_scale,
|
||||
w2_scale=w2_scale,
|
||||
a1_scale=a1_scale,
|
||||
a2_scale=a2_scale,
|
||||
block_shape=block_quant_shape,
|
||||
)
|
||||
|
||||
# JIT compilation & warmup
|
||||
run()
|
||||
@ -366,6 +384,7 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
block_quant_shape: List[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
current_platform.seed_everything(self.seed)
|
||||
dtype_str = get_config_dtype_str(dtype,
|
||||
@ -396,7 +415,8 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=100,
|
||||
block_quant_shape=block_quant_shape)
|
||||
block_quant_shape=block_quant_shape,
|
||||
use_deep_gemm=use_deep_gemm)
|
||||
return config, kernel_time
|
||||
|
||||
def tune(
|
||||
@ -411,6 +431,7 @@ class BenchmarkWorker:
|
||||
use_int8_w8a16: bool,
|
||||
search_space: list[dict[str, int]],
|
||||
block_quant_shape: list[int],
|
||||
use_deep_gemm: bool,
|
||||
) -> dict[str, int]:
|
||||
best_config = None
|
||||
best_time = float("inf")
|
||||
@ -436,7 +457,8 @@ class BenchmarkWorker:
|
||||
use_fp8_w8a8,
|
||||
use_int8_w8a16,
|
||||
num_iters=20,
|
||||
block_quant_shape=block_quant_shape)
|
||||
block_quant_shape=block_quant_shape,
|
||||
use_deep_gemm=use_deep_gemm)
|
||||
except triton.runtime.autotuner.OutOfResources:
|
||||
# Some configurations may be invalid and fail to compile.
|
||||
continue
|
||||
@ -550,6 +572,8 @@ def main(args: argparse.Namespace):
|
||||
else:
|
||||
batch_sizes = [args.batch_size]
|
||||
|
||||
use_deep_gemm = bool(args.use_deep_gemm)
|
||||
|
||||
ray.init()
|
||||
num_gpus = int(ray.available_resources()["GPU"])
|
||||
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
||||
@ -572,10 +596,10 @@ def main(args: argparse.Namespace):
|
||||
|
||||
start = time.time()
|
||||
configs = _distribute(
|
||||
"tune",
|
||||
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
|
||||
use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
|
||||
for batch_size in batch_sizes])
|
||||
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
|
||||
topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
|
||||
block_quant_shape, use_deep_gemm)
|
||||
for batch_size in batch_sizes])
|
||||
best_configs = {
|
||||
M: sort_config(config)
|
||||
for M, config in zip(batch_sizes, configs)
|
||||
@ -589,7 +613,7 @@ def main(args: argparse.Namespace):
|
||||
outputs = _distribute(
|
||||
"benchmark",
|
||||
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
|
||||
use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
|
||||
use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
|
||||
for batch_size in batch_sizes])
|
||||
|
||||
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
|
||||
@ -611,6 +635,7 @@ if __name__ == "__main__":
|
||||
type=str,
|
||||
choices=["auto", "fp8_w8a8", "int8_w8a16"],
|
||||
default="auto")
|
||||
parser.add_argument("--use-deep-gemm", action="store_true")
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--tune", action="store_true")
|
||||
|
||||
@ -482,16 +482,28 @@ def get_pip_packages(run_lambda, patterns=None):
|
||||
if patterns is None:
|
||||
patterns = DEFAULT_PIP_PATTERNS
|
||||
|
||||
# People generally have `pip` as `pip` or `pip3`
|
||||
# But here it is invoked as `python -mpip`
|
||||
def run_with_pip(pip):
|
||||
out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
|
||||
def run_with_pip():
|
||||
try:
|
||||
import importlib.util
|
||||
pip_spec = importlib.util.find_spec('pip')
|
||||
pip_available = pip_spec is not None
|
||||
except ImportError:
|
||||
pip_available = False
|
||||
|
||||
if pip_available:
|
||||
cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
|
||||
elif os.environ.get("UV") is not None:
|
||||
print("uv is set")
|
||||
cmd = ["uv", "pip", "list", "--format=freeze"]
|
||||
else:
|
||||
raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
|
||||
|
||||
out = run_and_read_all(run_lambda, cmd)
|
||||
return "\n".join(line for line in out.splitlines()
|
||||
if any(name in line for name in patterns))
|
||||
|
||||
pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
|
||||
out = run_with_pip([sys.executable, '-mpip'])
|
||||
|
||||
out = run_with_pip()
|
||||
return pip_version, out
|
||||
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));
|
||||
|
||||
fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
|
||||
torch::Tensor& rank_data, int64_t rank,
|
||||
bool full_nvlink) {
|
||||
bool fully_connected) {
|
||||
int world_size = fake_ipc_ptrs.size();
|
||||
if (world_size > 8)
|
||||
throw std::invalid_argument("world size > 8 is not supported");
|
||||
@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
|
||||
}
|
||||
return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
|
||||
rank_data.numel(), rank, world_size,
|
||||
full_nvlink);
|
||||
fully_connected);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -142,3 +142,48 @@ void register_graph_buffers(fptr_t _fa,
|
||||
bytes.reserve(handles.size());
|
||||
fa->register_graph_buffers(bytes, offsets);
|
||||
}
|
||||
|
||||
std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
|
||||
int64_t size) {
|
||||
auto device_index = c10::cuda::current_device();
|
||||
at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
|
||||
void* buffer;
|
||||
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
|
||||
auto stream = c10::cuda::getCurrentCUDAStream().stream();
|
||||
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
|
||||
// Allocate buffer
|
||||
#if defined(USE_ROCM)
|
||||
// data buffers need to be "uncached" for signal on MI200
|
||||
AT_CUDA_CHECK(
|
||||
hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
|
||||
#else
|
||||
AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
|
||||
#endif
|
||||
AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
|
||||
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
|
||||
|
||||
// Create IPC memhandle for the allocated buffer.
|
||||
// Will use it in open_mem_handle.
|
||||
auto options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
|
||||
auto handle =
|
||||
torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
|
||||
AT_CUDA_CHECK(
|
||||
cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
|
||||
|
||||
return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
|
||||
}
|
||||
|
||||
fptr_t open_mem_handle(torch::Tensor& mem_handle) {
|
||||
void* ipc_ptr;
|
||||
AT_CUDA_CHECK(cudaIpcOpenMemHandle(
|
||||
(void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
|
||||
cudaIpcMemLazyEnablePeerAccess));
|
||||
return reinterpret_cast<fptr_t>(ipc_ptr);
|
||||
}
|
||||
|
||||
void free_shared_buffer(fptr_t buffer) {
|
||||
AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
|
||||
}
|
||||
|
||||
@ -5,6 +5,10 @@
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
typedef __hip_bfloat16 nv_bfloat16;
|
||||
#endif
|
||||
|
||||
#include <iostream>
|
||||
#include <array>
|
||||
#include <limits>
|
||||
@ -12,6 +16,7 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace vllm {
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t e = cmd; \
|
||||
@ -22,24 +27,37 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
namespace vllm {
|
||||
|
||||
// Maximal number of blocks in allreduce kernel.
|
||||
constexpr int kMaxBlocks = 36;
|
||||
|
||||
// Default number of blocks in allreduce kernel.
|
||||
#ifndef USE_ROCM
|
||||
const int defaultBlockLimit = 36;
|
||||
CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
|
||||
#else
|
||||
const int defaultBlockLimit = 16;
|
||||
hipPointer_attribute rangeStartAddrAttr =
|
||||
HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
|
||||
#endif
|
||||
|
||||
// Counter may overflow, but it's fine since unsigned int overflow is
|
||||
// well-defined behavior.
|
||||
using FlagType = uint32_t;
|
||||
|
||||
// Two sets of peer counters are needed for two syncs: starting and ending an
|
||||
// operation. The reason is that it's possible for peer GPU block to arrive at
|
||||
// the second sync point while the current GPU block haven't passed the first
|
||||
// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
|
||||
// waiting for counter. We use alternating counter array to avoid this
|
||||
// possibility.
|
||||
struct Signal {
|
||||
alignas(128) FlagType self_counter[kMaxBlocks][8];
|
||||
// Two sets of peer counters are needed for two syncs. The reason is that
|
||||
// it's possible for peer GPU block to arrive at the second sync point while
|
||||
// the current GPU block haven't passed the first sync point. Thus, peer GPU
|
||||
// may write counter+1 while current GPU is busy waiting for counter. We use
|
||||
// alternating counter array to avoid this possibility.
|
||||
alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
|
||||
alignas(128) FlagType start[kMaxBlocks][8];
|
||||
alignas(128) FlagType end[kMaxBlocks][8];
|
||||
alignas(128) FlagType _flag[kMaxBlocks]; // incremental flags for each rank
|
||||
};
|
||||
|
||||
struct __align__(16) RankData {
|
||||
const void* __restrict__ ptrs[8];
|
||||
const void* ptrs[8];
|
||||
};
|
||||
|
||||
struct __align__(16) RankSignals {
|
||||
@ -134,27 +152,29 @@ DINLINE O downcast(array_t<float, O::size> val) {
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(USE_ROCM)
|
||||
|
||||
static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
|
||||
"l"(flag_addr));
|
||||
#else
|
||||
#else
|
||||
asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
|
||||
"l"(flag_addr));
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
|
||||
FlagType flag;
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
|
||||
: "=r"(flag)
|
||||
: "l"(flag_addr));
|
||||
#else
|
||||
#else
|
||||
asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
|
||||
: "=r"(flag)
|
||||
: "l"(flag_addr));
|
||||
#endif
|
||||
#endif
|
||||
return flag;
|
||||
}
|
||||
|
||||
@ -170,37 +190,99 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
|
||||
return flag;
|
||||
}
|
||||
|
||||
// is_start: whether this is the very first synchronization barrier.
|
||||
// need_fence: whether a memory fence is needed. If true, a release-acquire
|
||||
// semantic is used to enforce memory access order before and after this
|
||||
// barrier.
|
||||
template <int ngpus, bool is_start, bool need_fence = false>
|
||||
DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
|
||||
int rank) {
|
||||
if constexpr (!is_start) __syncthreads();
|
||||
static_assert(
|
||||
!(is_start && need_fence)); // Start barrier shouldn't need fence.
|
||||
// This function is meant to be used as the first synchronization in the all
|
||||
// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
|
||||
// prior memory accesses. Note: volatile writes will not be reordered against
|
||||
// other volatile writes.
|
||||
template <int ngpus>
|
||||
DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
|
||||
int rank) {
|
||||
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
|
||||
if (threadIdx.x < ngpus) {
|
||||
// Increment the counter. Technically we only need one counter, but we use
|
||||
// multiple per block to eliminate the need to share the counter via smem.
|
||||
auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
|
||||
auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
|
||||
auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
|
||||
// Write the expected counter value to peer and wait for correct value
|
||||
// from peer.
|
||||
st_flag_volatile(peer_counter_ptr, flag);
|
||||
while (ld_flag_volatile(self_counter_ptr) != flag);
|
||||
}
|
||||
__syncthreads();
|
||||
// use one thread to update flag
|
||||
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
|
||||
}
|
||||
|
||||
// This function is meant to be used as the second or the final
|
||||
// synchronization barrier in the all reduce kernel. If it's the final
|
||||
// synchronization barrier, we don't need to make any visibility guarantees
|
||||
// for prior memory accesses.
|
||||
template <int ngpus, bool final_sync = false>
|
||||
DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
|
||||
__syncthreads();
|
||||
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
|
||||
if (threadIdx.x < ngpus) {
|
||||
auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
|
||||
auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
|
||||
// Write the expected counter value to peer and wait for correct value from
|
||||
// peer.
|
||||
auto peer_counter_ptr =
|
||||
&sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
|
||||
auto self_counter_ptr =
|
||||
&self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
|
||||
if constexpr (need_fence) {
|
||||
st_flag_release(peer_counter_ptr, val);
|
||||
while (ld_flag_acquire(self_counter_ptr) != val);
|
||||
if constexpr (!final_sync) {
|
||||
st_flag_release(peer_counter_ptr, flag);
|
||||
while (ld_flag_acquire(self_counter_ptr) != flag);
|
||||
} else {
|
||||
st_flag_volatile(peer_counter_ptr, val);
|
||||
while (ld_flag_volatile(self_counter_ptr) != val);
|
||||
st_flag_volatile(peer_counter_ptr, flag);
|
||||
while (ld_flag_volatile(self_counter_ptr) != flag);
|
||||
}
|
||||
}
|
||||
if constexpr (is_start || need_fence) __syncthreads();
|
||||
if constexpr (!final_sync) __syncthreads();
|
||||
|
||||
// use one thread to update flag
|
||||
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template <int ngpus>
|
||||
DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
|
||||
int rank) {
|
||||
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
|
||||
if (threadIdx.x < ngpus) {
|
||||
// simultaneously write to the corresponding flag of all ranks.
|
||||
// Latency = 1 p2p write
|
||||
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
|
||||
flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
|
||||
// wait until we got true from all ranks
|
||||
while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
|
||||
__ATOMIC_RELAXED,
|
||||
__MEMORY_SCOPE_DEVICE) < flag);
|
||||
}
|
||||
__syncthreads();
|
||||
// use one thread to update flag
|
||||
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
|
||||
}
|
||||
|
||||
template <int ngpus, bool final_sync = false>
|
||||
DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
|
||||
__syncthreads();
|
||||
uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
|
||||
if (threadIdx.x < ngpus) {
|
||||
// simultaneously write to the corresponding flag of all ranks.
|
||||
// Latency = 1 p2p write
|
||||
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
|
||||
flag,
|
||||
final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
|
||||
__MEMORY_SCOPE_SYSTEM);
|
||||
// wait until we got true from all ranks
|
||||
while (
|
||||
__scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
|
||||
final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
|
||||
__MEMORY_SCOPE_DEVICE) < flag);
|
||||
}
|
||||
if constexpr (!final_sync) __syncthreads();
|
||||
// use one thread to update flag
|
||||
if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <typename P, int ngpus, typename A>
|
||||
DINLINE P packed_reduce(const P* ptrs[], int idx) {
|
||||
A tmp = upcast(ptrs[0][idx]);
|
||||
@ -220,13 +302,13 @@ __global__ void __launch_bounds__(512, 1)
|
||||
// note: we don't reorder the address so the accumulation order is the same
|
||||
// for all ranks, ensuring bitwise identical results
|
||||
auto dp = *_dp;
|
||||
multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
|
||||
barrier_at_start<ngpus>(sg, self_sg, rank);
|
||||
// do the actual reduction
|
||||
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
|
||||
idx += gridDim.x * blockDim.x) {
|
||||
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
|
||||
}
|
||||
multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
|
||||
barrier_at_end<ngpus, true>(sg, self_sg, rank);
|
||||
}
|
||||
|
||||
template <typename P>
|
||||
@ -255,18 +337,20 @@ __global__ void __launch_bounds__(512, 1)
|
||||
tmps[i] = get_tmp_buf<P>(sg.signals[target]);
|
||||
}
|
||||
auto tmp_out = tmps[0];
|
||||
multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
|
||||
barrier_at_start<ngpus>(sg, self_sg, rank);
|
||||
|
||||
// stage 1: reduce scatter
|
||||
for (int idx = start + tid; idx < end; idx += stride) {
|
||||
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
|
||||
}
|
||||
multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
|
||||
barrier_at_end<ngpus>(sg, self_sg, rank);
|
||||
|
||||
// stage 2: allgather. Note: it's important to match the tid between
|
||||
// the two stages, because visibility across devices is only guaranteed
|
||||
// between threads that have the same tid. If thread i computes the sum of
|
||||
// start + i in the first stage, then thread i also gathers start + i from all
|
||||
// ranks.
|
||||
// start + i in the first stage, then thread i also gathers start + i from
|
||||
// all ranks.
|
||||
|
||||
for (int idx = tid; idx < largest_part; idx += stride) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < ngpus; i++) {
|
||||
@ -287,21 +371,22 @@ class CustomAllreduce {
|
||||
public:
|
||||
int rank_;
|
||||
int world_size_;
|
||||
bool full_nvlink_;
|
||||
// Full NVLink or xGMI connection between GPUs.
|
||||
bool fully_connected_;
|
||||
|
||||
RankSignals sg_;
|
||||
// Stores an map from a pointer to its peer pointters from all ranks.
|
||||
// Stores an map from a pointer to its peer pointers from all ranks.
|
||||
std::unordered_map<void*, RankData*> buffers_;
|
||||
Signal* self_sg_;
|
||||
|
||||
// Stores rank data from all ranks. This is mainly for cuda graph purposes.
|
||||
// For cuda graph to work, all kernel arguments must be fixed during graph
|
||||
// capture time. However, the peer pointers are not known during graph capture
|
||||
// time. Therefore, during capture, we increment the rank data pointer and use
|
||||
// that as the argument to the kernel. The kernel arguments are stored in
|
||||
// graph_unreg_buffers_. The actual peer pointers will be filled in at the
|
||||
// memory pointed to by the pointers in graph_unreg_buffers_ when
|
||||
// the IPC handles are exchanged between ranks.
|
||||
// capture time. However, the peer pointers are not known during graph
|
||||
// capture time. Therefore, during capture, we increment the rank data
|
||||
// pointer and use that as the argument to the kernel. The kernel arguments
|
||||
// are stored in graph_unreg_buffers_. The actual peer pointers will be
|
||||
// filled in at the memory pointed to by the pointers in
|
||||
// graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
|
||||
//
|
||||
// The overall process looks like this:
|
||||
// 1. Graph capture.
|
||||
@ -319,17 +404,18 @@ class CustomAllreduce {
|
||||
* Signals are an array of ipc-enabled buffers from all ranks.
|
||||
* For each of the buffer, the layout is as follows:
|
||||
* | -- sizeof(Signal) -- | ------ a few MB ----- |
|
||||
* The first section is for allreduce synchronization, and the second section
|
||||
* is for storing the intermediate results required by some allreduce algos.
|
||||
* The first section is for allreduce synchronization, and the second
|
||||
* section is for storing the intermediate results required by some
|
||||
* allreduce algos.
|
||||
*
|
||||
* Note: this class does not own any device memory. Any required buffers
|
||||
* are passed in from the constructor.
|
||||
*/
|
||||
CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
|
||||
int rank, int world_size, bool full_nvlink = true)
|
||||
int rank, int world_size, bool fully_connected = true)
|
||||
: rank_(rank),
|
||||
world_size_(world_size),
|
||||
full_nvlink_(full_nvlink),
|
||||
fully_connected_(fully_connected),
|
||||
self_sg_(signals[rank]),
|
||||
d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
|
||||
d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
|
||||
@ -361,8 +447,7 @@ class CustomAllreduce {
|
||||
void* base_ptr;
|
||||
// note: must share the base address of each allocation, or we get wrong
|
||||
// address
|
||||
if (cuPointerGetAttribute(&base_ptr,
|
||||
CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
|
||||
if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
|
||||
(CUdeviceptr)ptr) != CUDA_SUCCESS)
|
||||
throw std::runtime_error("failed to get pointer attr");
|
||||
CUDACHECK(cudaIpcGetMemHandle(
|
||||
@ -396,11 +481,11 @@ class CustomAllreduce {
|
||||
|
||||
// Note: when registering graph buffers, we intentionally choose to not
|
||||
// deduplicate the addresses. That means if the allocator reuses some
|
||||
// addresses, they will be registered again. This is to account for the remote
|
||||
// possibility of different allocation patterns between ranks. For example,
|
||||
// rank 1 may get the same input address for the second allreduce, but rank 2
|
||||
// got a different address. IPC handles have internal reference counting
|
||||
// mechanism so overhead should be small.
|
||||
// addresses, they will be registered again. This is to account for the
|
||||
// remote possibility of different allocation patterns between ranks. For
|
||||
// example, rank 1 may get the same input address for the second allreduce,
|
||||
// but rank 2 got a different address. IPC handles have internal reference
|
||||
// counting mechanism so overhead should be small.
|
||||
void register_graph_buffers(
|
||||
const std::vector<std::string>& handles,
|
||||
const std::vector<std::vector<int64_t>>& offsets) {
|
||||
@ -431,15 +516,15 @@ class CustomAllreduce {
|
||||
/**
|
||||
* Performs allreduce, assuming input has already been registered.
|
||||
*
|
||||
* Block and grid default configs are results after careful grid search. Using
|
||||
* 36 blocks give the best or close to the best runtime on the devices I
|
||||
* tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
|
||||
* take a small amount of SMs. Not quite sure the underlying reason, but my
|
||||
* guess is that too many SMs will cause contention on NVLink bus.
|
||||
* Block and grid default configs are results after careful grid search.
|
||||
* Using 36 blocks give the best or close to the best runtime on the devices
|
||||
* I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
|
||||
* only take a small amount of SMs. Not quite sure the underlying reason,
|
||||
* but my guess is that too many SMs will cause contention on NVLink bus.
|
||||
*/
|
||||
template <typename T>
|
||||
void allreduce(cudaStream_t stream, T* input, T* output, int size,
|
||||
int threads = 512, int block_limit = 36) {
|
||||
int threads = 512, int block_limit = defaultBlockLimit) {
|
||||
auto d = packed_t<T>::P::size;
|
||||
if (size % d != 0)
|
||||
throw std::runtime_error(
|
||||
@ -473,13 +558,11 @@ class CustomAllreduce {
|
||||
#define KL(ngpus, name) \
|
||||
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
|
||||
rank_, size);
|
||||
// TODO(hanzhi713): Threshold is different for A100 and H100.
|
||||
// Add per device threshold.
|
||||
#define REDUCE_CASE(ngpus) \
|
||||
case ngpus: { \
|
||||
if (world_size_ == 2) { \
|
||||
KL(ngpus, cross_device_reduce_1stage); \
|
||||
} else if (full_nvlink_) { \
|
||||
} else if (fully_connected_) { \
|
||||
if ((world_size_ <= 4 && bytes < 512 * 1024) || \
|
||||
(world_size_ <= 8 && bytes < 256 * 1024)) { \
|
||||
KL(ngpus, cross_device_reduce_1stage); \
|
||||
@ -497,7 +580,8 @@ class CustomAllreduce {
|
||||
REDUCE_CASE(8)
|
||||
default:
|
||||
throw std::runtime_error(
|
||||
"custom allreduce only supports num gpus in (2,4,6,8). Actual num "
|
||||
"custom allreduce only supports num gpus in (2,4,6,8). Actual "
|
||||
"num "
|
||||
"gpus = " +
|
||||
std::to_string(world_size_));
|
||||
}
|
||||
@ -511,10 +595,11 @@ class CustomAllreduce {
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* To inspect PTX/SASS, copy paste this header file to compiler explorer and add
|
||||
a template instantiation:
|
||||
* To inspect PTX/SASS, copy paste this header file to compiler explorer and
|
||||
add a template instantiation:
|
||||
* template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
|
||||
half *, int, int, int);
|
||||
*/
|
||||
} // namespace vllm
|
||||
} // namespace vllm
|
||||
@ -1,9 +1,9 @@
|
||||
/**
|
||||
* This is a standalone test for custom allreduce.
|
||||
* To compile, make sure you have MPI and NCCL installed in your system.
|
||||
* export MPI_HOME=xxx
|
||||
* export MPI_HOME=XXX
|
||||
* nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
|
||||
* custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
|
||||
* custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
|
||||
*
|
||||
* Warning: this C++ test is not designed to be very readable and was used
|
||||
* during the rapid prototyping process.
|
||||
@ -22,7 +22,15 @@
|
||||
#include "cuda_profiler_api.h"
|
||||
#include "custom_all_reduce.cuh"
|
||||
#include "mpi.h"
|
||||
#include "nccl.h"
|
||||
#ifdef USE_ROCM
|
||||
#include <hip/hip_bf16.h>
|
||||
typedef __hip_bfloat16 nv_bfloat16;
|
||||
#include "rccl/rccl.h"
|
||||
#include "custom_all_reduce_hip.cuh"
|
||||
#else
|
||||
#include "nccl.h"
|
||||
#include "custom_all_reduce.cuh"
|
||||
#endif
|
||||
|
||||
#define MPICHECK(cmd) \
|
||||
do { \
|
||||
@ -43,16 +51,29 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef USE_ROCM
|
||||
__global__ void dummy_kernel() {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
for (int i = 0; i < 100; i++) {
|
||||
uint64_t start = wall_clock64();
|
||||
uint64_t cycles_elapsed;
|
||||
do {
|
||||
cycles_elapsed = wall_clock64() - start;
|
||||
} while (cycles_elapsed < 100);
|
||||
}
|
||||
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
|
||||
}
|
||||
#else
|
||||
__global__ void dummy_kernel() {
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
|
||||
#else
|
||||
for (int i = 0; i < 100; i++) {
|
||||
long long int start = clock64();
|
||||
while (clock64() - start < 150000000); // approximately 98.4ms on P40
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
__global__ void set_data(T* data, int size, int myRank) {
|
||||
@ -121,8 +142,14 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
|
||||
* registration, they are allocated and registered together in the test for
|
||||
* convenience.
|
||||
*/
|
||||
#ifdef USE_ROCM
|
||||
CUDACHECK(hipExtMallocWithFlags(
|
||||
(void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
|
||||
hipDeviceMallocUncached));
|
||||
#else
|
||||
CUDACHECK(
|
||||
cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
|
||||
#endif
|
||||
CUDACHECK(
|
||||
cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
|
||||
CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
|
||||
@ -311,13 +338,18 @@ int main(int argc, char** argv) {
|
||||
|
||||
bool performance_test = true;
|
||||
cudaProfilerStart();
|
||||
// Uncomment to scan through different block size configs.
|
||||
// for (int threads : {256, 512, 1024}) {
|
||||
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
|
||||
// run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
|
||||
// performance_test);
|
||||
// }
|
||||
// }
|
||||
// Uncomment to scan through different block size configs.
|
||||
// for (int threads : {256, 512, 1024}) {
|
||||
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
|
||||
// run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
|
||||
// performance_test);
|
||||
// }
|
||||
// }
|
||||
#ifdef USE_ROCM
|
||||
const int block_limit = 16;
|
||||
#else
|
||||
const int block_limit = 36;
|
||||
#endif
|
||||
// Scan through different sizes to test performance.
|
||||
for (int sz = 512; sz <= (8 << 20); sz *= 2) {
|
||||
run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
|
||||
@ -326,4 +358,4 @@ int main(int argc, char** argv) {
|
||||
cudaProfilerStop();
|
||||
MPICHECK(MPI_Finalize());
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
}
|
||||
@ -267,10 +267,10 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
|
||||
const std::optional<at::Tensor>& has_initial_state,
|
||||
bool silu_activation, int64_t pad_slot_id);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
using fptr_t = int64_t;
|
||||
fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
|
||||
torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
|
||||
torch::Tensor& rank_data, int64_t rank,
|
||||
bool fully_connected);
|
||||
void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
|
||||
fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
|
||||
void dispose(fptr_t _fa);
|
||||
@ -281,4 +281,7 @@ get_graph_buffer_ipc_meta(fptr_t _fa);
|
||||
void register_graph_buffers(fptr_t _fa,
|
||||
const std::vector<std::vector<int64_t>>& handles,
|
||||
const std::vector<std::vector<int64_t>>& offsets);
|
||||
#endif
|
||||
std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
|
||||
int64_t size);
|
||||
int64_t open_mem_handle(torch::Tensor& mem_handle);
|
||||
void free_shared_buffer(int64_t buffer);
|
||||
|
||||
@ -614,12 +614,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
|
||||
&get_max_shared_memory_per_block_device_attribute);
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
|
||||
// Custom all-reduce kernels
|
||||
custom_ar.def(
|
||||
"init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
|
||||
"int rank, bool full_nvlink) -> int");
|
||||
"int rank, bool fully_connected) -> int");
|
||||
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
|
||||
custom_ar.def(
|
||||
"all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
|
||||
@ -632,7 +631,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
|
||||
custom_ar.def("register_buffer", ®ister_buffer);
|
||||
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
|
||||
custom_ar.def("register_graph_buffers", ®ister_graph_buffers);
|
||||
|
||||
custom_ar.def("allocate_shared_buffer_and_handle",
|
||||
&allocate_shared_buffer_and_handle);
|
||||
custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
|
||||
custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
|
||||
|
||||
custom_ar.def("free_shared_buffer", &free_shared_buffer);
|
||||
}
|
||||
#endif
|
||||
|
||||
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||
|
||||
@ -1,18 +1,18 @@
|
||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
||||
ARG HIPBLASLT_BRANCH="4d40e36"
|
||||
ARG HIPBLASLT_BRANCH="db8e93b4"
|
||||
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
||||
ARG LEGACY_HIPBLASLT_OPTION=
|
||||
ARG RCCL_BRANCH="648a58d"
|
||||
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
||||
ARG TRITON_BRANCH="e5be006"
|
||||
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
||||
ARG PYTORCH_BRANCH="3a585126"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.19.1"
|
||||
ARG PYTORCH_BRANCH="295f2ed4"
|
||||
ARG PYTORCH_VISION_BRANCH="v0.21.0"
|
||||
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||
ARG FA_BRANCH="b7d29fb"
|
||||
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
||||
ARG AITER_BRANCH="21d47a9"
|
||||
ARG FA_BRANCH="1a7f4dfa"
|
||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
|
||||
ARG AITER_BRANCH="8970b25b"
|
||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
|
||||
|
||||
FROM ${BASE_IMAGE} AS base
|
||||
@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
|
||||
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
||||
ENV ROCM_PATH=/opt/rocm
|
||||
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
|
||||
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
|
||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||
|
||||
ARG PYTHON_VERSION=3.12
|
||||
@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install Python and other dependencies
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y software-properties-common git curl sudo vim less \
|
||||
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update -y \
|
||||
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||
@ -42,7 +42,7 @@ RUN apt-get update -y \
|
||||
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
|
||||
RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
|
||||
|
||||
FROM base AS build_hipblaslt
|
||||
ARG HIPBLASLT_BRANCH
|
||||
@ -60,7 +60,8 @@ RUN cd hipBLAS-common \
|
||||
RUN git clone https://github.com/ROCm/hipBLASLt
|
||||
RUN cd hipBLASLt \
|
||||
&& git checkout ${HIPBLASLT_BRANCH} \
|
||||
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||
&& apt-get install -y llvm-dev \
|
||||
&& ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||
&& cd build/release \
|
||||
&& make package
|
||||
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
||||
@ -110,11 +111,24 @@ RUN git clone ${FA_REPO}
|
||||
RUN cd flash-attention \
|
||||
&& git checkout ${FA_BRANCH} \
|
||||
&& git submodule update --init \
|
||||
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
|
||||
&& GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
|
||||
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||
&& cp /app/vision/dist/*.whl /app/install \
|
||||
&& cp /app/flash-attention/dist/*.whl /app/install
|
||||
|
||||
FROM base AS build_aiter
|
||||
ARG AITER_BRANCH
|
||||
ARG AITER_REPO
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN git clone --recursive ${AITER_REPO}
|
||||
RUN cd aiter \
|
||||
&& git checkout ${AITER_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt
|
||||
RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
|
||||
RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
|
||||
|
||||
FROM base AS final
|
||||
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||
dpkg -i /install/*deb \
|
||||
@ -130,19 +144,12 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
|
||||
ARG AITER_REPO
|
||||
ARG AITER_BRANCH
|
||||
RUN git clone --recursive ${AITER_REPO}
|
||||
RUN cd aiter \
|
||||
&& git checkout ${AITER_BRANCH} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& pip install -r requirements.txt \
|
||||
&& PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
|
||||
RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
|
||||
pip install /install/*.whl
|
||||
|
||||
ARG BASE_IMAGE
|
||||
ARG HIPBLASLT_BRANCH
|
||||
ARG HIPBLAS_COMMON_BRANCH
|
||||
ARG HIPBLASLT_BRANCH
|
||||
ARG LEGACY_HIPBLASLT_OPTION
|
||||
ARG RCCL_BRANCH
|
||||
ARG RCCL_REPO
|
||||
@ -154,6 +161,8 @@ ARG PYTORCH_REPO
|
||||
ARG PYTORCH_VISION_REPO
|
||||
ARG FA_BRANCH
|
||||
ARG FA_REPO
|
||||
ARG AITER_BRANCH
|
||||
ARG AITER_REPO
|
||||
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
||||
@ -167,6 +176,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
|
||||
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
|
||||
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
|
||||
@ -1,6 +1,6 @@
|
||||
# Dockerfile
|
||||
|
||||
We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
||||
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
||||
More information about deploying with Docker can be found [here](#deployment-docker).
|
||||
|
||||
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||
@ -28,7 +28,7 @@ The edges of the build graph represent:
|
||||
> Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
|
||||
>
|
||||
> ```bash
|
||||
> dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
|
||||
> dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
|
||||
> ```
|
||||
>
|
||||
> or in case you want to run it directly with the docker image:
|
||||
@ -43,7 +43,7 @@ The edges of the build graph represent:
|
||||
> --output png \
|
||||
> --dpi 200 \
|
||||
> --max-label-length 50 \
|
||||
> --filename Dockerfile \
|
||||
> --filename docker/Dockerfile \
|
||||
> --legend
|
||||
> ```
|
||||
>
|
||||
|
||||
@ -45,7 +45,7 @@ pytest tests/
|
||||
```
|
||||
|
||||
:::{tip}
|
||||
Since the <gh-file:Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
|
||||
Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
|
||||
|
||||
Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
|
||||
:::
|
||||
|
||||
@ -61,11 +61,11 @@ RUN uv pip install --system git+https://github.com/huggingface/transformers.git
|
||||
|
||||
## Building vLLM's Docker Image from Source
|
||||
|
||||
You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
|
||||
You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
|
||||
|
||||
```console
|
||||
# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
|
||||
DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
|
||||
DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
|
||||
```
|
||||
|
||||
:::{note}
|
||||
@ -92,6 +92,7 @@ Keep an eye on memory usage with parallel jobs as it can be substantial (see exa
|
||||
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
|
||||
$ python3 use_existing_torch.py
|
||||
$ DOCKER_BUILDKIT=1 docker build . \
|
||||
--file docker/Dockerfile \
|
||||
--target vllm-openai \
|
||||
--platform "linux/arm64" \
|
||||
-t vllm/vllm-gh200-openai:latest \
|
||||
|
||||
@ -69,14 +69,14 @@ server {
|
||||
|
||||
```console
|
||||
cd $vllm_root
|
||||
docker build -f Dockerfile . --tag vllm
|
||||
docker build -f docker/Dockerfile . --tag vllm
|
||||
```
|
||||
|
||||
If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
|
||||
|
||||
```console
|
||||
cd $vllm_root
|
||||
docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
|
||||
docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
|
||||
```
|
||||
|
||||
(nginxloadbalancer-nginx-docker-network)=
|
||||
|
||||
@ -16,5 +16,6 @@ gptqmodel
|
||||
int4
|
||||
int8
|
||||
fp8
|
||||
quark
|
||||
quantized_kvcache
|
||||
:::
|
||||
|
||||
217
docs/source/features/quantization/quark.md
Normal file
217
docs/source/features/quantization/quark.md
Normal file
@ -0,0 +1,217 @@
|
||||
(quark)=
|
||||
|
||||
# AMD QUARK
|
||||
|
||||
Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
|
||||
throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
|
||||
the flexible and powerful quantization toolkit, to produce performant quantized models to run on AMD GPUs. Quark has specialized support for quantizing large language models with weight,
|
||||
activation and kv-cache quantization and cutting-edge quantization algorithms like
|
||||
AWQ, GPTQ, Rotation and SmoothQuant.
|
||||
|
||||
## Quark Installation
|
||||
|
||||
Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
|
||||
|
||||
```console
|
||||
pip install amd-quark
|
||||
```
|
||||
|
||||
You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
|
||||
for more installation details.
|
||||
|
||||
## Quantization Process
|
||||
|
||||
After installing Quark, we will use an example to illustrate how to use Quark.
|
||||
The Quark quantization process can be listed for 5 steps as below:
|
||||
|
||||
1. Load the model
|
||||
2. Prepare the calibration dataloader
|
||||
3. Set the quantization configuration
|
||||
4. Quantize the model and export
|
||||
5. Evaluation in vLLM
|
||||
|
||||
### 1. Load the Model
|
||||
|
||||
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
|
||||
to fetch model and tokenizer.
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
|
||||
MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
|
||||
MAX_SEQ_LEN = 512
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, device_map="auto", torch_dtype="auto",
|
||||
)
|
||||
model.eval()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
```
|
||||
|
||||
### 2. Prepare the Calibration Dataloader
|
||||
|
||||
Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)
|
||||
to load calibration data. For more details about how to use calibration datasets efficiently, please refer
|
||||
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
BATCH_SIZE = 1
|
||||
NUM_CALIBRATION_DATA = 512
|
||||
|
||||
# Load the dataset and get calibration data.
|
||||
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
|
||||
text_data = dataset["text"][:NUM_CALIBRATION_DATA]
|
||||
|
||||
tokenized_outputs = tokenizer(text_data, return_tensors="pt",
|
||||
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
|
||||
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
|
||||
batch_size=BATCH_SIZE, drop_last=True)
|
||||
```
|
||||
|
||||
### 3. Set the Quantization Configuration
|
||||
|
||||
We need to set the quantization configuration, you can check
|
||||
[quark config guide](https://quark.docs.amd.com/latest/pytorch/user_guide_config_description.html)
|
||||
for further details. Here we use FP8 per-tensor quantization on weight, activation,
|
||||
kv-cache and the quantization algorithm is AutoSmoothQuant.
|
||||
|
||||
:::{note}
|
||||
Note the quantization algorithm needs a JSON config file and the config file is located in
|
||||
[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
|
||||
under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
|
||||
AutoSmoothQuant config file for Llama is
|
||||
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
|
||||
:::
|
||||
|
||||
```python
|
||||
from quark.torch.quantization import (Config, QuantizationConfig,
|
||||
FP8E4M3PerTensorSpec,
|
||||
load_quant_algo_config_from_file)
|
||||
|
||||
# Define fp8/per-tensor/static spec.
|
||||
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
|
||||
is_dynamic=False).to_quantization_spec()
|
||||
|
||||
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
|
||||
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
|
||||
weight=FP8_PER_TENSOR_SPEC)
|
||||
|
||||
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
|
||||
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
|
||||
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
|
||||
kv_cache_quant_config = {name :
|
||||
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
|
||||
weight=global_quant_config.weight,
|
||||
output_tensors=KV_CACHE_SPEC)
|
||||
for name in kv_cache_layer_names_for_llama}
|
||||
layer_quant_config = kv_cache_quant_config.copy()
|
||||
|
||||
# Define algorithm config by config file.
|
||||
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
|
||||
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
|
||||
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
|
||||
|
||||
EXCLUDE_LAYERS = ["lm_head"]
|
||||
quant_config = Config(
|
||||
global_quant_config=global_quant_config,
|
||||
layer_quant_config=layer_quant_config,
|
||||
kv_cache_quant_config=kv_cache_quant_config,
|
||||
exclude=EXCLUDE_LAYERS,
|
||||
algo_config=algo_config)
|
||||
```
|
||||
|
||||
### 4. Quantize the Model and Export
|
||||
|
||||
Then we can apply the quantization. After quantizing, we need to freeze the
|
||||
quantized model first before exporting. Note that we need to export model with format of
|
||||
HuggingFace `safetensors`, you can refer to
|
||||
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
|
||||
for more exporting format details.
|
||||
|
||||
```python
|
||||
import torch
|
||||
from quark.torch import ModelQuantizer, ModelExporter
|
||||
from quark.torch.export import ExporterConfig, JsonExporterConfig
|
||||
|
||||
# Apply quantization.
|
||||
quantizer = ModelQuantizer(quant_config)
|
||||
quant_model = quantizer.quantize_model(model, calib_dataloader)
|
||||
|
||||
# Freeze quantized model to export.
|
||||
freezed_model = quantizer.freeze(model)
|
||||
|
||||
# Define export config.
|
||||
LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
|
||||
export_config = ExporterConfig(json_export_config=JsonExporterConfig())
|
||||
export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
|
||||
|
||||
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
|
||||
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
|
||||
with torch.no_grad():
|
||||
exporter.export_safetensors_model(freezed_model,
|
||||
quant_config=quant_config, tokenizer=tokenizer)
|
||||
```
|
||||
|
||||
### 5. Evaluation in vLLM
|
||||
|
||||
Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# Sample prompts.
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
|
||||
kv_cache_dtype='fp8',quantization='quark')
|
||||
# Generate texts from the prompts. The output is a list of RequestOutput objects
|
||||
# that contain the prompt, generated text, and other information.
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
print("\nGenerated Outputs:\n" + "-" * 60)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}")
|
||||
print(f"Output: {generated_text!r}")
|
||||
print("-" * 60)
|
||||
```
|
||||
|
||||
Or, you can use `lm_eval` to evaluate accuracy:
|
||||
|
||||
```console
|
||||
$ lm_eval --model vllm \
|
||||
--model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
|
||||
--tasks gsm8k
|
||||
```
|
||||
|
||||
## Quark Quantization Script
|
||||
In addition to the example of Python API above, Quark also offers a
|
||||
[quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
|
||||
to quantize large language models more conveniently. It supports quantizing models with variety
|
||||
of different quantization schemes and optimization algorithms. It can export the quantized model
|
||||
and run evaluation tasks on the fly. With the script, the example above can be:
|
||||
|
||||
```console
|
||||
python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
|
||||
--output_dir /path/to/output \
|
||||
--quant_scheme w_fp8_a_fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
--quant_algo autosmoothquant \
|
||||
--num_calib_data 512 \
|
||||
--model_export hf_format \
|
||||
--tasks gsm8k
|
||||
```
|
||||
@ -86,7 +86,7 @@ Currently, there are no pre-built Intel Gaudi images.
|
||||
### Build image from source
|
||||
|
||||
```console
|
||||
docker build -f Dockerfile.hpu -t vllm-hpu-env .
|
||||
docker build -f docker/Dockerfile.hpu -t vllm-hpu-env .
|
||||
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
|
||||
```
|
||||
|
||||
|
||||
@ -132,7 +132,7 @@ Currently, there are no pre-built Neuron images.
|
||||
|
||||
See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
|
||||
|
||||
Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.
|
||||
Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
|
||||
|
||||
## Extra information
|
||||
|
||||
|
||||
@ -169,10 +169,10 @@ See <project:#deployment-docker-pre-built-image> for instructions on using the o
|
||||
|
||||
### Build image from source
|
||||
|
||||
You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
|
||||
You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
|
||||
|
||||
```console
|
||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
```
|
||||
|
||||
Run the Docker image with the following command:
|
||||
|
||||
@ -177,7 +177,7 @@ Currently, there are no pre-built CPU wheels.
|
||||
### Build image from source
|
||||
|
||||
```console
|
||||
$ docker build -f Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
|
||||
$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
|
||||
|
||||
# Launching OpenAI server
|
||||
$ docker run --rm \
|
||||
@ -193,11 +193,11 @@ $ docker run --rm \
|
||||
```
|
||||
|
||||
::::{tip}
|
||||
For ARM or Apple silicon, use `Dockerfile.arm`
|
||||
For ARM or Apple silicon, use `docker/Dockerfile.arm`
|
||||
::::
|
||||
|
||||
::::{tip}
|
||||
For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float`
|
||||
For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
|
||||
::::
|
||||
|
||||
## Supported features
|
||||
|
||||
@ -123,7 +123,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO
|
||||
|
||||
#### (Optional) Build an image with ROCm software stack
|
||||
|
||||
Build a docker image from <gh-file:Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
|
||||
Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
|
||||
**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
|
||||
If you choose to build this rocm_base image yourself, the steps are as follows.
|
||||
|
||||
@ -140,12 +140,12 @@ It is important that the user kicks off the docker build using buildkit. Either
|
||||
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
|
||||
|
||||
```console
|
||||
DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base .
|
||||
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base .
|
||||
```
|
||||
|
||||
#### Build an image with vLLM
|
||||
|
||||
First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
|
||||
First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
|
||||
It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
|
||||
|
||||
```console
|
||||
@ -156,10 +156,10 @@ It is important that the user kicks off the docker build using buildkit. Either
|
||||
}
|
||||
```
|
||||
|
||||
<gh-file:Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
|
||||
<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
|
||||
It provides flexibility to customize the build of docker image using the following arguments:
|
||||
|
||||
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
|
||||
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
|
||||
- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
|
||||
- `BUILD_RPD`: Include RocmProfileData profiling tool in the image
|
||||
- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
|
||||
@ -169,13 +169,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
|
||||
To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
|
||||
|
||||
```console
|
||||
DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
|
||||
DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||
```
|
||||
|
||||
To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
|
||||
|
||||
```console
|
||||
DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
|
||||
DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm .
|
||||
```
|
||||
|
||||
To run the above docker image `vllm-rocm`, use the below command:
|
||||
|
||||
@ -54,7 +54,7 @@ Currently, there are no pre-built XPU images.
|
||||
### Build image from source
|
||||
|
||||
```console
|
||||
$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
|
||||
$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
|
||||
$ docker run -it \
|
||||
--rm \
|
||||
--network=host \
|
||||
|
||||
@ -208,5 +208,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
|
||||
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
|
||||
|
||||
```{attention}
|
||||
There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
|
||||
There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
|
||||
```
|
||||
|
||||
@ -77,9 +77,9 @@ getting_started/v1_user_guide
|
||||
:caption: Models
|
||||
:maxdepth: 1
|
||||
|
||||
models/supported_models
|
||||
models/generative_models
|
||||
models/pooling_models
|
||||
models/supported_models
|
||||
models/extensions/index
|
||||
:::
|
||||
|
||||
|
||||
@ -1,55 +1,28 @@
|
||||
(supported-models)=
|
||||
|
||||
# List of Supported Models
|
||||
# Supported Models
|
||||
|
||||
vLLM supports generative and pooling models across various tasks.
|
||||
vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks.
|
||||
If a model supports more than one task, you can set the task via the `--task` argument.
|
||||
|
||||
For each task, we list the model architectures that have been implemented in vLLM.
|
||||
Alongside each architecture, we include some popular models that use it.
|
||||
|
||||
## Loading a Model
|
||||
## Model Implementation
|
||||
|
||||
### HuggingFace Hub
|
||||
### vLLM
|
||||
|
||||
By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
|
||||
If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
|
||||
|
||||
To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
|
||||
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
|
||||
These models are what we list in <project:#supported-text-models> and <project:#supported-mm-models>.
|
||||
|
||||
Models do not _need_ to be natively supported to be used in vLLM.
|
||||
The <project:#transformers-fallback> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
|
||||
(transformers-backend)=
|
||||
|
||||
:::{tip}
|
||||
The easiest way to check if your model is really supported at runtime is to run the program below:
|
||||
### Transformers
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
|
||||
|
||||
# For generative models (task=generate) only
|
||||
llm = LLM(model=..., task="generate") # Name or path of your model
|
||||
output = llm.generate("Hello, my name is")
|
||||
print(output)
|
||||
|
||||
# For pooling models (task={embed,classify,reward,score}) only
|
||||
llm = LLM(model=..., task="embed") # Name or path of your model
|
||||
output = llm.encode("Hello, my name is")
|
||||
print(output)
|
||||
```
|
||||
|
||||
If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
|
||||
:::
|
||||
|
||||
Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
|
||||
Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
|
||||
|
||||
(transformers-fallback)=
|
||||
|
||||
### Transformers fallback
|
||||
|
||||
vLLM can fallback to model implementations that are available in Transformers. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
|
||||
|
||||
To check if the backend is Transformers, you can simply do this:
|
||||
To check if the modeling backend is Transformers, you can simply do this:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
@ -69,16 +42,15 @@ vLLM may not fully optimise the Transformers implementation so you may see degra
|
||||
|
||||
#### Supported features
|
||||
|
||||
The Transformers fallback explicitly supports the following features:
|
||||
The Transformers modeling backend explicitly supports the following features:
|
||||
|
||||
- <project:#quantization-index> (except GGUF)
|
||||
- <project:#lora-adapter>
|
||||
- <project:#distributed-serving>
|
||||
|
||||
#### Remote code
|
||||
#### Remote Code
|
||||
|
||||
Earlier we mentioned that the Transformers fallback enables you to run remote code models directly in vLLM.
|
||||
If you are interested in this feature, this section is for you!
|
||||
If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!
|
||||
|
||||
Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
|
||||
Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
|
||||
@ -89,7 +61,7 @@ llm = LLM(model=..., task="generate", trust_remote_code=True) # Name or path of
|
||||
llm.apply_model(lambda model: print(model.__class__))
|
||||
```
|
||||
|
||||
To make your model compatible with the Transformers fallback, it needs:
|
||||
To make your model compatible with the Transformers backend, it needs:
|
||||
|
||||
```{code-block} python
|
||||
:caption: modeling_my_model.py
|
||||
@ -121,7 +93,9 @@ Here is what happens in the background:
|
||||
2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
|
||||
3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
|
||||
|
||||
To make your model compatible with tensor parallel, it needs:
|
||||
That's it!
|
||||
|
||||
For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
|
||||
|
||||
```{code-block} python
|
||||
:caption: configuration_my_model.py
|
||||
@ -130,20 +104,65 @@ from transformers import PretrainedConfig
|
||||
|
||||
class MyConfig(PretrainedConfig):
|
||||
base_model_tp_plan = {
|
||||
"layers.*.self_attn.q_proj": "colwise",
|
||||
...
|
||||
"layers.*.self_attn.k_proj": "colwise",
|
||||
"layers.*.self_attn.v_proj": "colwise",
|
||||
"layers.*.self_attn.o_proj": "rowwise",
|
||||
"layers.*.mlp.gate_proj": "colwise",
|
||||
"layers.*.mlp.up_proj": "colwise",
|
||||
"layers.*.mlp.down_proj": "rowwise",
|
||||
}
|
||||
base_model_pp_plan = {
|
||||
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
||||
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
||||
"norm": (["hidden_states"], ["hidden_states"]),
|
||||
}
|
||||
```
|
||||
|
||||
- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
|
||||
- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
|
||||
* You only need to do this for layers which are not present on all pipeline stages
|
||||
* vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
|
||||
* The `list` in the first element of the `tuple` contains the names of the input arguments
|
||||
* The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
|
||||
|
||||
## Loading a Model
|
||||
|
||||
### Hugging Face Hub
|
||||
|
||||
By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models).
|
||||
|
||||
To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
|
||||
If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
|
||||
|
||||
Models do not _need_ to be natively supported to be used in vLLM.
|
||||
The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
|
||||
|
||||
:::{tip}
|
||||
`base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
|
||||
The easiest way to check if your model is really supported at runtime is to run the program below:
|
||||
|
||||
```python
|
||||
from vllm import LLM
|
||||
|
||||
# For generative models (task=generate) only
|
||||
llm = LLM(model=..., task="generate") # Name or path of your model
|
||||
output = llm.generate("Hello, my name is")
|
||||
print(output)
|
||||
|
||||
# For pooling models (task={embed,classify,reward,score}) only
|
||||
llm = LLM(model=..., task="embed") # Name or path of your model
|
||||
output = llm.encode("Hello, my name is")
|
||||
print(output)
|
||||
```
|
||||
|
||||
If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
|
||||
:::
|
||||
|
||||
That's it!
|
||||
Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
|
||||
Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
|
||||
|
||||
### ModelScope
|
||||
|
||||
To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
|
||||
To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
|
||||
|
||||
```shell
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
@ -165,6 +184,8 @@ output = llm.encode("Hello, my name is")
|
||||
print(output)
|
||||
```
|
||||
|
||||
(supported-text-models)=
|
||||
|
||||
## List of Text-only Language Models
|
||||
|
||||
### Generative Models
|
||||
@ -545,7 +566,7 @@ you should explicitly specify the task type to ensure that the model is used in
|
||||
*
|
||||
- * `XLMRobertaModel`
|
||||
* XLM-RoBERTa-based
|
||||
* `intfloat/multilingual-e5-large`, etc.
|
||||
* `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, etc.
|
||||
*
|
||||
*
|
||||
:::
|
||||
@ -732,6 +753,13 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `AyaVisionForConditionalGeneration`
|
||||
* Aya Vision
|
||||
* T + I<sup>+</sup>
|
||||
* `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Blip2ForConditionalGeneration`
|
||||
* BLIP-2
|
||||
* T + I<sup>E</sup>
|
||||
@ -844,6 +872,13 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Mistral3ForConditionalGeneration`
|
||||
* Mistral3
|
||||
* T + I<sup>+</sup>
|
||||
* `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
*
|
||||
- * `MllamaForConditionalGeneration`
|
||||
* Llama 3.2
|
||||
* T + I<sup>+</sup>
|
||||
@ -1066,7 +1101,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
|
||||
2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
|
||||
|
||||
:::{tip}
|
||||
When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
|
||||
When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
|
||||
:::
|
||||
|
||||
3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
|
||||
|
||||
@ -188,6 +188,7 @@ For example:
|
||||
```yaml
|
||||
# config.yaml
|
||||
|
||||
model: meta-llama/Llama-3.1-8B-Instruct
|
||||
host: "127.0.0.1"
|
||||
port: 6379
|
||||
uvicorn-log-level: "info"
|
||||
@ -196,12 +197,13 @@ uvicorn-log-level: "info"
|
||||
To use the above config file:
|
||||
|
||||
```bash
|
||||
vllm serve SOME_MODEL --config config.yaml
|
||||
vllm serve --config config.yaml
|
||||
```
|
||||
|
||||
:::{note}
|
||||
In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
|
||||
The order of priorities is `command line > config file values > defaults`.
|
||||
e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
|
||||
:::
|
||||
|
||||
## API Reference
|
||||
|
||||
@ -60,6 +60,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
# Aya Vision
|
||||
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
model_name = "CohereForAI/aya-vision-8b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
mm_processor_kwargs={"crop_to_patches": True},
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
prompts = [
|
||||
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
|
||||
for question in questions
|
||||
]
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# BLIP-2
|
||||
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -498,6 +520,29 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
|
||||
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
|
||||
|
||||
|
||||
# Mistral-3 HF-format
|
||||
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
|
||||
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
|
||||
# NOTE: Need L40 (or equivalent) to avoid OOM
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
|
||||
)
|
||||
|
||||
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompts=prompts,
|
||||
)
|
||||
|
||||
|
||||
# LLama 3.2
|
||||
def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
|
||||
assert modality == "image"
|
||||
@ -842,6 +887,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
|
||||
|
||||
model_example_map = {
|
||||
"aria": run_aria,
|
||||
"aya_vision": run_aya_vision,
|
||||
"blip-2": run_blip2,
|
||||
"chameleon": run_chameleon,
|
||||
"deepseek_vl_v2": run_deepseek_vl2,
|
||||
@ -859,6 +905,7 @@ model_example_map = {
|
||||
"mantis": run_mantis,
|
||||
"minicpmo": run_minicpmo,
|
||||
"minicpmv": run_minicpmv,
|
||||
"mistral3": run_mistral3,
|
||||
"mllama": run_mllama,
|
||||
"molmo": run_molmo,
|
||||
"NVLM_D": run_nvlm_d,
|
||||
|
||||
@ -61,6 +61,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "CohereForAI/aya-vision-8b"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{
|
||||
"type": "text",
|
||||
"text": question
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_deepseek_vl2(question: str,
|
||||
image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "deepseek-ai/deepseek-vl2-tiny"
|
||||
@ -218,6 +253,28 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
|
||||
# Adjust this as necessary to fit in GPU
|
||||
engine_args = EngineArgs(
|
||||
model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)},
|
||||
)
|
||||
|
||||
placeholders = "[IMG]" * len(image_urls)
|
||||
prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
)
|
||||
|
||||
|
||||
def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
|
||||
|
||||
@ -504,11 +561,13 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
|
||||
model_example_map = {
|
||||
"aria": load_aria,
|
||||
"aya_vision": load_aya_vision,
|
||||
"deepseek_vl_v2": load_deepseek_vl2,
|
||||
"gemma3": load_gemma3,
|
||||
"h2ovl_chat": load_h2ovl,
|
||||
"idefics3": load_idefics3,
|
||||
"internvl_chat": load_internvl,
|
||||
"mistral3": load_mistral3,
|
||||
"mllama": load_mllama,
|
||||
"NVLM_D": load_nvlm_d,
|
||||
"phi3_v": load_phi3v,
|
||||
|
||||
60
examples/tool_chat_template_phi4_mini.jinja
Normal file
60
examples/tool_chat_template_phi4_mini.jinja
Normal file
@ -0,0 +1,60 @@
|
||||
{%- if messages %}
|
||||
{%- if system_message or tools %}
|
||||
<|system|>
|
||||
|
||||
{%- if system_message %}
|
||||
{{ system_message }}
|
||||
{%- endif %}
|
||||
In addition to plain text responses, you can chose to call one or more of the provided functions.
|
||||
|
||||
Use the following rule to decide when to call a function:
|
||||
* if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
|
||||
* if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
|
||||
|
||||
If you decide to call functions:
|
||||
* prefix function calls with functools marker (no closing marker required)
|
||||
* all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
|
||||
* follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
|
||||
* respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
|
||||
* make sure you pick the right functions that match the user intent
|
||||
|
||||
|
||||
{%- if tools %}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{%- endif %}<|end|>
|
||||
{%- endif %}
|
||||
|
||||
{%- for message in messages %}
|
||||
{%- if message.role != "system" %}
|
||||
<|{{ message.role }}|>
|
||||
{%- if message.content and message.role == "tools" %}
|
||||
{"result": {{ message.content }}}
|
||||
{%- elif message.content %}
|
||||
{{ message.content }}
|
||||
{%- elif message.tool_calls %}
|
||||
{%- for call in message.tool_calls %}
|
||||
{"name": "{{ call.function.name }}", "arguments": {{ call.function.arguments }}}
|
||||
{%- if not loop.last %},{% endif %}
|
||||
{%- endfor %}
|
||||
{%- endif %}<|end|>
|
||||
{%- endif %}
|
||||
{%- endfor %}<|assistant|>
|
||||
|
||||
{%- else %}
|
||||
{%- if system_message %}
|
||||
<|system|>
|
||||
|
||||
{{ system_message }}<|end|>
|
||||
{%- endif %}
|
||||
{%- if prompt %}
|
||||
<|user|>
|
||||
|
||||
{{ prompt }}<|end|>
|
||||
{%- endif %}<|assistant|>
|
||||
|
||||
{%- endif %}
|
||||
{{ response }}
|
||||
{%- if response %}<|user|>{% endif %}
|
||||
6
format.sh
Executable file → Normal file
6
format.sh
Executable file → Normal file
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "vLLM linting system has been moved from format.sh to pre-commit hook."
|
||||
echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
|
||||
echo "Please run 'pip install -r requirements/lint.txt', followed by"
|
||||
echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
|
||||
echo "Then linters will run automatically before each commit."
|
||||
echo "'pre-commit install' to install the pre-commit hooks."
|
||||
echo "Then linters will run automatically before each commit."
|
||||
@ -1,16 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
|
||||
|
||||
TL;DR:
|
||||
|
||||
VLLM_USE_PRECOMPILED=1 pip install -e .
|
||||
|
||||
or
|
||||
|
||||
export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
|
||||
export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
|
||||
pip install -e .
|
||||
""" # noqa
|
||||
|
||||
print(msg)
|
||||
@ -30,6 +30,7 @@ msgspec
|
||||
gguf == 0.10.0
|
||||
importlib_metadata
|
||||
mistral_common[opencv] >= 1.5.4
|
||||
opencv-python-headless >= 4.11.0 # required for video IO
|
||||
pyyaml
|
||||
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
|
||||
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
|
||||
|
||||
@ -6,7 +6,7 @@ torch==2.6.0
|
||||
torchvision==0.21.0
|
||||
torchaudio==2.6.0
|
||||
|
||||
cmake>=3.26
|
||||
cmake>=3.26,<4
|
||||
packaging
|
||||
setuptools>=61
|
||||
setuptools-scm>=8
|
||||
|
||||
@ -9,7 +9,6 @@ pytest-shard
|
||||
# testing utils
|
||||
awscli
|
||||
backoff # required for phi4mm test
|
||||
decord # required for video tests
|
||||
einops # required for MPT, qwen-vl and Mamba
|
||||
httpx
|
||||
librosa # required for audio tests
|
||||
@ -28,6 +27,7 @@ torchvision==0.21.0
|
||||
transformers_stream_generator # required for qwen-vl test
|
||||
matplotlib # required for qwen-vl test
|
||||
mistral_common[opencv] >= 1.5.4 # required for pixtral test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]==0.4.4 # required for model evaluation test
|
||||
transformers==4.50.3
|
||||
|
||||
@ -93,8 +93,6 @@ datasets==3.0.2
|
||||
# lm-eval
|
||||
decorator==5.1.1
|
||||
# via librosa
|
||||
decord==0.6.0
|
||||
# via -r requirements/test.in
|
||||
dill==0.3.8
|
||||
# via
|
||||
# datasets
|
||||
@ -276,7 +274,6 @@ numpy==1.26.4
|
||||
# contourpy
|
||||
# cupy-cuda12x
|
||||
# datasets
|
||||
# decord
|
||||
# einx
|
||||
# encodec
|
||||
# evaluate
|
||||
@ -337,8 +334,10 @@ nvidia-nvjitlink-cu12==12.4.127
|
||||
# torch
|
||||
nvidia-nvtx-cu12==12.4.127
|
||||
# via torch
|
||||
opencv-python-headless==4.10.0.84
|
||||
# via mistral-common
|
||||
opencv-python-headless==4.11.0.86
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# mistral-common
|
||||
packaging==24.1
|
||||
# via
|
||||
# accelerate
|
||||
|
||||
10
setup.py
10
setup.py
@ -201,6 +201,9 @@ class cmake_build_ext(build_ext):
|
||||
else:
|
||||
# Default build tool to whatever cmake picks.
|
||||
build_tool = []
|
||||
# Make sure we use the nvcc from CUDA_HOME
|
||||
if _is_cuda():
|
||||
cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
|
||||
subprocess.check_call(
|
||||
['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
|
||||
cwd=self.build_temp)
|
||||
@ -639,11 +642,10 @@ if _is_hip():
|
||||
|
||||
if _is_cuda():
|
||||
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
|
||||
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
|
||||
# FA3 requires CUDA 12.0 or later
|
||||
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
|
||||
# FA3 requires CUDA 12.3 or later
|
||||
ext_modules.append(
|
||||
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
|
||||
if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
|
||||
# Optional since this doesn't get built (produce an .so file) when
|
||||
# not targeting a hopper system
|
||||
ext_modules.append(
|
||||
@ -682,7 +684,7 @@ setup(
|
||||
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
|
||||
"runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
|
||||
"audio": ["librosa", "soundfile"], # Required for audio processing
|
||||
"video": ["decord"] # Required for video processing
|
||||
"video": [] # Kept for backwards compatibility
|
||||
},
|
||||
cmdclass=cmdclass,
|
||||
package_data=package_data,
|
||||
|
||||
7
tests/config/test_config_with_model.yaml
Normal file
7
tests/config/test_config_with_model.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
# Same as test_config.yaml but with model specified
|
||||
model: config-model
|
||||
port: 12312
|
||||
served_model_name: mymodel
|
||||
tensor_parallel_size: 2
|
||||
trust_remote_code: true
|
||||
multi_step_stream_outputs: false
|
||||
@ -1117,3 +1117,15 @@ def pytest_collection_modifyitems(config, items):
|
||||
for item in items:
|
||||
if "optional" in item.keywords:
|
||||
item.add_marker(skip_optional)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def cli_config_file():
|
||||
"""Return the path to the CLI config file."""
|
||||
return os.path.join(_TEST_DIR, "config", "test_config.yaml")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def cli_config_file_with_model():
|
||||
"""Return the path to the CLI config file with model."""
|
||||
return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
|
||||
|
||||
@ -129,12 +129,16 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
|
||||
check_answers(indices, answer, test_texts)
|
||||
|
||||
|
||||
def prep_prompts(batch_size: int):
|
||||
def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
|
||||
"""
|
||||
Generate prompts which a bunch of assignments,
|
||||
then asking for the value of one of them.
|
||||
The prompt is just under 10k tokens; sliding window is 4k
|
||||
so the answer is outside sliding window, but should still be correct.
|
||||
|
||||
Args:
|
||||
batch_size: number of prompts to generate
|
||||
ln_range: an argument to control the length of the prompt
|
||||
"""
|
||||
prompts: list[str] = []
|
||||
answer: list[int] = []
|
||||
@ -145,7 +149,7 @@ def prep_prompts(batch_size: int):
|
||||
indices.append(idx)
|
||||
prompt = "```python\n# We set a number of variables, " + \
|
||||
f"x{idx} will be important later\n"
|
||||
ln = random.randint(800, 1100)
|
||||
ln = random.randint(*ln_range)
|
||||
for k in range(30, ln):
|
||||
v = random.randint(10, 99)
|
||||
if k == idx:
|
||||
@ -157,7 +161,10 @@ def prep_prompts(batch_size: int):
|
||||
return prompts, answer, indices
|
||||
|
||||
|
||||
def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
|
||||
def check_answers(indices: list[int],
|
||||
answer: list[int],
|
||||
outputs: list[str],
|
||||
accept_rate: float = 0.7):
|
||||
answer2 = [int(text[0:2].strip()) for text in outputs]
|
||||
print(list(zip(indices, zip(answer, answer2))))
|
||||
numok = 0
|
||||
@ -166,7 +173,7 @@ def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
|
||||
numok += 1
|
||||
frac_ok = numok / len(answer)
|
||||
print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
|
||||
assert frac_ok > 0.7
|
||||
assert frac_ok >= accept_rate
|
||||
|
||||
|
||||
def check_window(prompts: list[str]):
|
||||
|
||||
@ -106,7 +106,7 @@ def eager_allreduce(
|
||||
# communicate independently
|
||||
num_communication = rank // tp_size + 1
|
||||
sz = 1024
|
||||
fa = get_tp_group().ca_comm
|
||||
fa = get_tp_group().device_communicator.ca_comm
|
||||
inp = torch.ones(sz, dtype=torch.float32, device=device)
|
||||
out = inp
|
||||
for _ in range(num_communication):
|
||||
|
||||
@ -58,7 +58,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
|
||||
more_args = None
|
||||
if current_platform.is_tpu():
|
||||
# Limit compilation time for TPU V1
|
||||
more_args = "max_num_seqs=64"
|
||||
more_args = "max_model_len=2048,max_num_seqs=64"
|
||||
|
||||
# Add TP test (if provided)
|
||||
if TPU_TP_TEST_STR:
|
||||
|
||||
@ -24,7 +24,23 @@ GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
|
||||
def monkeypatch_module():
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
mpatch = MonkeyPatch()
|
||||
yield mpatch
|
||||
mpatch.undo()
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=[False, True])
|
||||
def server(
|
||||
request,
|
||||
monkeypatch_module,
|
||||
zephyr_lora_files, #noqa: F811
|
||||
zephyr_lora_added_tokens_files): # noqa: F811
|
||||
|
||||
use_v1 = request.param
|
||||
monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
|
||||
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
@ -49,6 +65,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def is_v1_server(server):
|
||||
import os
|
||||
assert os.environ['VLLM_USE_V1'] in ['0', '1']
|
||||
return os.environ['VLLM_USE_V1'] == '1'
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
@ -471,8 +494,13 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
|
||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
||||
pytest.skip("Only xgrammar backend is supported with V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -511,9 +539,13 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_json_chat(client: openai.AsyncOpenAI,
|
||||
async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported in V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -559,7 +591,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
guided_decoding_backend: str, sample_regex):
|
||||
|
||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
||||
pytest.skip("Only xgrammar backend is supported with V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -617,8 +654,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_guided_choice):
|
||||
|
||||
if is_v1_server and guided_decoding_backend != 'xgrammar':
|
||||
pytest.skip("Only xgrammar backend is supported with V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -648,9 +690,13 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
|
||||
async def test_named_tool_use(client: openai.AsyncOpenAI,
|
||||
async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
|
||||
guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -742,6 +788,10 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.asyncio
|
||||
async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
@ -787,6 +837,10 @@ async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
|
||||
@pytest.mark.asyncio
|
||||
async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
|
||||
sample_json_schema):
|
||||
|
||||
if is_v1_server:
|
||||
pytest.skip("sample_json_schema has features unsupported on V1")
|
||||
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "you are a helpful assistant"
|
||||
|
||||
@ -6,12 +6,22 @@ import itertools
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
deep_gemm_moe_fp8, fused_topk, moe_align_block_size)
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
dg_available = False
|
||||
try:
|
||||
import deep_gemm
|
||||
dg_available = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
if current_platform.get_device_capability() < (9, 0):
|
||||
pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
|
||||
allow_module_level=True)
|
||||
@ -21,17 +31,18 @@ DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
|
||||
NUM_TOKENS = [7, 83, 2048]
|
||||
D = [512, 4096, 5120, 13824]
|
||||
GROUP_SIZE = [64, 128, 256, 512]
|
||||
M = [1, 7, 83, 512, 2048]
|
||||
N = [128, 512, 1024, 4096, 7748, 13824]
|
||||
K = [256, 4096, 5120, 3884, 13824]
|
||||
M = [1, 7, 8, 83, 84, 512, 2048, 4096]
|
||||
N = [128, 512, 1024, 4096, 7168, 7748, 13824]
|
||||
K = [256, 4096, 5120, 3884, 13824, 16384]
|
||||
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
|
||||
# and its hidden size is 7168.
|
||||
M_moe = [1, 7, 83, 512, 2048]
|
||||
N_moe = [4608] # [128, 4608, 13824]
|
||||
K_moe = [7168] # [256, 7168, 13824]
|
||||
M_moe = [1, 2, 7, 83, 128, 512, 2048]
|
||||
M_moe_dg = [128, 192, 512, 1335, 2048]
|
||||
N_moe = [128, 256, 1024, 4608] # [13824]
|
||||
K_moe = [256, 512, 7168] # [13824]
|
||||
BLOCK_SIZE = [[128, 128]]
|
||||
E = [8, 24] # [8, 24, 128, 256]
|
||||
TOP_KS = [2] # [1, 2, 6]
|
||||
E = [2, 8, 16, 24] # [128, 256]
|
||||
TOP_KS = [1, 2, 6]
|
||||
OUT_DTYPES = [torch.bfloat16] # [torch.float32, torch.half, torch.bfloat16]
|
||||
SEEDS = [0]
|
||||
|
||||
@ -217,11 +228,16 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
|
||||
SEEDS))
|
||||
@torch.inference_mode()
|
||||
def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
|
||||
if topk > E:
|
||||
pytest.skip(f"Skipping test; topk={topk} > E={E}")
|
||||
|
||||
torch.manual_seed(seed)
|
||||
factor_for_scale = 1e-2
|
||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
|
||||
a = torch.randn((M, K), dtype=dtype) / 10
|
||||
|
||||
w1_bf16 = (torch.rand(
|
||||
@ -246,25 +262,240 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
|
||||
|
||||
score = torch.randn((M, E), dtype=dtype)
|
||||
|
||||
out = fused_moe(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
score,
|
||||
topk,
|
||||
renormalize=False,
|
||||
use_fp8_w8a8=True,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
block_shape=block_size,
|
||||
)
|
||||
ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
|
||||
block_size)
|
||||
# Set the context to avoid lots of warning spam.
|
||||
with set_current_vllm_config(vllm_config):
|
||||
out = fused_moe(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
score,
|
||||
topk,
|
||||
renormalize=False,
|
||||
use_fp8_w8a8=True,
|
||||
w1_scale=w1_s,
|
||||
w2_scale=w2_s,
|
||||
block_shape=block_size,
|
||||
)
|
||||
ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
|
||||
block_size)
|
||||
|
||||
print(f"{out.sum()=}")
|
||||
print(f"{ref_out.sum()=}")
|
||||
#print(f"{out.sum()=}")
|
||||
#print(f"{ref_out.sum()=}")
|
||||
|
||||
rel_diff = (torch.mean(
|
||||
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
|
||||
torch.mean(torch.abs(ref_out.to(torch.float32))))
|
||||
assert rel_diff < 0.03
|
||||
|
||||
|
||||
def per_block_cast_to_fp8(
|
||||
x: torch.Tensor,
|
||||
block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
assert x.dim() == 2
|
||||
m, n = x.shape
|
||||
x_padded = torch.zeros(
|
||||
(deep_gemm.ceil_div(m, 128) * 128,
|
||||
deep_gemm.ceil_div(n, block_size_n) * block_size_n),
|
||||
dtype=x.dtype,
|
||||
device=x.device)
|
||||
x_padded[:m, :n] = x
|
||||
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
|
||||
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
|
||||
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
|
||||
x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
|
||||
scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
|
||||
return x_scaled_sub, scales
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"M,N,K,block_size,out_dtype,seed",
|
||||
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
|
||||
@torch.inference_mode()
|
||||
def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
|
||||
# only aligned sizes
|
||||
if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
|
||||
pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
|
||||
|
||||
torch.manual_seed(seed)
|
||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||
fp8_max = fp8_info.max
|
||||
|
||||
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
||||
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
|
||||
|
||||
_, block_k = block_size[0], block_size[1]
|
||||
|
||||
A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_k)
|
||||
B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
|
||||
|
||||
As = As_fp8.to(torch.float32)
|
||||
Bs = Bs_fp8.to(torch.float32)
|
||||
|
||||
ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
|
||||
out_dtype)
|
||||
|
||||
# Transpose earlier so that the testing will not trigger transposing kernels
|
||||
As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8)
|
||||
|
||||
out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
|
||||
|
||||
assert As_fp8.shape == (M, (K + 127) //
|
||||
128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
|
||||
|
||||
deep_gemm.gemm_fp8_fp8_bf16_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
|
||||
|
||||
rel_diff = (torch.mean(
|
||||
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
|
||||
torch.mean(torch.abs(ref_out.to(torch.float32))))
|
||||
assert rel_diff < 0.001
|
||||
|
||||
|
||||
def fp8_perm(m, idx):
|
||||
if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
|
||||
return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
|
||||
else:
|
||||
return m[idx, ...]
|
||||
|
||||
|
||||
def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
|
||||
M, K = a.shape
|
||||
|
||||
sorted_token_ids, m_indices, num_pad = moe_align_block_size(
|
||||
topk_ids, block_m, num_groups, None, pad_sorted_ids=True)
|
||||
|
||||
num_tokens = topk * M
|
||||
|
||||
sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
|
||||
m_indices = torch.repeat_interleave(m_indices, block_m, dim=0)
|
||||
inv_perm = torch.argsort(sorted_token_ids)[:M * topk]
|
||||
|
||||
a = fp8_perm(a, sorted_token_ids // topk)
|
||||
if a_s is not None:
|
||||
a_s = a_s[sorted_token_ids // topk]
|
||||
|
||||
return a, a_s, m_indices, inv_perm
|
||||
|
||||
|
||||
def test_moe_unpermute(out, inv_perm, topk, K, topk_weight):
|
||||
M = topk_weight.shape[0]
|
||||
out = out[inv_perm, ...]
|
||||
tmp_out = out.view(-1, topk, K)
|
||||
return (tmp_out * topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
|
||||
|
||||
|
||||
def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
|
||||
block_shape):
|
||||
"""Fused moe with block-wise quantization using DeepGemm grouped gemm."""
|
||||
num_groups = w1.shape[0]
|
||||
M, K = a.shape
|
||||
N = w2.shape[-1]
|
||||
|
||||
topk_weight, topk_ids = fused_topk(a, score.float(), topk, False)
|
||||
|
||||
block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
|
||||
|
||||
_, block_k = block_shape[0], block_shape[1]
|
||||
|
||||
a_q, a_s = per_token_group_quant_fp8(a, block_m)
|
||||
|
||||
a_q, a_s, m_indices, inv_perm = test_moe_permute(a_q, a_s, topk_ids,
|
||||
num_groups, topk, block_m)
|
||||
|
||||
inter_out = torch.zeros((a_q.shape[0], N * 2),
|
||||
dtype=torch.bfloat16,
|
||||
device=a.device)
|
||||
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous((a_q, a_s), (w1, w1_s),
|
||||
inter_out, m_indices)
|
||||
|
||||
act_out = SiluAndMul().forward_native(inter_out)
|
||||
act_out_q, act_out_s = per_token_group_quant_fp8(act_out, block_k)
|
||||
|
||||
out = torch.zeros(a_q.shape[0], K, dtype=torch.bfloat16, device=a.device)
|
||||
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
||||
(act_out_q, act_out_s), (w2, w2_s), out, m_indices)
|
||||
|
||||
final_out = test_moe_unpermute(out, inv_perm, topk, K, topk_weight)
|
||||
|
||||
return final_out
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"M,N,K,E,topk,seed",
|
||||
itertools.product(M_moe_dg, N_moe, K_moe, E, TOP_KS, SEEDS))
|
||||
@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
|
||||
@torch.inference_mode()
|
||||
def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
|
||||
|
||||
block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
|
||||
block_size = [block_m, block_m]
|
||||
dtype = torch.bfloat16
|
||||
|
||||
# only aligned sizes
|
||||
if (N % block_m != 0 or K % block_m != 0 or topk > E):
|
||||
pytest.skip(
|
||||
f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}")
|
||||
|
||||
if (N <= 512):
|
||||
pytest.skip("Skipping N <= 512 until performance issues solved.")
|
||||
|
||||
vllm_config = VllmConfig()
|
||||
|
||||
torch.manual_seed(seed)
|
||||
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||
|
||||
a = torch.randn((M, K), dtype=dtype) / 10
|
||||
|
||||
w1_bf16 = ((torch.rand((E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 *
|
||||
fp8_max).clamp(min=fp8_min, max=fp8_max)
|
||||
|
||||
w2_bf16 = ((torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 *
|
||||
fp8_max).clamp(min=fp8_min, max=fp8_max)
|
||||
|
||||
score = torch.randn((M, E), dtype=dtype)
|
||||
|
||||
block_n, block_k = block_size[0], block_size[1]
|
||||
n_tiles_w1 = ((2 * N) + block_n - 1) // block_n
|
||||
k_tiles_w1 = (K + block_k - 1) // block_k
|
||||
n_tiles_w2 = (K + block_n - 1) // block_n
|
||||
k_tiles_w2 = (N + block_k - 1) // block_k
|
||||
|
||||
w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
|
||||
w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
|
||||
|
||||
w1_s = torch.empty((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
|
||||
w2_s = torch.empty((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
|
||||
|
||||
w1_s = deep_gemm.get_col_major_tma_aligned_tensor(w1_s).contiguous()
|
||||
w2_s = deep_gemm.get_col_major_tma_aligned_tensor(w2_s).contiguous()
|
||||
|
||||
assert w1_s.shape == (E, (2 * N + 127) // 128, (K + 127) // 128)
|
||||
assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
|
||||
|
||||
for i in range(E):
|
||||
w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
|
||||
w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
|
||||
|
||||
# Set the context to avoid lots of warning spam.
|
||||
with set_current_vllm_config(vllm_config):
|
||||
if M >= 128:
|
||||
ref_out = deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s,
|
||||
score, topk, block_size)
|
||||
else:
|
||||
ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score,
|
||||
topk, block_size)
|
||||
|
||||
topk_weights, topk_ids = fused_topk(a, score.float(), topk, False)
|
||||
|
||||
out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
|
||||
|
||||
#print(f"{out.sum()=}")
|
||||
#print(f"{ref_out.sum()=}")
|
||||
|
||||
rel_diff = (torch.mean(
|
||||
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
|
||||
torch.mean(torch.abs(ref_out.to(torch.float32))))
|
||||
|
||||
assert rel_diff < 0.03
|
||||
|
||||
@ -40,14 +40,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def test_baichuan_lora(baichuan_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
|
||||
@ -18,6 +18,14 @@ EXPECTED_LORA_OUTPUT = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
|
||||
@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@create_new_process_for_each_test()
|
||||
def test_chatglm3_lora(chatglm3_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
|
||||
@ -1,65 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_PATH = "google/gemma-7b"
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
"Quote: Imagination is",
|
||||
"Quote: Be yourself;",
|
||||
"Quote: Painting is poetry that is seen rather than felt,",
|
||||
]
|
||||
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params,
|
||||
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
|
||||
if lora_id else None)
|
||||
# Print the outputs.
|
||||
generated_texts: list[str] = []
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text.strip()
|
||||
generated_texts.append(generated_text)
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
# The V1 lora test for this model requires more than 24GB.
|
||||
@pytest.mark.skip_v1
|
||||
@pytest.mark.xfail(current_platform.is_rocm(),
|
||||
reason="There can be output mismatch on ROCm")
|
||||
def test_gemma_lora(gemma_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
max_model_len=1024,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
enable_chunked_prefill=True)
|
||||
|
||||
expected_lora_output = [
|
||||
"more important than knowledge.\nAuthor: Albert Einstein\n",
|
||||
"everyone else is already taken.\nAuthor: Oscar Wilde\n",
|
||||
"and poetry is painting that is felt rather than seen.\n"
|
||||
"Author: Leonardo da Vinci\n",
|
||||
]
|
||||
|
||||
output1 = do_sample(llm, gemma_lora_files, lora_id=1)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output1[i].startswith(expected_lora_output[i])
|
||||
output2 = do_sample(llm, gemma_lora_files, lora_id=2)
|
||||
for i in range(len(expected_lora_output)):
|
||||
assert output2[i].startswith(expected_lora_output[i])
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import importlib
|
||||
import random
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
@ -20,7 +19,6 @@ from vllm.lora.fully_sharded_layers import (
|
||||
# yapf conflicts with isort for this block
|
||||
# yapf: disable
|
||||
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
LinearScalingRotaryEmbeddingWithLoRA,
|
||||
LogitsProcessorWithLoRA, LoRAMapping,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
MergedQKVParallelLinearWithLoRA,
|
||||
@ -29,8 +27,7 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||
RowParallelLinearWithLoRA,
|
||||
VocabParallelEmbeddingWithLoRA)
|
||||
# yapf: enable
|
||||
from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
|
||||
PackedLoRALayerWeights)
|
||||
from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.lora.punica_wrapper import get_punica_wrapper
|
||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
MergedColumnParallelLinear,
|
||||
@ -38,7 +35,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||
ReplicatedLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
|
||||
from vllm.model_executor.utils import set_random_seed
|
||||
@ -60,32 +56,16 @@ DEVICES = ([
|
||||
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||
] if current_platform.is_cuda_alike() else ["cpu"])
|
||||
|
||||
#For GPU, we will launch different triton kernels between the prefill and decode
|
||||
# stages, so we need to verify this. prefill stage(True) or decode stage(False)
|
||||
# prefill stage(True) or decode stage(False)
|
||||
STAGES = [True, False]
|
||||
|
||||
# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
|
||||
# the tests in this file run twice, once with the V0 engine and then with
|
||||
# the V1 engine.
|
||||
# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
|
||||
# with the inclusion of V1 tests to maintain the CI test times.
|
||||
NUM_RANDOM_SEEDS = 5
|
||||
# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
|
||||
# 256 before. It is cut to half with the inclusion of V1 tests to maintain
|
||||
# the CI test times.
|
||||
NUM_RANDOM_SEEDS = 10
|
||||
|
||||
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
|
||||
# Reload punica_gpu as the kernels used are tied to engine type.
|
||||
from vllm.lora.punica_wrapper import punica_gpu
|
||||
importlib.reload(punica_gpu)
|
||||
|
||||
def clean_cache():
|
||||
# Release any memory we might be holding on to. CI runs OOMs otherwise.
|
||||
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
|
||||
_LORA_B_PTR_DICT)
|
||||
@ -95,6 +75,24 @@ def v1(run_with_both_engines_lora):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def skip_cuda_with_stage_false(request):
|
||||
"""
|
||||
On cuda-like platforms, we use the same kernels for prefill and decode
|
||||
stage, and 'stage' is generally ignored, so we only need to test once.
|
||||
"""
|
||||
if current_platform.is_cuda_alike():
|
||||
try:
|
||||
if hasattr(request.node, "callspec") and hasattr(
|
||||
request.node.callspec, "params"):
|
||||
params = request.node.callspec.params
|
||||
if "stage" in params and params["stage"] is False:
|
||||
pytest.skip("Skip test when stage=False")
|
||||
except Exception:
|
||||
pass
|
||||
yield
|
||||
|
||||
|
||||
def get_random_id_to_index(num_loras: int,
|
||||
num_slots: int,
|
||||
log: bool = True) -> list[Optional[int]]:
|
||||
@ -1016,103 +1014,6 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
|
||||
atol=atol)
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
@pytest.mark.parametrize("num_loras", [1, 8])
|
||||
@pytest.mark.parametrize("device", ["cuda"])
|
||||
@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
|
||||
(6.0, 1.0)])
|
||||
@pytest.mark.parametrize("max_position", [11, 4096, 32768])
|
||||
@pytest.mark.parametrize("is_neox_style", [True, False])
|
||||
@pytest.mark.parametrize("rotary_dim", [None, 32])
|
||||
@pytest.mark.parametrize("head_size", [32, 108])
|
||||
@pytest.mark.parametrize("seq_len", [11, 1024])
|
||||
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
|
||||
reason="Only CUDA backends are supported")
|
||||
def test_rotary_embedding_long_context(dist_init, num_loras, device,
|
||||
scaling_factors, max_position,
|
||||
is_neox_style, rotary_dim, head_size,
|
||||
seq_len) -> None:
|
||||
dtype = torch.float16
|
||||
max_loras = 8
|
||||
seed = 0
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
|
||||
assert check_punica_wrapper(punica_wrapper)
|
||||
lora_config = LoRAConfig(max_loras=max_loras,
|
||||
max_lora_rank=8,
|
||||
long_lora_scaling_factors=scaling_factors,
|
||||
lora_dtype=dtype)
|
||||
|
||||
if rotary_dim is None:
|
||||
rotary_dim = head_size
|
||||
base = 10000
|
||||
batch_size = 5 * num_loras
|
||||
num_heads = 7
|
||||
|
||||
# Verify lora is equivalent to linear scaling rotary embedding.
|
||||
rope = get_rope(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
)
|
||||
lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
|
||||
lora_rope.set_mapping(punica_wrapper)
|
||||
lora_rope.create_lora_weights(max_loras, lora_config)
|
||||
linear_rope = get_rope(head_size, rotary_dim, max_position, base,
|
||||
is_neox_style, {
|
||||
"rope_type": "linear",
|
||||
"factor": scaling_factors
|
||||
})
|
||||
linear_rope = linear_rope.to(dtype=dtype)
|
||||
id_to_index = get_random_id_to_index(num_loras, max_loras)
|
||||
_, index_mapping, prompt_mapping = create_random_inputs(
|
||||
active_lora_ids=[0],
|
||||
num_inputs=batch_size,
|
||||
input_size=(1, max_position),
|
||||
input_range=(0, lora_config.lora_extra_vocab_size),
|
||||
input_type=torch.float16,
|
||||
device=device)
|
||||
|
||||
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
|
||||
long_lora_context = LongContextLoRAContext(list(scaling_factors),
|
||||
rotary_dim)
|
||||
|
||||
next_expected_offset = 0
|
||||
# Make sure the offset is correct.
|
||||
scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
|
||||
for scaling_factor, offset in scaling_factor_to_offset.items():
|
||||
assert offset == next_expected_offset
|
||||
next_expected_offset += scaling_factor * max_position
|
||||
|
||||
for i in range(len(scaling_factors)):
|
||||
long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
|
||||
scaling_factors[i], 0)
|
||||
punica_wrapper.update_metadata(
|
||||
lora_mapping,
|
||||
id_to_index,
|
||||
max_loras,
|
||||
512,
|
||||
lora_config.lora_extra_vocab_size,
|
||||
long_lora_context=long_lora_context,
|
||||
)
|
||||
# lora_rope.set_mapping(*mapping_info)
|
||||
|
||||
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||
query = torch.randn(batch_size,
|
||||
seq_len,
|
||||
num_heads * head_size,
|
||||
dtype=dtype)
|
||||
key = torch.randn_like(query)
|
||||
ref_q, ref_k = linear_rope(positions, query, key)
|
||||
actual_q, actual_k = lora_rope(positions, query, key)
|
||||
|
||||
torch.allclose(ref_q, actual_q)
|
||||
torch.allclose(ref_k, actual_k)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
|
||||
@pytest.mark.parametrize(
|
||||
"seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
|
||||
|
||||
@ -28,6 +28,14 @@ EXPECTED_LORA_OUTPUT = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
|
||||
@ -71,16 +79,6 @@ def generate_and_test(llm, sql_lora_files):
|
||||
print("removing lora")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
# V1 Test: Failing due to numerics on V1.
|
||||
@pytest.mark.skip_v1
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora(sql_lora_files):
|
||||
|
||||
@ -126,8 +124,6 @@ def test_llama_lora_warmup(sql_lora_files):
|
||||
"less when using lora than when not using lora")
|
||||
|
||||
|
||||
# V1 Test: Failing due to numerics on V1.
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_llama_lora_tp4(sql_lora_files):
|
||||
|
||||
@ -7,7 +7,6 @@ import torch
|
||||
from safetensors.torch import load_file
|
||||
from torch import nn
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import LoRAConfig
|
||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||
MergedColumnParallelLinearWithLoRA,
|
||||
@ -33,6 +32,17 @@ DEVICES = ([
|
||||
] if current_platform.is_cuda_alike() else ["cpu"])
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Some tests depend on V0 internals. Since both V0 and V1 use the same
|
||||
LoRAModelManager it is okay to just test V0.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv('VLLM_USE_V1', '0')
|
||||
yield
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_from_lora_tensors(sql_lora_files, device):
|
||||
tensors = load_file(
|
||||
@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
|
||||
assert manager.device == device
|
||||
|
||||
|
||||
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
|
||||
sql_lora_files, device):
|
||||
@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
|
||||
device)
|
||||
|
||||
|
||||
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
|
||||
@pytest.mark.parametrize("device", DEVICES)
|
||||
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
|
||||
sql_lora_files, device):
|
||||
|
||||
@ -78,6 +78,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output2[i])
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
@ -99,6 +101,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
|
||||
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
|
||||
|
||||
|
||||
@pytest.mark.skipif(current_platform.is_cuda_alike(),
|
||||
reason="Skipping to avoid redundant model tests")
|
||||
@pytest.mark.xfail(
|
||||
current_platform.is_rocm(),
|
||||
reason="MiniCPM-V dependency xformers incompatible with ROCm")
|
||||
|
||||
@ -10,6 +10,14 @@ MODEL_PATH = "microsoft/phi-2"
|
||||
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
prompts = [
|
||||
PROMPT_TEMPLATE.format(
|
||||
@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
# Skipping for V1 for now as we are hitting,
|
||||
# "Head size 80 is not supported by FlashAttention." error.
|
||||
@pytest.mark.skip_v1
|
||||
|
||||
@ -37,6 +37,14 @@ else:
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
def do_sample(llm: vllm.LLM,
|
||||
lora_path: str,
|
||||
lora_id: int,
|
||||
@ -69,14 +77,6 @@ def do_sample(llm: vllm.LLM,
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
|
||||
import vllm
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
@ -46,15 +44,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
return generated_texts
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
llm = vllm.LLM(MODEL_PATH,
|
||||
@ -74,7 +63,6 @@ def test_ilama_lora(ilama_lora_files):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4(ilama_lora_files):
|
||||
@ -96,7 +84,6 @@ def test_ilama_lora_tp4(ilama_lora_files):
|
||||
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
|
||||
|
||||
|
||||
@pytest.mark.skip_v1
|
||||
@multi_gpu_test(num_gpus=4)
|
||||
@create_new_process_for_each_test()
|
||||
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
|
||||
|
||||
@ -158,6 +158,20 @@ VLM_TEST_SETTINGS = {
|
||||
max_tokens=64,
|
||||
marks=[large_gpu_mark(min_gb=64)],
|
||||
),
|
||||
"aya_vision": VLMTestInfo(
|
||||
models=["CohereForAI/aya-vision-8b"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<image>What is the season?", # noqa: E501
|
||||
}),
|
||||
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
|
||||
),
|
||||
"blip2": VLMTestInfo(
|
||||
# TODO: Change back to 2.7b once head_dim = 80 is supported
|
||||
models=["Salesforce/blip2-opt-6.7b"],
|
||||
|
||||
70
tests/models/embedding/language/test_jina_reranker_v2.py
Normal file
70
tests/models/embedding/language/test_jina_reranker_v2.py
Normal file
@ -0,0 +1,70 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# ruff: noqa: E501
|
||||
"""Compare the scoring outputs of HF and vLLM models.
|
||||
|
||||
Run `pytest tests/models/embedding/language/test_jina_reranker_v2.py`.
|
||||
"""
|
||||
import math
|
||||
|
||||
import pytest
|
||||
|
||||
MODELS = [
|
||||
"jinaai/jina-reranker-v2-base-multilingual", # Roberta
|
||||
]
|
||||
|
||||
TEXTS_1 = ["Organic skincare products for sensitive skin"]
|
||||
|
||||
TEXTS_2 = [
|
||||
"Organic skincare for sensitive skin with aloe vera and chamomile.",
|
||||
"New makeup trends focus on bold colors and innovative techniques",
|
||||
"Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
|
||||
"Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
|
||||
"Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
|
||||
"Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
|
||||
"针对敏感肌专门设计的天然有机护肤产品",
|
||||
"新的化妆趋势注重鲜艳的颜色和创新的技巧",
|
||||
"敏感肌のために特別に設計された天然有機スキンケア製品",
|
||||
"新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=MODELS)
|
||||
def model_name(request):
|
||||
yield request.param
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
|
||||
|
||||
text_pair = [TEXTS_1[0], TEXTS_2[0]]
|
||||
|
||||
with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict([text_pair]).tolist()
|
||||
|
||||
with vllm_runner(model_name, task="score", dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
|
||||
|
||||
assert len(vllm_outputs) == 1
|
||||
assert len(hf_outputs) == 1
|
||||
|
||||
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
|
||||
|
||||
text_pairs = [[TEXTS_1[0], text] for text in TEXTS_2]
|
||||
|
||||
with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
|
||||
hf_outputs = hf_model.predict(text_pairs).tolist()
|
||||
|
||||
with vllm_runner(model_name, task="score", dtype=dtype,
|
||||
max_model_len=None) as vllm_model:
|
||||
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
|
||||
|
||||
assert len(vllm_outputs) == 10
|
||||
assert len(hf_outputs) == 10
|
||||
|
||||
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
|
||||
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
|
||||
@ -212,7 +212,7 @@ def _run_test(
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
max_num_seqs=3,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
|
||||
|
||||
@ -246,6 +246,7 @@ def _test_processing_correctness_mistral(
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"rhymes-ai/Aria",
|
||||
"CohereForAI/aya-vision-8b",
|
||||
"Salesforce/blip2-opt-2.7b",
|
||||
"facebook/chameleon-7b",
|
||||
"deepseek-ai/deepseek-vl2-tiny",
|
||||
|
||||
@ -259,6 +259,7 @@ _CROSS_ENCODER_EXAMPLE_MODELS = {
|
||||
_MULTIMODAL_EXAMPLE_MODELS = {
|
||||
# [Decoder-only]
|
||||
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
|
||||
"AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
|
||||
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b", # noqa: E501
|
||||
extras={"6b": "Salesforce/blip2-opt-6.7b"}), # noqa: E501
|
||||
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
|
||||
@ -297,6 +298,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
|
||||
extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501
|
||||
trust_remote_code=True),
|
||||
"Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503", # noqa: E501
|
||||
min_transformers_version="4.50", # noqa: E501
|
||||
extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}), # noqa: E501
|
||||
"MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
|
||||
max_transformers_version="4.48",
|
||||
transformers_version_reason="Use of private method which no longer exists.", # noqa: E501
|
||||
@ -346,7 +350,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
|
||||
trust_remote_code=True),
|
||||
}
|
||||
|
||||
_FALLBACK_MODEL = {
|
||||
_TRANSFORMERS_MODELS = {
|
||||
"TransformersForCausalLM": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
|
||||
}
|
||||
|
||||
@ -356,7 +360,7 @@ _EXAMPLE_MODELS = {
|
||||
**_CROSS_ENCODER_EXAMPLE_MODELS,
|
||||
**_MULTIMODAL_EXAMPLE_MODELS,
|
||||
**_SPECULATIVE_DECODING_EXAMPLE_MODELS,
|
||||
**_FALLBACK_MODEL,
|
||||
**_TRANSFORMERS_MODELS,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from vllm_test_utils import monitor
|
||||
from vllm_test_utils.monitor import monitor
|
||||
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
|
||||
@ -143,7 +143,8 @@ def parser():
|
||||
def parser_with_config():
|
||||
parser = FlexibleArgumentParser()
|
||||
parser.add_argument('serve')
|
||||
parser.add_argument('model_tag')
|
||||
parser.add_argument('model_tag', nargs='?')
|
||||
parser.add_argument('--model', type=str)
|
||||
parser.add_argument('--served-model-name', type=str)
|
||||
parser.add_argument('--config', type=str)
|
||||
parser.add_argument('--port', type=int)
|
||||
@ -199,29 +200,29 @@ def test_missing_required_argument(parser):
|
||||
parser.parse_args([])
|
||||
|
||||
|
||||
def test_cli_override_to_config(parser_with_config):
|
||||
def test_cli_override_to_config(parser_with_config, cli_config_file):
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'mymodel', '--config', './data/test_config.yaml',
|
||||
'serve', 'mymodel', '--config', cli_config_file,
|
||||
'--tensor-parallel-size', '3'
|
||||
])
|
||||
assert args.tensor_parallel_size == 3
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
|
||||
'./data/test_config.yaml'
|
||||
cli_config_file
|
||||
])
|
||||
assert args.tensor_parallel_size == 3
|
||||
assert args.port == 12312
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
|
||||
'./data/test_config.yaml', '--port', '666'
|
||||
cli_config_file, '--port', '666'
|
||||
])
|
||||
assert args.tensor_parallel_size == 3
|
||||
assert args.port == 666
|
||||
|
||||
|
||||
def test_config_args(parser_with_config):
|
||||
def test_config_args(parser_with_config, cli_config_file):
|
||||
args = parser_with_config.parse_args(
|
||||
['serve', 'mymodel', '--config', './data/test_config.yaml'])
|
||||
['serve', 'mymodel', '--config', cli_config_file])
|
||||
assert args.tensor_parallel_size == 2
|
||||
assert args.trust_remote_code
|
||||
assert not args.multi_step_stream_outputs
|
||||
@ -243,10 +244,9 @@ def test_config_file(parser_with_config):
|
||||
])
|
||||
|
||||
|
||||
def test_no_model_tag(parser_with_config):
|
||||
def test_no_model_tag(parser_with_config, cli_config_file):
|
||||
with pytest.raises(ValueError):
|
||||
parser_with_config.parse_args(
|
||||
['serve', '--config', './data/test_config.yaml'])
|
||||
parser_with_config.parse_args(['serve', '--config', cli_config_file])
|
||||
|
||||
|
||||
# yapf: enable
|
||||
@ -480,6 +480,48 @@ def test_swap_dict_values(obj, key1, key2):
|
||||
else:
|
||||
assert key1 not in obj
|
||||
|
||||
|
||||
def test_model_specification(parser_with_config,
|
||||
cli_config_file,
|
||||
cli_config_file_with_model):
|
||||
# Test model in CLI takes precedence over config
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'cli-model', '--config', cli_config_file_with_model
|
||||
])
|
||||
assert args.model_tag == 'cli-model'
|
||||
assert args.served_model_name == 'mymodel'
|
||||
|
||||
# Test model from config file works
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', '--config', cli_config_file_with_model,
|
||||
])
|
||||
assert args.model == 'config-model'
|
||||
assert args.served_model_name == 'mymodel'
|
||||
|
||||
# Test no model specified anywhere raises error
|
||||
with pytest.raises(ValueError, match="No model specified!"):
|
||||
parser_with_config.parse_args(['serve', '--config', cli_config_file])
|
||||
|
||||
# Test using --model option raises error
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=(
|
||||
"With `vllm serve`, you should provide the model as a positional "
|
||||
"argument or in a config file instead of via the `--model` option."
|
||||
),
|
||||
):
|
||||
parser_with_config.parse_args(['serve', '--model', 'my-model'])
|
||||
|
||||
# Test other config values are preserved
|
||||
args = parser_with_config.parse_args([
|
||||
'serve', 'cli-model', '--config', cli_config_file_with_model,
|
||||
])
|
||||
assert args.tensor_parallel_size == 2
|
||||
assert args.trust_remote_code is True
|
||||
assert args.multi_step_stream_outputs is False
|
||||
assert args.port == 12312
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
|
||||
(None, bool, [1, 2, 3])])
|
||||
@pytest.mark.parametrize("output", [0, 1, 2])
|
||||
|
||||
@ -612,7 +612,16 @@ def multi_process_parallel(
|
||||
# as compared to multiprocessing.
|
||||
# NOTE: We need to set working_dir for distributed tests,
|
||||
# otherwise we may get import errors on ray workers
|
||||
ray.init(runtime_env={"working_dir": VLLM_PATH})
|
||||
# NOTE: Force ray not to use gitignore file as excluding, otherwise
|
||||
# it will not move .so files to working dir.
|
||||
# So we have to manually add some of large directories
|
||||
os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
|
||||
ray.init(
|
||||
runtime_env={
|
||||
"working_dir": VLLM_PATH,
|
||||
"excludes":
|
||||
["build", ".git", "cmake-build-*", "shellcheck", "dist"]
|
||||
})
|
||||
|
||||
distributed_init_port = get_open_port()
|
||||
refs = []
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
@ -12,6 +13,8 @@ from vllm.v1.core.block_pool import BlockPool
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
|
||||
from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
|
||||
hash_block_tokens)
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec)
|
||||
|
||||
|
||||
def make_request(request_id,
|
||||
@ -39,13 +42,23 @@ def make_request(request_id,
|
||||
)
|
||||
|
||||
|
||||
def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
|
||||
return KVCacheConfig(
|
||||
num_blocks=num_blocks,
|
||||
tensors={},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(['layer'],
|
||||
FullAttentionSpec(block_size, 1, 1, torch.float32,
|
||||
False))
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
|
||||
def test_prefill(hash_algo):
|
||||
manager = KVCacheManager(
|
||||
block_size=16,
|
||||
num_gpu_blocks=10,
|
||||
make_kv_cache_config(16, 11),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
caching_hash_algo=hash_algo,
|
||||
num_preallocate_tokens=16,
|
||||
@ -67,12 +80,12 @@ def test_prefill(hash_algo):
|
||||
assert not computed_blocks
|
||||
assert num_computed_tokens == 0
|
||||
blocks = manager.allocate_slots(req0, 55, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
||||
assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
|
||||
|
||||
# Check full block metadata
|
||||
parent_block_hash = None
|
||||
for block_id in (0, 1, 2):
|
||||
block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
|
||||
for block_id in (1, 2, 3):
|
||||
block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
|
||||
block_hash = hash_block_tokens(hash_fn, parent_block_hash,
|
||||
block_tokens)
|
||||
assert manager.block_pool.blocks[block_id].block_hash == block_hash
|
||||
@ -80,7 +93,7 @@ def test_prefill(hash_algo):
|
||||
parent_block_hash = block_hash.hash_value
|
||||
|
||||
# Check partial/preallocated block metadata
|
||||
for block_id in (3, 4):
|
||||
for block_id in (4, 5):
|
||||
assert manager.block_pool.blocks[block_id].block_hash is None
|
||||
assert manager.block_pool.blocks[block_id].ref_cnt == 1
|
||||
|
||||
@ -90,11 +103,11 @@ def test_prefill(hash_algo):
|
||||
req1 = make_request("1", common_token_ids + unique_token_ids)
|
||||
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
||||
assert len(manager.req_to_block_hashes[req1.request_id]) == 3
|
||||
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
|
||||
assert [b.block_id for b in computed_blocks] == [1, 2, 3]
|
||||
assert num_computed_tokens == 3 * 16
|
||||
num_new_tokens = 53 - 3 * 16
|
||||
blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [5, 6]
|
||||
assert [b.block_id for b in blocks] == [6, 7]
|
||||
for block in computed_blocks:
|
||||
assert block.ref_cnt == 2
|
||||
|
||||
@ -107,14 +120,14 @@ def test_prefill(hash_algo):
|
||||
# All blocks should be available.
|
||||
assert manager.block_pool.free_block_queue.num_free_blocks == 10
|
||||
# The order should be
|
||||
# [unallocated (7, 8, 9)]
|
||||
# [unique_req0 (4, 3)]
|
||||
# [unique_req1 (6, 5)]
|
||||
# [common (2, 1, 0)]
|
||||
# [unallocated (8, 9, 10)]
|
||||
# [unique_req0 (5, 4)]
|
||||
# [unique_req1 (7, 6)]
|
||||
# [common (3, 2, 1)]
|
||||
assert [
|
||||
b.block_id
|
||||
for b in manager.block_pool.free_block_queue.get_all_free_blocks()
|
||||
] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
|
||||
] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
|
||||
|
||||
# Cache hit in the common prefix when the original block is already free.
|
||||
# Incomplete 1 block (6 tokens)
|
||||
@ -122,11 +135,11 @@ def test_prefill(hash_algo):
|
||||
req2 = make_request("2", common_token_ids + unique_token_ids)
|
||||
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
|
||||
assert len(manager.req_to_block_hashes[req2.request_id]) == 3
|
||||
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
|
||||
assert [b.block_id for b in computed_blocks] == [1, 2, 3]
|
||||
assert num_computed_tokens == 3 * 16
|
||||
num_new_tokens = 53 - 3 * 16
|
||||
blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [7, 8]
|
||||
assert [b.block_id for b in blocks] == [8, 9]
|
||||
|
||||
# Although we only have 5 free blocks, we have 8 blocks in
|
||||
# the free block queue due to lazy removal.
|
||||
@ -148,7 +161,7 @@ def test_prefill(hash_algo):
|
||||
assert num_computed_tokens == 0
|
||||
blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
|
||||
# This block ID order also checks the eviction order.
|
||||
assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
|
||||
assert [b.block_id for b in blocks] == [10, 5, 4, 7, 6, 9, 8, 3, 2, 1]
|
||||
assert manager.block_pool.free_block_queue.num_free_blocks == 0
|
||||
assert manager.block_pool.free_block_queue.free_list_head is None
|
||||
assert manager.block_pool.free_block_queue.free_list_tail is None
|
||||
@ -162,10 +175,8 @@ def test_prefill_plp():
|
||||
3. Schedule plp request; no hit should occur; validate blocks
|
||||
'''
|
||||
manager = KVCacheManager(
|
||||
block_size=16,
|
||||
num_gpu_blocks=10,
|
||||
make_kv_cache_config(16, 11),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=16,
|
||||
)
|
||||
@ -186,13 +197,13 @@ def test_prefill_plp():
|
||||
assert not computed_blocks
|
||||
assert num_computed_tokens == 0
|
||||
blocks = manager.allocate_slots(req0, 55, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
||||
assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
|
||||
req0_block_hashes = [b.block_hash for b in blocks]
|
||||
|
||||
# Check full block metadata
|
||||
parent_block_hash = None
|
||||
for block_id in (0, 1, 2):
|
||||
block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
|
||||
for block_id in (1, 2, 3):
|
||||
block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
|
||||
block_hash = hash_block_tokens(hash_fn, parent_block_hash,
|
||||
block_tokens)
|
||||
assert manager.block_pool.blocks[block_id].block_hash == block_hash
|
||||
@ -200,7 +211,7 @@ def test_prefill_plp():
|
||||
parent_block_hash = block_hash.hash_value
|
||||
|
||||
# Check partial/preallocated block metadata
|
||||
for block_id in (3, 4):
|
||||
for block_id in (4, 5):
|
||||
assert manager.block_pool.blocks[block_id].block_hash is None
|
||||
assert manager.block_pool.blocks[block_id].ref_cnt == 1
|
||||
|
||||
@ -211,11 +222,11 @@ def test_prefill_plp():
|
||||
req1 = make_request("1", common_token_ids + unique_token_ids)
|
||||
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
|
||||
assert len(manager.req_to_block_hashes[req1.request_id]) == 3
|
||||
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
|
||||
assert [b.block_id for b in computed_blocks] == [1, 2, 3]
|
||||
assert num_computed_tokens == 3 * 16
|
||||
num_new_tokens = 53 - 3 * 16
|
||||
blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [5, 6]
|
||||
assert [b.block_id for b in blocks] == [6, 7]
|
||||
for block in computed_blocks:
|
||||
assert block.ref_cnt == 2
|
||||
|
||||
@ -228,14 +239,14 @@ def test_prefill_plp():
|
||||
# All blocks should be available.
|
||||
assert manager.block_pool.free_block_queue.num_free_blocks == 10
|
||||
# The order should be
|
||||
# [unallocated (7, 8, 9)]
|
||||
# [unique_req0 (4, 3)]
|
||||
# [unique_req1 (6, 5)]
|
||||
# [common (2, 1, 0)]
|
||||
# [unallocated (8, 9, 10)]
|
||||
# [unique_req0 (5, 4)]
|
||||
# [unique_req1 (7, 6)]
|
||||
# [common (3, 2, 1)]
|
||||
assert [
|
||||
b.block_id
|
||||
for b in manager.block_pool.free_block_queue.get_all_free_blocks()
|
||||
] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
|
||||
] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
|
||||
|
||||
# Request #2 is a prompt-logprobs request:
|
||||
# NO cache hit in the common prefix; duplicates request #0 cached blocks
|
||||
@ -251,7 +262,7 @@ def test_prefill_plp():
|
||||
block_ids = [b.block_id for b in blocks]
|
||||
# Duplicate cached blocks have different ids but same hashes vs request #0
|
||||
assert [b.block_hash for b in blocks] == req0_block_hashes
|
||||
assert block_ids != [0, 1, 2, 3, 4]
|
||||
assert block_ids != [1, 2, 3, 4, 5]
|
||||
|
||||
# Request #2 block hashes are valid since request #0 hashes are.
|
||||
# Check block reference counts.
|
||||
@ -263,10 +274,8 @@ def test_prefill_plp():
|
||||
|
||||
def test_decode():
|
||||
manager = KVCacheManager(
|
||||
block_size=16,
|
||||
num_gpu_blocks=10,
|
||||
make_kv_cache_config(16, 11),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=16,
|
||||
)
|
||||
@ -282,7 +291,7 @@ def test_decode():
|
||||
assert not computed_blocks
|
||||
assert num_computed_tokens == 0
|
||||
blocks = manager.allocate_slots(req0, 55, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
||||
assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
|
||||
|
||||
# Append slots without allocating a new block.
|
||||
req0.num_computed_tokens = 55
|
||||
@ -316,10 +325,8 @@ def test_decode():
|
||||
|
||||
def test_evict():
|
||||
manager = KVCacheManager(
|
||||
block_size=16,
|
||||
num_gpu_blocks=10,
|
||||
make_kv_cache_config(16, 11),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=16,
|
||||
)
|
||||
@ -350,15 +357,15 @@ def test_evict():
|
||||
assert [
|
||||
b.block_id
|
||||
for b in manager.block_pool.free_block_queue.get_all_free_blocks()
|
||||
] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
|
||||
] == [7, 6, 5, 4, 3, 2, 1, 10, 9, 8]
|
||||
|
||||
# Touch the first 2 blocks.
|
||||
req2 = make_request("2", list(range(2 * 16 + 3)))
|
||||
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
|
||||
assert [b.block_id for b in computed_blocks] == [0, 1]
|
||||
assert [b.block_id for b in computed_blocks] == [1, 2]
|
||||
assert num_computed_tokens == 2 * 16
|
||||
blocks = manager.allocate_slots(req2, 3, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [6, 5]
|
||||
assert [b.block_id for b in blocks] == [7, 6]
|
||||
assert manager.block_pool.free_block_queue.num_free_blocks == 6
|
||||
|
||||
|
||||
@ -369,10 +376,8 @@ def test_hash_block_correct_reuse():
|
||||
"""
|
||||
block_size = 16
|
||||
manager = KVCacheManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=1,
|
||||
make_kv_cache_config(16, 2),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=0,
|
||||
)
|
||||
@ -408,10 +413,8 @@ def test_computed_blocks_not_evicted():
|
||||
"""
|
||||
block_size = 16
|
||||
manager = KVCacheManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=2,
|
||||
make_kv_cache_config(block_size, 3),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=0,
|
||||
)
|
||||
@ -424,7 +427,7 @@ def test_computed_blocks_not_evicted():
|
||||
assert num_computed_tokens == 0
|
||||
blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
|
||||
assert len(blocks) == 1
|
||||
assert blocks[0].block_id == 0
|
||||
assert blocks[0].block_id == 1
|
||||
|
||||
# Allocate another block.
|
||||
req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
|
||||
@ -433,7 +436,7 @@ def test_computed_blocks_not_evicted():
|
||||
assert num_computed_tokens == 0
|
||||
blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
|
||||
assert len(blocks) == 1
|
||||
assert blocks[0].block_id == 1
|
||||
assert blocks[0].block_id == 2
|
||||
|
||||
# Free the blocks.
|
||||
manager.free(req0)
|
||||
@ -444,13 +447,13 @@ def test_computed_blocks_not_evicted():
|
||||
req2 = make_request("2", list(range(num_tokens * 2)))
|
||||
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
|
||||
assert len(computed_blocks) == 1
|
||||
assert computed_blocks[0].block_id == 0
|
||||
assert computed_blocks[0].block_id == 1
|
||||
assert num_computed_tokens == block_size
|
||||
|
||||
blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
|
||||
computed_blocks)
|
||||
assert len(blocks) == 1
|
||||
assert blocks[0].block_id == 1
|
||||
assert blocks[0].block_id == 2
|
||||
|
||||
|
||||
def test_basic_prefix_caching_disabled():
|
||||
@ -459,10 +462,8 @@ def test_basic_prefix_caching_disabled():
|
||||
"""
|
||||
block_size = 4
|
||||
manager = KVCacheManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=4,
|
||||
make_kv_cache_config(block_size, 5),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=False,
|
||||
num_preallocate_tokens=0,
|
||||
)
|
||||
@ -502,10 +503,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
|
||||
This tests that the preallocated blocks are correctly added.
|
||||
"""
|
||||
manager = KVCacheManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=10,
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=num_preallocate_tokens,
|
||||
)
|
||||
@ -586,10 +585,8 @@ def test_mm_prefix_caching():
|
||||
This tests that the multi-modal prefix caching is correct.
|
||||
"""
|
||||
manager = KVCacheManager(
|
||||
block_size=16,
|
||||
num_gpu_blocks=10,
|
||||
make_kv_cache_config(16, 11),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=16,
|
||||
)
|
||||
@ -629,7 +626,7 @@ def test_mm_prefix_caching():
|
||||
assert block_hashes[2].extra_keys == ("bbb", )
|
||||
|
||||
blocks = manager.allocate_slots(req0, 59, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
|
||||
assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
|
||||
req0.num_computed_tokens = 59
|
||||
|
||||
# Append slots without allocating a new block.
|
||||
@ -667,10 +664,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
|
||||
"""
|
||||
block_size = 16
|
||||
manager = KVCacheManager(
|
||||
block_size=block_size,
|
||||
num_gpu_blocks=10,
|
||||
make_kv_cache_config(block_size, 11),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=0,
|
||||
)
|
||||
@ -723,10 +718,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
|
||||
|
||||
def test_reset_prefix_cache():
|
||||
manager = KVCacheManager(
|
||||
block_size=16,
|
||||
num_gpu_blocks=10,
|
||||
make_kv_cache_config(16, 11),
|
||||
max_model_len=8192,
|
||||
sliding_window=None,
|
||||
enable_caching=True,
|
||||
num_preallocate_tokens=0,
|
||||
)
|
||||
@ -736,7 +729,7 @@ def test_reset_prefix_cache():
|
||||
all_token_ids = full_block_token_ids + unique_token_ids
|
||||
req0 = make_request("0", all_token_ids)
|
||||
blocks = manager.allocate_slots(req0, 55)
|
||||
assert [b.block_id for b in blocks] == [0, 1, 2, 3]
|
||||
assert [b.block_id for b in blocks] == [1, 2, 3, 4]
|
||||
|
||||
unique_token_ids = [4] * 7
|
||||
all_token_ids = full_block_token_ids + unique_token_ids
|
||||
@ -745,7 +738,7 @@ def test_reset_prefix_cache():
|
||||
assert len(manager.req_to_block_hashes[req1.request_id]) == 3
|
||||
assert len(computed_blocks) == 3
|
||||
blocks = manager.allocate_slots(req1, 7, computed_blocks)
|
||||
assert [b.block_id for b in blocks] == [4]
|
||||
assert [b.block_id for b in blocks] == [5]
|
||||
|
||||
# Failed to reset prefix cache because some blocks are not freed yet.
|
||||
assert not manager.reset_prefix_cache()
|
||||
|
||||
@ -2,12 +2,15 @@
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec)
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
@ -66,13 +69,21 @@ def create_scheduler(
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
)
|
||||
kv_cache_config = KVCacheConfig(
|
||||
num_blocks=10000, # A large number of blocks to hold all requests
|
||||
tensors={},
|
||||
kv_cache_groups=[
|
||||
KVCacheGroupSpec(['layer'],
|
||||
FullAttentionSpec(16, 1, 1, torch.float32, False))
|
||||
],
|
||||
)
|
||||
cache_config.num_gpu_blocks = 10000
|
||||
return Scheduler(
|
||||
scheduler_config,
|
||||
model_config,
|
||||
cache_config,
|
||||
speculative_config=None,
|
||||
lora_config=None,
|
||||
kv_cache_config=kv_cache_config,
|
||||
log_stats=True,
|
||||
structured_output_manager=StructuredOutputManager(vllm_config),
|
||||
)
|
||||
@ -600,3 +611,98 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
|
||||
prompt_logprobs_dict={},
|
||||
)
|
||||
scheduler.update_from_output(scheduler_output1, model_runner_output)
|
||||
|
||||
|
||||
# Note - these test cases mirror some of those in test_rejection_sampler.py
|
||||
@pytest.mark.parametrize(
|
||||
"spec_tokens,output_tokens,expected",
|
||||
[
|
||||
([[1, 2, 3]], [[1, 2, 3, 4]], (3, 3)), # perfect match
|
||||
([[1, 2, 3]], [[1, 5]], (3, 1)), # early mismatch
|
||||
([[1, 2], [3]], [[1, 2, 5], [3, 4]], (3, 3)), # multiple sequences
|
||||
([[1]], [[1, 2]], (1, 1)), # single token sequence
|
||||
([[]], [[5]], (0, 0)), # empty sequence
|
||||
([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
|
||||
(6, 3)), # multiple mismatches
|
||||
])
|
||||
def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
|
||||
"""Test scheduling behavior with speculative decoding.
|
||||
|
||||
This test verifies that:
|
||||
1. Speculated tokens get scheduled correctly
|
||||
2. Spec decoding stats properly count number of draft and accepted tokens
|
||||
"""
|
||||
scheduler = create_scheduler()
|
||||
requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
|
||||
req_ids = []
|
||||
req_to_index = {}
|
||||
for i, request in enumerate(requests):
|
||||
scheduler.add_request(request)
|
||||
req_ids.append(request.request_id)
|
||||
req_to_index[request.request_id] = i
|
||||
|
||||
# Schedule a decode, which will also draft speculative tokens
|
||||
output = scheduler.schedule()
|
||||
assert len(output.scheduled_new_reqs) == len(requests)
|
||||
assert output.total_num_scheduled_tokens == len(requests)
|
||||
for i in range(len(requests)):
|
||||
req_id = requests[i].request_id
|
||||
assert output.num_scheduled_tokens[req_id] == 1
|
||||
assert req_id not in output.scheduled_spec_decode_tokens
|
||||
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
spec_token_ids=spec_tokens,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
)
|
||||
engine_core_outputs = scheduler.update_from_output(output,
|
||||
model_runner_output)
|
||||
|
||||
for i in range(len(requests)):
|
||||
running_req = scheduler.running[i]
|
||||
# The prompt token
|
||||
assert running_req.num_computed_tokens == 1
|
||||
# The prompt token and the sampled token
|
||||
assert running_req.num_tokens == 2
|
||||
# The prompt token, the sampled token, and the speculated tokens
|
||||
assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
|
||||
|
||||
# No draft or accepted tokens counted yet
|
||||
assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
|
||||
stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
|
||||
assert stats.num_draft_tokens == 0
|
||||
assert stats.num_accepted_tokens == 0
|
||||
|
||||
# Schedule the speculated tokens for validation
|
||||
output = scheduler.schedule()
|
||||
assert len(output.scheduled_new_reqs) == 0
|
||||
# The sampled token and speculated tokens
|
||||
assert output.total_num_scheduled_tokens == \
|
||||
len(requests) + sum(len(ids) for ids in spec_tokens)
|
||||
for i in range(len(requests)):
|
||||
req_id = requests[i].request_id
|
||||
assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
|
||||
if spec_tokens[i]:
|
||||
assert len(output.scheduled_spec_decode_tokens[req_id]) == \
|
||||
len(spec_tokens[i])
|
||||
else:
|
||||
assert req_id not in output.scheduled_spec_decode_tokens
|
||||
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=output_tokens,
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
)
|
||||
engine_core_outputs = scheduler.update_from_output(output,
|
||||
model_runner_output)
|
||||
|
||||
assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
|
||||
stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
|
||||
assert stats.num_draft_tokens == expected[0]
|
||||
assert stats.num_accepted_tokens == expected[1]
|
||||
|
||||
138
tests/v1/core/test_specialized_manager.py
Normal file
138
tests/v1/core/test_specialized_manager.py
Normal file
@ -0,0 +1,138 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
|
||||
from vllm.v1.core.specialized_manager import SlidingWindowManager
|
||||
from vllm.v1.kv_cache_interface import SlidingWindowSpec
|
||||
|
||||
|
||||
def test_sliding_window_possible_cached_prefix():
|
||||
sliding_window_spec = SlidingWindowSpec(
|
||||
block_size=2,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
sliding_window=4,
|
||||
use_mla=False,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
|
||||
manager = SlidingWindowManager(sliding_window_spec, block_pool)
|
||||
|
||||
def run_one_case(block_is_cached, expect_length):
|
||||
block_hash_list = [
|
||||
BlockHashType(i, ()) for i in range(len(block_is_cached))
|
||||
]
|
||||
|
||||
block_pool.cached_block_hash_to_block.clear()
|
||||
|
||||
# Mock the block pool with the cached blocks
|
||||
for i, (block_hash,
|
||||
is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
|
||||
if is_cached:
|
||||
block_pool.cached_block_hash_to_block[block_hash] = {
|
||||
i: block_pool.blocks[i + 10]
|
||||
}
|
||||
|
||||
computed_blocks = manager.find_longest_cache_hit(block_hash_list)
|
||||
assert len(computed_blocks) == expect_length
|
||||
|
||||
assert all(block == block_pool.null_block
|
||||
for block in computed_blocks[:expect_length - 2])
|
||||
for i in range(2):
|
||||
if i < expect_length:
|
||||
block_index = expect_length - i - 1
|
||||
assert computed_blocks[
|
||||
block_index].block_id == block_index + 10
|
||||
|
||||
run_one_case([False] * 10, 0)
|
||||
run_one_case([True], 1)
|
||||
run_one_case([True, False], 1)
|
||||
run_one_case([True, True], 2)
|
||||
run_one_case([True, True, False], 2)
|
||||
run_one_case([True, True, True], 3)
|
||||
run_one_case([True, True, True, False], 3)
|
||||
run_one_case([
|
||||
True, True, False, True, False, False, True, True, False, True, True,
|
||||
True
|
||||
], 12)
|
||||
run_one_case([
|
||||
True, True, False, True, False, False, True, True, False, False, False
|
||||
], 8)
|
||||
run_one_case([
|
||||
True, True, False, True, False, False, True, True, False, False, False,
|
||||
True
|
||||
], 8)
|
||||
|
||||
|
||||
def test_sliding_window_remove_skipped_blocks():
|
||||
sliding_window_spec = SlidingWindowSpec(
|
||||
block_size=2,
|
||||
num_kv_heads=1,
|
||||
head_size=1,
|
||||
dtype=torch.float32,
|
||||
sliding_window=4,
|
||||
use_mla=False,
|
||||
)
|
||||
|
||||
block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
|
||||
|
||||
manager = SlidingWindowManager(sliding_window_spec, block_pool)
|
||||
|
||||
null_block_id = block_pool.null_block.block_id
|
||||
|
||||
def id_to_block_table(ids):
|
||||
return [
|
||||
KVCacheBlock(id_)
|
||||
if id_ != null_block_id else block_pool.null_block for id_ in ids
|
||||
]
|
||||
|
||||
def assert_block_id(block_table, ids):
|
||||
for block, id_ in zip(block_table, ids):
|
||||
if id_ == null_block_id:
|
||||
assert block == block_pool.null_block
|
||||
else:
|
||||
assert block.block_id == id_
|
||||
|
||||
original_block_ids = [
|
||||
1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
|
||||
]
|
||||
block_table = id_to_block_table(original_block_ids)
|
||||
removed = manager.remove_skipped_blocks(block_table, 0)
|
||||
assert_block_id(removed, [])
|
||||
assert_block_id(block_table, original_block_ids)
|
||||
|
||||
# 4 tokens are computed. Only token 0 is out of the sliding window. As
|
||||
# block 1000 also contains token 1 that is in the sliding window, block 1000
|
||||
# cannot be removed.
|
||||
removed = manager.remove_skipped_blocks(block_table, 4)
|
||||
assert_block_id(removed, [])
|
||||
assert_block_id(block_table, original_block_ids)
|
||||
|
||||
# 5 tokens are computed. Token 0 & 1 are out of the sliding window.
|
||||
# Block 1000 can be removed.
|
||||
removed = manager.remove_skipped_blocks(block_table, 5)
|
||||
assert_block_id(removed, [original_block_ids[0]])
|
||||
assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
|
||||
|
||||
# 6 tokens are computed. Token 0-2 are out of the sliding window.
|
||||
# Cannot remove new block as the block 1001 is still used by token 3.
|
||||
removed = manager.remove_skipped_blocks(block_table, 6)
|
||||
assert_block_id(removed, [])
|
||||
assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
|
||||
|
||||
# 7 tokens are computed. Token 0-3 are out of the sliding window.
|
||||
# Block 1001 can be removed and block 1000 is already removed.
|
||||
removed = manager.remove_skipped_blocks(block_table, 7)
|
||||
assert_block_id(removed, [original_block_ids[1]])
|
||||
assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
|
||||
|
||||
# 11 tokens are computed. Token 0-7 are out of the sliding window.
|
||||
# Block 1002 & 1003 can be removed now. Block 1003 represents a longer
|
||||
# sequence, and is expected to be evicted earlier than 1002, so the order
|
||||
# of removed blocks should be [1003, 1002].
|
||||
removed = manager.remove_skipped_blocks(block_table, 11)
|
||||
assert_block_id(removed, [original_block_ids[3], original_block_ids[2]])
|
||||
assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
|
||||
84
tests/v1/e2e/test_correctness_sliding_window.py
Normal file
84
tests/v1/e2e/test_correctness_sliding_window.py
Normal file
@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
|
||||
prep_prompts)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestConfig:
|
||||
sliding_window: int
|
||||
ln_range: tuple[int, int]
|
||||
|
||||
|
||||
model_config = {
|
||||
"bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
|
||||
"google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"bigcode/starcoder2-3b", # sliding window only
|
||||
"google/gemma-2-2b-it", # sliding window + full attention
|
||||
])
|
||||
@pytest.mark.parametrize("batch_size", [5])
|
||||
@pytest.mark.parametrize("seed", [1])
|
||||
def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
|
||||
"""
|
||||
The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
|
||||
asks for value of one of them (which is outside the sliding window).
|
||||
If we tell it upfront which we are going to be looking for, then
|
||||
it answers correctly (mostly).
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
test_config = model_config[model]
|
||||
|
||||
llm = LLM(model=model)
|
||||
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
|
||||
|
||||
prompts, answer, indices = prep_prompts(batch_size,
|
||||
ln_range=test_config.ln_range)
|
||||
|
||||
check_length(prompts, llm, test_config.sliding_window)
|
||||
|
||||
# Fresh generation
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0)
|
||||
|
||||
# Re-generate with the same prompts to test prefix caching
|
||||
responses = llm.generate(prompts, sampling_params)
|
||||
check_answers(indices,
|
||||
answer,
|
||||
[response.outputs[0].text for response in responses],
|
||||
accept_rate=1.0)
|
||||
|
||||
|
||||
def check_length(prompts: list[str], llm: LLM, sliding_window: int):
|
||||
"""
|
||||
Check if the prompt length is valid, i.e., longer than the sliding window
|
||||
size and shorter than the model's max length.
|
||||
|
||||
Args:
|
||||
prompts: list of prompts
|
||||
llm: LLM object
|
||||
sliding_window: Sliding window size
|
||||
"""
|
||||
tokenizer = llm.get_tokenizer()
|
||||
max_model_len = llm.llm_engine.model_config.max_model_len
|
||||
assert any(
|
||||
len(tokenizer.encode(prompt)) > sliding_window
|
||||
for prompt in prompts), "Prompt is too short for test"
|
||||
assert all(
|
||||
len(tokenizer.encode(prompt)) <= max_model_len
|
||||
for prompt in prompts), "Prompt is too long for test"
|
||||
@ -23,7 +23,8 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
|
||||
("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
|
||||
"mistral"),
|
||||
("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
|
||||
("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
|
||||
#FIXME: This test is flaky on CI thus disabled
|
||||
#("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
|
||||
]
|
||||
|
||||
PARAMS_MODELS_TOKENIZER_MODE = [
|
||||
|
||||
@ -104,14 +104,6 @@ def test_enable_by_default_fallback(monkeypatch):
|
||||
assert envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Should fall back to V0 for experimental config.
|
||||
_ = AsyncEngineArgs(
|
||||
model=MODEL,
|
||||
enable_lora=True,
|
||||
).create_engine_config()
|
||||
assert not envs.VLLM_USE_V1
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Should fall back to V0 for supported model.
|
||||
_ = AsyncEngineArgs(
|
||||
model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
|
||||
@ -125,7 +117,7 @@ def test_v1_llm_by_default(monkeypatch):
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
# Should default to V1 for supported config.
|
||||
model = LLM(MODEL, enforce_eager=True)
|
||||
model = LLM(MODEL, enforce_eager=True, enable_lora=True)
|
||||
print(model.generate("Hello my name is"))
|
||||
assert hasattr(model.llm_engine, "engine_core")
|
||||
m.delenv("VLLM_USE_V1")
|
||||
|
||||
@ -32,7 +32,7 @@ TENSOR_PARALLEL_SIZES = [1]
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
|
||||
def test_models(
|
||||
def test_basic(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
model: str,
|
||||
@ -58,4 +58,5 @@ def test_models(
|
||||
vllm_outputs = vllm_model.generate_greedy(example_prompts,
|
||||
max_tokens)
|
||||
output = vllm_outputs[0][1]
|
||||
assert "1024" in output
|
||||
|
||||
assert "1024" in output or "0, 1" in output
|
||||
|
||||
146
tests/v1/tpu/test_perf.py
Normal file
146
tests/v1/tpu/test_perf.py
Normal file
@ -0,0 +1,146 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""A basic performance regression test for TPUs
|
||||
|
||||
Run `pytest tests/v1/tpu/test_perf.py`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tests.conftest import VllmRunner
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestParams:
|
||||
model: str
|
||||
num_prompts: int
|
||||
prefix_len: int
|
||||
decode_len: int
|
||||
expected_avg_time: float
|
||||
err_tol: float
|
||||
|
||||
|
||||
TEST_PARAMS = [
|
||||
# TODO: Cannot run a series of tests because:
|
||||
# RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
|
||||
# open(/dev/vfio/0): Device or resource busy: Device or resource busy;
|
||||
# Couldn't open iommu group /dev/vfio/0
|
||||
# => Investigate
|
||||
|
||||
# TestParams(
|
||||
# model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
# num_prompts=1,
|
||||
# prefix_len=10,
|
||||
# decode_len=5,
|
||||
# expected_avg_time=0.03,
|
||||
# err_tol=0.01,
|
||||
# ),
|
||||
# TestParams(
|
||||
# model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
# num_prompts=10,
|
||||
# prefix_len=100,
|
||||
# decode_len=50,
|
||||
# expected_avg_time=0.234,
|
||||
# err_tol=0.020,
|
||||
# ),
|
||||
TestParams(
|
||||
model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
num_prompts=64,
|
||||
prefix_len=500,
|
||||
decode_len=50,
|
||||
|
||||
# (This is the active CI/CD instance)
|
||||
# commit id: ccb246776d93ef105904a8ec015b3587240a1183
|
||||
# tpu: v5lite (vllm CI/CD)
|
||||
expected_avg_time=1.4,
|
||||
err_tol=0.30,
|
||||
|
||||
# (TODO: There is no v6e in CI/CD currently)
|
||||
# commit id: ccb246776d93ef105904a8ec015b3587240a1183
|
||||
# tpu: v6e
|
||||
# expected_avg_time=1.5,
|
||||
# err_tol=0.20,
|
||||
),
|
||||
]
|
||||
|
||||
NUM_WARMUPS = 5
|
||||
NUM_RUNS = 10
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
MAX_NUM_SEQS = 32
|
||||
GPU_UTIL = 0.9
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_tpu(),
|
||||
reason="This is a basic performance test for TPU only")
|
||||
@pytest.mark.parametrize("params", TEST_PARAMS)
|
||||
def test_perf(
|
||||
vllm_runner: type[VllmRunner],
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
params: TestParams,
|
||||
) -> None:
|
||||
tokenizer = get_tokenizer(params.model,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True)
|
||||
|
||||
prompts = []
|
||||
for i in range(params.num_prompts):
|
||||
prefix_token_ids = np.random.randint(0,
|
||||
tokenizer.vocab_size,
|
||||
size=params.prefix_len).tolist()
|
||||
prompt = tokenizer.decode(prefix_token_ids)
|
||||
prompts.append(prompt)
|
||||
|
||||
print(
|
||||
"-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
|
||||
len(prompts), params.prefix_len, params.decode_len))
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=params.decode_len,
|
||||
temperature=1.0,
|
||||
min_p=0.0)
|
||||
|
||||
with vllm_runner(params.model,
|
||||
max_num_batched_tokens=MAX_MODEL_LEN,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
max_num_seqs=MAX_NUM_SEQS,
|
||||
gpu_memory_utilization=GPU_UTIL,
|
||||
enforce_eager=False,
|
||||
tensor_parallel_size=1) as vllm_model:
|
||||
print(" -- Warmup / Compile")
|
||||
for i in range(NUM_WARMUPS):
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
print(" -- Benchmarking... ")
|
||||
times = []
|
||||
for i in range(NUM_RUNS):
|
||||
start_time = time.time()
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
times.append(time.time() - start_time)
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
|
||||
print(" -- avg_time = {}".format(avg_time))
|
||||
print(" -- expected_avg_time = {} with err_tol = {}".format(
|
||||
params.expected_avg_time, params.err_tol))
|
||||
diff = avg_time - params.expected_avg_time
|
||||
ok = diff < params.err_tol
|
||||
if diff < -params.err_tol:
|
||||
print(" !! WARNING !! Performance has improved by {}, "
|
||||
"it may be necessary to fine-tune the "
|
||||
"expected_avg_time = {}".format(
|
||||
-diff, params.expected_avg_time))
|
||||
|
||||
assert ok, " !! ERROR !! Regression detected"
|
||||
@ -1224,7 +1224,7 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
|
||||
|
||||
def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
||||
token_expert_indicies: torch.Tensor,
|
||||
gating_output: float) -> None:
|
||||
gating_output: torch.Tensor) -> None:
|
||||
torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
|
||||
token_expert_indicies, gating_output)
|
||||
|
||||
@ -1337,9 +1337,9 @@ def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
|
||||
|
||||
# custom ar
|
||||
def init_custom_ar(ipc_tensors: list[torch.Tensor], rank_data: torch.Tensor,
|
||||
rank: int, full_nvlink: bool) -> int:
|
||||
rank: int, fully_connected: bool) -> int:
|
||||
return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
|
||||
full_nvlink)
|
||||
fully_connected)
|
||||
|
||||
|
||||
def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int,
|
||||
@ -1369,6 +1369,18 @@ def register_graph_buffers(fa: int, handles: list[list[int]],
|
||||
torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
|
||||
|
||||
|
||||
def allocate_shared_buffer_and_handle(size: int) -> tuple[int, torch.Tensor]:
|
||||
return torch.ops._C_custom_ar.allocate_shared_buffer_and_handle(size)
|
||||
|
||||
|
||||
def open_mem_handle(mem_handle: torch.Tensor):
|
||||
return torch.ops._C_custom_ar.open_mem_handle(mem_handle)
|
||||
|
||||
|
||||
def free_shared_buffer(ptr: int) -> None:
|
||||
torch.ops._C_custom_ar.free_shared_buffer(ptr)
|
||||
|
||||
|
||||
def get_flash_mla_metadata(
|
||||
cache_seqlens: torch.Tensor,
|
||||
num_heads_per_head_k: int,
|
||||
|
||||
@ -10,8 +10,6 @@ import numpy.typing as npt
|
||||
from huggingface_hub import hf_hub_download
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.video import sample_frames_from_video
|
||||
|
||||
from .base import get_cache_dir
|
||||
|
||||
|
||||
@ -43,14 +41,19 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
|
||||
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
frames = []
|
||||
for i in range(total_frames):
|
||||
ret, frame = cap.read()
|
||||
if ret:
|
||||
frames.append(frame)
|
||||
cap.release()
|
||||
|
||||
num_frames = num_frames if num_frames > 0 else total_frames
|
||||
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
||||
for idx in range(total_frames):
|
||||
ok = cap.grab() # next img
|
||||
if not ok:
|
||||
break
|
||||
if idx in frame_indices: # only decompress needed
|
||||
ret, frame = cap.retrieve()
|
||||
if ret:
|
||||
frames.append(frame)
|
||||
|
||||
frames = np.stack(frames)
|
||||
frames = sample_frames_from_video(frames, num_frames)
|
||||
if len(frames) < num_frames:
|
||||
raise ValueError(f"Could not read enough frames from video file {path}"
|
||||
f" (expected {num_frames} frames, got {len(frames)})")
|
||||
|
||||
@ -317,8 +317,8 @@ class ModelConfig:
|
||||
) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
|
||||
raise ValueError(
|
||||
"VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
|
||||
"module was not found."
|
||||
"See https://github.com/vllm-project/vllm/blob/main/Dockerfile "
|
||||
"module was not found. See "
|
||||
"https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501
|
||||
"for instructions on how to install it.")
|
||||
|
||||
# The tokenizer version is consistent with the model version by default.
|
||||
@ -1116,8 +1116,7 @@ class CacheConfig:
|
||||
is_attention_free: Whether the model is attention-free.
|
||||
num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
|
||||
profiled num_gpu_blocks if specified. Does nothing if None.
|
||||
sliding_window: Sliding window size for the KV cache. Can not work with
|
||||
prefix caching enabled.
|
||||
sliding_window: Sliding window size for the KV cache.
|
||||
enable_prefix_caching: Whether to enable prefix caching.
|
||||
cpu_offload_gb: Size of the CPU offload buffer in GiB.
|
||||
"""
|
||||
@ -1606,11 +1605,13 @@ class ParallelConfig:
|
||||
if self.use_ray:
|
||||
from vllm.executor import ray_utils
|
||||
ray_utils.assert_ray_available()
|
||||
if current_platform.is_rocm():
|
||||
device_capability = current_platform.get_device_capability()
|
||||
if (current_platform.is_rocm() and device_capability is not None
|
||||
and device_capability < (9, 4)):
|
||||
self.disable_custom_all_reduce = True
|
||||
logger.info(
|
||||
"Disabled the custom all-reduce kernel because it is not "
|
||||
"supported on AMD GPUs.")
|
||||
"supported on AMD GPUs older than MI300X.")
|
||||
if self.ray_workers_use_nsight and not self.use_ray:
|
||||
raise ValueError("Unable to use nsight profiling unless workers "
|
||||
"run with Ray.")
|
||||
@ -2151,9 +2152,10 @@ class SpeculativeConfig:
|
||||
|
||||
# Replace hf_config for EAGLE draft_model
|
||||
if self.method == "eagle":
|
||||
if self.enable_chunked_prefill:
|
||||
if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
|
||||
raise ValueError(
|
||||
"Chunked prefill and EAGLE are not compatible.")
|
||||
"Chunked prefill and EAGLE are not compatible "
|
||||
"when using V0.")
|
||||
|
||||
from vllm.transformers_utils.configs.eagle import (
|
||||
EAGLEConfig)
|
||||
@ -2358,12 +2360,10 @@ class SpeculativeConfig:
|
||||
return self.num_speculative_tokens
|
||||
|
||||
def __repr__(self) -> str:
|
||||
if self.prompt_lookup_max is not None and self.prompt_lookup_max > 0:
|
||||
draft_model = "ngram"
|
||||
else:
|
||||
draft_model = self.draft_model_config.model
|
||||
method = self.method
|
||||
model = None if method == "ngram" else self.draft_model_config.model
|
||||
num_spec_tokens = self.num_speculative_tokens
|
||||
return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
|
||||
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -2717,6 +2717,10 @@ def _get_and_verify_max_len(
|
||||
max_len_key = key if max_len < derived_max_model_len \
|
||||
else max_len_key
|
||||
derived_max_model_len = min(derived_max_model_len, max_len)
|
||||
# For Command-R / Cohere, Cohere2 / Aya Vision models
|
||||
if tmp_max_len := getattr(hf_config, "model_max_length", None):
|
||||
max_len_key = "model_max_length"
|
||||
derived_max_model_len = tmp_max_len
|
||||
|
||||
# If sliding window is manually disabled, max_length should be less
|
||||
# than the sliding window length in the model config.
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
# not sure why, they are created from a different context.
|
||||
# the only successful approach is to call cuda driver API in C.
|
||||
import dataclasses
|
||||
import gc
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Any, Callable, Dict, Optional, Tuple, Union
|
||||
@ -175,7 +176,7 @@ class CuMemAllocator:
|
||||
str]] = None) -> None:
|
||||
"""
|
||||
Put the allocator in sleep mode.
|
||||
All data in the memory allocation with the specified tag will be
|
||||
All data in the memory allocation with the specified tag will be
|
||||
offloaded to CPU memory, and others will be discarded.
|
||||
|
||||
:param offload_tags: The tags of the memory allocation that will be
|
||||
@ -204,10 +205,13 @@ class CuMemAllocator:
|
||||
data.cpu_backup_tensor = cpu_backup_tensor
|
||||
unmap_and_release(handle)
|
||||
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def wake_up(self):
|
||||
"""
|
||||
Wake up the allocator from sleep mode.
|
||||
All data that is previously offloaded will be loaded back to GPU
|
||||
All data that is previously offloaded will be loaded back to GPU
|
||||
memory, and the rest of the data will have empty memory."""
|
||||
for ptr, data in self.pointer_to_data.items():
|
||||
handle = data.handle
|
||||
@ -225,7 +229,7 @@ class CuMemAllocator:
|
||||
def use_memory_pool(self, tag: Optional[str] = None):
|
||||
"""
|
||||
A context manager to use the memory pool.
|
||||
All memory allocation created inside the context will be allocated
|
||||
All memory allocation created inside the context will be allocated
|
||||
in the memory pool, and has the specified tag.
|
||||
|
||||
:param tag: The tag of the memory allocation. If None, the default tag
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import ctypes
|
||||
from contextlib import contextmanager
|
||||
from typing import List, Optional, Union
|
||||
|
||||
@ -10,7 +9,6 @@ from torch.distributed import ProcessGroup
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
||||
from vllm.distributed.device_communicators.custom_all_reduce_utils import (
|
||||
gpu_p2p_access_check)
|
||||
from vllm.distributed.parallel_state import in_the_same_node_as
|
||||
@ -22,7 +20,7 @@ try:
|
||||
ops.meta_size()
|
||||
custom_ar = True
|
||||
except Exception:
|
||||
# For AMD GPUs and CPUs
|
||||
# For CPUs
|
||||
custom_ar = False
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -71,7 +69,9 @@ class CustomAllreduce:
|
||||
|
||||
if not custom_ar:
|
||||
# disable because of missing custom allreduce library
|
||||
# e.g. in a non-cuda environment
|
||||
# e.g. in a non-GPU environment
|
||||
logger.info("Custom allreduce is disabled because "
|
||||
"of missing custom allreduce library")
|
||||
return
|
||||
|
||||
self.group = group
|
||||
@ -129,11 +129,10 @@ class CustomAllreduce:
|
||||
# test nvlink first, this will filter out most of the cases
|
||||
# where custom allreduce is not supported
|
||||
# this checks hardware and driver support for NVLink
|
||||
assert current_platform.is_cuda()
|
||||
from vllm.platforms.cuda import CudaPlatform
|
||||
cuda_platform: CudaPlatform = current_platform
|
||||
full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
|
||||
if world_size > 2 and not full_nvlink:
|
||||
assert current_platform.is_cuda_alike()
|
||||
fully_connected = current_platform.is_fully_connected(
|
||||
physical_device_ids)
|
||||
if world_size > 2 and not fully_connected:
|
||||
logger.warning(
|
||||
"Custom allreduce is disabled because it's not supported on"
|
||||
" more than two PCIe-only GPUs. To silence this warning, "
|
||||
@ -142,7 +141,8 @@ class CustomAllreduce:
|
||||
# test P2P capability, this checks software/cudaruntime support
|
||||
# this is expensive to compute at the first time
|
||||
# then we cache the result
|
||||
if not _can_p2p(rank, world_size):
|
||||
# On AMD GPU, p2p is always enabled between XGMI connected GPUs
|
||||
if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
|
||||
logger.warning(
|
||||
"Custom allreduce is disabled because your platform lacks "
|
||||
"GPU P2P capability or P2P test failed. To silence this "
|
||||
@ -154,7 +154,8 @@ class CustomAllreduce:
|
||||
# Meta data composes of two parts: meta data for synchronization and a
|
||||
# temporary buffer for storing intermediate allreduce results.
|
||||
self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
|
||||
group=group)
|
||||
group=group,
|
||||
uncached=True)
|
||||
# This is a pre-registered IPC buffer. In eager mode, input tensors
|
||||
# are first copied into this buffer before allreduce is performed
|
||||
self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
|
||||
@ -169,46 +170,11 @@ class CustomAllreduce:
|
||||
self.max_size = max_size
|
||||
self.rank = rank
|
||||
self.world_size = world_size
|
||||
self.full_nvlink = full_nvlink
|
||||
self.fully_connected = fully_connected
|
||||
self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
|
||||
self.full_nvlink)
|
||||
self.fully_connected)
|
||||
ops.register_buffer(self._ptr, self.buffer_ptrs)
|
||||
|
||||
@staticmethod
|
||||
def create_shared_buffer(
|
||||
size_in_bytes: int,
|
||||
group: Optional[ProcessGroup] = None) -> List[int]:
|
||||
"""
|
||||
Creates a shared buffer and returns a list of pointers
|
||||
representing the buffer on all processes in the group.
|
||||
"""
|
||||
lib = CudaRTLibrary()
|
||||
pointer = lib.cudaMalloc(size_in_bytes)
|
||||
handle = lib.cudaIpcGetMemHandle(pointer)
|
||||
world_size = dist.get_world_size(group=group)
|
||||
rank = dist.get_rank(group=group)
|
||||
handles = [None] * world_size
|
||||
dist.all_gather_object(handles, handle, group=group)
|
||||
|
||||
pointers: List[int] = []
|
||||
for i, h in enumerate(handles):
|
||||
if i == rank:
|
||||
pointers.append(pointer.value) # type: ignore
|
||||
else:
|
||||
pointers.append(
|
||||
lib.cudaIpcOpenMemHandle(h).value) # type: ignore
|
||||
|
||||
return pointers
|
||||
|
||||
@staticmethod
|
||||
def free_shared_buffer(pointers: List[int],
|
||||
group: Optional[ProcessGroup] = None,
|
||||
rank: Optional[int] = None) -> None:
|
||||
if rank is None:
|
||||
rank = dist.get_rank(group=group)
|
||||
lib = CudaRTLibrary()
|
||||
lib.cudaFree(ctypes.c_void_p(pointers[rank]))
|
||||
|
||||
@contextmanager
|
||||
def capture(self):
|
||||
"""
|
||||
@ -255,7 +221,7 @@ class CustomAllreduce:
|
||||
return False
|
||||
# for 4 or more non NVLink-capable GPUs, custom allreduce provides
|
||||
# little performance improvement over NCCL.
|
||||
if self.world_size == 2 or self.full_nvlink:
|
||||
if self.world_size == 2 or self.fully_connected:
|
||||
return inp_size < self.max_size
|
||||
return False
|
||||
|
||||
@ -306,3 +272,30 @@ class CustomAllreduce:
|
||||
|
||||
def __del__(self):
|
||||
self.close()
|
||||
|
||||
@staticmethod
|
||||
def create_shared_buffer(size_in_bytes: int,
|
||||
group: Optional[ProcessGroup] = None,
|
||||
uncached: Optional[bool] = False) -> List[int]:
|
||||
pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes)
|
||||
|
||||
world_size = dist.get_world_size(group=group)
|
||||
rank = dist.get_rank(group=group)
|
||||
handles = [None] * world_size
|
||||
dist.all_gather_object(handles, handle, group=group)
|
||||
|
||||
pointers: List[int] = []
|
||||
for i, h in enumerate(handles):
|
||||
if i == rank:
|
||||
pointers.append(pointer) # type: ignore
|
||||
else:
|
||||
pointers.append(ops.open_mem_handle(h))
|
||||
return pointers
|
||||
|
||||
@staticmethod
|
||||
def free_shared_buffer(pointers: List[int],
|
||||
group: Optional[ProcessGroup] = None,
|
||||
rank: Optional[int] = 0) -> None:
|
||||
if rank is None:
|
||||
rank = dist.get_rank(group=group)
|
||||
ops.free_shared_buffer(pointers[rank])
|
||||
|
||||
@ -1468,15 +1468,21 @@ class EngineArgs:
|
||||
|
||||
# Only Ngram speculative decoding so far.
|
||||
is_ngram_enabled = False
|
||||
is_eagle_enabled = False
|
||||
if self.speculative_config is not None:
|
||||
# This is supported but experimental (handled below).
|
||||
if (("method" in self.speculative_config
|
||||
and self.speculative_config["method"] in ("ngram", "[ngram]"))
|
||||
or
|
||||
("model" in self.speculative_config and
|
||||
self.speculative_config["model"] in ("ngram", "[ngram]"))):
|
||||
is_ngram_enabled = True
|
||||
speculative_method = self.speculative_config.get("method")
|
||||
if speculative_method:
|
||||
if speculative_method in ("ngram", "[ngram]"):
|
||||
is_ngram_enabled = True
|
||||
elif speculative_method == "eagle":
|
||||
is_eagle_enabled = True
|
||||
else:
|
||||
speculative_model = self.speculative_config.get("model")
|
||||
if speculative_model in ("ngram", "[ngram]"):
|
||||
is_ngram_enabled = True
|
||||
if not (is_ngram_enabled or is_eagle_enabled):
|
||||
# Other speculative decoding methods are not supported yet.
|
||||
_raise_or_fallback(feature_name="Speculative Decoding",
|
||||
recommend_to_remove=False)
|
||||
return False
|
||||
@ -1512,10 +1518,6 @@ class EngineArgs:
|
||||
and _warn_or_fallback("Engine in background thread")):
|
||||
return False
|
||||
|
||||
# LoRA is supported on V1, but off by default for now.
|
||||
if self.enable_lora and _warn_or_fallback("LORA"):
|
||||
return False
|
||||
|
||||
# PP is supported on V1 with Ray distributed executor,
|
||||
# but off for MP distributed executor for now.
|
||||
if (self.pipeline_parallel_size > 1
|
||||
@ -1527,10 +1529,14 @@ class EngineArgs:
|
||||
if is_ngram_enabled and _warn_or_fallback("ngram"):
|
||||
return False
|
||||
|
||||
# Eagle is under development, so we don't support it yet.
|
||||
if is_eagle_enabled and _warn_or_fallback("Eagle"):
|
||||
return False
|
||||
|
||||
# Non-CUDA is supported on V1, but off by default for now.
|
||||
not_cuda = not current_platform.is_cuda()
|
||||
if not_cuda and _warn_or_fallback( # noqa: SIM103
|
||||
current_platform.device_type):
|
||||
current_platform.device_name):
|
||||
return False
|
||||
#############################################################
|
||||
|
||||
|
||||
@ -487,7 +487,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
|
||||
if model_type in ("minicpmo", "minicpmv"):
|
||||
return "(<image>./</image>)"
|
||||
if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
|
||||
if model_type in ("blip-2", "fuyu", "paligemma", "pixtral",
|
||||
"mistral3"):
|
||||
# These models do not use image tokens in the prompt
|
||||
return None
|
||||
if model_type == "qwen":
|
||||
@ -495,8 +496,9 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
if model_type.startswith("llava"):
|
||||
return self._cached_token_str(self._tokenizer,
|
||||
hf_config.image_token_index)
|
||||
if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
|
||||
"skywork_chat", "NVLM_D", "h2ovl_chat"):
|
||||
if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
|
||||
"internvl_chat", "skywork_chat", "NVLM_D",
|
||||
"h2ovl_chat"):
|
||||
return "<image>"
|
||||
if model_type == "mllama":
|
||||
return "<|image|>"
|
||||
|
||||
@ -4,7 +4,6 @@ import argparse
|
||||
|
||||
import uvloop
|
||||
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.entrypoints.cli.types import CLISubcommand
|
||||
from vllm.entrypoints.openai.api_server import run_server
|
||||
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
|
||||
@ -21,14 +20,9 @@ class ServeSubcommand(CLISubcommand):
|
||||
|
||||
@staticmethod
|
||||
def cmd(args: argparse.Namespace) -> None:
|
||||
# The default value of `--model`
|
||||
if args.model != EngineArgs.model:
|
||||
raise ValueError(
|
||||
"With `vllm serve`, you should provide the model as a "
|
||||
"positional argument instead of via the `--model` option.")
|
||||
|
||||
# EngineArgs expects the model name to be passed as --model.
|
||||
args.model = args.model_tag
|
||||
# If model is specified in CLI (as positional arg), it takes precedence
|
||||
if hasattr(args, 'model_tag') and args.model_tag is not None:
|
||||
args.model = args.model_tag
|
||||
|
||||
uvloop.run(run_server(args))
|
||||
|
||||
@ -41,10 +35,12 @@ class ServeSubcommand(CLISubcommand):
|
||||
serve_parser = subparsers.add_parser(
|
||||
"serve",
|
||||
help="Start the vLLM OpenAI Compatible API server",
|
||||
usage="vllm serve <model_tag> [options]")
|
||||
usage="vllm serve [model_tag] [options]")
|
||||
serve_parser.add_argument("model_tag",
|
||||
type=str,
|
||||
help="The model tag to serve")
|
||||
nargs='?',
|
||||
help="The model tag to serve "
|
||||
"(optional if specified in config)")
|
||||
serve_parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user