Compare commits

...

314 Commits

Author SHA1 Message Date
e17250f0d2 fix precommit 2025-06-18 21:17:43 -07:00
4959915089 [Quantization] Modify the logic of BNB double quantization (#19742)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-19 03:52:09 +00:00
8d1e89d946 [Misc][ROCm] Enforce no unused variable in ROCm C++ files (#19796)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-18 20:25:15 -07:00
36239f79dd Fix FA2 fallback for Blackwell V1 (#19781)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-19 09:53:55 +08:00
dfada85eee [Frontend] Expose custom args in OpenAI APIs (#16862)
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-18 17:41:11 -07:00
ed33349738 [BugFix] Fix use_cudagraph=False (#19612)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-19 08:23:12 +08:00
d49adea1f9 [Multimodal] Use fast processor for Qwen2/2.5-VL (#19789) 2025-06-18 15:49:40 -07:00
14fdd21d39 [Core] More fixes to MultiModalEmbeddings type handling (#19715)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-18 22:48:29 +00:00
04fefe7c9a [TPU] Update torch-xla version to include paged attention tuned block change (#19813)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-18 22:41:13 +00:00
3b523e38d9 [Core] Do not copy array during hashing (#19484)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-18 15:36:55 -07:00
16c16301c8 Disable "Forbid direct 'import triton'" check for vllm/triton_utils/importing.py in an extensible way (#19783)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
2025-06-18 15:08:00 -07:00
9206d0ff01 docs: fix Slack bulletpoint in README (#19811)
Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
2025-06-18 20:47:08 +00:00
a89209b78d [v1] Support mamba2 (#19327)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-18 20:34:15 +00:00
ffacb222cb [Docs] Add Huzaifa Sidhpurwala to vuln mgmt team doc (#19808)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-18 20:22:28 +00:00
12575cfa7a [Bugfix] fix RAY_CGRAPH_get_timeout is not set successfully (#19725)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-18 10:26:16 -07:00
8b6e1d639c [Hardware][AMD] integrate aiter chunked prefill into vllm (#18596)
Signed-off-by: fsx950223 <fsx950223@outlook.com>
Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: fsx950223 <fsx950223@outlook.com>
Co-authored-by: charlifu <charlifu@amd.com>
2025-06-18 08:46:51 -07:00
735a9de71f [Qwen] Add tagging rule for Qwen related PRs (#19799)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-18 14:26:43 +00:00
257ab95439 [Platform] Allow platform use V1 Engine by default (#19792)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-06-18 13:03:36 +00:00
cca91a7a10 [doc] fix the incorrect label (#19787)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-18 10:30:58 +00:00
f04d604567 [Minor] Zero-initialize attn output buffer (#19784)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-18 06:59:27 +00:00
19a53b2783 [V1] Decouple GPU and TPU InputBatch (#19778)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
2025-06-18 06:38:13 +00:00
eccdc8318c [V1][P/D] An native implementation of xPyD based on P2P NCCL (#18242)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2025-06-18 06:32:36 +00:00
5f52a84685 [V1] Add API docs for EncoderCacheManager (#19294)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-18 13:37:01 +08:00
d4629dc43f [Misc] Add __str__ for RequestStatus (#19780)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-06-18 03:03:01 +00:00
6e9cc73f67 [MISC] correct DeviceConfig device field static type analysis (#19699)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-17 17:21:50 -07:00
c53711bd63 [MISC] correct copy_blocks src_to_dists param type (#19696)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-17 17:21:06 -07:00
dac8cc49f4 [TPU] Update torch version to include paged attention kernel change (#19706)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-17 22:24:49 +00:00
a44b1c951d [Feature][ROCm] Add full graph capture support for TritonAttentionBackend (#19158)
Signed-off-by: charlifu <charlifu@amd.com>
2025-06-17 17:03:06 -04:00
b447624ee3 [Bugfix] Fix faulty triton importing logic when using Ray for DP (#19734)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-17 20:59:29 +00:00
cda92307c1 [Misc] Update lmcache connector with the latest connector apis (#19441)
Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>
2025-06-17 19:57:54 +00:00
bf57ccc5c2 Remove sm120 arch from sm100 cutlass kernel arch list (#19716)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-17 11:49:39 -07:00
ffb2cd6b54 [Perf] Optimize moe_align_block_size CUDA kernel (#19572)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-06-17 11:49:26 -07:00
ca94d7fa00 [Bugfix] Update multimodel models mapping to fit new checkpoint after Transformers v4.52 (#19151)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-17 15:58:38 +00:00
5a1c2e15d8 [Mis] remove duplicate engine status checks (#19647)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-17 08:17:38 -07:00
4c8f64faa7 [V1][Kernel] Flashinfer HND KV cache layout (#19280)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-06-17 09:09:22 -04:00
93aee29fdb [doc] split "Other AI Accelerators" tabs (#19708) 2025-06-17 22:05:29 +09:00
154d063b9f [doc][mkdocs] Add edit button to documentation (#19637)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-17 11:10:31 +00:00
ccd7c05089 [Kernel] Add Split-KV Support to Unified Triton Attention Kernel (#19152)
Signed-off-by: Jan van Lunteren <jvl@zurich.ibm.com>
2025-06-17 10:45:07 +00:00
c48c6c4008 Add a doc on how to update PyTorch version (#19705) 2025-06-17 18:10:37 +08:00
aed8468642 [Doc] Add missing llava family multi-image examples (#19698)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-17 07:05:21 +00:00
5c76b9cdaf [Core] add remove_seq_from_computed_blocks_tracker to BlockSpaceManager (#19686)
Signed-off-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
Co-authored-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
2025-06-17 04:40:58 +00:00
ddfed314f9 Fixes IMA for TP w/ flex-attention (#19712)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-06-17 04:01:50 +00:00
5b3ad5ecf2 [DOC] fix doc typos (#19600)
Signed-off-by: Di Liu <liu-di@sjtu.edu.cn>
2025-06-17 11:34:53 +08:00
ede5c4ebdf [Frontend] add chunking audio for > 30s audio (#19597)
Signed-off-by: nguyenhoangthuan99 <thuanhppro12@gmail.com>
2025-06-17 11:34:00 +08:00
07334959d8 [Wheel Size] Only build FA2 8.0+PTX (#19336) 2025-06-17 12:32:49 +09:00
119f683949 [doc] add project flag to gcloud TPU command (#19664)
Signed-off-by: David Xia <david@davidxia.com>
2025-06-17 01:00:09 +00:00
0860087aff [Fix] Fall back to Gloo when NCCL backend is unavailable (#19641)
Signed-off-by: conroy-cheers <conroy@corncheese.org>
2025-06-17 08:42:14 +08:00
6bc7b57315 [Quantization] Remove FP4 emulation; Fall-back to marlin for device < 100 (#19563) 2025-06-16 17:33:51 -04:00
90f9c2eb5c [V1] Change return type on get_multimodal_embeddings() (#19446)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-16 13:32:15 -04:00
387bdf0ab9 [Model] Add support for MiniMaxM1ForCausalLM (shares architecture with MiniMaxText01ForCausalLM) (#19677)
Signed-off-by: QscQ <qscqesze@gmail.com>
2025-06-16 09:47:14 -07:00
5e5baa91aa [Kernels] Use empty for modular MoE workspaces (#19667)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-16 14:58:01 +00:00
836d4ce140 [Bugfix] fix missing 'finish_reason': null in streaming chat (#19662)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-16 14:10:39 +00:00
c3fec47bb7 [MISC] bump huggingface_hub pkg to 0.33.0 (#19547)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-16 05:22:28 -07:00
1173804dca [Bugfix] Fix TP inference for Flex attention backend (#19657)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-16 11:21:37 +00:00
4d5424029b [Feature]:Allow for Granite MoE Hybrid models with _only_ shared experts. (#19652)
Signed-off-by: Shawn Tan <shawntan@ibm.com>
2025-06-16 11:14:18 +00:00
3e7506975c [DOC] Add reasoning capability to vLLM streamlit code (#19557) 2025-06-16 07:09:12 -04:00
ee35e96ac3 [BugFix] Don't catch BaseException when dumping execute_model errors (#19626)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-16 11:01:08 +00:00
dec66d253b [Kernel] GGUF MMVQ kernel for multiple input vectors (#18754)
Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>
2025-06-16 17:33:26 +08:00
8d120701fd [Docs] Move multiproc doc to v1 dir (#19651)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-16 09:10:12 +00:00
f40f763f12 [CI] Add mteb testing for rerank models (#19344) 2025-06-16 01:36:43 -07:00
26bc46ef89 [MISC] typo fix (#19672)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-16 07:18:49 +00:00
a77aea59fd [TPU] support attention head dim smaller than 128 (#19620)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-06-16 06:40:53 +00:00
b692e9cd07 [Misc] Fix skipped max-model-len validation when deriving max model length from tokenizer config (#19660)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-16 06:30:29 +00:00
367871a469 [Misc][Frontend] passthrough bad_words (#19564)
Signed-off-by: Francesco Bertolotti <francesco.bertolotti@igenius.ai>
Co-authored-by: Francesco Bertolotti <francesco.bertolotti@igenius.ai>
Co-authored-by: Aaron Pham <Aaronpham0103@gmail.com>
2025-06-16 05:05:13 +00:00
92183b41f3 [Bugfix][Core] Prefix caching causes incorrect outputs due to outdated ComputedBlocksTracker (#18957)
Signed-off-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
Co-authored-by: 刘全 <quan.liu2@dbappsecurity.com.cn>
2025-06-15 21:56:37 -07:00
c6703d1e0d [MISC] Remove unused variableds in C++ (#19609)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-15 20:05:28 -07:00
a5e7242d5f [Misc] Remove duplicate multiproc method setting for CPU platform (#19649)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-16 02:26:58 +00:00
91b2c17a55 [CI/Build] Fix torch nightly CI dependencies part 2 (#19589) 2025-06-15 20:01:10 +08:00
055915e6ce Enable prefix caching with full cuda graphs (#19617)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-15 01:05:05 -07:00
3d330c4c09 [Benchmark] Refactor benchmark script for fp8 & int8 (#19627)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-15 15:15:37 +08:00
0b73736a0d [Kernel] Raise verbose error and consolidate num_heads/num_kv_heads divisibility check (#19339)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-15 13:43:48 +08:00
ee1531bc38 [Bugfix][2/n] Fix speculative decoding CI - Fix test_ngram_e2e_greedy_correctness (#19644) 2025-06-14 21:15:41 -07:00
e13945f9dd [Perf] Further tunings for SM100 FP8 CUTLASS kernel (#19566) 2025-06-14 17:25:10 -07:00
08500011d3 [Fix] Convert kv_transfer_config from dict to KVTransferConfig (#19262) 2025-06-14 12:32:07 -07:00
861a0a0a39 [Bugfix] Don't attempt to use triton if no driver is active (#19561) 2025-06-14 12:30:54 -07:00
bc956b38d0 Only build CUTLASS MoE kernels on Hopper (#19648) 2025-06-14 11:44:15 -07:00
294fc1e2c9 [Hardware][NVIDIA][kernel] Fp4 MOE quant kernel optimization (#19500) 2025-06-14 09:34:28 -07:00
2db9044ab6 [Bugfix] Fix auto dtype casting for BatchFeature (#19316)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-06-14 15:13:08 +00:00
6fa718a460 [Misc] Modularize CLI Argument Parsing in Benchmark Scripts (#19593)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-14 16:54:52 +08:00
06be858828 [Bugfix] Fix the speculative decoding test by setting the target dtype (#19633) 2025-06-13 20:57:32 -07:00
d1e34cc9ac [V1][Metrics] Deprecate metrics with gpu_ prefix for non GPU specific metrics. (#18354)
Signed-off-by: Saheli Bhattacharjee <saheli@krai.ai>
2025-06-14 11:07:36 +08:00
bd517eb9fe [BugFix] Fix DP Coordinator incorrect debug log message (#19624)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-14 00:18:03 +00:00
d65668b4e8 Adding "AMD: Multi-step Tests" to amdproduction. (#19508)
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-13 17:08:51 -07:00
aafbbd981f [torch.compile] Use custom ops when use_inductor=False (#19618) 2025-06-13 15:05:54 -07:00
0f0874515a [Doc] Add troubleshooting section to k8s deployment (#19377)
Signed-off-by: Anna Pendleton <pendleton@google.com>
2025-06-13 21:47:51 +00:00
3597b06a4f [CUDA] Enable full cudagraph for FlashMLA (#18581)
Signed-off-by: luka <luka@neuralmagic.com>
2025-06-13 18:12:26 +00:00
1015296b79 [doc][mkdocs] fix the duplicate Supported features sections in GPU docs (#19606)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-13 16:25:08 +00:00
ce9dc02c93 [Refactor] Remove unused variables in moe_permute_unpermute_kernel.inl (#19573)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-13 06:12:15 -07:00
a24cb91600 [Model] Fix minimax model cache & lm_head precision (#19592)
Signed-off-by: qingjun <qingjun@minimaxi.com>
2025-06-13 12:08:20 +00:00
7e8d97dd3f [BugFix] Honor enable_caching in connector-delayed kvcache load case (#19435)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-13 09:46:32 +00:00
d70bc7c029 [torch.compile] reorganize the cache directory to support compiling multiple models (#19064)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-13 15:23:25 +08:00
ce688ad46e use base version for version comparison (#19587)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-06-13 15:09:34 +08:00
cefdb9962d [Fix] The zip function in Python 3.9 does not have the strict argument (#19549)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-13 14:57:48 +08:00
ace5cdaff0 [Fix] bump mistral common to support magistral (#19533)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-12 22:28:12 -07:00
6458721108 [CPU] Refine default config for the CPU backend (#19539)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-06-13 13:27:39 +08:00
bb4a0decef [Misc] Correct broken docs link (#19553)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-06-12 22:27:13 -07:00
c707cfc12e [doc] fix incorrect link (#19586)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-13 04:26:09 +00:00
7b3c9ff91d [Doc] uses absolute links for structured outputs (#19582)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-13 03:35:17 +00:00
c68698b326 [Bugfix] Fix EAGLE vocab embedding for multimodal target model (#19570)
Signed-off-by: qizixi <qizixi@meta.com>
2025-06-12 23:09:19 -04:00
e3b12667d4 [BugFix] : Fix Batched DeepGemm Experts (#19515)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-12 20:43:02 -06:00
e6aab5de29 Revert "[Build/CI] Add tracing deps to vllm container image (#15224)" (#19378) 2025-06-12 17:26:40 -07:00
c57bb199b3 [V1] Resolve failed concurrent structured output requests (#19565)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-12 23:30:09 +00:00
dba68f9159 [Doc] Unify structured outputs examples (#18196)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-12 22:50:31 +00:00
a3319f4f04 [Bugfix] Enforce contiguous input for dynamic_per_token FP8/INT8 quant (#19452)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-12 15:39:15 -04:00
9d880f594d [Misc] Turn MOE_DP_CHUNK_SIZE into an env var (#19506) 2025-06-12 18:01:16 +00:00
017ef648e9 [Spec Decode][Benchmark] Generalize spec decode offline benchmark to more methods and datasets (#18847) 2025-06-12 10:30:56 -07:00
4b25ab14e2 [doc] Make top navigation sticky (#19540)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-12 15:48:11 +00:00
f98548b9da [torch.compile][ROCm] Fuse quantization onto attention using a torch.compile pass (#16756)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
2025-06-12 08:31:04 -07:00
96846bb360 Fix TorchAOConfig skip layers (#19265)
Signed-off-by: mobicham <hicham@mobiuslabs.com>
2025-06-12 22:22:53 +08:00
b6efafd9e4 [Perf] Vectorize static / dynamic INT8 quant kernels (#19233)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-06-12 06:51:41 -07:00
1129e2b1ab [V1][NixlConnector] Drop num_blocks check (#19532)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-06-12 12:36:14 +00:00
c742438f8b [Doc] Add V1 column to supported models list (#19523)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-12 19:16:44 +08:00
73e2e0118f [Quantization] Improve AWQ logic (#19431)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-12 11:02:11 +00:00
c9280e6346 [Bugfix] Respect num-gpu-blocks-override in v1 (#19503)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-12 11:00:23 +00:00
af09b3f0a0 [Bugfix][V1] Allow manual FlashAttention for Blackwell (#19492)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-12 10:40:24 +00:00
4f6c42fa0a [Security] Prevent new imports of (cloud)pickle (#18018)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Aaron Pham <Aaronpham0103@gmail.com>
2025-06-12 10:30:17 +00:00
dff680001d Fix typo (#19525)
Signed-off-by: 2niuhe <carlton2tang@gmail.com>
2025-06-12 09:24:45 +00:00
2e090bd5df [AMD][Kernel][BugFix] fix test_rocm_compressed_tensors_w8a8 for rocm (#19509)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-06-12 07:14:24 +00:00
1b0b065eb5 [BugFix] Handle missing sep_token for Qwen3-Reranker in Score API (#19522)
Signed-off-by: strutive07 <strutive07@gmail.com>
2025-06-12 07:00:47 +00:00
d5bdf899e4 [BugFix] Work-around incremental detokenization edge case error (#19449)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-12 06:43:20 +00:00
7e3e74c97c [Frontend] Improve error message in tool_choice validation (#19239)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-12 01:13:00 -04:00
3f6341bf7f Add Triton Fused MoE kernel config for E=16 on B200 (#19518)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-06-12 04:31:51 +00:00
e5d35d62f5 [BugFix] Force registration of w8a8_block_fp8_matmul_deepgemm via lazy import (#19514)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-12 04:28:12 +00:00
2f1c19b245 [CI] change spell checker from codespell to typos (#18711)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-06-11 19:57:10 -07:00
42f52cc95b [CI/Build] Fix torch nightly CI dependencies (#19505)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-11 14:40:42 -07:00
97a9465bbc [UX] Add Feedback During CUDAGraph Capture (#19501)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
2025-06-11 21:09:05 +00:00
c7ea0b56cd [AMD] [Quantization] Add override flag for attention dtype instead of using kv_cache_dtype trigger (#17331)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-06-11 15:53:28 -04:00
29fa5cac1c [Kernels] Add activation chunking logic to FusedMoEModularKernel (#19168)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-06-11 12:53:10 -04:00
b2d9be6f7d [Docs] Remove WIP features in V1 guide (#19498)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-11 09:15:03 -07:00
04a55612dd [Misc] Fix misleading ROCm warning (#19486)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-12 00:12:10 +08:00
89b0f84e17 [doc] fix "Other AI accelerators" getting started page (#19457)
Signed-off-by: David Xia <david@davidxia.com>
2025-06-11 16:11:17 +00:00
497a91e9f7 [CI] Update FlashInfer to 0.2.6.post1 (#19297)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-11 22:57:28 +08:00
943ffa5703 [Bugfix] Update the example code, make it work with the latest lmcache (#19453)
Signed-off-by: Runzhen Wang <wangrunzhen@gmail.com>
2025-06-11 12:42:20 +00:00
5c8d34a42c Support no privileged mode on CPU for docker and kubernetes deployments (#19241)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
2025-06-11 04:11:47 -07:00
3c8694eabe Fix some typo (#19475)
Signed-off-by: ximing.wxm <ximing.wxm@antgroup.com>
Co-authored-by: ximing.wxm <ximing.wxm@antgroup.com>
2025-06-11 10:36:04 +00:00
7484e1fce2 Add cache to cuda get_device_capability (#19436)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-11 17:37:05 +08:00
a2142f0196 Support non-string values in JSON keys from CLI (#19471)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 09:34:04 +00:00
871d6b7c74 [Misc] Reduce warning message introduced in env_override (#19476)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-11 17:29:54 +08:00
29a38f0352 [Doc] Support "important" and "announcement" admonitions (#19479)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 01:39:58 -07:00
a5115f4ff5 [Doc] Fix quantization link titles (#19478)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 01:27:22 -07:00
68b4a26149 [Doc] Update V1 User Guide for Hardware and Models (#19474)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-11 00:49:06 -07:00
b8e809a057 [Kernel] Support deep_gemm for linear methods (#19085)
Signed-off-by: artetaout <lulala341@gmail.com>
2025-06-11 15:14:45 +08:00
5039ec2336 [ROCm] Add rules to automatically label ROCm related PRs (#19405)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-11 15:09:18 +08:00
7c644ab6d5 Fix Typo in Documentation and Function Name (#19442) 2025-06-10 22:44:11 -07:00
2d40665fe8 Add fused MOE config for Qwen3 30B A3B on B200 (#19455)
Signed-off-by: Junhao Li <junhao@ubicloud.com>
2025-06-11 13:43:46 +08:00
96ada386b7 [Misc] Remove unused MultiModalHasher.hash_prompt_mm_data (#19422)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-11 05:18:57 +00:00
1e473b3010 [CI] Disable failing GGUF model test (#19454)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-11 05:12:38 +00:00
2b1e2111b0 Fix test_max_model_len in tests/entrypoints/llm/test_generate.py (#19451)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-11 12:54:59 +08:00
a45b979d9f [BugFix] Fix docker build cpu-dev image error (#19394)
Signed-off-by: niu_he <carlton2tang@gmail.com>
2025-06-10 20:56:40 -07:00
3952731e8f [New Model]: Support Qwen3 Embedding & Reranker (#19260) 2025-06-10 20:07:30 -07:00
77f0d465d0 [BugFix] Allow use_cudagraph to work with dynamic VLLM_USE_V1 (#19390)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-11 07:54:41 +08:00
22c3c0aa4a Add H20-3e fused MoE kernel tuning configs for Qwen3-235B-A22B-FP8 (#19401)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
2025-06-11 07:23:57 +08:00
33f8dba7c6 [Model] use AutoWeightsLoader for commandr (#19399)
Signed-off-by: py-andy-c <pychen1017@gmail.com>
2025-06-10 22:42:21 +00:00
5241ca50d6 [ROCm][V1] Adding ROCm to the list of plaforms using V1 by default (#19440)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-10 22:06:15 +00:00
da9b523ce1 [Docs] Note that alternative structured output backends are supported (#19426)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-06-10 16:20:00 +00:00
b6553be1bc [Misc] Slight improvement of the BNB (#19418)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-10 13:51:49 +00:00
64a9af5afa Simplify ep kernels installation (#19412)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-10 20:06:08 +08:00
e4248849ec [BugFix][CPU] Fix CPU CI by ignore collecting test_pixtral (#19411)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-06-10 12:02:40 +00:00
467bef18a3 [BugFix][FlashInfer] Fix attention backend interface mismatch with unexpected keyword use_irope (#19134)
Signed-off-by: Yunqiu Guo <guorachel@meta.com>
2025-06-10 16:48:51 +08:00
5f1ac1e1d1 Revert "[v1] Add fp32 support to v1 engine through flex attn" (#19404) 2025-06-10 01:30:20 -07:00
9368cc90b2 Automatically bind CPU OMP Threads of a rank to CPU ids of a NUMA node. (#17930)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2025-06-10 06:22:05 +00:00
32b3946bb4 Add clear documentation around the impact of debugging flag (#19369)
Signed-off-by: Anna Pendleton <pendleton@google.com>
2025-06-10 06:16:09 +00:00
6b1391ca7e [Misc] refactor neuron_multimodal and profiling (#19397)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-10 06:12:42 +00:00
a3f66e75d1 Add security warning to bug report template (#19365)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-06-10 06:06:36 +00:00
319cb1e351 [Core] Batch multi modal input using pinned memory (#19169)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-10 13:44:59 +08:00
1efef71645 [Bugfix] Fix modelscope token passed in (#19389)
Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-10 13:39:37 +08:00
646d62f636 [Core] Use tuple for kv cache group block ids (#19175)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-10 07:01:17 +02:00
6cd4ae8acd [Frontend] Add tqdm_leave_pbar to control progress bar visibility (#19357)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-10 04:55:09 +00:00
c016047ed7 Fix docs/mkdocs/hooks/remove_announcement.py (#19382) 2025-06-09 21:36:54 -07:00
9af6d22e4c Use xla flag to improve the quantized model performance (#19303)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-06-10 01:28:45 +00:00
4589b94032 [Bugfix] Fix benchmark_moe.py (#19016)
Signed-off-by: Tianyu Guo <guoty9@mail2.sysu.edu.cn>
2025-06-09 18:04:36 -07:00
cc867be19c [V1] Reuse V0's memory_profiling util for gpu worker memory profiling (#19312)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-06-10 08:40:01 +08:00
3a7cd627a8 [Misc] Fix a config typo in disable_hybrid_kv_cache_manager configuration (#19383)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-09 16:41:51 -07:00
8058c91108 [HOT-FIX] Add kv_sharing_target_layer_name argument to cutlass_mla backend (#19374)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-06-09 19:00:07 -04:00
7d44c469fe [TPU]Fix KV cache sharing tests (#19371) 2025-06-09 18:38:15 -04:00
31f58be96a [Frontend] Make TIMEOUT_KEEP_ALIVE configurable through env var (#18472)
Signed-off-by: liusiqian <liusiqian@tal.com>
2025-06-09 21:41:21 +00:00
ebb2f383b8 [Quantization] Bump compressed-tensors version (#19295)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-06-09 14:33:15 -07:00
c1c7dbbeeb [Bugfix][Core] Prevent token lengths exceeding max_model_len in V0 (#19348)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-09 23:01:29 +08:00
5cf2daea9a [Misc] Fixes and Optimizations for DeepEP + DeepGEMM combination. (#19298)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
2025-06-09 10:50:39 -04:00
b8089195b4 [v1] Add fp32 support to v1 engine through flex attn (#19319)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-06-09 22:10:44 +08:00
770e5dcdb8 [full_graph] Fix query_start_loc padding (#19321)
Signed-off-by: Yinghai Lu <yinghai@thinkingmachines.ai>
2025-06-09 21:32:56 +08:00
c57c9415b1 [Docs] Fix a bullet list in usage/security.md (#19358)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-06-09 13:28:51 +00:00
01810f9236 [CI] Introduce rules for llama auto-label (#19323)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-09 20:05:42 +08:00
59abbd84f9 [Fix] Allow kernel compilation for CUDA capability 8.7 (#19328)
Signed-off-by: Conroy Cheers <conroy@corncheese.org>
2025-06-09 02:57:23 -07:00
95a6568b5c [CI/Build] Fix LoRA test (#19350)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-09 09:52:10 +00:00
0eca5eacd0 [Doc] Fix description in the Automatic Prefix Caching design doc (#19333)
Signed-off-by: cr7258 <chengzw258@163.com>
2025-06-09 17:30:02 +08:00
12e5829221 [doc] improve ci doc (#19307)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-09 07:26:12 +00:00
3a4d417707 [Misc] Cleanup compilation tests (#19343)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-09 15:05:44 +08:00
8335667c22 [Frontend] Remove unreachable code from llm.py (#19288)
Signed-off-by: KsuParkhamchuk <k.parkhamchuk@gmail.com>
2025-06-09 10:22:10 +08:00
e1c4380d4c [Misc] Add documentation update reminder to PR template (#19289)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-09 10:20:53 +08:00
e31ae3de36 [Deprecation] Remove inputs arg fallback in Engine classes (#18799)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-09 10:19:56 +08:00
2ffb9b6e07 [Bugfix] model_max_length should consider max_model_len in tokenizer_config (#19201) 2025-06-08 07:17:53 -07:00
cda10fa3e2 [Multi Modal] Add an env var for message queue max chunk bytes (#19242)
Signed-off-by: yZhen <yZhen@fb.com>
Co-authored-by: yZhen <yZhen@fb.com>
2025-06-08 21:39:12 +08:00
c123bc33f9 [Quantization] Add compressed-tensors NVFP4 support (#18312) 2025-06-08 09:05:55 -04:00
b9a1791e2c [Hardware][POWER] Add IBM POWER11 Support to CPU Extension Detection (#19082)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
2025-06-08 09:17:14 +00:00
989dcee981 Add H20-3e fused MoE kernel tuning configs for Qwen3-235B-A22B (#19315)
Signed-off-by: Xu Wenqing <xuwq1993@qq.com>
2025-06-08 16:07:02 +08:00
3d64d366e0 [Misc] Change tests/compile to use VLLM_V1 by default (#19302)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-08 16:06:48 +08:00
eaa2e51088 [Bugfix] Re-enable use_cudagraph in vLLM v1 (#19299)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-06-08 08:56:12 +08:00
d77f7fb871 [Bugfix]: Fix TypeError: 'float' object cannot be interpreted as an integer (#19283)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-08 08:16:31 +08:00
2d8476e465 [BugFix][V1] Fix memory profiling bug (#18974)
Signed-off-by: luka <luka@neuralmagic.com>
2025-06-07 10:34:51 -07:00
88be823d57 [AMD] Update compatible packaging version (#19309)
Signed-off-by: pramkuma <Pramendra.Kumar@amd.com>
2025-06-07 20:55:09 +08:00
4e4f63ad45 [Nit][Benchmark]Fix example in benchmark_serving_structured_output.py (#19311)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-06-07 18:25:38 +08:00
d2f0e7e615 [CI/Build] Improve Llama GGUF test robustness (#19287)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-07 17:23:28 +08:00
122cdca5f6 [Misc] refactor context extension (#19246)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-07 05:13:21 +00:00
cf02f9b283 Add FlexAttention to V1 (#16078)
Signed-off-by: drisspg <drisspguessous@gmail.com>
2025-06-06 21:58:55 -07:00
c4296b1a27 [CI][PowerPC] Use a more appropriate way to select testcase in tests/models/language/pooling/test_embedding.py (#19253)
Signed-off-by: Aaruni Aggarwal <aaruniagg@gmail.com>
2025-06-07 11:52:52 +08:00
66c508b137 [TPU][Test] Add script to run benchmark on TPU for buildkite (#19039)
Signed-off-by: Qiliang Cui <derrhein@gmail.com>
2025-06-06 20:10:24 -07:00
84166fee97 [Kernel] Integrate CUTLASS MoE kernel with PPLX (#18762)
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-06 18:26:11 -07:00
6e0cd10f72 [Easy][Test] Simplify test_function_tool_use with multiple parametrizes (#19269)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-07 09:19:09 +08:00
e010688f50 [Build][ROCm] Update Dockerfile.rocm (#19296)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-06-06 19:35:16 -04:00
441b65d8c7 [Misc][Tools][Benchmark] Fix and improve auto tune script (#19163)
Signed-off-by: Chenyaaang <chenyangli@google.com>
2025-06-06 23:31:19 +00:00
46ecc57973 [BugFix] Fix tpu_model_runner block_id concatenation (#19228)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:28:17 -07:00
b6a3a9f76d [Core] Fix abrupt request abort (#18485)
Signed-off-by: nicklucche <nlucches@redhat.com>
Signed-off-by: Nick Hill <nhill@redhat.com>

Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:27:59 -07:00
ca27f0f9c1 [Bugfix][Core] Update cancellation logic in generate() to handle Generator exits (#19225)
Co-authored-by: Adolfo Victoria <adovi@meta.com>
2025-06-06 20:17:54 +00:00
aad30bd306 [BugFix] Fix MultiConnector test after HMA changes (#19291)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 20:16:24 +00:00
94ecee6282 Fixed ppc build when it runs on non-RHEL based linux distros (#18422)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
Signed-off-by: npanpaliya <nishidha.panpaliya@partner.ibm.com>
Co-authored-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
2025-06-06 11:54:26 -07:00
8267f9916f improve logits bias (#19041) 2025-06-06 19:59:25 +08:00
7353492a47 [Core] Raise when non-multi-instance DP clients target a DP rank (#19227)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-06 19:03:01 +08:00
7661e92ef8 [Model] Optimize nemotron_h implementation (#19249)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-06 10:05:14 +00:00
f168b85725 Unit Test for run_dp_sharded_vision_model (#19103)
Signed-off-by: Siqi Yan <siqi@meta.com>
Co-authored-by: Siqi Yan <siqi@meta.com>
2025-06-06 16:24:02 +08:00
da511d54d8 Fix CompilationConfig repr (#19091)
Signed-off-by: rzou <zou3519@gmail.com>
2025-06-06 16:23:35 +08:00
65c69444b1 [Docs] Improve V1 KVConnector interface documentation (#19172)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-06-06 16:22:45 +08:00
94870359cd [Quantization] Bump compressed-tensors version; update NVFP4A16 test model (#19224)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
2025-06-06 01:21:54 -07:00
0d49483ea9 [TPU] fix kv cache dtype in model runner (#19244)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-06 16:20:16 +08:00
90b78ec5f9 [v1][P/D] Fix a edge case in kv cache schedule (#19182)
Co-authored-by: jinghui <jinghui@fb.com>
2025-06-05 23:32:55 -07:00
91a2ef98ea [Chore] update CODEOWNERS (#19247)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-06-06 06:09:43 +00:00
3da2313d78 Support allowed_token_ids in ChatCompletionRequest (#19143)
Signed-off-by: Xu Song <xusong.vip@gmail.com>
2025-06-06 05:06:48 +00:00
b61dc5f972 [TPU] update torch_xla pin (#19231)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-06-06 04:27:38 +00:00
f8a1a2d108 [v1] Hybrid Memory Allocator (#17996)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-05 20:47:09 -07:00
3465b87ef8 [Bugfix] Fix EAGLE vocab embedding construction for Llama 70B (#19033)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-06-05 19:10:08 -07:00
c8134bea15 Fix AOPerModuleConfig name changes (#18869)
Signed-off-by: Jerry Zhang <jerryzh168@gmail.com>
2025-06-05 18:51:32 -07:00
cb6d572e85 [Model] NemotronH support (#18863)
Signed-off-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
Co-authored-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
2025-06-05 21:29:28 +00:00
87360308b7 [V1] Use FlashInfer by default on Blackwell GPUs (#19118) 2025-06-05 15:40:39 -04:00
aa49f14832 [Quantization] Skip Fp4 Test for compressed-tensors (#19217) 2025-06-05 18:21:53 +00:00
9ef9173cfa [P/D][NixlConnector] Enable FlashInfer backend (#19090) 2025-06-05 17:10:15 +00:00
85e2b7bb13 [MISC][Bugfix] Use less CPU when message queue has been empty for some time (#16226)
Signed-off-by: Povilas Kanapickas <povilas@radix.lt>
2025-06-05 16:53:08 +00:00
61059bee40 [Hardware][NVIDIA] FP4 MoE kernel optimization (#19110)
Signed-off-by: Chiyue Wei <chiyuew@nvidia.com>
Co-authored-by: Chiyue Wei <chiyuew@nvidia.com>
2025-06-05 09:48:26 -07:00
ec89524f50 Add H20-3e fused MoE kernel tuning configs for DeepSeek-R1/V3 (#19205) 2025-06-05 16:38:54 +00:00
f20f9f063b [mistral_common] Add v11 tokenizer (#19193)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-06-05 08:27:41 -07:00
9bc8bb07cf [Bugfix] properly catch PIL-related errors for vision models when incorrect data urls are provided (#19202)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-06-05 12:59:28 +00:00
1aeb925f34 [Frontend] improve vllm run-batch --help display (#19187)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-05 11:16:25 +00:00
188a4590d8 [Misc] Do not override NCCL_CUMEM_ENABLE if set explicitly (#19105)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-06-05 11:14:32 +00:00
18093084be [Misc] Remove unnecessary fallback to prefill-decode attention (#19138)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-06-05 16:08:26 +08:00
da40380214 [Build] Annotate wheel and container path for release workflow (#19162)
Signed-off-by: simon-mo <simon.mo@hey.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-06-04 23:24:56 -07:00
8fc57501d3 [Bugfix]: Fix the incompatibility issue with stream when Thinking is disabled (#19135)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-05 06:24:24 +00:00
af7fc84fd2 [BugFix][Minor] Fix full cuda graph bug when max_num_seqs < 512 (#19171)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-05 13:41:25 +08:00
0678b52251 Handle non-serializable objects when dumping benchmark results (#19114) 2025-06-04 22:40:04 -07:00
25b918eee6 [Torch Nightly]add missing dependency (#18770)
Signed-off-by: Yang Wang <elainewy@meta.com>
2025-06-04 21:56:12 -07:00
a408820f2f [Bugfix] Fix port handling in make_zmq_path (#19117) 2025-06-04 21:00:59 -06:00
c56ed8bb0e [Bugfix][Nixl] Fix full prefix cache hit bug (#18632)
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-06-05 02:07:32 +00:00
78dcf56cb3 [doc] small fix (#19167)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-05 09:13:50 +08:00
b2fac67130 [P/D] Heterogeneous TP (#18833)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-06-04 23:25:34 +00:00
23027e2daf [Misc] refactor: simplify EngineCoreClient.make_async_mp_client in AsyncLLM (#18817)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-04 15:37:25 -07:00
c3fd4d669a [Kernel] Integrate batched/masked deepgemm kernel (#19111)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun <vsundarr@redhat.com>
2025-06-04 21:59:18 +00:00
ef3f98b59f [Bugfix] fix v1 cpu worker fails on macOS (#19121) 2025-06-04 20:17:38 +00:00
7ee2590478 [TPU] Update dynamo dump file name in compilation test (#19108)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-04 16:13:43 -04:00
53a5a0ce30 [Perf] Tunings for SM100 FP8 CUTLASS kernel (#18778)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-04 10:46:28 -07:00
d459fae0a2 [Bugfix][EP+DP] Fix internode check (#19112)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
2025-06-04 23:39:23 +08:00
c8dcc15921 Allow AsyncLLMEngine.generate to target a specific DP rank (#19102)
Signed-off-by: Jon Swenson <jmswen@gmail.com>
2025-06-04 08:26:47 -07:00
8f4ffbd373 [Doc] Update V1 Guide for embedding models (#19141)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-04 22:57:55 +08:00
5f2cd251d2 Sm100 blockwise fp8 swap ab (#18564) 2025-06-04 07:48:45 -07:00
02658c2dfe Add DeepSeek-R1-0528 function call chat template (#18874)
Signed-off-by: 许文卿 <xwq391974@alibaba-inc.com>
2025-06-04 13:24:18 +00:00
01dc9a76db [CI/Build][Bugfix] Ensure compatibility with transformers 4.52 (#18678)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-04 04:49:20 -07:00
35cf32df30 Improve the output precision of embedding models (#19092) 2025-06-04 11:48:57 +00:00
8711bc5e68 [Misc] Add packages for benchmark as extra dependency (#19089)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-04 04:18:48 -07:00
2669a0d7b5 Fix ValueError: Missing value for tag key(s): model_name,engine. (#19113)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-06-04 17:10:45 +08:00
8e972d9c44 [TPU] Skip hanging tests (#19115)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-06-04 01:43:00 -07:00
3336c8cfbe Fix #19130 (#19132)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-04 01:42:06 -07:00
b124e1085b [Bugfix] Fix FA3 full cuda graph correctness (#19106)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-06-03 23:10:15 -07:00
41aa578428 [NVIDIA] Add Cutlass MLA backend (#17625) 2025-06-03 21:40:26 -07:00
8d646c2e53 [Cleanup][v1]:remote guided-decoding-backend for example (#19059)
Signed-off-by: calvin chen <120380290@qq.com>
2025-06-04 04:23:26 +00:00
5d6d1adf15 [KERNEL] Sampler. CUDA kernel for applying repetition penalty (#18437) 2025-06-03 21:13:01 -07:00
1409ef9134 [Core] Cast multimodal input in hf processor (#18862)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-06-03 20:24:56 -07:00
4555143ea7 [CPU] V1 support for the CPU backend (#16441) 2025-06-03 18:43:01 -07:00
52dceb172d [Docs] Add developer doc about CI failures (#18782)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-06-04 01:09:13 +00:00
abd7df2fca [Misc] Fix path and python alias errors in disagg_prefill exmaples (#18919) 2025-06-03 17:15:18 -07:00
b712be98c7 feat: add data parallel rank to KVEventBatch (#18925) 2025-06-03 17:14:20 -07:00
a8da78eac9 [Bugfix] Max concurrency estimation and check_enough_kv_cache_memory for models with sliding window layers (#19029)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-04 00:14:06 +00:00
5d96533e22 [Bugfix][P/D] Fix Prefix Cache Bug (#18411)
Signed-off-by: nicklucche <nlucches@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-06-03 23:53:16 +00:00
4de790fcad [Bugfix]: Fix the incompatibility issue with tool_choice 'required' when Thinking is enabled (#19075)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-06-03 23:27:24 +00:00
b5fd9506c1 [Bugfix] get_num_blocks_to_allocate with null_block (#19031)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 15:30:55 -07:00
135cf55cd1 [V1][Spec Decode][Ngram] 1.35x gain -> 1.95x gain on InstructCoder with prompt fix (#18971) 2025-06-03 15:26:33 -07:00
6cac54f4d1 [v1] Re-init input batch for multiple kv cache groups (#18654)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 21:41:36 +00:00
6865fe0074 Fix interaction between Optional and Annotated in CLI typing (#19093)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikun@apache.org>
2025-06-03 21:07:19 +00:00
e31446b6c8 [Perf] Tune scaled_fp8_quant by increasing vectorization (#18844)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-03 13:48:25 -07:00
bdf13965ab [V1] Support cross-layer KV sharing (#18212)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-06-03 20:33:07 +00:00
fa98d77773 [Kernel] DeepEP dispatch-combine kernel integration (#18434)
Signed-off-by: Varun <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-06-03 12:30:02 -07:00
01eee40536 [doc] update docker version (#19074)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-03 19:08:21 +00:00
19bdaf32b1 [Doc] Readme standardization (#18695)
Co-authored-by: Soren Dreano <soren@numind.ai>
2025-06-03 11:50:55 -07:00
02f0c7b220 [Misc] Add SPDX-FileCopyrightText (#19100)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-06-03 11:20:17 -07:00
d054da1992 [Misc] fix: add miss best_of param validation (#18555)
Signed-off-by: googs1025 <googs1025@gmail.com>
2025-06-03 11:02:07 -07:00
4b7817c119 [Misc] Add missing _Backend enums (#19081)
Signed-off-by: nicklucche <nlucches@redhat.com>
2025-06-03 16:15:16 +00:00
d00dd65cd4 [Doc] Improve the Pull Request template with key components (#19086)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-03 23:44:34 +08:00
d81edded69 [Bugfix] disable processor cache (#19068)
Signed-off-by: raushan <raushan@huggingface.co>
2025-06-03 15:06:04 +00:00
476844d44c Fix underscores in dict keys passed via CLI (#19030)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-06-03 14:39:24 +00:00
4e68ae5e59 [CI/Build] Remove V0 LoRA test (#19066)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-03 14:30:18 +00:00
4e88723f32 [doc] clarify windows support (#19088)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-06-03 21:42:17 +08:00
118ff92111 [Doc] Update V1 user guide for embedding and enc-dec models (#19060)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-06-03 02:29:41 -07:00
ec2dcd80bc [Misc] Update WeightsMapper for qwen2-vl/qwen2.5-vl (#19054)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-06-03 09:08:20 +00:00
42243fbda0 [Doc] Add InternVL LoRA support (#19055)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-06-03 09:08:03 +00:00
6d18ed2a2e Update docker docs with ARM CUDA cross-compile (#19037)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-06-03 08:21:53 +00:00
f32fcd9444 [v1][KVCacheManager] Rename BlockHashType to BlockHash (#19015)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-06-03 08:01:48 +00:00
d32aa2e670 [Bugfix] Use cmake 3.26.1 instead of 3.26 to avoid build failure (#19019)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-06-03 00:16:17 -07:00
cc977286e7 Reduce logs in CLI scripts and plugin loader (#18970)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-06-03 06:00:45 +00:00
17430e3653 [bugfix] small fix logic issue (#18999)
Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
2025-06-03 05:35:12 +00:00
1282bd812e Add tarsier model support (#18985)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-06-03 13:13:13 +08:00
bdce64f236 [V1] Support DP with Ray (#18779) 2025-06-02 21:15:13 -07:00
9e6f61e8c3 [ROCm][Build] Clean up the ROCm build (#19040)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-06-02 20:47:47 -07:00
8655f47f37 [CPU][CI] Re-enable the CPU CI tests (#19046)
Signed-off-by: jiang.li <jiang1.li@intel.com>
2025-06-02 20:46:47 -07:00
4ce42f9204 Adding "LoRA Test %N" to AMD production tests (#18929)
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
2025-06-02 20:46:44 -07:00
8a57872b2a [Bugfix][EP+DP] Use pplx-kernel internode instead of intranode (#19034)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-06-03 11:36:51 +08:00
5bc1ad6cee [Doc] Remove duplicate TOCs during MkDocs migration (#19021)
Signed-off-by: Zerohertz <ohg3417@gmail.com>
2025-06-02 19:49:48 -07:00
9112b443a0 [Hardware][TPU] Initial support of model parallelism with single worker using SPMD (#18011)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Hossein Sarshar <hossein.sarshar@gmail.com>
Co-authored-by: Chengji Yao <chengjiyao@google.com>
2025-06-03 00:06:20 +00:00
c57d577e8d add an absolute path for run.sh (#18258)
Signed-off-by: calvin chen <120380290@qq.com>
2025-06-02 19:38:23 +00:00
1628 changed files with 29256 additions and 7170 deletions

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import pytest

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime
import json

View File

@ -1,5 +1,6 @@
steps:
- label: "Build wheel - CUDA 12.8"
id: build-wheel-cuda-12-8
agents:
queue: cpu_queue_postmerge
commands:
@ -11,6 +12,7 @@ steps:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6"
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
commands:
@ -28,6 +30,7 @@ steps:
- label: "Build wheel - CUDA 11.8"
# depends_on: block-build-cu118-wheel
id: build-wheel-cuda-11-8
agents:
queue: cpu_queue_postmerge
commands:
@ -44,6 +47,7 @@ steps:
- label: "Build release image"
depends_on: block-release-image-build
id: build-release-image
agents:
queue: cpu_queue_postmerge
commands:
@ -51,6 +55,18 @@ steps:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Annotate release workflow"
depends_on:
- build-release-image
- build-wheel-cuda-12-8
- build-wheel-cuda-12-6
- build-wheel-cuda-11-8
id: annotate-release-workflow
agents:
queue: cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/annotate-release.sh"
- label: "Build and publish TPU release image"
depends_on: ~
if: build.env("NIGHTLY") == "1"
@ -70,9 +86,10 @@ steps:
DOCKER_BUILDKIT: "1"
- input: "Provide Release version here"
id: input-release-version
fields:
- text: "What is the release version?"
key: "release-version"
key: release-version
- block: "Build CPU release image"
key: block-cpu-release-image-build

View File

@ -0,0 +1,31 @@
#!/bin/bash
set -ex
# Get release version and strip leading 'v' if present
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
if [ -z "$RELEASE_VERSION" ]; then
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
exit 1
fi
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel:
\`\`\`
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
\`\`\`
To download and upload the image:
\`\`\`
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
docker tag vllm/vllm-openai vllm/vllm-openai:latest
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
docker push vllm/vllm-openai:latest
docker push vllm/vllm-openai:v${RELEASE_VERSION}
\`\`\`
EOF

View File

@ -0,0 +1,17 @@
#!/bin/bash
# Usage: ./ci_clean_log.sh ci.log
# This script strips timestamps and color codes from CI log files.
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 ci.log"
exit 1
fi
INPUT_FILE="$1"
# Strip timestamps
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
# Strip colorization
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

View File

@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
fi
if [[ $commands == *"pytest -v -s lora"* ]]; then
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
fi
#ignore certain kernels tests
if [[ $commands == *" kernels/core"* ]]; then
commands="${commands} \

View File

@ -7,6 +7,7 @@ set -ex
# Setup cleanup
remove_docker_container() {
if [[ -n "$container_id" ]]; then
podman stop --all -t0
podman rm -f "$container_id" || true
fi
podman system prune -f
@ -37,7 +38,7 @@ function cpu_tests() {
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
}
# All of CPU tests are expected to be finished less than 40 mins.

View File

@ -6,75 +6,82 @@ set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
export CMAKE_BUILD_PARALLEL_LEVEL=32
# Setup cleanup
remove_docker_container() {
set -e;
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Try building the docker image
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# list packages
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
pip list"
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pip list"
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
set -e
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -v -s tests/kernels/test_cache.py -m cpu_model
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
pytest -v -s tests/models/language/generation -m cpu_model
pytest -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model"
# Run compressed-tensor test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
VLLM_USE_V1=0 pytest -s -v \
tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online serving
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
@ -83,7 +90,7 @@ function cpu_tests() {
--tokenizer facebook/opt-125m"
# Run multi-lora tests
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/lora/test_qwen2vl.py"
@ -91,4 +98,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

View File

@ -150,11 +150,15 @@ run_and_track_test 9 "test_multimodal.py" \
run_and_track_test 10 "test_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
run_and_track_test 11 "test_struct_output_generate.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 12 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 13 "test_lora.py" \
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
run_and_track_test 14 "test_tpu_qkv_linear.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then

View File

@ -0,0 +1,18 @@
#!/bin/bash
# Usage: ./rerun_test.sh path/to/test.py::test_name
# Check if argument is given
if [ $# -lt 1 ]; then
echo "Usage: $0 path/to/test.py::test_name"
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
exit 1
fi
TEST=$1
COUNT=1
while pytest -sv "$TEST"; do
COUNT=$((COUNT + 1))
echo "RUN NUMBER ${COUNT}"
done

View File

@ -0,0 +1,24 @@
#!/bin/bash
set -euo pipefail
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
# Check disk usage of the filesystem where Docker's root directory is located
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
# Define the threshold
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi

View File

@ -0,0 +1,14 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=vllm-tpu
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct
MAX_NUM_SEQS=512
MAX_NUM_BATCHED_TOKENS=512
TENSOR_PARALLEL_SIZE=1
MAX_MODEL_LEN=2048
DOWNLOAD_DIR=/mnt/disks/persist
EXPECTED_THROUGHPUT=8.0
INPUT_LEN=1800
OUTPUT_LEN=128

View File

@ -0,0 +1,102 @@
#!/bin/bash
if [ ! -f "$1" ]; then
echo "Error: The env file '$1' does not exist."
exit 1 # Exit the script with a non-zero status to indicate an error
fi
ENV_FILE=$1
# For testing on local vm, use `set -a` to export all variables
source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# Build docker image.
# TODO: build the image outside the script and share the image with other
# tpu test if building time is too long.
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg USE_SCCACHE=1 \
--build-arg GIT_REPO_CHECK=0 \
--tag vllm/vllm-tpu-bm \
--progress plain -f docker/Dockerfile.tpu .
LOG_ROOT=$(mktemp -d)
# If mktemp fails, set -e will cause the script to exit.
echo "Results will be stored in: $LOG_ROOT"
if [ -z "$HF_TOKEN" ]; then
echo "Error: HF_TOKEN is not set or is empty."
exit 1
fi
# Make sure mounted disk or dir exists
if [ ! -d "$DOWNLOAD_DIR" ]; then
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
exit 1
fi
echo "Run model $MODEL"
echo
echo "starting docker...$CONTAINER_NAME"
echo
docker run \
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
--env-file $ENV_FILE \
-e HF_TOKEN="$HF_TOKEN" \
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
-e MODEL=$MODEL \
-e WORKSPACE=/workspace \
--name $CONTAINER_NAME \
-d \
--privileged \
--network host \
-v /dev/shm:/dev/shm \
vllm/vllm-tpu-bm tail -f /dev/null
echo "run script..."
echo
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
echo "copy result back..."
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
if [ "$BUILDKITE" = "true" ]; then
echo "Running inside Buildkite"
buildkite-agent artifact upload "$VLLM_LOG"
buildkite-agent artifact upload "$BM_LOG"
else
echo "Not running inside Buildkite"
fi
#
# compare the throughput with EXPECTED_THROUGHPUT
# and assert meeting the expectation
#
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
echo "Failed to get the throughput"
exit 1
fi
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
exit 1
fi

View File

@ -0,0 +1,94 @@
#!/bin/bash
set -euo pipefail
VLLM_LOG="$WORKSPACE/vllm_log.txt"
BM_LOG="$WORKSPACE/bm_log.txt"
if [ -n "$TARGET_COMMIT" ]; then
head_hash=$(git rev-parse HEAD)
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
exit 1
fi
fi
echo "model: $MODEL"
echo
#
# create a log folder
#
mkdir "$WORKSPACE/log"
# TODO: Move to image building.
pip install pandas
pip install datasets
#
# create sonnet_4x
#
echo "Create sonnet_4x.txt"
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
#
# start vllm service in backend
#
echo "lanching vllm..."
echo "logging to $VLLM_LOG"
echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--no-enable-prefix-caching \
--download_dir $DOWNLOAD_DIR \
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
echo "wait for 20 minutes.."
echo
# sleep 1200
# wait for 10 minutes...
for i in {1..120}; do
# TODO: detect other type of errors.
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
echo "Detected RuntimeError, exiting."
exit 1
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
echo "Application started"
break
else
echo "wait for 10 seconds..."
sleep 10
fi
done
#
# run test
#
echo "run benchmark test..."
echo "logging to $BM_LOG"
echo
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos > "$BM_LOG"
echo "completed..."
echo
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
echo "throughput: $throughput"
echo

View File

@ -145,6 +145,7 @@ steps:
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
- tests/v1/test_async_llm_dp.py
- tests/v1/engine/test_engine_core_client.py
commands:
# test with tp=2 and external_dp=2
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@ -154,6 +155,7 @@ steps:
# test with internal dp
- python3 ../examples/offline_inference/data_parallel.py
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
@ -175,6 +177,11 @@ steps:
- tests/tracing
commands:
- pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0' \
'opentelemetry-api>=1.26.0' \
'opentelemetry-exporter-otlp>=1.26.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1'"
- pytest -v -s tracing
##### fast check tests #####
@ -287,7 +294,7 @@ steps:
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdproduction]
source_file_dependencies:
- vllm/lora
- tests/lora
@ -303,6 +310,7 @@ steps:
commands:
- pytest -v -s compile/test_pass_manager.py
- pytest -v -s compile/test_fusion.py
- pytest -v -s compile/test_fusion_attn.py
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
@ -318,6 +326,7 @@ steps:
# these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py
- pytest -v -s compile/piecewise/test_full_cudagraph.py
- label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental, amdproduction]
@ -421,6 +430,9 @@ steps:
- vllm/model_executor/layers/quantization
- tests/quantization
commands:
# temporary install here since we need nightly, will move to requirements/test.in
# after torchao 0.12 release
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min
@ -663,7 +675,7 @@ steps:
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Multi-step Tests (4 GPUs) # 36min
mirror_hardwares: [amdexperimental]
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:

16
.github/CODEOWNERS vendored
View File

@ -10,15 +10,17 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm
/vllm/entrypoints @aarnphm
CMakeLists.txt @tlrmchlsmth
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb
/vllm/v1/structured_output @mgoin @russellb @aarnphm
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
@ -27,8 +29,8 @@ CMakeLists.txt @tlrmchlsmth
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/models @DarkLight1337 @ywang96
@ -38,11 +40,11 @@ CMakeLists.txt @tlrmchlsmth
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/weight_loading @mgoin @youkaichao
/tests/lora @jeejeelee
# Docs
/docs @hmellor
mkdocs.yaml @hmellor
mkdocs.yaml @hmellor

View File

@ -8,6 +8,16 @@ body:
attributes:
value: >
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
- type: markdown
attributes:
value: |
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
- Passwords or authentication credentials
- Private URLs or endpoints
- Personal or confidential data
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
- type: textarea
attributes:
label: Your current environment

View File

@ -1,6 +1,18 @@
FILL IN THE PR DESCRIPTION HERE
## Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
FIX #xxxx (*link existing issues this PR will resolve*)
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
## Purpose
## Test Plan
## Test Result
## (Optional) Documentation Update
<!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

49
.github/mergify.yml vendored
View File

@ -36,6 +36,20 @@ pull_request_rules:
add:
- frontend
- name: label-llama
description: Automatically apply llama label
conditions:
- or:
- files~=^examples/.*llama.*\.py
- files~=^tests/.*llama.*\.py
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
- files~=^vllm/model_executor/models/.*llama.*\.py
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
actions:
label:
add:
- llama
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
@ -51,6 +65,41 @@ pull_request_rules:
add:
- multi-modality
- name: label-qwen
description: Automatically apply qwen label
conditions:
- or:
- files~=^examples/.*qwen.*\.py
- files~=^tests/.*qwen.*\.py
- files~=^vllm/model_executor/models/.*qwen.*\.py
- files~=^vllm/reasoning/.*qwen.*\.py
- title~=(?i)Qwen
- body~=(?i)Qwen
actions:
label:
add:
- qwen
- name: label-rocm
description: Automatically apply rocm label
conditions:
- or:
- files~=^csrc/rocm/
- files~=^docker/Dockerfile.rocm
- files~=^requirements/rocm.*\.txt
- files~=^vllm/attention/backends/rocm.*\.py
- files~=^vllm/attention/ops/rocm.*\.py
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
- files~=^tests/kernels/.*_rocm.*\.py
- files=vllm/platforms/rocm.py
- title~=(?i)AMD
- title~=(?i)ROCm
actions:
label:
add:
- rocm
- name: label-structured-output
description: Automatically apply structured-output label
conditions:

2
.gitignore vendored
View File

@ -200,5 +200,5 @@ benchmarks/**/*.json
actionlint
shellcheck*/
# Ingore moe/marlin_moe gen code
# Ignore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*

View File

@ -11,6 +11,8 @@ repos:
hooks:
- id: yapf
args: [--in-place, --verbose]
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
exclude: '(.buildkite|benchmarks|build|examples)/.*'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
@ -18,12 +20,10 @@ repos:
args: [--output-format, github, --fix]
- id: ruff-format
files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
- repo: https://github.com/crate-ci/typos
rev: v1.32.0
hooks:
- id: codespell
additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml']
- id: typos
- repo: https://github.com/PyCQA/isort
rev: 6.0.1
hooks:
@ -143,6 +143,13 @@ repos:
types: [python]
pass_filenames: false
additional_dependencies: [regex]
- id: check-pickle-imports
name: Prevent new pickle/cloudpickle imports
entry: python tools/check_pickle_imports.py
language: python
types: [python]
pass_filenames: false
additional_dependencies: [pathspec, regex]
# Keep `suggestion` last
- id: suggestion
name: Suggestion

View File

@ -182,9 +182,6 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
# Set rocm version dev int.
#
if(VLLM_GPU_LANG STREQUAL "HIP")
#
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
@ -192,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
#
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
@ -246,6 +242,7 @@ set(VLLM_EXT_SRC
"csrc/activation_kernels.cu"
"csrc/layernorm_kernels.cu"
"csrc/layernorm_quant_kernels.cu"
"csrc/sampler.cu"
"csrc/cuda_view.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
@ -311,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
# 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)
#
@ -423,9 +420,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
# require CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@ -457,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# kernels for the remaining archs that are not already built for 3x.
# (Build 8.9 for FP8)
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
@ -545,10 +542,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUTLASS MoE kernels
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
# to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
# if it's possible to compile MoE kernels that use its output.
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
@ -687,7 +684,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
# 9.0 for latest bf16 atomicAdd PTX
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
#

View File

@ -58,8 +58,8 @@ vLLM is fast with:
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
- Continuous batching of incoming requests
- Fast model execution with CUDA/HIP graph
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
- Speculative decoding
- Chunked prefill
@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
- Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
- Prefix caching support
- Multi-LoRA support
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g. E5-Mistral)
- Embedding Models (e.g., E5-Mistral)
- Multi-modal LLMs (e.g., LLaVA)
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
@ -156,10 +156,10 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
## Media Kit
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)

View File

@ -10,11 +10,15 @@
# 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo
# MODEL: the model served by vllm
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len
# OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file.
@ -30,31 +34,27 @@
TAG=$(date +"%Y_%m_%d_%H_%M")
BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct"
TP=1
DOWNLOAD_DIR=""
INPUT_LEN=4000
OUTPUT_LEN=16
MIN_CACHE_HIT_PCT_PCT=0
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
echo "result file$ $RESULT"
echo "result file: $RESULT"
echo "model: $MODEL"
echo
rm -rf $LOG_FOLDER
mkdir -p $LOG_FOLDER
cd "$BASE/vllm"
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
pip install datasets
pip install -q datasets
current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT"
@ -64,53 +64,69 @@ best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0
start_server() {
local gpu_memory_utilization=$1
local max_num_seqs=$2
local max_num_batched_tokens=$3
local vllm_log=$4
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size $TP \
--enable-prefix-caching \
--load-format dummy \
--download-dir "$DOWNLOAD_DIR" \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
else
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
else
return 0
fi
}
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
pkill -f vllm
# start the server
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization 0.98 \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size 1 \
--enable-prefix-caching \
--load-format dummy \
--download-dir $DOWNLOAD_DIR \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
echo "wait for 10 minutes.."
echo
# wait for 10 minutes...
server_started=0
for i in {1..60}; do
if grep -Fq "Application startup complete" "$vllm_log"; then
echo "Application started"
server_started=1
break
else
# echo "wait for 10 seconds..."
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
return 1
echo "starting server..."
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
else
echo "server started."
fi
echo
echo "run benchmark test..."
echo
meet_latency_requirement=0
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
@ -118,29 +134,29 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--sonnet-prefix-len $prefix_len \
--port 8004 > "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1
request_rate=inf
fi
if (( ! meet_latency_requirement )); then
# start from request-rate as int(through_put) + 1
request_rate=$((${through_put%.*} + 1))
# start from request-rate as int(throughput) + 1
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
@ -149,19 +165,18 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \
--backend vllm \
--model $MODEL \
--dataset-name sonnet \
--dataset-path benchmarks/sonnet_4x.txt \
--sonnet-input-len $INPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore_eos \
--dataset-name random \
--random-input-len $INPUT_LEN \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--sonnet-prefix-len $prefix_len \
--port 8004 > "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
--random-prefix-len $prefix_len \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
@ -173,10 +188,10 @@ run_benchmark() {
fi
# write the results and update the best result.
if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
best_throughput=$through_put
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
best_throughput=$throughput
best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput
@ -188,22 +203,39 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
rm -f $vllm_log
printf '=%.0s' $(seq 1 20)
return 0
}
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
num_seqs_list="128 256"
num_batched_tokens_list="512 1024 2048 4096"
for num_seqs in $num_seqs_list; do
for num_batched_tokens in $num_batched_tokens_list; do
run_benchmark $num_seqs $num_batched_tokens
exit 0
# first find out the max gpu-memory-utilization without HBM OOM.
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
break
else
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
fi
done
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
exit 1
fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
done
done
echo "finish permutations"

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
import json

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This module defines a framework for sampling benchmark requests from various
datasets. Each dataset subclass of BenchmarkDataset must implement sample
@ -864,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset):
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = f"{item['instruction']}:\n{item['input']}"
prompt = f"{item['input']}\n\n{item['instruction']} Just output \
the code, do not include any explanation."
# apply template
prompt = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark the latency of processing a single batch of requests."""
import argparse
@ -122,7 +123,7 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, results)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the latency of processing a single batch of "
"requests till completion."
@ -170,6 +171,12 @@ if __name__ == "__main__":
# V1 enables prefix caching by default which skews the latency
# numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Offline benchmark to test the long document QA throughput.
@ -141,7 +142,7 @@ def main(args):
)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the performance with or "
"without automatic prefix caching."
@ -191,5 +192,11 @@ if __name__ == "__main__":
)
parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the efficiency of prefix caching.
@ -217,7 +218,7 @@ def main(args):
)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the performance with or without "
"automatic prefix caching."
@ -267,5 +268,11 @@ if __name__ == "__main__":
)
parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline prioritization."""
import argparse
@ -160,7 +161,7 @@ def main(args: argparse.Namespace):
json.dump(results, f, indent=4)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument(
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
@ -203,6 +204,12 @@ if __name__ == "__main__":
)
parser = EngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput.
On the server side, run one of the following commands:
@ -874,7 +875,7 @@ def main(args: argparse.Namespace):
save_to_pytorch_benchmark_format(args, result_json, file_name)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput."
)
@ -1224,6 +1225,10 @@ if __name__ == "__main__":
"script chooses a LoRA module at random.",
)
args = parser.parse_args()
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
r"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands:
@ -11,7 +12,6 @@ On the client side, run:
--model <your_model> \
--dataset json \
--structured-output-ratio 1.0 \
--structured-output-backend auto \
--request-rate 10 \
--num-prompts 1000
@ -850,7 +850,7 @@ def main(args: argparse.Namespace):
json.dump(results, outfile, indent=4)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput."
)
@ -1034,5 +1034,10 @@ if __name__ == "__main__":
help="Ratio of Structured Outputs requests",
)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
main(args)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput."""
import argparse
@ -594,7 +595,7 @@ def validate_args(args):
)
if __name__ == "__main__":
def create_argument_parser():
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument(
"--backend",
@ -716,6 +717,12 @@ if __name__ == "__main__":
)
parser = AsyncEngineArgs.add_cli_args(parser)
return parser
if __name__ == "__main__":
parser = create_argument_parser()
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)
json.dump(
records,
f,
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Cutlass bench utils
from collections.abc import Iterable

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import itertools

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle as pkl
import time

View File

@ -4,11 +4,85 @@ import copy
import itertools
import torch
import triton
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"fp8-tensor-w-token-a": dict(
w="tensor", a="token", no_a_quant=False, enabled=False
),
"fp8-tensor-w-tensor-a": dict(
w="tensor", a="tensor", no_a_quant=False, enabled=True
),
"fp8-channel-w-token-a": dict(
w="channel", a="token", no_a_quant=False, enabled=True
),
"fp8-channel-w-tensor-a": dict(
w="channel", a="tensor", no_a_quant=False, enabled=False
),
"fp8-tensor-w-token-a-noquant": dict(
w="tensor", a="token", no_a_quant=True, enabled=False
),
"fp8-tensor-w-tensor-a-noquant": dict(
w="tensor", a="tensor", no_a_quant=True, enabled=True
),
"fp8-channel-w-token-a-noquant": dict(
w="channel", a="token", no_a_quant=True, enabled=True
),
"fp8-channel-w-tensor-a-noquant": dict(
w="channel", a="tensor", no_a_quant=True, enabled=False
),
}
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str):
if w_type == "tensor":
scale_b = torch.ones(1, device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
else:
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True)
return b_fp8.t(), scale_b_fp8
def build_fp8_runner(cfg, a, b, dtype, device):
b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device)
scale_a_const = (
torch.ones(1, device=device, dtype=torch.float32)
if cfg["a"] == "tensor"
else None
)
if cfg["no_a_quant"]:
if cfg["a"] == "tensor":
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
else:
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
def run():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
return run
if cfg["a"] == "tensor":
def run():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
else:
def run():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
return run
@triton.testing.perf_report(
@ -17,28 +91,8 @@ from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=[
"torch-bf16",
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a",
"fp8-channel-w-token-a",
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant",
"fp8-channel-w-token-a-noquant",
# "fp8-channel-w-tensor-a-noquant",
],
line_names=[
"torch-bf16",
# "fp8-tensor-w-token-a",
"fp8-tensor-w-tensor-a",
"fp8-channel-w-token-a",
# "fp8-channel-w-tensor-a",
# "fp8-tensor-w-token-a-noquant",
"fp8-tensor-w-tensor-a-noquant",
"fp8-channel-w-token-a-noquant",
# "fp8-channel-w-tensor-a-noquant",
],
line_vals=_enabled,
line_names=_enabled,
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs FP8 GEMMs",
args={},
@ -49,144 +103,34 @@ def benchmark(batch_size, provider, N, K):
device = "cuda"
dtype = torch.bfloat16
# Create input tensors
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if "torch-bf16" in provider:
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
elif "fp8" in provider:
# Weights are always quantized ahead of time
if "noquant" in provider:
# For no quantization, we just measure the GEMM
if "tensor-w-token-a" in provider:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
assert scale_b_fp8.numel() == 1
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "tensor-w-tensor-a" in provider:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
assert scale_b_fp8.numel() == 1
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-token-a" in provider:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-tensor-a" in provider:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
def run_quant():
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
else:
# In these cases, we quantize the activations during the GEMM call
if "tensor-w-token-a" in provider:
# Dynamic per-token quant for A, per-tensor quant for B
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b)
assert scale_b_fp8.numel() == 1
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "tensor-w-tensor-a" in provider:
# Static per-tensor quantization with fixed scales
# for both A and B
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor([1.0], device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
assert scale_b_fp8.numel() == 1
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-token-a" in provider:
# Static per-channel quantization for weights, per-token
# quant for A
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(
a, use_per_token_if_dynamic=True
)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
elif "channel-w-tensor-a" in provider:
# Static per-channel quantization for weights, per-tensor
# quant for A
scale_a = torch.tensor([1.0], device=device, dtype=torch.float32)
scale_b = torch.tensor((N,), device=device, dtype=torch.float32)
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
scale_b_fp8 = scale_b_fp8.expand(N).contiguous()
assert scale_b_fp8.numel() == N
def run_quant():
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a)
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
b_fp8 = b_fp8.t()
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_fp8_runner(cfg, a, b, dtype, device)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
# Calculate TFLOP/s, two flops per multiply-add
tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3)
return tflops(ms), tflops(max_ms), tflops(min_ms)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
KN_model_names = []
models_tps = list(itertools.product(args.models, args.tp_sizes))
for model, tp_size in models_tps:
assert model in WEIGHT_SHAPES
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
out = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
KN_model_names.append(KN)
return KN_model_names
out.append(KN)
return out
if __name__ == "__main__":
@ -196,21 +140,13 @@ if __name__ == "__main__":
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=[*WEIGHT_SHAPES.keys()],
help="List of models to benchmark",
)
parser.add_argument(
"--tp-sizes",
nargs="+",
type=int,
default=[1],
help="List of tensor parallel sizes",
choices=list(WEIGHT_SHAPES.keys()),
)
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
args = parser.parse_args()
KN_model_names = prepare_shapes(args)
for K, N, model_name in KN_model_names:
print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,

View File

@ -0,0 +1,169 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy
import itertools
import torch
from weight_shapes import WEIGHT_SHAPES
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
from vllm.triton_utils import triton
PROVIDER_CFGS = {
"torch-bf16": dict(enabled=True),
"int8-tensor-w-token-a": dict(
w="tensor", a="token", no_a_quant=False, enabled=False
),
"int8-tensor-w-tensor-a": dict(
w="tensor", a="tensor", no_a_quant=False, enabled=True
),
"int8-channel-w-token-a": dict(
w="channel", a="token", no_a_quant=False, enabled=True
),
"int8-channel-w-tensor-a": dict(
w="channel", a="tensor", no_a_quant=False, enabled=False
),
"int8-tensor-w-token-a-noquant": dict(
w="tensor", a="token", no_a_quant=True, enabled=False
),
"int8-tensor-w-tensor-a-noquant": dict(
w="tensor", a="tensor", no_a_quant=True, enabled=True
),
"int8-channel-w-token-a-noquant": dict(
w="channel", a="token", no_a_quant=True, enabled=True
),
"int8-channel-w-tensor-a-noquant": dict(
w="channel", a="tensor", no_a_quant=True, enabled=False
),
}
def _quant_weight(b, w_type, device):
if w_type == "tensor":
scale_b = torch.ones(1, device=device, dtype=torch.float32)
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
assert scale_b_int8.numel() == 1
else: # channel
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
assert scale_b_int8.numel() == b.shape[0]
return b_int8.t(), scale_b_int8
def build_int8_runner(cfg, a, b, dtype, device):
# quant before running the kernel
b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device)
scale_a_const = None
if cfg["a"] == "tensor":
scale_a_const = torch.ones(1, device=device, dtype=torch.float32)
# no quant, create activation ahead
if cfg["no_a_quant"]:
if cfg["a"] == "tensor":
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
else: # token
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
def run_quant():
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
return run_quant
# dynamic quant, create activation inside
if cfg["a"] == "tensor":
def run_quant():
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
else: # token
def run_quant():
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
return run_quant
_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")]
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["batch_size"],
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
x_log=False,
line_arg="provider",
line_vals=_enabled,
line_names=[k for k in _enabled],
ylabel="TFLOP/s (larger is better)",
plot_name="BF16 vs INT8 GEMMs",
args={},
)
)
def benchmark(batch_size, provider, N, K):
M = batch_size
device = "cuda"
dtype = torch.bfloat16
a = torch.randn((M, K), device=device, dtype=dtype)
b = torch.randn((N, K), device=device, dtype=dtype)
quantiles = [0.5, 0.2, 0.8]
if provider == "torch-bf16":
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
)
else:
cfg = PROVIDER_CFGS[provider]
run_quant = build_int8_runner(cfg, a, b, dtype, device)
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
lambda: run_quant(), quantiles=quantiles
)
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
def prepare_shapes(args):
KN_model_names = []
for model, tp_size in itertools.product(args.models, args.tp_sizes):
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
KN[tp_dim] //= tp_size
KN.append(model)
KN_model_names.append(KN)
return KN_model_names
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--models",
nargs="+",
type=str,
default=["meta-llama/Llama-3.1-8B-Instruct"],
choices=list(WEIGHT_SHAPES.keys()),
help="List of models to benchmark",
)
parser.add_argument(
"--tp-sizes",
nargs="+",
type=int,
default=[1],
help="List of tensor parallel sizes",
)
args = parser.parse_args()
for K, N, model in prepare_shapes(args):
print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
benchmark.run(
print_data=True,
show_plots=True,
save_path=f"bench_int8_res_n{N}_k{K}",
N=N,
K=K,
)
print("Benchmark finished!")

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
@ -90,7 +91,7 @@ def bench_run(
score = torch.randn((m, num_experts), device=device, dtype=dtype)
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
quant_blocksize = 16
w1_blockscale = torch.empty(

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.utils.benchmark as benchmark
@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import (
cutlass_moe_fp8,
fused_experts,
fused_topk,
)
@ -69,18 +70,9 @@ def bench_run(
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
for expert in range(num_experts):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
w1_q_notransp = w1_q.clone()
w2_q_notransp = w2_q.clone()
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
@ -121,10 +113,6 @@ def bench_run(
w2_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
num_repeats: int,
):
for _ in range(num_repeats):
@ -132,14 +120,10 @@ def bench_run(
a,
w1,
w2,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
w1_scale,
w2_scale,
a1_scale=a_scale,
)
@ -152,10 +136,6 @@ def bench_run(
w2_scale: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
ab_strides1: torch.Tensor,
c_strides1: torch.Tensor,
ab_strides2: torch.Tensor,
c_strides2: torch.Tensor,
):
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
@ -164,14 +144,10 @@ def bench_run(
a,
w1_q,
w2_q,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
w1_scale,
w2_scale,
a1_scale=a_scale,
)
@ -217,10 +193,6 @@ def bench_run(
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
)
torch.cuda.synchronize()
@ -229,8 +201,8 @@ def bench_run(
with torch.cuda.graph(triton_graph, stream=triton_stream):
run_triton_from_graph(
a,
w1_q_notransp,
w2_q_notransp,
w1_q,
w2_q,
topk_weights,
topk_ids,
w1_scale,
@ -249,18 +221,12 @@ def bench_run(
"w2": w2,
"score": score,
"topk": topk,
"w1_q_notransp": w1_q_notransp,
"w2_q_notransp": w2_q_notransp,
# Cutlass params
"a_scale": a_scale,
"w1_q": w1_q,
"w2_q": w2_q,
"w1_scale": w1_scale,
"w2_scale": w2_scale,
"ab_strides1": ab_strides1,
"c_strides1": c_strides1,
"ab_strides2": ab_strides2,
"c_strides2": c_strides2,
# cuda graph params
"cutlass_graph": cutlass_graph,
"triton_graph": triton_graph,
@ -278,8 +244,8 @@ def bench_run(
# Warmup
run_triton_moe(
a,
w1_q_notransp,
w2_q_notransp,
w1_q,
w2_q,
topk_weights,
topk_ids,
w1_scale,
@ -290,7 +256,7 @@ def bench_run(
results.append(
benchmark.Timer(
stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
@ -321,16 +287,12 @@ def bench_run(
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
num_warmup,
)
results.append(
benchmark.Timer(
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import copy

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.utils.benchmark as benchmark

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
@ -6,7 +7,6 @@ import time
from contextlib import nullcontext
from datetime import datetime
from itertools import product
from types import SimpleNamespace
from typing import Any, TypedDict
import ray
@ -42,7 +42,7 @@ def benchmark_config(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
block_quant_shape: List[int] = None,
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
@ -399,7 +399,7 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
block_quant_shape: List[int] = None,
block_quant_shape: list[int] = None,
use_deep_gemm: bool = False,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
@ -531,7 +531,7 @@ def save_configs(
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
block_quant_shape: List[int],
block_quant_shape: list[int],
) -> None:
dtype_str = get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@ -562,7 +562,6 @@ def main(args: argparse.Namespace):
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
if args.model_prefix:
config = getattr(config, args.model_prefix)
config = SimpleNamespace(**config)
if config.architectures[0] == "DbrxForCausalLM":
E = config.ffn_config.moe_num_experts
@ -594,11 +593,7 @@ def main(args: argparse.Namespace):
shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = (
torch.float16
if current_platform.is_rocm()
else getattr(torch, config.torch_dtype)
)
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config)

View File

@ -0,0 +1,159 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import itertools
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size_triton,
)
from vllm.triton_utils import triton
def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
return torch.stack(
[
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
for _ in range(num_tokens)
]
)
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
"""
Verifies vllm vs. Triton
"""
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
# 1. malloc space for triton and vllm
# malloc enough space (max_num_tokens_padded) for the sorted ids
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids_triton = torch.empty(
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
)
sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value
expert_ids_triton = torch.zeros(
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
)
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
sorted_ids_vllm.fill_(topk_ids.numel())
expert_ids_vllm = torch.zeros_like(expert_ids_triton)
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
# 2. run implementations
moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids_triton,
expert_ids_triton,
num_tokens_post_pad_triton,
)
ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids_vllm,
expert_ids_vllm,
num_tokens_post_pad_vllm,
)
print(f"✅ VLLM implementation works with {num_experts} experts!")
# 3. compare results
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
):
print("✅ Triton and VLLM implementations match.")
else:
print("❌ Triton and VLLM implementations DO NOT match.")
print("Triton expert_ids:", expert_ids_triton)
print("VLLM expert_ids:", expert_ids_vllm)
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
# test configurations
num_tokens_range = [1, 16, 256, 4096]
num_experts_range = [16, 64, 224, 256, 280, 512]
topk_range = [1, 2, 8]
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["num_tokens", "num_experts", "topk"],
x_vals=configs,
line_arg="provider",
line_vals=["vllm", "triton"], # "triton"
line_names=["VLLM", "Triton"], # "Triton"
plot_name="moe-align-block-size-performance",
args={},
)
)
def benchmark(num_tokens, num_experts, topk, provider):
"""Benchmark function for Triton."""
block_size = 256
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
sorted_ids.fill_(topk_ids.numel())
max_num_m_blocks = max_num_tokens_padded // block_size
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
quantiles = [0.5, 0.2, 0.8]
if provider == "vllm":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: ops.moe_align_block_size(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
elif provider == "triton":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: moe_align_block_size_triton(
topk_ids,
num_experts,
block_size,
sorted_ids.clone(),
expert_ids.clone(),
num_tokens_post_pad.clone(),
),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--num_experts",
type=int,
default=64,
choices=[8, 16, 32, 64, 128, 256],
)
parser.add_argument(
"--topk",
type=int,
default=8,
choices=[2, 4, 8],
help="Top-k value for correctness check.",
)
args = parser.parse_args()
print("Running correctness check...")
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
benchmark.run(print_data=True, show_plots=True)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
from typing import Any, TypedDict

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
from typing import Optional, Union

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from itertools import accumulate
from typing import Optional

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
WEIGHT_SHAPES = {
"ideal": [[4 * 256 * 32, 256 * 32]],

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from sglang quantization/tuning_block_wise_kernel.py
import argparse

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# fmt: off
# ruff: noqa: E501
import time

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
import pickle

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from collections.abc import Iterable

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Weight Shapes are in the format
# ([K, N], TP_SPLIT_DIM)

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import cProfile
import pstats

View File

@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
else()
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
@ -106,13 +107,19 @@ elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
message(WARNING "vLLM CPU backend using AVX2 ISA")
elseif (POWER9_FOUND OR POWER10_FOUND)
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
message(STATUS "PowerPC detected")
# Check for PowerPC VSX support
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=native"
"-mtune=native")
if (POWER9_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power9"
"-mtune=power9")
elseif (POWER10_FOUND OR POWER11_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=power10"
"-mtune=power10")
endif()
elseif (ASIMD_FOUND)
message(STATUS "ARMv8 or later architecture detected")

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
GIT_TAG 763ad155a1c826f71ff318f41edb1e4e5e376ddb
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# A command line tool for running pytorch's hipify preprocessor on CUDA

View File

@ -122,6 +122,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"-DENABLE_FP8"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-Werror=unused-variable"
"-fno-gpu-rdc")
endif()

View File

@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
static_cast<ElementAcc*>(nullptr), stride_LSE},
hw_info,
-1, // split_kv
1, // split_kv
nullptr, // is_var_split_kv
};
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute

View File

@ -65,9 +65,6 @@ void paged_attention_v1_launcher(
int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1);
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
const float* alibi_slopes_ptr =
alibi_slopes
@ -193,4 +190,4 @@ void paged_attention_v1(
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP
#undef DIVIDE_ROUND_UP

View File

@ -66,9 +66,6 @@ void paged_attention_v2_launcher(
int kv_block_stride = key_cache.stride(0);
int kv_head_stride = key_cache.stride(1);
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
const float* alibi_slopes_ptr =
alibi_slopes
@ -203,4 +200,4 @@ void paged_attention_v2(
#undef WARP_SIZE
#undef MAX
#undef MIN
#undef DIVIDE_ROUND_UP
#undef DIVIDE_ROUND_UP

View File

@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
}
template <typename T>
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
const int size) {
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
const int size) {
T max = max_data[0];
for (int i = 1; i < size; ++i) {
max = max >= max_data[i] ? max : max_data[i];
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
if (partition_num == 1) continue;
reducePartitonSoftmax(
reducePartitionSoftmax(
max_logits + seq_idx * num_heads * max_num_partitions +
head_idx * max_num_partitions,
exp_sums + seq_idx * num_heads * max_num_partitions +

View File

@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
explicit FP16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
// non-temporal load
explicit FP16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
explicit BF16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
// non-temporal load
explicit BF16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
// normal load
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
// non-temproal load
// non-temporal load
explicit FP32Vec16(bool, void* ptr)
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
// normal load
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
// non-temproal load
// non-temporal load
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
_mm512_mask_storeu_epi8(ptr, mask, reg);
}
// non-temproal save
// non-temporal save
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
};
#endif

View File

@ -54,8 +54,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
int page_num = numa_migrate_pages(pid, src_mask, mask);
if (page_num == -1) {
TORCH_CHECK(false,
"numa_migrate_pages failed. errno: " + std::to_string(errno));
TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
}
// restrict memory allocation node.
@ -105,4 +104,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
return ss.str();
}
#endif
#endif

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import enum
from typing import Union

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob
import itertools
import os

View File

@ -13,232 +13,45 @@
namespace vllm {
namespace moe {
namespace {
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
int32_t col) {
// don't worry about overflow because num_experts is relatively small
return row * total_col + col;
}
} // namespace
template <typename scalar_t, typename token_cnts_t>
__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
int32_t* sorted_token_ids,
int32_t* expert_ids,
int32_t* total_tokens_post_pad,
int32_t num_experts,
int32_t block_size, size_t numel) {
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
extern __shared__ int32_t shared_mem[];
int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1)
token_cnts_t* tokens_cnts =
(token_cnts_t*)(shared_mem + num_experts +
1); // 2d tensor with shape (blockDim.x + 1, num_experts)
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
/**
* In the first step we compute token_cnts[thread_index + 1][expert_index],
* which counts how many tokens in the token shard of thread_index are
* assigned to expert expert_index.
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
}
}
__syncthreads();
// We accumulate the token counts of all experts in thread 0.
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i - 1] +
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
block_size) *
block_size;
}
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
/**
* Each thread processes a token shard, calculating the index of each token
* after sorting by expert number. Given the example topk_ids =
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
* padding value(preset in python).
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
// TODO(simon): this is temporarily adapted from
// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
// we did this to unblock Deepseek V3 but there should be a better
// implementation to manage shared memory.
template <typename scalar_t>
__global__ void moe_align_block_size_global_mem_kernel(
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) {
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
__global__ void moe_align_block_size_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
size_t numel, int32_t* __restrict__ cumsum) {
extern __shared__ int32_t shared_counts[];
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
}
/**
* In the first step we compute token_cnts[thread_index + 1][expert_index],
* which counts how many tokens in the token shard of thread_index are
* assigned to expert expert_index.
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
}
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
if (threadIdx.x < num_experts) {
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
}
}
__syncthreads();
// We accumulate the token counts of all experts in thread 0.
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i - 1] +
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
block_size) *
block_size;
}
*total_tokens_post_pad = cumsum[num_experts];
}
__syncthreads();
/**
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
/**
* Each thread processes a token shard, calculating the index of each token
* after sorting by expert number. Given the example topk_ids =
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
* padding value(preset in python).
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
int32_t expert_id = topk_ids[i];
/** The cumsum[expert_id] stores the starting index of the tokens that the
* expert with expert_id needs to process, and
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
* processed by the expert with expert_id within the current thread's token
* shard.
*/
int32_t rank_post_pad =
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
// taken from
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
template <typename scalar_t>
__global__ void sgl_moe_align_block_size_kernel(
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel, int32_t* cumsum) {
__shared__ int32_t shared_counts[32][8];
const int warp_id = threadIdx.x / 32;
const int experts_per_warp = 8;
const int warp_id = threadIdx.x / WARP_SIZE;
const int my_expert_start = warp_id * experts_per_warp;
// Initialize shared_counts for this warp's experts
for (int i = 0; i < experts_per_warp; ++i) {
if (my_expert_start + i < num_experts) {
shared_counts[warp_id][i] = 0;
if (my_expert_start + i < padded_num_experts) {
shared_counts[warp_id * experts_per_warp + i] = 0;
}
}
__syncthreads();
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;
const size_t tid = threadIdx.x;
const size_t stride = blockDim.x;
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
for (size_t i = tid; i < numel; i += stride) {
int expert_id = topk_ids[i];
int warp_idx = expert_id / experts_per_warp;
int expert_offset = expert_id % experts_per_warp;
atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
}
__syncthreads();
// Single thread computes cumulative sum and total tokens
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
int expert_count = 0;
int warp_idx = (i - 1) / experts_per_warp;
int expert_offset = (i - 1) % experts_per_warp;
expert_count = shared_counts[warp_idx][expert_offset];
expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];
cumsum[i] =
cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
@ -248,7 +61,6 @@ __global__ void sgl_moe_align_block_size_kernel(
__syncthreads();
// Assign expert IDs to blocks
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
@ -257,13 +69,11 @@ __global__ void sgl_moe_align_block_size_kernel(
}
}
// taken from
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
template <typename scalar_t>
__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids,
int32_t* sorted_token_ids,
int32_t* cumsum_buffer,
size_t numel) {
__global__ void count_and_sort_expert_tokens_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
size_t numel) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;
@ -290,132 +100,138 @@ __global__ void moe_sum_kernel(
}
}
template <typename scalar_t>
__global__ void moe_align_block_size_small_batch_expert_kernel(
const scalar_t* __restrict__ topk_ids,
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
int32_t block_size, size_t numel) {
const size_t tid = threadIdx.x;
const size_t stride = blockDim.x;
extern __shared__ int32_t shared_mem[];
int32_t* cumsum = shared_mem;
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
}
for (size_t i = tid; i < numel; i += stride) {
++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
}
__syncthreads();
if (threadIdx.x < num_experts) {
tokens_cnts[threadIdx.x] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[i * num_experts + threadIdx.x] +=
tokens_cnts[(i - 1) * num_experts + threadIdx.x];
}
}
__syncthreads();
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] =
cumsum[i - 1] +
CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
block_size;
}
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
}
__syncthreads();
if (threadIdx.x < num_experts) {
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
i += block_size) {
expert_ids[i / block_size] = threadIdx.x;
}
}
for (size_t i = tid; i < numel; i += stride) {
int32_t expert_id = topk_ids[i];
int32_t rank_post_pad =
tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[threadIdx.x * num_experts + expert_id];
}
}
} // namespace moe
} // namespace vllm
// taken from
// https://github.com/sgl-project/sglang/blob/8b5f83ed3b7d2a49ad5c5cd5aa61c5d502f47dbc
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int device_max_shared_mem;
auto dev = topk_ids.get_device();
cudaDeviceGetAttribute(&device_max_shared_mem,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
const int32_t shared_mem_i32 =
((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
const int32_t shared_mem_i16 =
((num_thread + 1) * num_experts) * sizeof(uint16_t) +
(num_experts + 1) * sizeof(int32_t);
bool use_global_memory = false;
bool use_i16 = false; // Use uint16_t for shared memory token counts
if (shared_mem_i32 < device_max_shared_mem) {
// Do nothing in this case. We're all set to use int32_t token counts
} else if (shared_mem_i16 < device_max_shared_mem &&
topk_ids.numel() <= 65535) {
// when nelements of topk_ids is smaller than 65535 (max value of uint16),
// element value of token_cnts would also smaller than 65535,
// so we can use uint16 as dtype of token_cnts
use_i16 = true;
} else {
use_global_memory = true;
}
if (use_global_memory) {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
// tensors
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
auto options_int = torch::TensorOptions()
.dtype(torch::kInt)
.device(topk_ids.device());
torch::Tensor token_cnts_buffer =
torch::empty({(num_experts + 1) * num_experts}, options_int);
torch::Tensor cumsum_buffer =
torch::empty({num_experts + 1}, options_int);
auto kernel =
vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
kernel<<<1, num_thread, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>());
});
} else if (use_i16) {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// set dynamic shared mem
auto kernel =
vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem_i16));
kernel<<<1, num_thread, shared_mem_i16, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel());
});
} else {
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
auto kernel =
vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem_i32));
kernel<<<1, num_thread, shared_mem_i32, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel());
});
}
}
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size,
torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
TORCH_CHECK(num_experts == 256,
"sgl_moe_align_block_size kernel only supports deepseek v3.");
int64_t padded_num_experts =
((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
int experts_per_warp = WARP_SIZE;
int threads = 1024;
threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// calc needed amount of shared mem for `cumsum` tensors
auto options_int =
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
torch::Tensor cumsum_buffer =
torch::zeros({num_experts + 1}, options_int);
bool small_batch_expert_mode =
(topk_ids.numel() < 1024) && (num_experts <= 64);
auto align_kernel =
vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
align_kernel<<<1, 1024, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
if (small_batch_expert_mode) {
const int32_t threads = max((int32_t)num_experts, WARP_SIZE);
const int32_t shared_mem_size =
((threads + 1) * num_experts + (num_experts + 1)) *
sizeof(int32_t);
const int block_threads = 256;
const int num_blocks =
(topk_ids.numel() + block_threads - 1) / block_threads;
const int max_blocks = 65535;
const int actual_blocks = std::min(num_blocks, max_blocks);
auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel<scalar_t>;
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
auto small_batch_expert_kernel =
vllm::moe::moe_align_block_size_small_batch_expert_kernel<
scalar_t>;
small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
topk_ids.numel());
} else {
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
size_t shared_mem_size =
num_warps * experts_per_warp * sizeof(int32_t);
align_kernel<<<1, threads, shared_mem_size, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
padded_num_experts, experts_per_warp, block_size,
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
const int block_threads = std::min(256, (int)threads);
const int num_blocks =
(topk_ids.numel() + block_threads - 1) / block_threads;
const int max_blocks = 65535;
const int actual_blocks = std::min(num_blocks, max_blocks);
auto sort_kernel =
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
}
});
}

View File

@ -12,12 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size, torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad);
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
int64_t block_size,
torch::Tensor sorted_token_ids,
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad);
#ifndef USE_ROCM
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor b_qweight, torch::Tensor b_scales,
@ -30,4 +24,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
int64_t BLOCK_SIZE_K, int64_t bit);
#endif
bool moe_permute_unpermute_supported();
bool moe_permute_unpermute_supported();
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor);

View File

@ -12,7 +12,7 @@ void moe_permute(
const torch::Tensor& input, // [n_token, hidden]
const torch::Tensor& topk_weights, //[n_token, topk]
torch::Tensor& topk_ids, // [n_token, topk]
const torch::Tensor& token_expert_indicies, // [n_token, topk]
const torch::Tensor& token_expert_indices, // [n_token, topk]
const std::optional<torch::Tensor>& expert_map, // [n_expert]
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,
@ -27,15 +27,15 @@ void moe_permute(
"expert_first_token_offset must be int64");
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
"topk_ids must be int32");
TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
"token_expert_indicies must be int32");
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
"token_expert_indices must be int32");
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
"src_row_id2dst_row_id_map must be int32");
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
"expert_first_token_offset shape != n_local_expert+1")
TORCH_CHECK(
src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
"token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
auto n_token = input.sizes()[0];
auto n_hidden = input.sizes()[1];
auto align_block_size_value =
@ -71,7 +71,7 @@ void moe_permute(
expert_map_ptr, n_expert, stream);
}
// expert sort topk expert id and scan expert id get expert_first_token_offset
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
get_ptr<int>(permuted_experts_id),
get_ptr<int>(dst_row_id2src_row_id_map),
get_ptr<int64_t>(expert_first_token_offset), n_token,
@ -130,11 +130,67 @@ void moe_unpermute(
});
}
template <typename T>
__global__ void shuffleInputRowsKernel(const T* input,
const int32_t* dst2src_map, T* output,
int64_t num_src_rows,
int64_t num_dst_rows, int64_t num_cols) {
int64_t dest_row_idx = blockIdx.x;
int64_t const source_row_idx = dst2src_map[dest_row_idx];
if (blockIdx.x < num_dst_rows) {
// Load 128-bits per thread
constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows
auto const* source_row_ptr =
reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
auto* dest_row_ptr =
reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
int64_t const start_offset = threadIdx.x;
int64_t const stride = blockDim.x;
int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
for (int elem_index = start_offset; elem_index < num_elems_in_col;
elem_index += stride) {
dest_row_ptr[elem_index] = source_row_ptr[elem_index];
}
}
}
void shuffle_rows(const torch::Tensor& input_tensor,
const torch::Tensor& dst2src_map,
torch::Tensor& output_tensor) {
TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
"Input and output tensors must have the same data type");
auto stream = at::cuda::getCurrentCUDAStream().stream();
int64_t const blocks = output_tensor.size(0);
int64_t const threads = 256;
int64_t const num_dest_rows = output_tensor.size(0);
int64_t const num_src_rows = input_tensor.size(0);
int64_t const num_cols = input_tensor.size(1);
TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
"num_cols must be divisible by 128 / "
"sizeof(input_tensor.scalar_type()) / 8");
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
dst2src_map.data_ptr<int32_t>(),
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
num_dest_rows, num_cols);
});
}
#else
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indicies,
const torch::Tensor& token_expert_indices,
const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,
@ -147,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
void moe_unpermute(const torch::Tensor& input,
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indicies,
const torch::Tensor& token_expert_indices,
const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,

View File

@ -14,12 +14,13 @@
__VA_ARGS__(); \
break; \
}
#define MOE_DISPATCH_FLOAT_CASE(...) \
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
#define MOE_DISPATCH_FLOAT_CASE(...) \
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
#define MOE_DISPATCH(TYPE, ...) \
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
@ -39,6 +40,11 @@ template <>
struct ScalarType2CudaType<at::ScalarType::BFloat16> {
using type = __nv_bfloat16;
};
// uint8 for packed fp4
template <>
struct ScalarType2CudaType<at::ScalarType::Byte> {
using type = uint8_t;
};
// #if __CUDA_ARCH__ >= 890
// fp8

View File

@ -20,7 +20,6 @@ __global__ void expandInputRowsKernel(
int expert_id = sorted_experts[expanded_dest_row];
extern __shared__ int64_t smem_expert_first_token_offset[];
int64_t align_expanded_row_accumulate = 0;
if constexpr (ALIGN_BLOCK_SIZE) {
// load g2s
for (int idx = threadIdx.x; idx < num_local_experts + 1;
@ -63,7 +62,6 @@ __global__ void expandInputRowsKernel(
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
// Duplicate and permute rows
int64_t const source_k_rank = expanded_source_row / num_rows;
int64_t const source_row = expanded_source_row % num_rows;
auto const* source_row_ptr =
@ -160,7 +158,6 @@ __global__ void finalizeMoeRoutingKernel(
elem_index += stride) {
ComputeElem thread_output;
thread_output.fill(0);
float row_rescale{0.f};
for (int k_idx = 0; k_idx < k; ++k_idx) {
int64_t const expanded_original_row = original_row + k_idx * num_rows;
int64_t const expanded_permuted_row =
@ -177,8 +174,6 @@ __global__ void finalizeMoeRoutingKernel(
auto const* expanded_permuted_rows_row_ptr =
expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;
int64_t const expert_idx = expert_for_source_row[k_offset];
ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>(
expanded_permuted_rows_row_ptr[elem_index]);
thread_output = thread_output + row_scale * (expert_result);

View File

@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
gating_output, nullptr, topk_weights, topk_indicies, \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, \
stream);
@ -433,7 +433,7 @@ template <typename IndType>
void topkGatingSoftmaxKernelLauncher(
const float* gating_output,
float* topk_weights,
IndType* topk_indicies,
IndType* topk_indices,
int* token_expert_indices,
float* softmax_workspace,
const int num_tokens,
@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
gating_output, nullptr, softmax_workspace, num_experts);
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
num_experts, topk, 0, num_experts);
}
}
@ -516,9 +516,8 @@ void topk_softmax(
topk,
stream);
}
else
else if (topk_indices.scalar_type() == at::ScalarType::UInt32)
{
assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
@ -530,4 +529,17 @@ void topk_softmax(
topk,
stream);
}
else {
assert(topk_indices.scalar_type() == at::ScalarType::Int64);
vllm::moe::topkGatingSoftmaxKernelLauncher(
gating_output.data_ptr<float>(),
topk_weights.data_ptr<float>(),
topk_indices.data_ptr<int64_t>(),
token_expert_indices.data_ptr<int>(),
softmax_workspace.data_ptr<float>(),
num_tokens,
num_experts,
topk,
stream);
}
}

View File

@ -22,15 +22,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
" Tensor! num_tokens_post_pad) -> ()");
m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// temporarily adapted from
// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
m.def(
"sgl_moe_align_block_size(Tensor topk_ids, int num_experts,"
" int block_size, Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
#ifndef USE_ROCM
m.def(
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
@ -66,7 +57,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def(
"moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
"Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
"Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
"int n_local_expert,"
"int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
"expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
@ -81,6 +72,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def("moe_permute_unpermute_supported() -> bool");
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
// Row shuffle for MoE
m.def(
"shuffle_rows(Tensor input_tensor, Tensor dst2src_map, Tensor! "
"output_tensor) -> ()");
m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
#endif
}

View File

@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
torch::Tensor& weight, double epsilon);
void apply_repetition_penalties_(torch::Tensor& logits,
const torch::Tensor& prompt_mask,
const torch::Tensor& output_mask,
const torch::Tensor& repetition_penalties);
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
torch::Tensor& weight, torch::Tensor& scale,
double epsilon);
@ -231,7 +236,8 @@ void cutlass_moe_mm(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides);
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch);
void cutlass_fp4_group_mm(
torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
@ -243,7 +249,16 @@ void get_cutlass_moe_mm_data(
const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
torch::Tensor& input_permutation, torch::Tensor& output_permutation,
const int64_t num_experts, const int64_t n, const int64_t k);
const int64_t num_experts, const int64_t n, const int64_t k,
const std::optional<torch::Tensor>& blockscale_offsets);
void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
torch::Tensor& problem_sizes1,
torch::Tensor& problem_sizes2,
const torch::Tensor& expert_num_tokens,
const int64_t num_local_experts,
const int64_t padded_m, const int64_t n,
const int64_t k);
void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,

View File

@ -274,7 +274,6 @@ void advance_step_flashinfer(
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
[[maybe_unused]] int block_tables_stride = block_tables.stride(0);
TORCH_CHECK((blocks * threads > num_queries),
"multi-step: not enough threads to map to num_queries = ",
num_queries, " block_tables.stride(0) = ", block_tables.stride(0),

View File

@ -1,15 +1,17 @@
#include <ATen/cuda/CUDAContext.h>
#include <torch/all.h>
#include <cmath>
#include "../../dispatch_utils.h"
#include "../vectorization_utils.cuh"
#ifndef USE_ROCM
#include <cub/util_type.cuh>
#include <cub/cub.cuh>
#include <cub/util_type.cuh>
#else
#include <hipcub/util_type.hpp>
#include <hipcub/hipcub.hpp>
#include <hipcub/util_type.hpp>
#endif
static inline __device__ int8_t float_to_int8_rn(float x) {
@ -103,134 +105,170 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
namespace vllm {
template <typename scalar_t, typename scale_type>
template <typename scalar_t, typename scale_t>
__global__ void static_scaled_int8_quant_kernel(
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
scale_type const* scale_ptr, const int hidden_size) {
int const tid = threadIdx.x;
int64_t const token_idx = blockIdx.x;
scale_type const scale = *scale_ptr;
const scalar_t* __restrict__ input, int8_t* __restrict__ output,
const scale_t* scale_ptr, const int hidden_size) {
const int tid = threadIdx.x;
const int stride = blockDim.x;
const int64_t token_idx = blockIdx.x;
const float scale = *scale_ptr;
// Must be performed using 64-bit math to avoid integer overflow.
out += token_idx * hidden_size;
input += token_idx * hidden_size;
const scalar_t* row_in = input + token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size;
for (int i = tid; i < hidden_size; i += blockDim.x) {
out[i] = float_to_int8_rn(static_cast<float>(input[i]) / scale);
}
vectorize_with_alignment<16>(
row_in, row_out, hidden_size, tid, stride,
[=] __device__(int8_t& dst, const scalar_t& src) {
dst = float_to_int8_rn(static_cast<float>(src) / scale);
});
}
template <typename scalar_t, typename scale_type, typename azp_type>
template <typename scalar_t, typename scale_t, typename azp_t>
__global__ void static_scaled_int8_azp_quant_kernel(
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
scale_type const* scale_ptr, azp_type const* azp_ptr,
const int hidden_size) {
int const tid = threadIdx.x;
int64_t const token_idx = blockIdx.x;
scale_type const scale = *scale_ptr;
azp_type const azp = *azp_ptr;
const scalar_t* __restrict__ input, int8_t* __restrict__ output,
const scale_t* scale_ptr, const azp_t* azp_ptr, const int hidden_size) {
const int tid = threadIdx.x;
const int stride = blockDim.x;
const int64_t token_idx = blockIdx.x;
const float scale = *scale_ptr;
const azp_t azp = *azp_ptr;
const float inv_s = 1.0f / scale;
// Must be performed using 64-bit math to avoid integer overflow.
out += token_idx * hidden_size;
input += token_idx * hidden_size;
const scalar_t* row_in = input + token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size;
for (int i = tid; i < hidden_size; i += blockDim.x) {
auto const val = static_cast<float>(input[i]);
auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp);
out[i] = quant_val;
}
vectorize_with_alignment<16>(
row_in, row_out, hidden_size, tid, stride,
[=] __device__(int8_t& dst, const scalar_t& src) {
const auto v = static_cast<float>(src) * inv_s;
dst = int32_to_int8(float_to_int32_rn(v) + azp);
});
}
template <typename scalar_t, typename scale_type>
template <typename scalar_t, typename scale_t>
__global__ void dynamic_scaled_int8_quant_kernel(
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
scale_type* scale, const int hidden_size) {
int const tid = threadIdx.x;
int64_t const token_idx = blockIdx.x;
float absmax_val = 0.0f;
float const zero = 0.0f;
const scalar_t* __restrict__ input, int8_t* __restrict__ output,
scale_t* scale_out, const int hidden_size) {
const int tid = threadIdx.x;
const int stride = blockDim.x;
const int64_t token_idx = blockIdx.x;
// Must be performed using 64-bit math to avoid integer overflow.
out += token_idx * hidden_size;
input += token_idx * hidden_size;
const scalar_t* row_in = input + token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size;
for (int i = tid; i < hidden_size; i += blockDim.x) {
float val = static_cast<float>(input[i]);
val = val > zero ? val : -val;
absmax_val = val > absmax_val ? val : absmax_val;
// calculate for absmax
float thread_max = 0.f;
for (int i = tid; i < hidden_size; i += stride) {
const auto v = fabsf(static_cast<float>(row_in[i]));
thread_max = fmaxf(thread_max, v);
}
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStorage;
float const block_absmax_val_maybe =
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
__shared__ float block_absmax_val;
using BlockReduce = cub::BlockReduce<float, 256>;
__shared__ typename BlockReduce::TempStorage tmp;
float block_max = BlockReduce(tmp).Reduce(thread_max, cub::Max{}, blockDim.x);
__shared__ float absmax;
if (tid == 0) {
block_absmax_val = block_absmax_val_maybe;
scale[token_idx] = block_absmax_val / 127.0f;
absmax = block_max;
scale_out[blockIdx.x] = absmax / 127.f;
}
__syncthreads();
float const tmp_scale = 127.0f / block_absmax_val;
for (int i = tid; i < hidden_size; i += blockDim.x) {
out[i] = float_to_int8_rn(static_cast<float>(input[i]) * tmp_scale);
}
float inv_s = (absmax == 0.f) ? 0.f : 127.f / absmax;
// 2. quantize
vectorize_with_alignment<16>(
row_in, row_out, hidden_size, tid, stride,
[=] __device__(int8_t& dst, const scalar_t& src) {
dst = float_to_int8_rn(static_cast<float>(src) * inv_s);
});
}
template <typename scalar_t, typename scale_type, typename azp_type>
// MinMax structure to hold min and max values in one go
struct MinMax {
float min, max;
__host__ __device__ MinMax()
: min(std::numeric_limits<float>::max()),
max(std::numeric_limits<float>::lowest()) {}
__host__ __device__ explicit MinMax(float v) : min(v), max(v) {}
// add a value to the MinMax
__host__ __device__ MinMax& operator+=(float v) {
min = fminf(min, v);
max = fmaxf(max, v);
return *this;
}
// merge two MinMax objects
__host__ __device__ MinMax& operator&=(const MinMax& other) {
min = fminf(min, other.min);
max = fmaxf(max, other.max);
return *this;
}
};
__host__ __device__ inline MinMax operator+(MinMax a, float v) {
return a += v;
}
__host__ __device__ inline MinMax operator&(MinMax a, const MinMax& b) {
return a &= b;
}
template <typename scalar_t, typename scale_t, typename azp_t>
__global__ void dynamic_scaled_int8_azp_quant_kernel(
scalar_t const* __restrict__ input, int8_t* __restrict__ out,
scale_type* scale, azp_type* azp, const int hidden_size) {
int64_t const token_idx = blockIdx.x;
const scalar_t* __restrict__ input, int8_t* __restrict__ output,
scale_t* scale_out, azp_t* azp_out, const int hidden_size) {
const int tid = threadIdx.x;
const int stride = blockDim.x;
const int64_t token_idx = blockIdx.x;
// Must be performed using 64-bit math to avoid integer overflow.
out += token_idx * hidden_size;
input += token_idx * hidden_size;
const scalar_t* row_in = input + token_idx * hidden_size;
int8_t* row_out = output + token_idx * hidden_size;
// Scan for the min and max value for this token
float max_val = std::numeric_limits<float>::min();
float min_val = std::numeric_limits<float>::max();
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
auto val = static_cast<float>(input[i]);
max_val = std::max(max_val, val);
min_val = std::min(min_val, val);
// 1. calculate min & max
MinMax thread_mm;
for (int i = tid; i < hidden_size; i += stride) {
thread_mm += static_cast<float>(row_in[i]);
}
// Reduce the max and min values across the block
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStorage;
max_val = BlockReduce(reduceStorage).Reduce(max_val, cub::Max{}, blockDim.x);
__syncthreads(); // Make sure min doesn't mess with max shared memory
min_val = BlockReduce(reduceStorage).Reduce(min_val, cub::Min{}, blockDim.x);
using BlockReduce = cub::BlockReduce<MinMax, 256>;
__shared__ typename BlockReduce::TempStorage tmp;
__shared__ scale_type scale_sh;
__shared__ azp_type azp_sh;
MinMax mm = BlockReduce(tmp).Reduce(
thread_mm,
[] __device__(MinMax a, const MinMax& b) {
a &= b;
return a;
},
blockDim.x);
// Compute the scale and zero point and store them, only on the first thread
if (threadIdx.x == 0) {
float const scale_val = (max_val - min_val) / 255.0f;
// Use rounding to even (same as torch.round)
auto const azp_float = std::nearbyint(-128.0f - min_val / scale_val);
auto const azp_val = static_cast<azp_type>(azp_float);
// Store the scale and azp into shared and global
scale[token_idx] = scale_sh = scale_val;
azp[token_idx] = azp_sh = azp_val;
__shared__ float scale_sh;
__shared__ azp_t azp_sh;
if (tid == 0) {
float s = (mm.max - mm.min) / 255.f;
float zp = nearbyintf(-128.f - mm.min / s); // round-to-even
scale_sh = s;
azp_sh = azp_t(zp);
scale_out[blockIdx.x] = s;
azp_out[blockIdx.x] = azp_sh;
}
// Wait for the scale and azp to be computed
__syncthreads();
float const scale_val = scale_sh;
azp_type const azp_val = azp_sh;
const float inv_s = 1.f / scale_sh;
const azp_t azp = azp_sh;
// Quantize the values
for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) {
auto const val = static_cast<float>(input[i]);
auto const quant_val =
int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val);
out[i] = quant_val;
}
// 2. quantize
vectorize_with_alignment<16>(
row_in, row_out, hidden_size, tid, stride,
[=] __device__(int8_t& dst, const scalar_t& src) {
const auto v = static_cast<float>(src) * inv_s;
dst = int32_to_int8(float_to_int32_rn(v) + azp);
});
}
} // namespace vllm
@ -247,7 +285,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size;
dim3 const grid(num_tokens);
dim3 const block(std::min(hidden_size, 1024));
dim3 const block(std::min(hidden_size, 256));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
@ -278,7 +316,7 @@ void dynamic_scaled_int8_quant(
int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size;
dim3 const grid(num_tokens);
dim3 const block(std::min(hidden_size, 1024));
dim3 const block(std::min(hidden_size, 256));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {

View File

@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
TORCH_CHECK(
a.size(0) % 4 == 0,
"Input tensor must have a number of rows that is a multiple of 4. ",
"but got: ", a.size(0), " rows.");
if (out.dtype() == torch::kBFloat16) {
cutlass_gemm_blockwise_sm100_fp8_dispatch<cutlass::bfloat16_t>(
out, a, b, a_scales, b_scales);

View File

@ -1,5 +1,6 @@
#pragma once
#include "cuda_utils.h"
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
@ -22,49 +23,49 @@ namespace vllm {
using namespace cute;
template <typename OutType, typename MmaTileShape, typename ScalesPerTile,
class ClusterShape, typename EpilogueScheduler,
typename MainloopScheduler>
// clang-format off
template <class OutType, int ScaleGranularityM,
int ScaleGranularityN, int ScaleGranularityK,
class MmaTileShape, class ClusterShape,
class EpilogueScheduler, class MainloopScheduler,
bool swap_ab_ = false>
struct cutlass_3x_gemm_fp8_blockwise {
static constexpr bool swap_ab = swap_ab_;
using ElementAB = cutlass::float_e4m3_t;
using ElementA = ElementAB;
using LayoutA = cutlass::layout::RowMajor;
using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
using ElementB = ElementAB;
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
using ElementC = void;
using ElementD = OutType;
using LayoutD = cutlass::layout::RowMajor;
using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
using ElementC = void; // TODO: support bias
using LayoutC = LayoutD;
using LayoutC_Transpose = LayoutD_Transpose;
static constexpr int AlignmentC = AlignmentD;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBlockScale = float;
// MMA and Cluster Tile Shapes
// Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster
// Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>;
static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{});
static constexpr int ScaleGranularityM =
size<0>(MmaTileShape{}) / ScaleMsPerTile;
static constexpr int ScaleGranularityN =
size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{});
static constexpr int ScaleGranularityK =
size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{});
using ScaleConfig = conditional_t<swap_ab,
cutlass::detail::Sm100BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::K, cute::UMMA::Major::MN>,
cutlass::detail::Sm100BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::MN, cute::UMMA::Major::K>>;
// Shape of the threadblocks in a cluster
using ClusterShape_MNK = ClusterShape;
using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::MN, cute::UMMA::Major::K>;
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise {
static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
using ElementScalar = float;
// clang-format off
using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag,
@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise {
ElementAccumulator,
ElementCompute,
ElementC,
LayoutC,
conditional_t<swap_ab, LayoutC_Transpose, LayoutC>,
AlignmentC,
ElementD,
LayoutD,
conditional_t<swap_ab, LayoutD_Transpose, LayoutD>,
AlignmentD,
EpilogueScheduler,
DefaultOperation
>::CollectiveOp;
using StageCountType = cutlass::gemm::collective::StageCountAuto;
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
ElementA,
cute::tuple<LayoutA, LayoutSFA>,
AlignmentA,
ElementB,
cute::tuple<LayoutB, LayoutSFB>,
AlignmentB,
ElementAccumulator,
MmaTileShape,
ClusterShape,
using CollectiveMainloop = conditional_t<swap_ab,
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
ElementB,
cute::tuple<LayoutB_Transpose, LayoutSFA>,
AlignmentB,
ElementA,
cute::tuple<LayoutA_Transpose, LayoutSFB>,
AlignmentA,
ElementAccumulator,
MmaTileShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler
>::CollectiveOp;
// clang-format on
MainloopScheduler
>::CollectiveOp,
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
ElementA,
cute::tuple<LayoutA, LayoutSFA>,
AlignmentA,
ElementB,
cute::tuple<LayoutB, LayoutSFB>,
AlignmentB,
ElementAccumulator,
MmaTileShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler
>::CollectiveOp>;
using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
static constexpr bool swap_ab = Gemm::swap_ab;
using GemmKernel = typename Gemm::GemmKernel;
using StrideA = typename Gemm::GemmKernel::StrideA;
using StrideB = typename Gemm::GemmKernel::StrideB;
@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
using ElementD = typename Gemm::ElementD;
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
auto prob_shape = cute::make_shape(m, n, k, 1);
StrideA a_stride;
StrideB b_stride;
@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
b_stride =
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
c_stride =
cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1));
LayoutSFA layout_SFA =
LayoutSFA layout_SFA = swap_ab ?
ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) :
ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
LayoutSFB layout_SFB =
LayoutSFB layout_SFB = swap_ab ?
ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) :
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
typename GemmKernel::MainloopArguments mainloop_args{
a_ptr, a_stride, b_ptr, b_stride,
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB};
auto mainloop_args = [&](){
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
if (swap_ab) {
return typename GemmKernel::MainloopArguments{
b_ptr, b_stride, a_ptr, a_stride,
b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB
};
}
else {
return typename GemmKernel::MainloopArguments{
a_ptr, a_stride, b_ptr, b_stride,
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
};
}
}();
auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1);
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
typename GemmKernel::EpilogueArguments epilogue_args{
@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
auto m = a.size(0);
auto k = a.size(1);
auto n = b.size(1);
int sms;
int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) {
return std::ceil(static_cast<float>(m) / tile1SM) *
std::ceil(static_cast<float>(n) / tile1SM) >=
sms;
};
bool use_2sm = should_use_2sm(m, n);
if (use_2sm) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, Shape<_256, _128, _128>, Shape<_256, _1, _1>,
Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
constexpr int TILE_K = 128;
// TODO: better heuristics
bool swap_ab = (m < 16) || (m % 4 != 0);
bool use_tma_epilogue = (m * n) % 4 == 0;
if (!swap_ab) {
constexpr int TILE_N = 128;
int tile_m = 256;
if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) {
tile_m = 64;
}
else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) {
tile_m = 128;
}
if (tile_m == 64) {
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_64, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
}
} else if (tile_m == 128) {
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_128, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
out, a, b, a_scales, b_scales);
}
} else { // tile_m == 256
if (use_tma_epilogue) {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
} else {
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, TILE_N, TILE_K, Shape<_256, Int<TILE_N>, Int<TILE_K>>,
Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>(
out, a, b, a_scales, b_scales);
}
}
} else {
// TODO: Test more tile N configs
constexpr int TILE_M = 128;
constexpr int TILE_N = 16;
// TMA epilogue isn't compatible with Swap A/B
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, Shape<_128, _128, _128>, Shape<_128, _1, _1>,
Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>(
OutType, TILE_M, 1, TILE_K, Shape<Int<TILE_M>, Int<TILE_N>, Int<TILE_K>>,
Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm,
cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>(
out, a, b, a_scales, b_scales);
}
}

View File

@ -15,16 +15,59 @@ using c3x::cutlass_gemm_caller;
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_default {
// M in (256, inf)
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_256, _128, _64>;
using TileShape = Shape<_256, _128, _128>;
using ClusterShape = Shape<_2, _2, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M256 {
// M in (128, 256]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_128, _128, _128>;
using ClusterShape = Shape<_2, _2, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M128 {
// M in (64, 128]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_128, _128, _256>;
using ClusterShape = Shape<_2, _4, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue>
struct sm100_fp8_config_M64 {
// M in [1, 64]
static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
using TileShape = Shape<_64, _64, _256>;
using ClusterShape = Shape<_1, _8, _1>;
using Cutlass3xGemm =
cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
KernelSchedule, EpilogueSchedule>;
};
template <typename InType, typename OutType,
template <typename, typename, typename> typename Epilogue,
typename... EpilogueArgs>
@ -39,8 +82,34 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
using Cutlass3xGemmDefault =
typename sm100_fp8_config_default<InType, OutType,
Epilogue>::Cutlass3xGemm;
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
out, a, b, std::forward<EpilogueArgs>(args)...);
using Cutlass3xGemmM64 =
typename sm100_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM128 =
typename sm100_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
using Cutlass3xGemmM256 =
typename sm100_fp8_config_M256<InType, OutType, Epilogue>::Cutlass3xGemm;
uint32_t const m = a.size(0);
uint32_t const mp2 =
std::max(static_cast<uint32_t>(64), next_pow_2(m)); // next power of 2
if (mp2 <= 64) {
// m in [1, 64]
return cutlass_gemm_caller<Cutlass3xGemmM64>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else if (mp2 <= 128) {
// m in (64, 128]
return cutlass_gemm_caller<Cutlass3xGemmM128>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else if (mp2 <= 256) {
// m in (128, 256]
return cutlass_gemm_caller<Cutlass3xGemmM256>(
out, a, b, std::forward<EpilogueArgs>(args)...);
} else {
// m in (256, inf)
return cutlass_gemm_caller<Cutlass3xGemmDefault>(
out, a, b, std::forward<EpilogueArgs>(args)...);
}
}
template <template <typename, typename, typename> typename Epilogue,

View File

@ -84,7 +84,8 @@ void run_cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch) {
TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
@ -113,19 +114,23 @@ void run_cutlass_moe_mm_sm90(
if (n >= 8192) {
cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
} else if (k >= 8192) {
cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
} else if (m <= 16) {
cutlass_group_gemm_caller<Cutlass3xGemmM16>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
} else {
cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
}
}
@ -134,15 +139,18 @@ void dispatch_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch) {
if (out_tensors.dtype() == torch::kBFloat16) {
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
} else {
run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
problem_sizes, a_strides, b_strides, c_strides);
problem_sizes, a_strides, b_strides, c_strides, per_act_token,
per_out_ch);
}
}
@ -153,8 +161,9 @@ void cutlass_moe_mm_sm90(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch) {
dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
expert_offsets, problem_sizes, a_strides, b_strides,
c_strides);
c_strides, per_act_token, per_out_ch);
}

View File

@ -76,7 +76,8 @@ void cutlass_group_gemm_caller(
torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
torch::Tensor const& b_strides, torch::Tensor const& c_strides,
bool per_act_token, bool per_out_ch) {
using ElementAB = typename Gemm::ElementAB;
using ElementD = typename Gemm::ElementD;
@ -84,9 +85,6 @@ void cutlass_group_gemm_caller(
int k_size = a_tensors.size(1);
int n_size = out_tensors.size(1);
bool per_act_token = a_scales.numel() != 1;
bool per_out_ch = b_scales.numel() != num_experts;
auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
auto options_int =

Some files were not shown because too many files have changed in this diff Show More