Compare commits

...

519 Commits

Author SHA1 Message Date
3e2e549c49 add a small update
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-30 16:29:52 -07:00
e1e2d61788 update time
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-30 15:09:19 -07:00
467d3269a5 add batch invariant test to ci
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-30 14:47:11 -07:00
e7acb20076 [Feature] Batch invariant torch.compile (#27660)
Signed-off-by: PaulZhang12 <paulzhan@fb.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-30 13:11:29 -07:00
4b68c4a55b [Core][Perf] Only invoke save_new_computed_blocks when computed blocks are not empty (#27799)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-30 19:47:30 +00:00
a8141fa649 [Refactor] Remove VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK (#27750)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-30 15:32:39 -04:00
4917002523 [Fix] Skip record_sleep_state logic in PrometheusStatsLogger if not in dev mode (#27789)
Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
2025-10-30 19:26:27 +00:00
a2981c4272 [EP/DP][API Server] Enable DP-aware routing in OpenAI API requests (#24945)
Co-authored-by: Cong Chen <prowindy@gmail.com>
2025-10-30 12:10:16 -07:00
4574d48bab [Core][Bookkeeping] Update cu_num_accepted_tokens for all req_index (#27629)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-30 11:52:36 -07:00
ab98f6556f [Bugfix] Fix 2 precommit issues - (mamba_block_size, kv_cache_config) (#27811)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-10-30 11:52:18 -07:00
2918c1b49c [Model] Use the same fused_moe configs for all H200 devices (#23642)
Signed-off-by: Roger Meier <r.meier@siemens.com>
2025-10-30 17:36:56 +00:00
1004205795 [MTP] Refactor mtp predictor to avoid d2h operation (#27643)
Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-30 17:27:39 +00:00
ba33e8830d Reapply "Install pre-built xformers-0.0.32.post2 built with pt-2.9.0" (#27768)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-10-30 10:22:30 -07:00
33a0ea5f32 [Docs] add Shanghai Meetup - 2025/10 (#27545)
Signed-off-by: Kebe <mail@kebe7jun.com>
Signed-off-by: esmeetu <jasonailu87@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: esmeetu <jasonailu87@gmail.com>
2025-10-31 00:33:13 +08:00
60f76baa66 [Misc] Replace CUDA_VISIBLE_DEVICES in DP with torch.cuda.set_device for device selection on cuda-like devices (#27564)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-10-30 11:41:44 -04:00
e5e076cad7 [BugFix] Stopgap - Flashinfer Autotuner + GPT-OSS + DP/TP (#27762)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-30 08:24:31 -07:00
eebf00cb0c [Bugfix][CPU] Fix MRoPE dispatch on the CPU backend (#27800)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-10-30 15:12:05 +00:00
9956aae4ea [Model][Ouro] Support Ouro Model (#27794)
Signed-off-by: yinfan.1024 <yinfan.1024@bytedance.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: yinfan.1024 <yinfan.1024@bytedance.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-30 22:34:41 +08:00
0fe0140408 [KV offload] Enable CPU KV offload on CUDA alike Platforms (#27770)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-30 22:10:29 +08:00
4e68cc9b6a [Model] Introduce Kimi Linear to vLLM (#27809)
Signed-off-by: lizhiyuan <lizhiyuan@moonshot.cn>
Signed-off-by: Zhiyuan Li <uniartisan2017@gmail.com>
2025-10-30 21:02:27 +08:00
1994de99ea [CI Failure] Fix test_kv_cache_model_load_and_run (#27717)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-30 12:27:53 +00:00
4464723f22 [Frontend][Doc][5/N] Improve all pooling task | Polish encode (pooling) api & Document. (#25524)
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-30 12:13:05 +00:00
74374386e2 [Bugfix] Improve GPU validation logging in Ray fallback scenarios (#25775)
Signed-off-by: Sairam Pillai <sairam.pillai61@gmail.com>
2025-10-30 11:57:59 +00:00
c01f6e525f [CI] Fix mypy for vllm/v1/core and vllm/v1/engine (#27108)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-30 11:32:17 +00:00
c7d2a554ba [CI Failure] fix test_default_mm_loras (#27795)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-30 18:13:03 +08:00
af826e0820 [V0 deprecation] Remove VLLM_USE_V1 usage in config module (#27784)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-30 09:42:49 +00:00
e806178d2a [BugFix][VL] Fix FA selection on Qwen2.5-VL (#27790)
Signed-off-by: zhewenli <zhewenli@meta.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-30 07:54:44 +00:00
5be1bed790 [CI/Build]Add eval config for Qwen3-235B-A22B-Instruct-2507-FP8 (#27113)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-30 07:50:56 +00:00
31b55ffc62 use stringData in secret yaml to store huggingface token (#25685)
Signed-off-by: yiting.jiang <yiting.jiang@daocloud.io>
2025-10-30 00:47:36 -07:00
ded8ada86a Add more dims for batch invariant shims (#27489)
Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Bram Wasti <bwasti@fb.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-30 05:28:45 +00:00
8bff831f0a [Benchmark] Cleanup deprecated nightly benchmark and adjust the docstring for performance benchmark (#25786)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2025-10-30 04:43:37 +00:00
b5d70751d8 [BugFix] Reordering extend logic fix (#27739)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-10-29 21:39:34 -07:00
b8c48c5d72 kernels/moe test pruning (#27053)
Signed-off-by: Fardin Hoque <kfhfar@amazon.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-30 12:10:34 +08:00
17d055f527 [Feat] Adds runai distributed streamer (#27230)
Signed-off-by: bbartels <benjamin@bartels.dev>
Signed-off-by: Benjamin Bartels <benjamin@bartels.dev>
Co-authored-by: omer-dayan <omdayan@nvidia.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-29 21:09:10 -07:00
2ce5c5d3d6 [BugFix] Handle unscheduled requests properly when async scheduling (#27756)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-29 21:04:25 -07:00
b5bae42f91 [XPU] Update latest IPEX 2.8 release (#27735)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-10-30 11:17:13 +08:00
d7fb10c574 [Bugfix] mamba-block-size is set for vision language model (#27773)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-10-29 19:39:57 -07:00
b798e39f93 [XPU][bugfix] fix rope for llama4 and deepseek (#25145)
Signed-off-by: Yan Ma <yan.ma@intel.com>
2025-10-30 09:43:13 +08:00
48eb8eba58 [Temp fix] Disable torch.compile for Qwen2.5 VL's VisionBlock temporarily. (#27760)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-29 23:17:48 +00:00
b5d90f7400 [Bug] Fix DBO IMA issue for DeepEPHT (#27666)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-29 16:28:27 -04:00
d4aa144343 [BugFix] Fix handling of resumed reqs in SharedStorageConnector (#27719)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-29 20:16:52 +00:00
fcb1d570bb [Bug] Fix DeepEP low latency assert self.batched_router_logits.size(-1) == full_router_logits.size(-1) Bug (#27682)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-29 14:50:39 -04:00
accb8fab07 [KVConnector] Add metrics to Prometheus-Grafana dashboard (#26811)
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Co-authored-by: Mark McLoughlin <markmc@redhat.com>
2025-10-29 18:44:49 +00:00
5b0448104f [Bug] Raise error explicitly if using incompatible backend (#27424)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-29 13:29:20 -04:00
f7a6682872 [CI/Build] Test torchrun with 8 cards (#27548)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-10-29 10:26:06 -07:00
a9fe0793f2 use_aot_compile should respect VLLM_DISABLE_COMPILE_CACHE (#27698)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-29 17:08:54 +00:00
7568a282b9 [FIXBUG] Qwen3VL hallucinations without Contiguous on Torch.SDPA (#27744)
Signed-off-by: JartX <sagformas@epdcenter.es>
Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-29 16:55:35 +00:00
1da3309ace [Core] Exposing engine sleep & wake_up state as prometheus metrics (#24176)
Signed-off-by: Braulio Dumba <Braulio.Dumba@ibm.com>
2025-10-29 09:32:01 -07:00
5522fb274b [Chore] Optimize P2PNCCLEngine http_address (#27488)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-30 00:05:09 +08:00
0f95a1c3f2 [CI] Fix flaky test_two_responses_with_same_prev_id test (#27745)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-29 15:10:35 +00:00
ded24e3e54 [ROCm][Platform] Add MI308X device id in _ROCM_DEVICE_ID_NAME_MAP (#27623)
Signed-off-by: Xiake Sun <xiake.sun@amd.com>
2025-10-29 14:44:03 +00:00
d6704dd099 Fix MiniMax-M2 rmsnorm precision and remove useless code (#27627)
Signed-off-by: xuebi <xuebi@minimaxi.com>
Co-authored-by: xuebi <xuebi@minimaxi.com>
2025-10-29 21:01:05 +08:00
ecca3fee76 [Frontend] Add vllm bench sweep to CLI (#27639)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-29 05:59:48 -07:00
9a0d2f0d92 [CI/Build] Skip cpu offloading test on AMD (#27690)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-29 12:55:51 +00:00
ad3ec89532 [VLM] Add Qwen3-VL generation test (#25185)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-29 12:19:37 +00:00
3481e40743 [chore] Remove models weight on S3 logic (#27725)
Signed-off-by: kevin <kevin@anyscale.com>
2025-10-29 10:29:49 +00:00
5e72216d17 Feature/video support in random mm dataset (#25963)
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Signed-off-by: Eugene Khvedchenya <ekhvedchenia@nvidia.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-29 18:24:52 +08:00
1a33aacf82 [Misc] Raise error for missing video metadata in MultiModalDataParser (#27664)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-29 10:06:42 +00:00
7ba6aa8f56 [Fix] import get_kv_cache_torch_dtype error in LMCacheConnector integration (#27670)
Signed-off-by: KevinCheung2259 <2651309292@qq.com>
2025-10-29 10:03:54 +00:00
ab2eb27b74 [Frontend] [gpt-oss] Mcp type bug (#27689)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Signed-off-by: Alec Solder <alecs@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-29 10:01:32 +00:00
3c7fefdeba [Frontend] [gpt-oss] Tool json call parsing error retry (#27675)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Signed-off-by: Alec Solder <alecs@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
Co-authored-by: Alec Solder <alecs@fb.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-29 09:42:44 +00:00
1891cf605a [Bugfix] Fix modular kernel tests (#27707)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-29 16:14:33 +08:00
8df98c2161 [perf] Enable concurrent execution of "shared_experts" and "selected_experts" in qwen3-next (#27578)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-10-29 08:12:54 +00:00
4fb8771cc0 [CI/Build] Move pre-commit only scripts to tools/pre_commit (#27657)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-29 08:04:33 +00:00
413ef7a3b4 [Speculators] Move tests + fix integration (#27308)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
Signed-off-by: rahul-tuli <rtuli@redhat.com>
Co-authored-by: Rahul Tuli <rtuli@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-10-29 00:54:21 -07:00
8b62495076 [Bugfix] Fix non-contiguous tensor error in rocm_unquantized_gemm_impl (#27605)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-29 00:00:15 -07:00
83fd49b1fc [CI/Build][Bugfix]Fix Quantized Models Test on AMD (#27712)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-29 06:27:30 +00:00
a4a4f0f617 [KV Connector] Update lmcache connector with latest compatibility (#27681)
Signed-off-by: Samuel Shen <slshen@uchicago.edu>
Co-authored-by: Samuel Shen <slshen@uchicago.edu>
2025-10-29 05:38:37 +00:00
0d8161b075 [Model] Fix Qwen3VL and Qwen3Omni after torch.compile changes (#27705)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-29 05:28:20 +00:00
d2c33c397a [NIXL][XPU] update name of nixl wheel (#27631)
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
2025-10-29 12:43:29 +08:00
f6d5f5888c [Build] Revert triton_kernels requirements (#27659) 2025-10-28 21:07:09 -07:00
9007bf57e6 Revert "Install pre-built xformers-0.0.32.post2 built with pt-2.9.0" (#27714) 2025-10-28 20:58:01 -07:00
f257544709 Install pre-built xformers-0.0.32.post2 built with pt-2.9.0 (#27598)
Signed-off-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-28 19:39:15 -07:00
0b51c9bd8b [Core] Early return in SlidingWindowManager.remove_skipped_blocks (#27673)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-29 01:32:33 +00:00
d3ab240f39 [Bug] Fix deepep low latency use nvlink by default (#27677)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-28 23:53:12 +00:00
94666612a9 [Misc][qwen2_5_vl][torch.compile] Enable supports_torch_compile on generic nn.Module and demonstrate speedup on Qwen Vision model (#23207)
Signed-off-by: Lucas Kabela <lucaskabela@meta.com>
Signed-off-by: Lucas Kabela <lucasakabela@gmail.com>
2025-10-28 22:36:43 +00:00
4fe5895361 [AsyncScheduling] Make async overlap work with logprobs (#27615)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-28 22:35:54 +00:00
111faf1118 [Core] Scheduler: Publish connector events after output (#25875)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-10-28 21:01:33 +00:00
6afc28a9ba [Test] Batch Invariant: Unit test using parameterized backend (#27478)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-28 13:51:35 -07:00
141e6a0505 [Misc] Make reorder batch also separate extends (#27367)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-10-28 10:55:10 -07:00
130aa8cbcf Add load pattern configuration guide to benchmarks (#26886)
Signed-off-by: Matvei Pashkovskii <mpashkov@amd.com>
Signed-off-by: Matvei Pashkovskii <matvei.pashkovskii@amd.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-28 10:49:15 -07:00
e3d8186666 [compile] Add fallback path to AOT compile when serialization fails. (#27350)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-28 12:54:26 -04:00
f5710ef02a [Misc] Make LayerBlockType a Literal instead of Enum (#27658)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-28 16:23:35 +00:00
a8c02fb5bf [Bugfix][CI] Fix v1 attention backend tests and add CI coverage (#26597)
Signed-off-by: Mohammad Miadh Angkad <MAngkad.BSDSBA2027@aim.edu>
Signed-off-by: Mohammad Miadh Angkad <mangkad.bsdsba2027@aim.edu>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-28 11:42:05 -04:00
02af36df36 [Bugfix] Fix allocation & free logic of SingleWriterShmRingBuffer (#27117)
Signed-off-by: Kero Liang <kerorek@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: donglu <donglu@cohere.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-28 15:01:24 +00:00
e88bdd60d9 [FLA] Introduce Kimi Delta Attention(KDA) to VLLM (#27654)
Signed-off-by: lizhiyuan <lizhiyuan@moonshot.cn>
2025-10-28 22:56:28 +08:00
05e034f085 [nit]: Fix import for the lmcache integration (#27600)
Signed-off-by: Samuel Shen <slshen@uchicago.edu>
Co-authored-by: Samuel Shen <slshen@uchicago.edu>
2025-10-28 14:40:55 +00:00
936643a868 [BugFix] Also consider RAY_EXPERIMENTAL_NOSET_* when storing compilation cache (#27294)
Signed-off-by: Hollow Man <hollowman@opensuse.org>
2025-10-28 10:22:28 -04:00
b186149e8e [Bugfix][Frontend] validate arg priority in frontend LLM class before add request (#27596)
Signed-off-by: Junpu Fan <junpufan@gmail.com>
2025-10-28 14:02:43 +00:00
2abbd351ef [Core] Enable async scheduling for external_launcher mode (#27394)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
2025-10-28 13:52:47 +00:00
446912d1cb fix: allow HuggingFace standard chat template params via **kwargs (#27622)
Signed-off-by: wangln19 <wanglinian@dev.wanglinian.msh-dev.svc.cluster.local>
Signed-off-by: wangln19 <96399074+wangln19@users.noreply.github.com>
Co-authored-by: wangln19 <wanglinian@dev.wanglinian.msh-dev.svc.cluster.local>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-28 21:12:34 +08:00
a00d6254e9 [compile] Disable dynamo guards check for AOT compilation. (#27288)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-28 12:58:12 +00:00
05181cc57f [Hybrid] Add mamba_block_size to Engine Args (#27289)
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
2025-10-28 12:54:24 +00:00
259504e147 [compile] Add enable_prompt_embeds to compile hash. (#27285)
Signed-off-by: zhxchen17 <zhxchen17@fb.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-28 20:46:03 +08:00
0484b64248 [Bug] Fix shape issue for eplb expert weights (#27589)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-28 20:44:05 +08:00
f58d9b6404 [Misc] Separate out utils.counter and move utils.Device to engine (#27588)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-28 12:20:46 +00:00
44b5ce956d [Bugfix] In LongRoPE, decide short vs long based on max_model_len (#27431)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-28 12:00:56 +00:00
7a865f2325 [V0 Deprecation] Remove vestigial V0 logits_processors.py file (#27601)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-28 19:17:45 +08:00
2fa90bda27 Fix a robust parsing issue in KimiK2ToolParser that causes IndexError (#27565)
Signed-off-by: wangln19 <wanglinian@dev.wanglinian.msh-dev.svc.cluster.local>
Co-authored-by: wangln19 <wanglinian@dev.wanglinian.msh-dev.svc.cluster.local>
2025-10-28 11:11:50 +00:00
0291fbf65c [CI/Build] Fix amd model executor test (#27612)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-28 08:58:11 +00:00
b46e4a06f1 [Core][Bookkeeping Optimization] Update against numpy view of is_token_ids tensor (#27618)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-28 08:13:10 +00:00
d34f5fe939 [Bugfix][CPU] Fallback oneDNN linear to torch linear to fix half gemm support on legecy platforms (#27526)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-27 23:25:44 -07:00
bdb01a38fe [Hardware][AMD][Model] Triton MoE tuning configs for GLM-4.6 for MI300X (#27323)
Signed-off-by: minatoaquaMK2 <jiacheng.yue@foxmail.com>
2025-10-27 22:58:06 -07:00
5b3c35a68e [ROCm] [Doc] Update ROCm installation docs (#27327)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-10-28 13:00:50 +08:00
61fbfe5274 [Bugfix] fixed inconsistent finish_reason handling between V0 and V1 engines (#27555)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-28 02:18:08 +00:00
255e34ca50 [Stability fix] turn off HMA allocator when connector is set (#27592)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
2025-10-27 18:32:23 -07:00
a8d2e326ec [Bugfix][CI] Fix config resolving logic with remote models (#27610) 2025-10-28 00:48:32 +00:00
53a56e658b [gpt-oss][2/N] Support input_messages in responsesRequest (#26962)
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
2025-10-27 23:15:49 +00:00
69f064062b Code quality improvements: version update, type annotation enhancement, and enum usage simplification (#27581)
Signed-off-by: Bradley <bradley.b.pitt@gmail.com>
2025-10-27 17:50:22 +00:00
921e78f4bb [ROCm] Update AITER branch for ROCm base docker (#27586)
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
2025-10-27 17:22:33 +00:00
6ebffafbb6 [Misc] Clean up more utils (#27567)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-27 15:30:38 +00:00
3b96f85c36 [Chore]: Stream tokens vs characters in tool call parser tests (#26513)
Signed-off-by: Ben Browning <bbrownin@redhat.com>
2025-10-27 23:06:25 +08:00
23ad820553 fixing mm placeholder replacement issue with gemma3 (#27538)
Signed-off-by: tingtingtang1992 <streamttt@gmail.com>
2025-10-27 14:34:01 +00:00
5d3be3ba4c [Bugfix][LoRA][FusedMoE] Select MxFP4 Backend based on LoRA Enablement (#27487)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-27 07:32:50 -07:00
4f882be4a0 [Model] Siglip2 Model Support (#27566)
Signed-off-by: piood <2477084691@qq.com>
2025-10-27 06:57:37 -07:00
9273754222 [Hybrid] Added supports_mamba_prefix_caching Protocol (#27339)
Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
2025-10-27 13:05:20 +00:00
f4e8154076 [Kernel] Enable moe LoRA kernel support FP16 (#27468)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-27 19:48:37 +08:00
a663f6ae64 [cpu][perf] Fix low CPU utilization with VLLM_CPU_OMP_THREADS_BIND on AArch64 (#27415)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2025-10-27 11:14:55 +00:00
a4fc21895e [Bugfix] Fixed when return_token_ids=False, the first event still contains prompt_token_ids. (#27561)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-27 11:06:43 +00:00
a3e8611da5 [Bugfix] Limit the default value of max_model_len when it is not specified by users (#27556)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-10-27 10:16:20 +00:00
7c2bdb83dc [Misc] Clean up utils (#27552)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-27 09:05:40 +00:00
9932ed6a83 [Kernel] Adding split_K implementation for fused_moe_lora (#27291)
Signed-off-by: Danielle Robinson <dmmaddix@amazon.com>
Signed-off-by: Danielle Robinson <dcmaddix@gmail.com>
Co-authored-by: Danielle Robinson <dmmaddix@amazon.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-27 02:05:24 -07:00
2d631d28c6 [Doc] Slight improvement to M2 and beyond (#27554)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-27 09:02:10 +00:00
b368382964 [Model] Deprecate merge_by_field_config=False (#27551)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-27 16:43:00 +08:00
a806c14cc7 [Performance][LoRA] add context varying params to 'do_not_specialize' in fused moe lora (#27445)
Signed-off-by: gnovack <gnovack@amazon.com>
2025-10-27 06:31:55 +00:00
181bf5bbde [Docs] reemove the incorrect enable_reasoning parameter (#27550)
Signed-off-by: zxw <1020938856@qq.com>
2025-10-26 23:17:19 -07:00
cbd5e07a51 [Model] Use merge_by_field_config for MM models (Qwen series) (#27546)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-27 05:38:05 +00:00
63b22e0dbb [Model][Bugfix] fix ernie45 moe 300B SharedFusedMoE output tuple (#27316)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-26 20:53:31 -07:00
5980604c44 Fix MiniMax-M2 copyright (#27537)
Signed-off-by: xuebi <xuebi@minimaxi.com>
Co-authored-by: xuebi <xuebi@minimaxi.com>
2025-10-27 03:29:51 +00:00
361a7463d3 fix m2 test (#27536)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-27 01:04:36 +08:00
720af6ab79 [Model][MiniMax-M2] Support MiniMax-M2 Model (#27535)
Signed-off-by: xuebi <xuebi@minimaxi.com>
Co-authored-by: xuebi <xuebi@minimaxi.com>
2025-10-27 00:59:11 +08:00
55cba4a05c [CI/Build] Update causal-conv1d installation (#27529)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-26 22:14:22 +08:00
c7abff2990 Revert "[CI/Build] Use CPU for mm processing test on CI (#27522)" (#27531)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-26 04:44:27 -07:00
71b1c8b667 [Chore]:Extract math and argparse utilities to separate modules (#27188)
Signed-off-by: Yeshwanth Surya <yeshsurya@gmail.com>
Signed-off-by: Yeshwanth N <yeshsurya@gmail.com>
Signed-off-by: yeshsurya <yeshsurya@gmail.com>
2025-10-26 04:03:32 -07:00
8fb7b2fab9 [Doc] Fix links to GH projects (#27530)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-26 17:55:51 +08:00
be7b55a83d [Doc] Remove Molmo warning (#27527)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-26 16:22:52 +08:00
315b860abe [bugfix]fix empty prompts for async-engine mode in benchmark throughput (#27494)
Signed-off-by: Lucia Fang <fanglu@fb.com>
2025-10-26 08:16:35 +00:00
87c41c26ad [Bugfix] Fix processor initialization for model from modelscope instead of HF (#27461)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-26 07:44:31 +00:00
65d2cf9511 [BUGFIX][ROCM] ViT FlashAttention on ROCm (no GFX9) and contiguous on qwen3vl ROCm TORCH_SDPA (#27190)
Signed-off-by: JartX <sagformas@epdcenter.es>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-10-26 15:08:52 +08:00
d63cd9ff10 [CI/Build] Use CPU for mm processing test on CI (#27522)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-26 13:09:18 +08:00
66a168a197 [CI/Build] Refactor processing tests (#27470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-25 16:14:30 +00:00
a99564ac5b [Attention] Add missing kv cache scale setup (#27490)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-25 00:12:49 -07:00
4c5f632165 [Misc] Simplify max tokens in multimodal registry (#27500)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-24 23:56:01 -07:00
b853540388 [Core][Hybrid allocator + kv connector 1/n] Enable hybrid allocator + KV cache connector (#25712)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
2025-10-24 23:34:18 -07:00
56ed7609a9 Revert "[Misc] Remove use of CUDA_VISIBLE_DEVICES for device selectio… (#27502) 2025-10-25 05:31:43 +00:00
29c9cb8007 [CI] Add tests for cudagraph (#27391)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-10-25 02:37:33 +00:00
83f478bb19 [KVConnector] Migrate the LMCache integration code to be vLLM native (#25542)
Signed-off-by: ApostaC <yihua98@uchicago.edu>
2025-10-25 00:23:53 +00:00
269c4db0a4 [Misc][DP] Guard mxfp4 implementation selection (#27484)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-24 23:29:24 +00:00
52efc34ebf [Log] Optimize Startup Log (#26740)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-24 19:27:04 -04:00
d95d0f4b98 [Distributed] Basic set of configuration for large EP deployment on GB200 (#27328)
Signed-off-by: Pengchao Wang <wpc@fb.com>
Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2025-10-24 14:16:44 -07:00
0402428200 [Perf][Async Scheduling] Remove CPU->GPU sync in dummy_run (#27455)
Signed-off-by: Lehua Ding <lehuading@tencent.com>
2025-10-24 20:45:36 +00:00
17af6aa0da [Document] Add ms-swift library to rlhf.md (#27469) 2025-10-24 20:31:50 +00:00
fc168c33f3 [CI/Build] Fix test_torch_utils in AMD CI (#27317)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-24 12:26:00 -07:00
acc78aeb88 [Bugfix] Fix interns1-vit qk norm code path (#27480)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-24 17:43:45 +00:00
0f67d4d962 [Attention] Add MLA prefill backend: trtllm_ragged_attention_deepseek (#26397)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-10-24 10:24:08 -07:00
7e1d697b56 [Bugfix] Fix MultiConnector stats reconstruction across process boundaries (#27366)
Signed-off-by: Kourosh Hakhamaneshi <Kourosh@anyscale.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
2025-10-24 17:08:05 +00:00
699d62e6cf [NIXL][BUGFIX] delay done_recving queue cleanup to bottom of get_finished (#27297)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-24 17:01:41 +00:00
cd390b609d [compile] Turn standalone_compile back on (#27460)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-10-24 16:30:27 +00:00
2080b05099 [cpu][fix] Fix onednn_mm crash on consecutive matmuls with same M,K,N and different dtype (#27472)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2025-10-24 15:57:48 +00:00
6454afec90 [Doc] Fix minor issues in docs/design/metrics.md (#27436)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-10-24 05:40:54 -07:00
41a62564a7 Fix test named tool use (#27458)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-24 20:27:45 +08:00
284cc92275 [MISC] cudagraph_capture_sizes related improvements (#26016)
Signed-off-by: fhl <2410591650@qq.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-24 05:11:05 -07:00
435be10db9 Fix AArch64 CPU Docker pipeline (#27331)
Signed-off-by: Ioana Ghiban <ioana.ghiban@arm.com>
2025-10-24 05:11:01 -07:00
b7030d962b [Benchmark] Enable benchmark to run with encoding_format="bytes" (#27467)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-24 11:16:50 +00:00
3567816932 [Refactor] move tool parsing logic from protocol.py to the tool parser (#27383)
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
2025-10-24 09:53:23 +00:00
e0ef8a2920 [BugFix] Fix torchrun DP with LLM class (#27395)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-10-24 08:11:37 +00:00
42efe609ba [MM][Bugfix] Replace PatchEmbed's conv3d to linear layer (#27418)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-24 07:32:47 +00:00
88d3141ec6 [Docs] remove v1 column for embedding models (#27446)
Signed-off-by: piood <2477084691@qq.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-23 23:55:03 -07:00
09a6a49eaf [Misc] Avoid "PyTorch non-writable tensors" warning in RayPPCommunicator (#27443)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-10-24 14:53:09 +08:00
074475541a [Bugfix] Fix Pydantic union resolution for ResponseFunctionToolCall in Responses API (#26706)
Signed-off-by: Shai Trinczer <strinczer@icloud.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-23 22:53:42 -07:00
d4c574c39f [Chore] remove structural tags logging lines (#27451) 2025-10-24 05:35:45 +00:00
c528b9006a Fix EventPublisherFactory logic for disabled KV cache events (#27419)
Signed-off-by: Bradley <bradley.b.pitt@gmail.com>
2025-10-24 05:00:01 +00:00
85fee74b33 [Bugfix][CI] Move resolving cudagraph_mode before initializing attn_metadata_builder (#27427)
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
2025-10-23 20:31:14 -07:00
8dbe0c527f [Misc] Add TPU usage report when using tpu_inference. (#27423)
Signed-off-by: Hongmin Fan <fanhongmin@google.com>
2025-10-23 20:29:37 -07:00
5cc6bddb6e [Kernel] Add GPTQv2 format support for low-bit or asymmetric quantization, by adapting gptq_gemm (#26092) 2025-10-23 23:26:13 -04:00
1f9460c4c1 Fix pooling adapters for Transformers backend (#27338)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-23 20:23:55 -07:00
70022ffc00 Granite 4.0 quark quantization support (#26944)
Signed-off-by: Xiao YU <Xiao.YU@xilinx.com>
Signed-off-by: Xiao Yu <xiao.yu.dc@outlook.com>
Co-authored-by: Xiao YU <Xiao.YU@xilinx.com>
2025-10-24 02:14:03 +00:00
f417746ad7 [Hardware][POWERPC] Disable oneDNN path in vllm/model_executor/layers/utils.py for Powerpc (#27422)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
2025-10-23 21:21:36 +00:00
0552cfb195 [Model] Siglip Embedding Support (#27324)
Signed-off-by: piood <2477084691@qq.com>
2025-10-23 20:19:48 +00:00
51dd14ac2b [Bugfix][DP] Fix creating too many DP Placement Groups (#26880)
Signed-off-by: Kebe <mail@kebe7jun.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Rui Qiao <ruisearch42@gmail.com>
2025-10-23 20:16:51 +00:00
dbfbf9f324 [Attention] Fix FlashMLA metadata builder arguments for q_len > 1 (#27368)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-23 15:58:15 -04:00
ca76486a16 [Chore] Separate out vllm.utils.platform_utils.py (#27374)
Signed-off-by: Jonathan <chenleejonathan@gmail.com>
2025-10-23 19:08:06 +00:00
a9f55dc588 [Misc] Add triton_kernels dependency (#27370)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-23 12:04:14 -07:00
81d5bb765a [Bugfix] Fix AWQ marlin layer skipping (#27416)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-23 18:30:28 +00:00
0825197bee [Bugfix][ROCm][DeepSeek] Fix for forward_hip in rope for DeepSeek (#27373)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-10-23 17:43:53 +00:00
9ef3d5b875 [Bugfix] Fix dp_chunking enablement logic in FusedMoE layer (#27220)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-10-24 00:03:14 +08:00
295c7f0267 Mirroring the test definitions (2025-10-22) (#27362)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-10-24 00:02:26 +08:00
3fa2c12185 [Frontend][4/N] Improve all pooling task | Add plugin pooling task (#26973)
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Christian Pinto <christian.pinto@ibm.com>
2025-10-23 14:46:18 +00:00
fe2016de2d [CI/Build] Remove unnecessary flags from test registry (#27353)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-23 14:42:40 +00:00
237cf6d32a [Misc] Remove use of CUDA_VISIBLE_DEVICES for device selection (fix DP slow startup time &c) (#26709)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-10-23 20:58:39 +08:00
faee3ccdc2 [Feature] Pydantic validation for speculative.py (#27156)
Signed-off-by: Navya Srivastava <navya.srivastava1707@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-23 12:19:33 +00:00
570c3e1cd4 [Bugfix] Honor --mm_encoder_attn_backend when used (#27124)
Co-authored-by: Bradley D <4551889+bradleyhd@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-23 20:09:52 +08:00
3a4255c7c4 Run mypy on the lowest supported Python version instead of system Python (#27048)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-23 05:07:44 -07:00
61089465a6 [Model] Add MoE support for NemotronH (#25863)
Signed-off-by: Tomer Asida <57313761+tomeras91@users.noreply.github.com>
2025-10-23 10:27:23 +00:00
88afa11010 [Metrics] [KVConnector] Add connector prefix cache hit rate stats (#26245)
Signed-off-by: tovam <tovam@pliops.com>
2025-10-23 12:21:08 +02:00
d00ce29d89 [CI] Reorganize entrypoints tests (#27403)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-23 10:10:06 +00:00
3b7bdf983b add SLA information into comparison graph for vLLM Benchmark Suite (#25525)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-23 08:04:59 +00:00
50b788a17a [CI/Build] Fix AMD CI: test_cpu_gpu.py (#27388)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-23 07:55:00 +00:00
fc059c7061 [Bugfix] Fix args settings for guided decoding args (#27375)
Signed-off-by: Lucia Fang <fanglu@fb.com>
2025-10-23 07:34:06 +00:00
bfb240cc49 [CI/Build] Fix Prithvi plugin test (#27393)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-23 07:30:44 +00:00
e255d92990 [Chore] Remove duplicate has_ functions in vllm.utils (#27372)
Signed-off-by: Jonathan <chenleejonathan@gmail.com>
2025-10-23 06:11:59 +00:00
3729ed00ba [Model] Add num_cached_tokens for PoolingRequestOutput (#27378)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-23 14:03:42 +08:00
6644796bf4 [V1][spec decode] return logprobs for spec decoding (#26060)
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-10-22 22:59:59 -07:00
ff93cc8c84 [CORE] Support Prefix Caching with Prompt Embeds (#27219)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-10-22 22:18:07 -07:00
243ed7d32e [Bugfix][Core] running queue index leakage exception (#26754)
Signed-off-by: CLFutureX <chenyongqyl@163.com>
2025-10-22 21:40:12 -07:00
7e0941055f [Bugfix] Fix incorrect kv cache metrics in grafana.json (#27133)
Signed-off-by: Fangping Shi <fangping_shi@apple.com>
Co-authored-by: Fangping Shi <fangping_shi@apple.com>
2025-10-22 20:58:36 -07:00
6738e4a093 [Bugfix] Fix SLA tuner initialization (#27355)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-22 20:43:04 -07:00
2566dca2a9 [Bugfix] Fix deepseek-ocr multi-image inference and add merge_by_field_config=True with tensor schema support (#27361)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-22 17:15:38 -07:00
b4fda58a2d [MLA] Bump FlashMLA (#27354)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-22 15:48:37 -07:00
a0003b56b0 [Chore] Separate out system utilities from vllm.utils (#27201)
Signed-off-by: dongbo910220 <1275604947@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-22 20:25:25 +00:00
5beacce2ea [BugFix] bugfix for Flash Attention MLA with full cuda graph IMA following pr-25490 (#27128)
Signed-off-by: qqma <qqma@amazon.com>
Co-authored-by: qqma <qqma@amazon.com>
2025-10-22 19:36:39 +00:00
8669c69afa [Feature] publisher default set zmq in kv_event config (#26915)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-22 19:19:33 +00:00
1651003c35 [Prefix Cache] Use LoRA name for consistent KV-cache block hashing (#27211)
Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
2025-10-22 18:13:03 +00:00
1cb8c6c5fe [Doc] Fix numbering sequence in prefix caching (#27357)
Signed-off-by: William Song <jinwook@umich.edu>
2025-10-22 17:35:47 +00:00
e05a6754a8 [Model] Revert PR #26715: Restore custom PaliGemma and Gemma3-MM impl… (#27309)
Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com>
Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com>
2025-10-22 10:05:34 -07:00
084a9dae80 [Bugfix] Disable FlexAttention direct block mask building for encoder-only models (#27344)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-22 16:39:08 +00:00
RED
c9461e05a4 Support Anthropic API /v1/messages Endpoint (#22627)
Signed-off-by: liuli <ll407707@alibaba-inc.com>
Co-authored-by: liuli <ll407707@alibaba-inc.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-22 09:13:18 -07:00
4dfdb821c8 [P/D] Dynamic kv_output_aggregator collect size (#26734)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-22 18:07:58 +02:00
58fab50d82 [Frontend] Require flag for loading text and image embeds (#27204)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-22 15:52:02 +00:00
db6f28d898 [Bugfix] Fix HF format InternVL large variants video processing (#27330)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-22 08:39:23 -07:00
14e2f1231e [Bugfix] Make get_mrope_input_positions instance methods (#27342)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-22 08:38:34 -07:00
7c4767f1eb [NIXL] use Host buffer to support TP_ratio > 1 for XPU (#27140)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2025-10-22 15:28:13 +00:00
9771e0b432 [Bugfix] Add missing 'is_internal_router' attribute to FusedMoEWithLoRA (#27351)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-22 08:19:12 -07:00
980de31ca0 [bugfix] remove unused parameters to reduce unnecessary vram usage (#26789)
Signed-off-by: Reinforce-II <fate@eastal.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-22 08:16:09 -07:00
1c160841ea [Bug] Fix DeepSeek-V2.5-1210-FP8 issue (#27267)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-22 11:00:10 -04:00
4ca13a8667 [NIXL] Terminate handshake listener thread in shutdown (#26404)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-10-22 16:59:53 +02:00
675aa2ec64 [Model] Upstream Deepseek-OCR model (#27247)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-22 07:59:15 -07:00
3ae082c373 [Chore] Separate out optional dependency checks from vllm.utils (#27207)
Signed-off-by: dongbo910220 <1275604947@qq.com>
Signed-off-by: dongbo910220 <32610838+dongbo910220@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-22 10:44:21 -04:00
49c00fe304 Mirroring changes in test-pipeline.yaml into test-amd.yaml (#27242)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-10-22 09:59:45 -04:00
141d3b9fc5 [docs] Update v1 metrics design doc (#27332)
Signed-off-by: Simon Mo <simon.mo@hey.com>
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: atalhens <sneh.lata@nutanix.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: atalhens <sneh.lata@nutanix.com>
2025-10-22 06:29:15 -07:00
abf3db40ef [Core] Handle MoE LoRA edge cases (#27335)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-22 13:14:33 +00:00
8e4ca4d14e Bugfix - pass 'max_num_tokens_padded' into 'moe_lora_align_block_size' (#27311)
Signed-off-by: gnovack <gnovack@amazon.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-22 12:23:57 +00:00
1a0f4defb7 [Log] Add Warning for LLM(data_parallel_size=k) single-process DP Usage (#27282)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-22 12:12:21 +00:00
843af7f7fc [Bugfix][CPU] Disable dual stream execution for experts on CPU (#27320)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-10-22 11:02:27 +00:00
1f633b8632 [Frontend][3/N] Improve all pooling task | Support binary embedding response (#27066)
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-22 18:38:57 +08:00
a4c29e6e82 fixed reasoning streaming with tool_choice="required" (#24108)
Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Signed-off-by: ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
Co-authored-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2025-10-22 09:42:55 +00:00
8f18feb191 Remove last level references not removed in #26355 (#27260)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-22 09:18:17 +00:00
ed540d6d4c Update release pipeline for PyTorch 2.9.0 (#27303)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-10-22 09:18:01 +00:00
f6027b2855 [1/N][Platform] Cleanup useless function (#26982)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-22 09:04:57 +00:00
ab3e80042e [torch.compile] Enable silu_mul_fp8_quant fusion without custom ops enabled (#27146)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-10-22 00:22:39 -04:00
ceacedc1f9 [Benchmark] Add plot utility for parameter sweep (#27168)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-21 20:30:03 -07:00
bfa59be8f1 [CI] Nixl integration tests DP-EP (#27199)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-22 11:17:48 +08:00
265ecb05fb [DOC] [ROCm] Add ROCm quickstart guide (#26505)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-10-22 03:10:48 +00:00
09a7e6f617 [Deepseek v3.2] Remove extra logics in indexer (#26465)
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
Signed-off-by: Lain <siyuanf@nvidia.com>
Co-authored-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
2025-10-21 23:34:03 +00:00
6c2eef5a5d [P/D] KVConnector for decode benchmarking (#25986)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-10-21 16:30:47 -07:00
19748806f0 [Bugfix] skip cuda graph for drafter when running with eager (#26821)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-21 15:39:09 -07:00
4a8a567e16 Updated xgrammar backend to not deny supported string formats (#27253)
Signed-off-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Signed-off-by: ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
Co-authored-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-21 22:25:23 +00:00
344a0017c0 [Performance] Dual stream execution of "shared_experts" and "selected_experts" inside FusedMoE (#26440)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-10-21 21:38:29 +00:00
becb7de40b Update PyTorch to 2.9.0+cu129 (#24994)
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-21 17:20:18 -04:00
250fb1b8ea [Bugfix] fixes the decoding metadata of dense mla's fp8 kvcache. (#27144)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-10-21 18:27:03 +00:00
647214f3d5 [V0 Deprecation] Remove V0 executors (#27142)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-21 11:09:37 -07:00
ddeec11ba9 [Bugfix][P/D] Reduce num_threads used by nixl ucx backend (#27196)
Signed-off-by: David Whyte-Gray <40244437+dagrayvid@users.noreply.github.com>
2025-10-21 13:41:52 -04:00
86ed77022d [Feature] Batch Invariant for R1 TP 8 on Blackwell (#27229)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-21 10:25:55 -07:00
aa1356ec53 [ROCm] Update Triton, Torch, and AITER branches for ROCm base Dockerfile (#27206)
Signed-off-by: Micah Williamson <micah.williamson@amd.com>
2025-10-21 12:01:23 -04:00
ecc3c0940a Add @pavanimajety to .github/codeowners for Flashinfer, ModelOpt related code (#27213)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-10-21 22:59:53 +08:00
ba09652de2 [ROCM] Enable CompressedTensorsWNA16 (#27187)
Signed-off-by: JartX <sagformas@epdcenter.es>
2025-10-21 10:43:23 -04:00
bd66b8529b [CI] Install pre-release version of apache-tvm-ffi for flashinfer (#27262)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-21 14:23:56 +00:00
6c728f7771 [Chore] Separate out NCCL utilities from vllm.utils (#27197)
Signed-off-by: dongbo910220 <1275604947@qq.com>
2025-10-21 06:18:23 -07:00
80e9452984 [Deepseek v3.2] Optimize top_k_per_row (#26763)
Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
2025-10-21 08:30:07 +00:00
c3a2c6ac5f [MM][Core] Decouple ViT backend from LM backend (#27061)
Signed-off-by: Roger Wang <hey@rogerw.io>
2025-10-21 00:30:10 -07:00
72f431e709 [Nixl] Minor refactor to handshake related metadata (#26410)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-21 09:07:47 +02:00
be4445072c [Fix][Spec Decode] Fix llama4 draft loading with different quantization (#27136)
Signed-off-by: linzebing <linzebing1995@gmail.com>
2025-10-20 23:19:00 -07:00
f381cf2302 [Bugfix] Fix broken MTP weight loading for FP8 KV Scales (#27227)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-10-20 22:51:44 -07:00
5ff5d94e77 [Bugfix] Fix gpt-oss w4a8 DP/EP on B200 (#26729)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-21 01:51:14 -04:00
f95da13c3d [ModelOpt] Load w13/w2_input_scale for all experts, nvfp4 (#26135)
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-21 01:50:31 -04:00
aef368aa08 [BugFix] GPT-OSS Attention DP + MoE TP weight loading issue (#24032)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-10-21 04:03:47 +00:00
5f6cbf60d6 [Feature][Kernel]FusedMoE LoRA (#21229)
Signed-off-by: wuchen <cntryroa@gmail.com>
Signed-off-by: banjuede <lmklhc@163.com>
Signed-off-by: Chen Wu <cntryroa@gmail.com>
Signed-off-by: Danielle Robinson <dmmaddix@amazon.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: bk-201 <joy25810@foxmail.com>
Co-authored-by: wuchen <wuchen@zetyun.com>
Co-authored-by: Nathan Van Gheem <vangheem@gmail.com>
Co-authored-by: banjuede <lmklhc@163.com>
Co-authored-by: Danielle Robinson <dmmaddix@amazon.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: bk-201 <joy25810@foxmail.com>
2025-10-21 03:01:37 +00:00
3ada34f9cb [Frontend] Enforce tokenize=False when applying chat template (#27205)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-21 02:57:34 +00:00
0eb8f2b880 create is_in_the_same_node on cpu (#26832)
Co-authored-by: Lunwen He <lunwenh@meta.com>
2025-10-21 02:04:14 +00:00
163965d183 [cpu] Dispatch un-quantized linear to oneDNN/ACL by default for AArch64 (#27183)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
Co-authored-by: Michael Yang <Michael.Yang@arm.com>
2025-10-21 02:02:58 +00:00
a03cf9bc70 [V0 Deprecation] Remove V0 metrics code (#27215)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-21 02:02:10 +00:00
352c0c8a28 [Quantization] Automatically infer AWQ modules_to_not_convert field (#26909)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-21 01:49:28 +00:00
bfe0b4bd2a [ez] add uv lock to gitignore (#27212)
Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
2025-10-21 00:37:44 +00:00
58fbbcb2f5 [ROCm] enable some tests in entrypoints test groups on AMD (#26725)
Signed-off-by: Yida <yida.wu@amd.com>
2025-10-21 00:37:16 +00:00
87778d5f00 [Feature][Quantization] auto_round support for mixed bits quantization (#23812)
Signed-off-by: n1ck-guo <heng.guo@intel.com>
Signed-off-by: Heng Guo <heng.guo@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-20 22:23:30 +00:00
f9e7ad5400 [Bugfix][CI] Fix Distributed Tests (4 GPUs) async_sched+ray test (#27195)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-20 16:34:54 +00:00
4d0f266113 [Kernel][Model] Tune fused_moe Triton configs for Qwen3-30B A3/A3B on H100 (FP8/BF16) (#26268)
Signed-off-by: Shivam <shivampr.dev@gmail.com>
2025-10-20 07:48:01 -07:00
e93ff6c8b9 Nemotron Nano V2 VL + EVS Video Support (#27107)
Signed-off-by: Eugene Khvedchenia <ekhvedchenia@nvidia.com>
Signed-off-by: Natan Bagrov <nbagrov@nvidia.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Natan Bagrov <nbagrov@nvidia.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-20 22:19:11 +08:00
1c691f4a71 AArch64 CPU Docker pipeline (#26931) 2025-10-20 07:09:40 -04:00
9fce7bee74 [Kernel] Accelerate solve_tril with TMA (#26746)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-10-20 05:39:02 +00:00
b63f2143f8 [LoRA] LoRA cuda graph specialization (#25914)
Signed-off-by: Andy Lo <andy@mistral.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-20 04:21:09 +00:00
f32bf7582e [Model][VLM] Support Bee-8B Model (#27012)
Signed-off-by: uyzhang <yi.zhang.4096@gmail.com>
Signed-off-by: Yi Zhang <zhangyi970819@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
2025-10-20 02:31:26 +00:00
8a81d776ce Fix typo in ValueError message: use kv_role instead of kv_disagg_role (#27166)
Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
2025-10-19 19:47:19 +00:00
f6fdacd82c [Bugfix] Fix error with penalties when speculative decoding and structural output are enabled (#26586)
Signed-off-by: southfreebird <yvorott@gmail.com>
2025-10-19 19:24:46 +00:00
d31f7844f8 [Misc] Move utils to avoid conflicts with stdlib, and move tests (#27169)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-19 05:20:55 -07:00
7a6c8c3fa1 [Chore] Separate out vllm.utils.network_utils (#27164)
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
2025-10-19 03:06:32 -07:00
221bf72577 output type conversion fix (#27159) 2025-10-19 08:10:07 +00:00
b3aba04e5a [Benchmark] Convenience script for multiple parameter combinations (#27085)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-18 23:57:01 -07:00
8a297115e2 [Chore] Separate out hashing utilities from vllm.utils (#27151)
Signed-off-by: dongbo910220 <1275604947@qq.com>
2025-10-19 11:09:38 +08:00
191eed0bb9 [BugFix] Fix lazy imports involving outlines_core (#27158)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-10-19 02:35:32 +00:00
fb860670da [Minor] Remove unused env variable (#27161) 2025-10-18 18:48:35 -07:00
83e760c57d [V1][Metrics][Plugin] Add plugin support for custom StatLoggerBase implementations (#22456)
Signed-off-by: tovam <tovam@pliops.com>
2025-10-18 15:12:46 -07:00
c2bba69065 [BugFix] Disable fp8 kv-cache by default for DeepSeek V3.2 (#27121)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-18 22:05:23 +00:00
e133d6d218 [BugFix] fix graph partition signature (#27139)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-18 17:34:36 -04:00
a1946c9f61 [Chore] Separate out profiling utilities from vllm.utils (#27150)
Signed-off-by: dongbo910220 <1275604947@qq.com>
2025-10-18 19:12:01 +00:00
9f020f4f31 [BugFix] Fix failing gemma-3-1b-it test: test_lm_eval_accuracy_v1_engine[google/gemma-3-1b-it] (#27111)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-10-18 12:44:39 -06:00
3b45075206 [Minor] Add some clarifying comments to recent changes (#27130)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-18 09:52:45 -07:00
168e578efc Fix incorrect string formatting in barrier timeout exceptions (#27149)
Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
2025-10-18 09:51:57 -07:00
6ac5e06f7c [Chore] Clean up pytorch helper functions in vllm.utils (#26908)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: isotr0py <2037008807@qq.com>
2025-10-18 09:48:22 -07:00
5c2acb270a [Models][QwenVL] Remove unnecessary .contiguous() calls (#27106)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-18 07:05:05 -07:00
b26b70bec4 [Misc] Refactor get_kv_cache_spec into AttentionLayerBase (#26587)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-18 13:51:21 +00:00
ab4be40fc5 [fix][cpu] fix prefill attention in CPU attention backend (#27035)
Signed-off-by: Fadi Arafeh <fadi.arafeh@arm.com>
2025-10-18 13:30:21 +00:00
245e4f2c01 [Feature] Batch Invariant: Support DeepGEMM and Blackwell (#27127)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-18 09:28:05 -04:00
1d165d6d85 [Chore] Separate out vllm.utils.mem_utils (#27143)
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-18 10:06:59 +00:00
83004020fd [Test] Add test for /health endpoint on engine failure (#26074)
Signed-off-by: dongbo910220 <1275604947@qq.com>
2025-10-18 09:59:05 +00:00
12e21701e7 [DOC][FEATURES][CPU]update cpu feature for v1 (#27135)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-18 01:10:45 -07:00
30a33b92ee [Misc] Rev DeepEP (#27122)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-18 14:54:29 +08:00
7c572544e4 [GPT-OSS] Structure_Tag support for gpt-oss tool-call in cot (#25515)
Signed-off-by: Hanchenli <lihanc2002@gmail.com>
Signed-off-by: Hanchenli <61769611+Hanchenli@users.noreply.github.com>
Signed-off-by: Wei Wei <wwei6@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Wei Wei <wwei6@meta.com>
Co-authored-by: Wei Wei <weiweinpu@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-10-17 21:55:54 -07:00
c312320764 [CI/Build] tests(v1): feed Triton attention the (num_blocks, 2, …) KV cache layout in backend-correctness tests (#26663)
Signed-off-by: Huamin Li <3ericli@gmail.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-17 21:11:26 -07:00
c981f0ea78 [Perf] Add H100 fused MoE config (#25398)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-10-18 02:21:27 +00:00
6367bde739 [BugFix][Core] Fix error when enable async-scheduling in multi-node env (#25887)
Signed-off-by: Lehua Ding <lehuading@tencent.com>
Signed-off-by: Lehua Ding <lehuading@qq.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
2025-10-17 22:16:18 +00:00
f50cc221ea [Test] Make test_failure more stable for batch invariance (#27054) 2025-10-17 16:59:08 -04:00
acedc74b1a [V1][Spec Decode] Fix greedy temperature detection after sampler refactor (#27077)
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
2025-10-17 13:27:47 -07:00
d29483b58a [Minor] Remove unnecessary error message (#27115)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
2025-10-17 20:02:12 +00:00
950cf9e58e [Bugfix] Use PIECEWISE cudagraphs on Blackwell if max_model_len > 131072 (#27114)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-17 19:47:18 +00:00
3125d79950 [Chore] Remove unused PolyNorm layer (#27110)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-17 19:03:43 +00:00
e33ee23ee3 [Bugfix] [AITER] [ROCm] Fix Quark MoE Quant Config and AITER Fused MoE quant type logic (#27029)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-10-17 12:51:10 -06:00
b10c64c834 [ROCm][Bugfix][Model] Fix illegal memory access when running qwen3_moe models with rms_norm (Qwen3-235B-A22B, Qwen3-30B-A3B, etc.) (#26192)
Signed-off-by: Randall Smith <ransmith@amd.com>
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
Signed-off-by: rasmith <Randall.Smith@amd.com>
Co-authored-by: Randall Smith <ransmith@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-17 14:17:18 -04:00
0925b28a8e [ROCM] MoE fp4 CK kernel (#26545)
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
2025-10-17 14:06:33 -04:00
99722d5f0e [CI] Remove forbidden slash (#27112)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-17 09:38:00 -07:00
4c91a28e30 [bugfix] Qwen3-VL fix video incorrect timestamp calculations while do_sample_frames=True (#27104)
Co-authored-by: 松灵 <wpf272043@alibaba-inc.com>
2025-10-17 16:26:33 +00:00
b038d9c40c [Data-parallel] Allow DP>1 for world_size > num_gpus on node (8) (#26367)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Rui Qiao <ruisearch42@gmail.com>
2025-10-17 08:24:42 -07:00
2ba60ec7fe [CI] Nixl integration tests (#27010)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-10-17 07:13:31 -07:00
bd7157a071 [torch.compile] Enable attention and allreduce fusion without custom ops enabled (#24604)
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-17 08:10:23 -06:00
be429d0cfd Fix incorrect docstring for stop_profile() method (#27101)
Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com>
2025-10-17 06:30:23 -07:00
c253745eb8 [Harware][AMD][Model] Triton MoE tuning configs for GLM-4.5 for MI350 and MI355 (#25586)
Signed-off-by: Reima Karhila <reima.karhila@amd.com>
Signed-off-by: xaguilar <Xavier.AguilarFruto@amd.com>
Co-authored-by: xaguilar <Xavier.AguilarFruto@amd.com>
2025-10-17 04:56:12 -07:00
daec4d2624 [Model]Improve Qwen3VLMoeForConditionalGeneration packed_modules_mapping (#27096)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-17 04:47:00 -07:00
6c9fdbf725 [Docs] Replace rst style double-backtick with md single-backtick (#27091)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-17 02:47:34 -07:00
483ea64611 [Docs] Replace all explicit anchors with real links (#27087)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-17 02:22:06 -07:00
e20eba753b [VLM][Refactor] Remove useless func get_input_positions in MRotaryEmbedding (#27088)
Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-17 02:00:30 -07:00
bbc1b29665 Update troubleshooting.md and remind VLLM_TRACE_FUNCTION usage (#27069)
Signed-off-by: cong-meta <prowindy@hotmail.com>
2025-10-17 01:53:06 -07:00
acb1bfa601 [CI] fix docs build failed (#27082)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-17 07:53:40 +00:00
75c7ad9918 [Kernel][Performance] Fuse float cast and renormalize to topk softmax kernel (#26717)
Signed-off-by: zhuhaoran <zhuhaoran.zhr@alibaba-inc.com>
Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
2025-10-17 07:30:35 +00:00
5550ff9c25 [CI/Build] Update compressed tensor test path to fix CPU CI (#27068)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-10-16 22:34:56 -07:00
3aeb19a39e [Model] Add support for LightOnOCR (#26916)
Signed-off-by: Said Taghadouini <taghadouinisaid@gmail.com>
Signed-off-by: Said Taghadouini <84044788+staghado@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-17 05:05:24 +00:00
8c017b3490 [Model] Always use Transformers backend for PaliGemma and Gemma3-MM (#26715)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-17 05:03:35 +00:00
9c2c2287a0 [CI/Build] Update Llama4 eval yaml (#27070)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-17 04:59:47 +00:00
fec2b341ad [Kernel] Lazy import FlashInfer (#26977) 2025-10-17 04:48:18 +00:00
87bc0c492f [Bugfix] Fix ReplicatedLinearWithLoRA (#27065)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-17 04:43:16 +00:00
fe3b9372ad [Core] Change execute_model_with_error_logging() to be a ctx manager (#27060)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-17 11:45:32 +08:00
bde9e2272a [Bugfix][Qwen] fixes the weights dtype in qwen3_next: it is actually a bfloat16 (#27030)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-10-17 03:37:52 +00:00
08405609cc disable graph partition in custom op (#26952)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-17 11:08:47 +08:00
ab81379ea6 [Perf] Exploit out-of-band buffers in shm_broadcast (#26961)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-16 20:08:03 -07:00
4ffd6e8942 [Docs] Reduce custom syntax used in docs (#27009)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 20:05:34 -07:00
965c5f4914 vllm bench serve shows num of failed requests (#26478)
Signed-off-by: Tomas Ruiz <tomas.ruiz.te@gmail.com>
2025-10-16 19:55:09 -07:00
4d055ef465 Remove unused imports (#26972)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-16 19:51:17 -07:00
17c540a993 [torch.compile] fix simple inductor graph partition test (#27050)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-16 21:09:36 -04:00
4d4d6bad19 [Chore] Separate out vllm.utils.importlib (#27022)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-17 00:48:59 +00:00
11ae016bd7 [torch.compile] Passing only necessary compilation config to inductor pass config (#27041)
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Lucia (Lu) Fang <fanglu@meta.com>
2025-10-17 00:01:52 +00:00
41d3071918 [NVIDIA] [Perf] Update to leverage flashinfer trtllm FP4 MOE throughput kernel (#26714)
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-16 16:20:25 -07:00
fb5e10d3fb Refactor Transformers backend to use mixins (#26906)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 21:50:39 +00:00
b2f78cbad4 [small][batch invariance] Rename the env and internal flags to simplify usage (#26855)
Signed-off-by: Bram Wasti <bwasti@meta.com>
2025-10-16 21:40:25 +00:00
23583ee28c [Bug] Add Assertion for random-input-len / random-output-len (#26834)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-16 21:36:39 +00:00
01c977e96d [CI] Prune Quantization Tests and skip compilation (#27038)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-16 17:26:35 -04:00
b3dda72c23 [Feature] Migrate DeepGEMM API from get_m_alignment_for_contiguous_layout to get_mk_alignment_for_contiguous_layout (#26935)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-16 16:46:48 -04:00
fb0571b077 [GPTOSS][DP/EP][Marlin] Enable GPTOSS Batched DP/EP using Marlin kernels (#25997)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-16 12:53:11 -07:00
2ed8b6b3d0 [Bug] Fix batch invariant test has to is (#27032)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-16 19:45:14 +00:00
013abde6ef Adding Warmup to Benchmark Serving (#26943)
Signed-off-by: Kimbo Chen <chentenghung@gmail.com>
2025-10-16 12:44:32 -07:00
a5464dcf92 [Compressed Tensors] Always clone output for compile robustness (#26849)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-16 19:29:59 +00:00
ac3ed5a815 Support block size of 256 used by Intel HPU (#26883)
Signed-off-by: mandy-li <mandy.j.li@intel.com>
2025-10-16 15:10:57 -04:00
e6ba2000ae [gpt-oss][1/N] EZ: refactor serving_responses for modularity (#26948)
Signed-off-by: Andrew Xia <axia@meta.com>
2025-10-16 18:44:06 +00:00
aa255ff55a Support set in the CLI generation (#27031)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 18:07:18 +00:00
7bb736d00e Fix Qwen2.5 VL image grid docstring (#27033)
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
2025-10-16 09:57:36 -07:00
9f4e30904b [Model] Fix Qwen3VL mm mapping (#27027)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-16 09:45:59 -07:00
5afd3276df [Feature] Add process_weights_after_loading to AttentionImpl (#26870)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-10-16 08:02:30 -07:00
43721bc67f [CI] Replace large models with tiny alternatives in tests (#24057)
Signed-off-by: Tahsin Tunan <tahsintunan@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-16 15:51:27 +01:00
02d709a6f1 [docs] standardize Hugging Face env var to HF_TOKEN (deprecates HUGGING_FACE_HUB_TOKEN) (#27020)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
2025-10-16 15:31:02 +01:00
4a510ab487 [NIXL] Improve request_finished() debug logs (#25665)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-10-16 15:55:17 +02:00
314fa8abbf [Attention] Tune CUTLASS MLA num_splits (#26846)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-16 06:36:09 -07:00
334535b6fb [Benchmark] Show E2EL by default for pooling models (#27014)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 12:47:09 +00:00
dcbb3f1871 [Bugfix] Correct LayerNorm epsilon parameter in modernbert.py (#27008)
Signed-off-by: bogdanm <152898065+bogdan01m@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-16 12:27:44 +00:00
00417f4e44 [MISC] fix import violations for re and triton modules (#26654)
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-10-16 03:38:27 -07:00
ed344f4116 Cleanup code after Python 3.10 upgrade (#26520)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
2025-10-16 03:38:23 -07:00
e51928793e [Model][Bugfix] fix ernie45 vl run failed from shared experts optimization (#26885)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-16 03:37:35 -07:00
d2740fafbf [Chore] Separate out vllm.utils.collections (#26990)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 08:35:35 +00:00
17838e50ef [Benchmark] Use truncation by default for pooling benchmarks (#26992)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 16:02:39 +08:00
44c8555621 [CI/Build] Fix AMD import failures in CI (#26841)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-10-16 07:28:20 +00:00
f7d318de2b [Hardware][CPU][PowerPC]Disable torch.compile() in toptopk sampling (#26987)
Signed-off-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
Co-authored-by: Akash Kaothalkar <akash.kaothalkar@ibm.com>
2025-10-15 22:36:59 -07:00
76f0d05bc6 [CI/Build] Update expected beam search output for Phi3V (#26978)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 05:12:44 +00:00
7d8975de84 Deepseek-v3 Batch Invariant on 8xH100 (#26609)
Signed-off-by: Bram Wasti <bwasti@meta.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-15 22:06:02 -07:00
785d8b6410 [PERF] Qwen3-next MTP speedup (change bool mask indexing to index_select / index_copy to reduce d2h) (#26437)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@gmail.com>
2025-10-16 12:18:31 +08:00
f6cdc9a02f [Chore] Rename utils submodules (#26920)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-16 03:58:13 +00:00
509cdc0370 [DOC][XPU]update feature parity with Intel GPU (#26954)
Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-15 20:07:10 -07:00
9b6504c307 [BugFix] Work around graph partition x torch.compile cache issue (#26956)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-10-15 20:06:11 -07:00
e19b16dde6 [bugfix] Fix SP + PP without specifying compile size (#26955)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-15 20:05:33 -07:00
582f2c6be7 [BUG] Allow runai_streamer_sharded in config check (#26958)
Signed-off-by: ahao-anyscale <ahao@anyscale.com>
2025-10-15 20:05:14 -07:00
f8a0acbdbe [CI] Enable Blackwell Llama4 MoE tests (#26731)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-15 21:02:57 -06:00
1317034379 [ROCm][FEAT] Fuse DeepSeek shared experts into AITER fused_moe ops (#24097)
Signed-off-by: chenjun <junchen2@amd.com>
Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
Co-authored-by: valarLip <103567126+valarLip@users.noreply.github.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
2025-10-16 10:41:34 +08:00
0ecc553ee6 [Bugfix] reasoning_parser parameter handling in run_batch.py (#26225)
Signed-off-by: inc-jeong <inc.jeong@navercorp.com>
Signed-off-by: InChang Jeong <inc.jeong@navercorp.com>
Co-authored-by: USER <user@AL02367916.local>
2025-10-16 10:24:05 +08:00
f96bc3649c [Qwen3-Next] Add tuned MoE config for Qwen3-Next FP8 on H100 tp2 (#26887)
Signed-off-by: Felix Zhu <felixzhu555@gmail.com>
2025-10-15 18:55:05 -07:00
938c43ea7f [ci] Adjusting AMD test composition 2025-10-14 (#26852)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-10-15 23:52:13 +00:00
0a9ef0cfce Move query quantization to attention layer for Flashinfer & Triton. (#26534)
Signed-off-by: adabeyta <aabeyta@redhat.com>
Signed-off-by: Adrian Abeyta <aabeyta@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-15 19:01:38 -04:00
e5b438a247 [Bug] Temporally Disable VLLM_ALLREDUCE_USE_SYMM_MEM by Default (#26925)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-15 16:18:50 -04:00
0b99f5d302 support flashinfer_fp4 moe for 5090 gpu (#26669)
Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-15 15:06:47 -04:00
1f491aa0c8 Vectorize RMS norm variance using vectorize_read_with_alignment (#26234)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-15 11:54:41 -07:00
de92d916fe [NVIDIA] Add support for cudnn fp4 gemm via flashinfer (#26107)
Signed-off-by: kaixih <kaixih@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-10-15 13:53:00 -04:00
a1063628a4 [Chore] Clean up CODEOWNERS (#26923)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-10-15 10:52:54 -07:00
d796375258 [ModelOpt] Remove NVFP4 MoE K%16==0 constraint (#26891)
Signed-off-by: XiaobingSuper <xiaobingzhangupc@gmail.com>
2025-10-15 13:06:17 -04:00
14f8456344 [Feature]: Use pydantic validation in observability.py config (#26637)
Signed-off-by: Samuel Wu <cernunnos1710@gmail.com>
Signed-off-by: Sam/Samuel <57896620+cern1710@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-15 16:44:03 +00:00
4794c2bd92 Olmo 3 tool parser and tests (#26143)
Signed-off-by: Pradeep Dasigi <pradeepd@allenai.org>
2025-10-15 16:36:12 +00:00
d3cbaa08dc Lower sevarity of log when model info cache misses due to exception (#26917)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-15 09:01:09 -07:00
828523ad8e [Chore] Separate out vllm.utils.async_utils (#26913)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 15:33:00 +00:00
136a17fe6e [Chore] Separate out vllm.utils.func (#26904)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 13:03:58 +00:00
f57438338d [BugFix] Patch inductor memory plan logic (#26878)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-15 12:51:45 +00:00
5d598680e3 chore: remove unused marker (#26890)
Signed-off-by: Max Wittig <max.wittig@siemens.com>
2025-10-15 05:40:33 -07:00
8f4b313c37 [Misc] rename torch_dtype to dtype (#26695)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-15 12:11:48 +00:00
f93e348010 [Misc] Remove isort and yapf ignores (#26888)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 12:09:03 +00:00
f54f85129e [Model][2/N] Improve all pooling task | Support multi-vector retrieval (#25370)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-15 11:14:41 +00:00
d4d1a6024f [Lora]Load tuned multi-lora kernel configs from json files (#26319)
Signed-off-by: li2haipeng <44383182+li2haipeng@users.noreply.github.com>
Signed-off-by: Haipeng Li <li2haipeng@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-15 09:45:14 +00:00
db1764e4e0 [Platform] allow platform to init dp group (#22243)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-15 02:32:17 -07:00
7f83b4ee8e [Easy] Get rid of unnecessary paraenthesis in kv_cache_manager (#26842)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-15 09:17:43 +00:00
5c3bae1a6a [Fix] Remove divisibility requirement between num_kv_heads and tp_size in bailing_moe (#26876)
Signed-off-by: vito.yy <vito.yy@antgroup.com>
2025-10-15 16:44:04 +08:00
5210dc3940 [Misc] Update TritonLanguagePlaceholder to have attributes that are used by Flash Linear Attention ops. (#26853)
Co-authored-by: Xudong Ma <mxd@meta.com>
2025-10-15 08:37:49 +00:00
650b51f9f9 [doc] add Context Parallel Deployment doc (#26877)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-15 16:33:52 +08:00
6256697997 [Doc] ruff format remaining Python examples (#26795)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 01:25:49 -07:00
71557a5f7c [CI] Fix mypy for vllm/executor (#26845)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-15 01:23:33 -07:00
f3c378ffa7 [CI/Build] Add Qwen2.5-VL-7B-Instruct ChartQA Accuracy Tests in CI (#21810)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: zhewenli <zhewenli@meta.com>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Ye (Charlotte) Qi <ye.charlotte.qi@gmail.com>
2025-10-15 08:09:56 +00:00
f5ed68ef63 [Deepseek-V3.2][Kernel] Integrate cuda indexer k cache gather (#26456)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-10-15 16:05:01 +08:00
efdef57b1f [bugfix] Lazy import cv2 (#26869)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-15 07:47:50 +00:00
b8a4572157 [Misc] Use helper function to generate dummy messages in OpenAI MM tests (#26875)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-15 07:17:37 +00:00
302ef403a2 [DSA][MLA] Tiny refactor on DeepSeek to make it reusable for different backends (#26656)
Signed-off-by: MengqingCao <cmq0113@163.com>
2025-10-15 00:16:44 -07:00
8865da157b [Bugfix][Multi Modal] Fix incorrect Molmo token processing (#26873)
Signed-off-by: sanghol <sanghol@allenai.org>
2025-10-15 07:13:59 +00:00
f0862eae43 [Graph Partition] pass tests for decorator (#26831)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-15 06:39:48 +00:00
8c851f6d04 [Bugfix] Fix qwen3-omni audio truncation issue (#26815)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-15 05:38:36 +00:00
7cfa420f49 [BugFix] Patch inductor partitioning logic (#26735)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-15 05:04:32 +00:00
a27b288e4a [Feature] default --extra-body param to disable thinking in vllm bench serve (#26784)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-10-15 04:23:44 +00:00
e471d7ca7e [CI/Build][Bugfix] fix qutlass cmake error when set QUTLASS_SRC_DIR (#26773)
Signed-off-by: izhuhaoran <izhuhaoran@qq.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-10-15 04:09:44 +00:00
c43ca8259e [Docs] Move build.inc into arm.inc (#26862)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-10-14 20:35:08 -07:00
85a65e7f51 [Model] Add DeepSeek-V3.1 reasoning parser (split from PR #24972) (#25589)
Signed-off-by: taohui <taohui3@gmail.com>
Signed-off-by: Tao Hui <taohui3@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
2025-10-15 11:09:52 +08:00
a2986b3e33 [Bugfix] Fixes prefix-repetition benchmark script (#26828)
Signed-off-by: Kourosh Hakhamaneshi <Kourosh@anyscale.com>
2025-10-15 02:54:43 +00:00
96b9aa5aa0 [Frontend][torch.compile] CompilationConfig Overhaul (#20283): name change compilation level to compilation mode, deprecation compilation level (#26355)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-15 02:51:16 +00:00
e66d787bce Disable FlashInfer sampler by default (#26859)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-15 02:35:18 +00:00
bfad142e25 [BUGFIX][NIXL] quick fix for 'assert self.connector_worker is not None' in get_kv_connector_stats (#26851)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-15 02:33:25 +00:00
9354660036 [Bugfix]fix Qwen3 xml tool parser (#26345)
Signed-off-by: Zhikaiiii <1658973216@qq.com>
2025-10-15 09:50:30 +08:00
07ca70af8d [Core][Easy] Use envs.__getattr__ for all Unify to environment variable access (#26810)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-15 01:41:18 +00:00
2dcd12d357 [torch.compile] Fix tests for torch==2.9 inductor partition (#26116)
Signed-off-by: ProExpertProg <lgovedic@redhat.com>
Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2025-10-14 19:55:02 -04:00
579d2e5458 [WideEP][P/D] Add usage stats for DP+EP and KV Connector (#26836)
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
2025-10-14 23:51:54 +00:00
0512c04aee [frontend][gptoss] Add per turn stats into Harmony Context (#25061)
Signed-off-by: lacora <hyelacora@gmail.com>
Co-authored-by: Ye Hu <yehu@fb.com>
2025-10-14 16:48:13 -07:00
7e0ef4084a [CI Failure] Fix torchao dep failure for Quantization Test (#26824)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-14 16:41:43 -07:00
4aed506b65 [Core] Streamline some structured output related code (#26737)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-14 23:27:44 +00:00
a86b4c58e8 remove attn output view kernel (#26680)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
Signed-off-by: Boyuan Feng <fby.1994@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-14 22:53:10 +00:00
ff4810ba73 [Minor] Group async_scheduling related fields in model runner init (#26736)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-14 14:46:37 -07:00
9d6964926e fix: response_format for completion (#23212)
Signed-off-by: Nan2018 <qinnanjoshua@gmail.com>
2025-10-14 21:23:22 +00:00
0e65818910 Added MoE configs for llama 4, H200 device with tp=4/8 tuning (#26837)
Signed-off-by: Dhruvil Bhatt <bhattdbh@amazon.com>
2025-10-14 14:21:03 -07:00
380f17527c [Perf] Cache vllm.env.__getattr__ result to avoid recomputation (#26146)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-14 17:03:21 -04:00
b92ab3deda Notice for deprecation of AutoAWQ (#26820)
Signed-off-by: HDCharles <39544797+HDCharles@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-14 13:39:59 -07:00
acaa2c0a4a [Core] Reuse empty block lists whenever possible in KVCacheBlocks to mitigate GC costs (#24964)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-14 12:58:43 -07:00
82af928c41 [Attention][Spec Decode] FlashMLA spec decode support (#26541)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
2025-10-14 19:38:20 +00:00
87efc681db llama4_vision_rope: add HIP override to accept (q, k) and avoid (positions, q, k) mismatch (#26790)
Signed-off-by: Huamin Li <3ericli@gmail.com>
2025-10-14 11:54:12 -07:00
c3a722fcb2 [CI Failure] Fix tests with missing TinyLlama-1.1B-Chat-v1.0-FP8-e2e (#26816)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-14 18:38:59 +00:00
aba48f7db1 [Kernel][MoE] Add MoE tunings for GLM 4.6-FP8 and GLM 4.5 Air on NVidia B200 (#26818) 2025-10-14 11:20:39 -07:00
04b5f9802d [CI] Raise VLLM_MAX_SIZE_MB to 500 due to failing Build wheel - CUDA 12.9 (#26722)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-14 10:52:05 -07:00
efc8f7d814 Update coveragerc and add codecov.yml for path fixes (#26435)
Signed-off-by: Reza Barazesh <rezabarazesh@meta.com>
2025-10-14 09:45:06 -07:00
6d87a2838c [Config] Remove Unused Environment Variable VLLM_DISABLE_PAD_FOR_CUDAGRAPH (#26743)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-14 11:47:49 -04:00
e6cdbd6792 Revert "[issues template] Encourage the author implement their own ideas" (#26814) 2025-10-14 08:37:34 -07:00
df850c4912 [Feature][Responses API] Stream Function Call - harmony (#24317)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-14 08:31:43 -07:00
720394de43 [KVConnector][Metrics] Aggregate scheduler-side KVConnectorStats (#26046)
Signed-off-by: Qier Li <kevin44036@gmail.com>
2025-10-14 14:38:07 +00:00
88a49745af [issues template] Encourage the author implement their own ideas (#26671)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-14 22:32:36 +08:00
ca683a2a72 use combo kernel to fuse qk-norm and qk-rope (#26682)
Signed-off-by: Boyuan Feng <boyuan@meta.com>
2025-10-14 09:40:59 -04:00
e9f1b8c9e9 Adjusted the model order of the model registration file (#26798)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-10-14 13:26:11 +00:00
ea97940d6c [DCP] Support Decode Context Parallel (DCP) for GQA with FlashAttention (#24864)
Signed-off-by: yuanyongjie.yyj <yuanyongjie.yyj@antgroup.com>
Signed-off-by: FENP <32334296+FENP@users.noreply.github.com>
Signed-off-by: Jaya Yuan <yuanyongjie.yyj@antgroup.com>
2025-10-14 13:07:50 +00:00
fdd32750f0 [CI/Build] Cleanup LoRA test (#26752)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-14 12:06:35 +00:00
c715ba3735 [Feature] Change vllm.py with pydantic validation (#26726)
Signed-off-by: Vladislav <vladislav.bronzov@gmail.com>
Signed-off-by: Vladislav Bronzov <58587565+VladOS95-cyber@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-14 12:00:54 +00:00
9c4cb68339 [Chore] Remove SupportsV0Only interface and update supported models docs (#26783)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 04:55:10 -07:00
780eb03d9b [CI] Fix test_tool_id_kimi_k2 (#26787)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-10-14 10:27:07 +00:00
ef9676a1f1 [Doc] ruff format some Python examples (#26767)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 03:21:53 -07:00
70b1b330e1 Don't allow typos to fix by default (#26785)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-14 03:05:15 -07:00
d1d063a588 [Chore] Use max_transformers_version for Qwen-VL test (#26792)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 03:03:46 -07:00
7e6edb1469 [NIXL][HeteroTP] Enable KV transfer from HND prefill to NHD decode (#26556)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
2025-10-14 09:46:05 +00:00
74704d4553 [Model] Use merge_by_field_config for MM models (O-P) (#26776)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 09:42:45 +00:00
d2f816d6ff [Bugfix] Standardize merging multimodal embeddings (#26771)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 09:36:21 +00:00
577d498212 [Plugin] Make plugin group clear (#26757)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-14 07:49:59 +00:00
fd85c9f426 [Bugfix][FE]: Always include usage with --enable-force-include-usage (#20983)
Signed-off-by: Max Wittig <max.wittig@siemens.com>
Signed-off-by: Antoine Auger <antoineauger@users.noreply.github.com>
Co-authored-by: Antoine Auger <antoineauger@users.noreply.github.com>
2025-10-14 09:17:39 +02:00
d32c611f45 [CI/Build] Use 127.0.0.1 instead of localhost in utils (#26750)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
2025-10-14 07:04:00 +00:00
01ad27faff [Model][Bugfix]fix ernie45 load failed due to ernie45 eplb code (#26684)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-14 06:55:23 +00:00
481545b397 scheduler.py: Update the name of the default scheduler. (#26758)
Signed-off-by: Ryan Li <ryanli@ryanli.org>
2025-10-14 06:52:21 +00:00
d3cc8427c0 [ci] Adding the test-amd.yaml for test definitions for the AMD backend. (alternative PR) (#26718)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-10-13 23:10:23 -07:00
4821ac1b4d [CI] [ROCm] Automate CC list for ROCm related issue (#26753)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-10-14 13:57:26 +08:00
4497c8f821 Fix lora tests failure in TPU CI due to the removal of LoRA bias (#26723)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-10-14 13:04:23 +08:00
2e36cdbe2b [Docs] Add a start tag to build.inc.md (#26747)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
2025-10-13 21:51:55 -07:00
fe3edb4cf0 Add support for the /rerank endpoint in vllm bench serve (#26602)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-10-14 04:25:43 +00:00
29350922c6 [Feature][Quantization] auto_round format add support for regex (#24024)
Signed-off-by: n1ck-guo <heng.guo@intel.com>
Signed-off-by: Heng Guo <heng.guo@intel.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-10-14 03:03:16 +00:00
8ae169286f [torch.compile] Unwrap fused_marlin_moe custom op (#26739)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-14 02:22:16 +00:00
8a0af6a561 [build][torch.compile] upgrade depyf version (#26702)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-10-14 10:12:09 +08:00
cfded80793 [Easy] Fix env type check errors from VLLM_DEBUG_LOG_API_SERVER_RESPONSE (#26742)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-14 01:46:44 +00:00
b59dd19b55 [compile] Enable sequence parallelism for full cuda graph without specifying compile sizes (#26681)
Signed-off-by: angelayi <yiangela7@gmail.com>
2025-10-13 18:15:34 -07:00
3e051bda82 [UX] Replace VLLM_ALL2ALL_BACKEND with --all2all-backend (#26732)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-13 18:12:52 -07:00
8317f72354 [Misc][DP] support customized aggregated logger for dp (#24354)
Signed-off-by: Lu Fang <fanglu@fb.com>
2025-10-13 17:45:59 -07:00
d8bebb008a Add tests for chunked prefill and prefix cache with causal pooling models (#26526)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Ayush Singh <ayush1009208@gmail.com>
2025-10-14 07:45:04 +08:00
35bc22f23c [ResponseAPI] Further polish message serialization and unit tests (#26728)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-13 23:31:35 +00:00
fa96fb9c70 Pruning kernel Core Tests (#26727)
Signed-off-by: Fardin Hoque <kfhfar@amazon.com>
2025-10-13 23:08:18 +00:00
e3fdb627d9 [FrontEnd] UNREVERT CompilationConfig overhaul (#20283): deprecate use_inductor in favor of backend, simplify custom_ops (#26502)
Signed-off-by: morrison-turnansky <mturnans@redhat.com>
Signed-off-by: Morrison Turnansky <mturnans@redhat.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
2025-10-13 22:47:16 +00:00
7200a21cd1 [Bug] Fix Assertion error DeepEP/csrc/kernels/intranode.cu:928: 'false and Unsupported type' (#26532)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-13 18:26:37 -04:00
577c72a227 [CI Perf]Prune Tests in kernel/mamba (#26538)
Signed-off-by: Fardin Hoque <kfhfar@amazon.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-10-13 18:22:31 -04:00
314285d4f2 [CI] Fix mypy for vllm/distributed (#26593)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-13 16:02:24 -04:00
d2a7938582 [Frontend][1/N] Improve all pooling task | Support FP16 Embedding Base64 (Still uses fp32 by default). (#26414)
Signed-off-by: wang.yuqi <noooop@126.com>
Co-authored-by: Maximilien de Bayser <maxdebayser@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-10-13 19:06:43 +00:00
89342ce4c0 [Quantization] [Performance] Enable Marlin GEMM kernels for the calibration-free RTN-based quantization (#26051)
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Signed-off-by: Alex Kogan <82225080+sakogan@users.noreply.github.com>
2025-10-13 18:52:54 +00:00
f89f599395 [CI][Release][Arm64]: Build arm64 release for gpu arch 8.9 (#26698) 2025-10-13 18:42:12 +00:00
e251e457c5 [Log] Optimize Startup Log (#26601)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-10-14 02:06:57 +08:00
afc47e4de7 [Model] Use merge_by_field_config for MM models (M-N) (#26710)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-14 01:27:01 +08:00
e3b90c1ba2 [Bugfix][Speculative Decoding] Extend Eagle quantization config fix to llama_eagle.py (#26590)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-10-13 17:17:13 +00:00
134f70b3ed [Bugfix][Rocm] fix qr error when different inp shape (#25892)
Signed-off-by: Haoyang Li <lihaoyang0109@gmail.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-10-13 10:04:21 -07:00
a1b2d658ee [CI/Build] upgrade compressed-tensors to 0.12.2 to address LGPLv3 (#26501)
Signed-off-by: Sangyeon Cho <josang1204@gmail.com>
2025-10-13 12:58:33 -04:00
5c7fe25491 [Misc] Separate prompt logging to debug (#26713)
Signed-off-by: Aleksei Tsvetkov <aitsvet@ya.ru>
2025-10-13 09:04:18 -07:00
53c9a7cee2 [P/D] [NixlConnector] kv load recovery integration (#26171)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-10-13 08:48:04 -07:00
0d21b9b51e [UX] Speedup DeepGEMM warmup with heuristics (#25619)
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-10-13 07:59:27 -07:00
10214b6935 [FEATURE]: Use pydantic validation in multimodal.py config (#26629)
Signed-off-by: Anand Roy <86306690+andycandy@users.noreply.github.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-13 07:56:59 -07:00
4a61950f4d [Hardware][CPU] Disable torch.compile for RISC-V to prevent APIError (#26693)
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn>
Signed-off-by: ihb2032 <1355790728@qq.com>
Signed-off-by: lyd1992 <liuyudong@iscas.ac.cn
2025-10-13 07:56:01 -07:00
3263799056 [unrevert] Add batch invariant kernel override for FlashInfer backend [2/n] (#26373)
Signed-off-by: Bram Wasti <bwasti@meta.com>
Signed-off-by: Bram Wasti <bwasti@fb.com>
2025-10-13 10:24:53 -04:00
8e67b2557a [Bugfix] Fix out of bound index issue for Jina-embedding-v3 RoPE with cuda graph (#26687)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-13 03:21:48 -07:00
4073c82c4e [ResponseAPI] Simplify input/output message serialization (#26620)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-10-13 09:59:15 +00:00
767c3ab869 [Model][0/N] Improve all pooling task | clean up (#25817)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-10-13 16:44:50 +08:00
4f207c7174 Ignore large reformatting PRs in git blame (#26690)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-13 01:20:47 -07:00
782505ed8e [Model] Add reasoning_parser and tool_parser for Ernie45 thinking (#25027)
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
2025-10-13 15:55:20 +08:00
98f30b8cba [Model] Fix Skywork R1V mlp (#26673)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-10-12 22:42:17 -07:00
3cd36660f7 docs: wrong command in structured_outputs README (#26677)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-10-12 20:59:01 -07:00
46ad73955a [FIX] Throwing an exception when the model does not support pool tasks (#25840) (#25855)
Signed-off-by: zxw <1020938856@qq.com>
Co-authored-by: wang.yuqi <noooop@126.com>
2025-10-12 20:56:21 -07:00
41f3884438 [Bugfix][Core]Fix block table out-of-range issue in priority scheduling (#26661)
Signed-off-by: quanliu <18646313696@163.com>
2025-10-13 01:25:42 +00:00
60e419c1ee [Misc] cache result of disable_inplace (#26666)
Signed-off-by: Bill Nell <bnell@redhat.com>
2025-10-13 00:17:50 +00:00
1172 changed files with 57683 additions and 21648 deletions

View File

@ -5,11 +5,11 @@ import os
import sys
import zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
# Note that we have 800 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/6326 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
def print_top_10_largest_files(zip_file):

View File

@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.419
- name: "exact_match,flexible-extract"
value: 0.416
limit: 1000
num_fewshot: 5

View File

@ -0,0 +1,12 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
value: 0.80
limit: 100
num_fewshot: 0

View File

@ -0,0 +1,10 @@
# For hf script, without -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
tasks:
- name: "mmlu_pro"
metrics:
- name: "exact_match,custom-extract"
value: 0.80
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5

View File

@ -1,4 +1,5 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
# For vllm script, with -t option (tensor parallel size)
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
tasks:
- name: "gsm8k"

View File

@ -0,0 +1,12 @@
# For vllm script, with -t option (tensor parallel size).
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
backend: "vllm-vlm"
tasks:
- name: "chartqa"
metrics:
- name: "relaxed_accuracy,none"
value: 0.855
limit: 2500
num_fewshot: 0

View File

@ -0,0 +1,14 @@
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
tasks:
- name: "mmlu_pro"
metrics:
- name: "exact_match,custom-extract"
value: 0.82
limit: 250 # will run on 250 * 14 subjects = 3500 samples
num_fewshot: 5
enforce_eager: false # we use false to speed up the eval process
kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
max_model_len: 40960
apply_chat_template: true
fewshot_as_multiturn: true
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"

View File

@ -0,0 +1 @@
Qwen3-235B-A22B-Instruct-2507-FP8.yaml

View File

@ -0,0 +1 @@
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml

View File

@ -0,0 +1 @@
Qwen2.5-VL-7B-Instruct.yaml

View File

@ -0,0 +1,44 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on chartqa for vllm.
#
# Make sure you have lm-eval-harness installed:
# pip install lm-eval==0.4.9
usage() {
echo``
echo "Runs lm eval harness on ChartQA using multimodal vllm."
echo "This pathway is intended to be used to create baselines for "
echo "our correctness tests in vllm's CI."
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:l:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm-vlm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
--tasks chartqa \
--batch_size auto \
--apply_chat_template \
--limit $LIMIT

View File

View File

@ -0,0 +1,50 @@
#!/bin/bash
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
usage() {
echo``
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
echo "This pathway is intended to be used to create baselines for "
echo "our automated nm-test-accuracy workflow"
echo
echo "usage: ${0} <options>"
echo
echo " -m - huggingface stub or local directory of the model"
echo " -l - limit number of samples to run"
echo " -f - number of fewshot samples to use"
echo " -t - tensor parallel size to run at"
echo
}
while getopts "m:b:l:f:t:" OPT; do
case ${OPT} in
m )
MODEL="$OPTARG"
;;
b )
BATCH_SIZE="$OPTARG"
;;
l )
LIMIT="$OPTARG"
;;
f )
FEWSHOT="$OPTARG"
;;
t )
TP_SIZE="$OPTARG"
;;
\? )
usage
exit 1
;;
esac
done
lm_eval --model vllm \
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
--batch_size auto

View File

@ -19,21 +19,35 @@ RTOL = 0.08
def launch_lm_eval(eval_config, tp_size):
trust_remote_code = eval_config.get("trust_remote_code", False)
max_model_len = eval_config.get("max_model_len", 4096)
batch_size = eval_config.get("batch_size", "auto")
backend = eval_config.get("backend", "vllm")
enforce_eager = eval_config.get("enforce_eager", "true")
kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
model_args = (
f"pretrained={eval_config['model_name']},"
f"tensor_parallel_size={tp_size},"
f"enforce_eager=true,"
f"enforce_eager={enforce_eager},"
f"kv_cache_dtype={kv_cache_dtype},"
f"add_bos_token=true,"
f"trust_remote_code={trust_remote_code},"
f"max_model_len={max_model_len}"
f"max_model_len={max_model_len},"
)
results = lm_eval.simple_evaluate(
model="vllm",
model=backend,
model_args=model_args,
tasks=[task["name"] for task in eval_config["tasks"]],
num_fewshot=eval_config["num_fewshot"],
limit=eval_config["limit"],
batch_size="auto",
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm, or explicitly set
apply_chat_template=eval_config.get(
"apply_chat_template", backend == "vllm-vlm"
),
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs=eval_config.get("gen_kwargs"),
batch_size=batch_size,
)
return results

View File

@ -1,184 +0,0 @@
steps:
- label: "Wait for container to be ready"
key: wait-for-container-image
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
containers:
- image: badouralix/curl-jq
command:
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- label: "Cleanup H100"
agents:
queue: H100
depends_on: ~
command: docker system prune -a --volumes --force
- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
depends_on: wait-for-container-image
if: build.branch == "main"
plugins:
- kubernetes:
podSpec:
priorityClassName: perf-benchmark
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- label: "H200"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H200
depends_on: wait-for-container-image
if: build.branch == "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: 4,5,6,7
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
#- block: "Run H100 Benchmark"
#key: block-h100
#depends_on: ~
- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
depends_on: wait-for-container-image
if: build.branch == "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
# Premerge benchmark
- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
depends_on: wait-for-container-image
if: build.branch != "main"
plugins:
- kubernetes:
podSpec:
priorityClassName: perf-benchmark
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- label: "H200"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H200
depends_on: wait-for-container-image
if: build.branch != "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: 4,5,6,7
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
#- block: "Run H100 Benchmark"
#key: block-h100
#depends_on: ~
- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
depends_on: wait-for-container-image
if: build.branch != "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN

View File

@ -1,28 +0,0 @@
# Nightly benchmark annotation
## Description
This file contains the downloading link for benchmarking results.
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
- [benchmarking results](artifact://results.zip)
- [benchmarking code](artifact://nightly-benchmarks.zip)
Please download the visualization scripts in the post
## Results reproduction
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
```bash
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside `./benchmarks/results`.

View File

@ -1,39 +0,0 @@
# Nightly benchmark
This benchmark aims to:
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Setup
- Docker images:
- vLLM: `vllm/vllm-openai:v0.6.2`
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
- 8x Nvidia A100 GPUs
- Workload:
- Dataset
- ShareGPT dataset
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Models: llama-3 8B, llama-3 70B.
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
## Known issues
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
- TGI does not support `ignore-eos` flag.

View File

@ -1,196 +0,0 @@
common_pod_spec: &common_pod_spec
priorityClassName: perf-benchmark
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /root/.cache/huggingface
type: Directory
common_container_settings: &common_container_settings
command:
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
steps:
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
- label: "A100 vllm step 10"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:v0.6.2
<<: *common_container_settings
- label: "A100 sglang benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: lmsysorg/sglang:v0.3.2-cu121
<<: *common_container_settings
- label: "A100 lmdeploy benchmark"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: openmmlab/lmdeploy:v0.6.1-cu12
<<: *common_container_settings
- label: "A100 trt llama-8B"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<: *common_container_settings
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: TEST_SELECTOR
value: "llama8B"
- label: "A100 trt llama-70B"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
<<: *common_container_settings
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_HOME
value: /root/.cache/huggingface
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: TEST_SELECTOR
value: "llama70B"
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
# - label: "A100 trt benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
# <<: *common_container_settings
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
# - label: "A100 tgi benchmark"
# priority: 100
# agents:
# queue: A100
# plugins:
# - kubernetes:
# podSpec:
# <<: *common_pod_spec
# containers:
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
# <<: *common_container_settings
- wait
- label: "Collect the results"
priority: 100
agents:
queue: A100
plugins:
- kubernetes:
podSpec:
<<: *common_pod_spec
containers:
- image: vllm/vllm-openai:v0.5.0.post1
command:
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: VLLM_SOURCE_CODE_LOC
value: /workspace/build/buildkite/vllm/performance-benchmark
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- block: ":rocket: check the results!"

View File

@ -1,26 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
from transformers import AutoTokenizer
def main(model, cachedir):
# Load the tokenizer and save it to the specified directory
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.save_pretrained(cachedir)
print(f"Tokenizer saved to {cachedir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download and save Hugging Face tokenizer"
)
parser.add_argument("--model", type=str, required=True, help="Name of the model")
parser.add_argument(
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
)
args = parser.parse_args()
main(args.model, args.cachedir)

View File

@ -1,97 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
from pathlib import Path
import numpy as np
import pandas as pd
from tabulate import tabulate
def parse_arguments():
parser = argparse.ArgumentParser(
description="Parse command line arguments for summary-nightly-results script."
)
parser.add_argument(
"--results-folder",
type=str,
required=True,
help="The folder where the results are stored.",
)
parser.add_argument(
"--description", type=str, required=True, help="Description of the results."
)
args = parser.parse_args()
return args
def get_perf(df, method, model, metric):
means = []
for qps in [2, 4, 8, 16, "inf"]:
target = df["Test name"].str.contains(model)
target = target & df["Engine"].str.contains(method)
target = target & df["Test name"].str.contains("qps_" + str(qps))
filtered_df = df[target]
if filtered_df.empty:
means.append(0.0)
else:
means.append(filtered_df[metric].values[0])
return np.array(means)
def get_perf_w_std(df, method, model, metric):
if metric in ["TTFT", "ITL"]:
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
mean = mean.tolist()
std = get_perf(df, method, model, "Std " + metric + " (ms)")
if std.mean() == 0:
std = None
success = get_perf(df, method, model, "Successful req.")
if std is not None:
std = std / np.sqrt(success)
std = std.tolist()
else:
assert metric == "Tput"
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
df, method, model, "Output Tput (tok/s)"
)
mean = mean.tolist()
std = None
return mean, std
def main(args):
results_folder = Path(args.results_folder)
results = []
# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
with open(test_file) as f:
results = results + json.loads(f.read())
# generate markdown table
df = pd.DataFrame.from_dict(results)
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
with open(args.description) as f:
description = f.read()
description = description.format(nightly_results_benchmarking_table=md_table)
with open("nightly_results.md", "w") as f:
f.write(description)
if __name__ == "__main__":
args = parse_arguments()
main(args)

View File

@ -1,9 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from lmdeploy.serve.openai.api_client import APIClient
api_client = APIClient("http://localhost:8000")
model_name = api_client.available_models[0]
print(model_name)

View File

@ -1,78 +0,0 @@
#!/bin/bash
set -ex
set -o pipefail
main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)
(which zip) || (apt-get install -y zip)
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip plotting the results."
exit 0
fi
# initial annotation
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
# download results
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
mkdir -p results/
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
ls
ls results/
# upload benchmark results
zip -r results.zip results/
/workspace/buildkite-agent artifact upload "results.zip"
# upload benchmarking scripts
cd "$VLLM_SOURCE_CODE_LOC/"
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
# The figures should be generated by a separate process outside the CI/CD pipeline
# # generate figures
# python3 -m pip install tabulate pandas matplotlib
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
# --description $description \
# --results-folder results/
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sharegpt
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_2048_128
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_128_2048
# # upload results and figures
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}
main "$@"

View File

@ -1,464 +0,0 @@
#!/bin/bash
set -o pipefail
set -x
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
echo "GPU type is $gpu_type"
}
check_hf_token() {
# check if HF_TOKEN is available and valid
if [[ -z "$HF_TOKEN" ]]; then
echo "Error: HF_TOKEN is not set."
exit 1
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Error: HF_TOKEN does not start with 'hf_'."
exit 1
else
echo "HF_TOKEN is set and valid."
fi
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
get_current_llm_serving_engine() {
if which lmdeploy >/dev/null; then
echo "Container: lmdeploy"
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
return
fi
if [ -e /tgi-entrypoint.sh ]; then
echo "Container: tgi"
export CURRENT_LLM_SERVING_ENGINE=tgi
return
fi
if which trtllm-build >/dev/null; then
echo "Container: tensorrt-llm"
export CURRENT_LLM_SERVING_ENGINE=trt
return
fi
if [ -e /sgl-workspace ]; then
echo "Container: sglang"
export CURRENT_LLM_SERVING_ENGINE=sglang
return
fi
if [ -e /vllm-workspace ]; then
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
return
fi
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
kill_gpu_processes() {
pkill -f '[p]ython'
pkill -f '[p]ython3'
pkill -f '[t]ritonserver'
pkill -f '[p]t_main_thread'
pkill -f '[t]ext-generation'
pkill -f '[l]mdeploy'
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pkill -f '[V]LLM'
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
ensure_installed() {
# Ensure that the given command is installed by apt-get
local cmd=$1
if ! which "$cmd" >/dev/null; then
apt-get update && apt-get install -y "$cmd"
fi
}
run_serving_tests() {
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
# get client and server arguments
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if [[ $reuse_server == "true" ]]; then
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
"$server_params" "$common_params"
fi
if wait_for_server; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
break
fi
# prepare tokenizer
# this is required for lmdeploy.
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
# change model name for lmdeploy (it will not follow standard hf name)
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
if [[ $backend = "trt" ]]; then
backend="tensorrt-llm"
fi
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--ignore-eos \
$client_args"
elif [[ "$dataset_name" = "sonnet" ]]; then
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="vllm bench serve \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--sonnet-input-len $sonnet_input_len \
--sonnet-output-len $sonnet_output_len \
--sonnet-prefix-len $sonnet_prefix_len \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--ignore-eos \
$client_args"
else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
server_command="None"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
done
kill_gpu_processes
}
run_genai_perf_tests() {
# run genai-perf tests
# $1: a json file specifying genai-perf test cases
local genai_perf_test_file
genai_perf_test_file=$1
# Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
# get client and server arguments
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if [[ $reuse_server == "true" ]]; then
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
"$server_params" "$common_params"
fi
if wait_for_server; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps=$num_prompts
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
#TODO: add output dir.
client_command="genai-perf profile \
-m $model \
--service-kind openai \
--backend "$backend" \
--endpoint-type chat \
--streaming \
--url localhost:$port \
--request-rate $qps \
--num-prompts $num_prompts \
"
echo "Client command: $client_command"
eval "$client_command"
#TODO: process/record outputs
done
done
kill_gpu_processes
}
prepare_dataset() {
# download sharegpt dataset
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
echo "" > sonnet_4x.txt
for _ in {1..4}
do
cat sonnet.txt >> sonnet_4x.txt
done
}
main() {
# check if the environment variable is successfully injected from yaml
check_gpus
check_hf_token
get_current_llm_serving_engine
pip install -U transformers
pip install -r requirements/dev.txt
which genai-perf
# check storage
df -h
ensure_installed wget
ensure_installed curl
ensure_installed jq
# genai-perf dependency
ensure_installed libb64-0d
prepare_dataset
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
# run the test
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
# run genai-perf tests
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
mv artifacts/ $RESULTS_FOLDER/
# upload benchmark results to buildkite
python3 -m pip install tabulate pandas
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
upload_to_buildkite
}
main "$@"

View File

@ -1,82 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import datetime
import json
import os
from pathlib import Path
import pandas as pd
from tabulate import tabulate
results_folder = Path("results/")
# serving results and the keys that will be printed into markdown
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"gpu_type": "GPU",
"completed": "Successful req.",
"request_throughput": "Tput (req/s)",
"mean_ttft_ms": "Mean TTFT (ms)",
"std_ttft_ms": "Std TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"mean_itl_ms": "Mean ITL (ms)",
"std_itl_ms": "Std ITL (ms)",
"median_itl_ms": "Median ITL (ms)",
"mean_tpot_ms": "Mean TPOT (ms)",
"std_tpot_ms": "Std TPOT (ms)",
"median_tpot_ms": "Median TPOT (ms)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
"total_input_tokens": "Total input tokens",
"total_output_tokens": "Total output tokens",
"engine": "Engine",
}
if __name__ == "__main__":
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file) as f:
raw_result = json.loads(f.read())
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
serving_results = pd.DataFrame.from_dict(serving_results)
if not serving_results.empty:
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
columns=serving_column_mapping
)
serving_md_table_with_headers = tabulate(
serving_results, headers="keys", tablefmt="pipe", showindex=False
)
# remove the first line of header
serving_md_table_lines = serving_md_table_with_headers.split("\n")
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
# document benchmarking results in markdown
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
# document results with header.
# for those who wants to reproduce our benchmark.
f.write(serving_md_table_with_headers)
f.write("\n")
# document benchmarking results in json
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
results = serving_results.to_dict(orient="records")
f.write(json.dumps(results))

View File

@ -1,23 +0,0 @@
#!/bin/sh
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
else
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
fi
TIMEOUT_SECONDS=10
retries=0
while [ $retries -lt 1000 ]; do
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
exit 0
fi
echo "Waiting for image to be available..."
retries=$((retries + 1))
sleep 5
done
exit 1

View File

@ -1,30 +0,0 @@
[
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
{
"test_name": "latency_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
}
]

View File

@ -1,32 +0,0 @@
[
{
"test_name": "throughput_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
},
{
"test_name": "throughput_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]

View File

@ -2,40 +2,23 @@
## Introduction
This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
## Performance benchmark quick overview
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100 and Intel® Xeon® Processors, with different models.
**Benchmarking Duration**: about 1hr.
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
## Nightly benchmark quick overview
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
**Benchmarking Duration**: about 3.5hrs.
## Trigger the benchmark
Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
Manually Trigger the benchmark
The benchmark needs to be triggered manually:
```bash
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
```
Runtime environment variables:
@ -47,10 +30,6 @@ Runtime environment variables:
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
@ -152,26 +131,3 @@ Here is an example using the script to compare result_a and result_b with Model,
A comparison diagram will be generated below the table.
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
## Nightly test details
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
### Nightly tests
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
### Docker containers
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@ -7,6 +7,7 @@ from importlib import util
import pandas as pd
pd.options.display.float_format = "{:.2f}".format
plotly_found = util.find_spec("plotly.express") is not None
@ -109,7 +110,10 @@ def compare_data_columns(
if len(compare_frames) >= 2:
base = compare_frames[0]
current = compare_frames[-1]
ratio = current / base
if "P99" in data_column or "Median" in data_column:
ratio = base / current # for latency
else:
ratio = current / base
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
frames.append(ratio)
@ -199,6 +203,71 @@ def split_json_by_tp_pp(
return saved_paths
def _add_limit_line(fig, y_value, label):
# Visible dashed line + annotation
fig.add_hline(
y=y_value,
line_dash="dash",
line_color="red" if "ttft" in label.lower() else "blue",
annotation_text=f"{label}: {y_value} ms",
annotation_position="top left",
)
# Optional: add a legend item (as a transparent helper trace)
if plot and plotly_found:
import plotly.graph_objects as go
fig.add_trace(
go.Scatter(
x=[None],
y=[None],
mode="lines",
line=dict(
dash="dash", color="red" if "ttft" in label.lower() else "blue"
),
name=f"{label}",
)
)
def _find_concurrency_col(df: pd.DataFrame) -> str:
for c in [
"# of max concurrency.",
"# of max concurrency",
"Max Concurrency",
"max_concurrency",
"Concurrency",
]:
if c in df.columns:
return c
# Fallback: guess an integer-like column (harmless if unused)
for c in df.columns:
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
return c
return "# of max concurrency."
def _highlight_threshold(
df: pd.DataFrame, threshold: float
) -> "pd.io.formats.style.Styler":
"""Highlight numeric per-configuration columns with value <= threshold."""
conc_col = _find_concurrency_col(df)
key_cols = [
c
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
if c in df.columns
]
conf_cols = [
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
]
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
return df.style.map(
lambda v: "background-color:#e6ffe6;font-weight:bold;"
if pd.notna(v) and v <= threshold
else "",
subset=conf_cols,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
@ -220,6 +289,26 @@ if __name__ == "__main__":
default="# of max concurrency.",
help="column name to use as X Axis in comparison graph",
)
parser.add_argument(
"-l",
"--latency",
type=str,
default="p99",
help="take median|p99 for latency like TTFT/TPOT",
)
parser.add_argument(
"--ttft-max-ms",
type=float,
default=3000.0,
help="Reference limit for TTFT plots (ms)",
)
parser.add_argument(
"--tpot-max-ms",
type=float,
default=100.0,
help="Reference limit for TPOT plots (ms)",
)
args = parser.parse_args()
drop_column = "P99"
@ -234,12 +323,22 @@ if __name__ == "__main__":
"# of max concurrency.",
"qps",
]
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"Median TTFT /n",
"Median TPOT /n",
]
if "median" in args.latency:
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"Median TTFT /n",
"Median TPOT /n",
]
drop_column = "P99"
elif "p99" in args.latency:
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"P99 TTFT /n",
"P99 TPOT /n",
]
if len(args.file) == 1:
files = split_json_by_tp_pp(args.file[0], output_root="splits")
@ -275,33 +374,83 @@ if __name__ == "__main__":
f"Expected subset: {filtered_info_cols}, "
f"but DataFrame has: {list(output_df.columns)}"
)
output_df_sorted = output_df.sort_values(by=existing_group_cols)
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
output_df_sorted = output_df.sort_values(by=args.xaxis)
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
for name, group in output_groups:
html = group.to_html()
group_name = (
",".join(map(str, name)).replace(",", "_").replace("/", "-")
)
group_html_name = "perf_comparison_" + group_name + ".html"
metric_name = str(data_cols_to_compare[i]).lower()
if "tok/s" in metric_name:
html = group.to_html()
elif "ttft" in metric_name:
styler = _highlight_threshold(group, args.ttft_max_ms).format(
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
na_rep="",
)
html = styler.to_html(
table_attributes='border="1" class="dataframe"'
)
elif (
"tpot" in metric_name
or "median" in metric_name
or "p99" in metric_name
):
styler = _highlight_threshold(group, args.tpot_max_ms).format(
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
na_rep="",
)
html = styler.to_html(
table_attributes='border="1" class="dataframe"'
)
text_file.write(html_msgs_for_data_cols[i])
text_file.write(html)
with open(group_html_name, "a+") as sub_text_file:
sub_text_file.write(html_msgs_for_data_cols[i])
sub_text_file.write(html)
if plot and plotly_found:
import plotly.express as px
if plot and plotly_found:
import plotly.express as px
df = group[raw_data_cols]
df_sorted = df.sort_values(by=info_cols[y_axis_index])
# Melt DataFrame for plotting
df_melted = df_sorted.melt(
id_vars=info_cols[y_axis_index],
var_name="Configuration",
value_name=data_cols_to_compare[i],
)
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
# Create Plotly line chart
fig = px.line(
df_melted,
x=info_cols[y_axis_index],
y=data_cols_to_compare[i],
color="Configuration",
title=title,
markers=True,
)
# Export to HTML
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
df = group[raw_data_cols]
df_sorted = df.sort_values(by=info_cols[y_axis_index])
# Melt DataFrame for plotting
df_melted = df_sorted.melt(
id_vars=info_cols[y_axis_index],
var_name="Configuration",
value_name=data_cols_to_compare[i],
)
title = (
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
)
# Create Plotly line chart
fig = px.line(
df_melted,
x=info_cols[y_axis_index],
y=data_cols_to_compare[i],
color="Configuration",
title=title,
markers=True,
)
# ---- Add threshold lines based on metric name ----
if "ttft" in metric_name:
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
elif (
"tpot" in metric_name
or "median" in metric_name
or "p99" in metric_name
):
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
# Export to HTML
text_file.write(
fig.to_html(full_html=True, include_plotlyjs="cdn")
)
sub_text_file.write(
fig.to_html(full_html=True, include_plotlyjs="cdn")
)

View File

@ -63,9 +63,11 @@ serving_column_mapping = {
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
"std_ttft_ms": "STD TTFT (ms)",
"mean_tpot_ms": "Mean TPOT (ms)",
"median_tpot_ms": "Median",
"p99_tpot_ms": "P99",
"std_tpot_ms": "STD TPOT (ms)",
"mean_itl_ms": "Mean ITL (ms)",
"median_itl_ms": "Median ITL (ms)",
"p99_itl_ms": "P99 ITL (ms)",
@ -368,7 +370,7 @@ if __name__ == "__main__":
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df["GPU"] = df["GPU"].apply(
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
)
# get markdown tables
@ -390,7 +392,7 @@ if __name__ == "__main__":
json_file = "benchmark_results.json"
with open(results_folder / md_file, "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/"
"../.buildkite/performance-benchmarks/"
+ "performance-benchmarks-descriptions.md"
)
results = results.format(

View File

@ -469,7 +469,12 @@ main() {
ensure_sharegpt_downloaded
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
# dump vllm info via vllm collect-env
env_output=$(vllm collect-env)
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
# benchmarking
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"

View File

@ -0,0 +1,26 @@
[
{
"test_name": "latency_llama8B_tp2",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"num_iters_warmup": 5,
"num_iters": 15
}
}
]

View File

@ -95,6 +95,38 @@
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
"qps_list": ["inf"],
@ -233,6 +265,41 @@
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
"qps_list": ["inf"],
@ -365,6 +432,38 @@
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
"qps_list": ["inf"],
@ -503,6 +602,41 @@
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
"qps_list": ["inf"],
@ -638,6 +772,39 @@
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
"qps_list": ["inf"],
@ -780,6 +947,42 @@
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"quantization": "awq",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
"qps_list": ["inf"],

View File

@ -2,7 +2,7 @@
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -28,13 +28,13 @@
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -60,13 +60,13 @@
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"test_name": "serving_llama8B_tp1_random_128_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -76,39 +76,7 @@
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
@ -124,16 +92,16 @@
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 100
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_pp6_random_1024_128",
"test_name": "serving_llama8B_tp2_random_128_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@ -143,7 +111,7 @@
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 6,
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
@ -159,10 +127,150 @@
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 100
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_128_2048",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 2048,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp1_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
},
{
"test_name": "serving_llama8B_tp2_random_2048_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [32],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 2048,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 32
}
}
]

View File

@ -0,0 +1,27 @@
[
{
"test_name": "throughput_llama8B_tp2",
"environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200,
"backend": "vllm"
}
}
]

View File

@ -1,5 +1,5 @@
steps:
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
# aarch64 + CUDA builds
- label: "Build arm64 wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-arm64-cuda-12-9
@ -8,13 +8,28 @@ steps:
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# aarch64 build
- label: "Build arm64 CPU wheel"
depends_on: ~
id: build-wheel-arm64-cpu
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.8"
depends_on: ~
id: build-wheel-cuda-12-8
@ -28,20 +43,6 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.6"
depends_on: ~
id: build-wheel-cuda-12-6
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.9"
depends_on: ~
id: build-wheel-cuda-12-9
@ -55,6 +56,20 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 13.0"
depends_on: ~
id: build-wheel-cuda-13-0
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# Build release images (12.9)
- label: "Build release image (x86)"
depends_on: ~
id: build-release-image-x86
@ -62,13 +77,12 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# re-tag to default image tag and push, just in case arm64 build fails
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
- label: "Build release image (arm64)"
depends_on: ~
id: build-release-image-arm64
@ -76,7 +90,7 @@ steps:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
# Add job to create multi-arch manifest
@ -142,6 +156,22 @@ steps:
env:
DOCKER_BUILDKIT: "1"
- block: "Build arm64 CPU release image"
key: block-arm64-cpu-release-image-build
depends_on: ~
- label: "Build and publish arm64 CPU release image"
depends_on: block-arm64-cpu-release-image-build
agents:
queue: arm64_cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
- label: "Build and publish nightly multi-arch image to DockerHub"
depends_on:
- create-multi-arch-manifest

View File

@ -70,7 +70,7 @@ function cpu_tests() {
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
# Note: disable it until supports V1
# Run AWQ test

View File

@ -20,7 +20,10 @@ trap remove_docker_container EXIT
# Run the image and test offline inference/tensor parallel
docker run \
--device /dev/dri \
--device /dev/dri:/dev/dri \
--net=host \
--ipc=host \
--privileged \
-v /dev/dri/by-path:/dev/dri/by-path \
--entrypoint="" \
-e "HF_TOKEN=${HF_TOKEN}" \
@ -42,7 +45,7 @@ docker run \
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
pytest -v -s v1/structured_output
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
pytest -v -s v1/test_serial_utils.py
'

View File

@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
elif [[ $normal_wheel == *"cu128"* ]]; then
# if $normal_wheel matches cu128, do not upload the index.html
echo "Skipping index files for cu128 wheels"
else
if [[ $normal_wheel == *"cu129"* ]]; then
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
else
echo "Skipping index files for non-cu129 wheels"
fi
# generate index for nightly
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu126"* ]]; then
# if $normal_wheel matches cu126, do not upload the index.html
echo "Skipping index files for cu126 wheels"
elif [[ $normal_wheel == *"cu128"* ]]; then
# if $normal_wheel matches cu128, do not upload the index.html
echo "Skipping index files for cu128 wheels"
else
if [[ $normal_wheel == *"cu129"* ]]; then
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
else
echo "Skipping index files for non-cu129 wheels"
fi
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"

1328
.buildkite/test-amd.yaml Normal file

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,7 @@ steps:
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
# in /vllm/tools/generate_nightly_torch_test.py
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
soft_fail: true
source_file_dependencies:
- requirements/nightly_torch_test.txt
@ -172,6 +172,8 @@ steps:
- tests/v1/engine/test_engine_core_client.py
- tests/distributed/test_symm_mem_allreduce.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and external_dp=2
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
# test with torchrun tp=2 and pp=2
@ -203,6 +205,24 @@ steps:
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
- label: Distributed Tests (8 GPUs) # 4min
timeout_in_minutes: 10
gpu: h100
num_gpus: 8
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- examples/offline_inference/torchrun_dp_example.py
- vllm/config/parallel.py
- vllm/distributed/
- vllm/v1/engine/llm_engine.py
- vllm/v1/executor/uniproc_executor.py
- vllm/v1/worker/gpu_worker.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
# test with torchrun tp=2 and dp=4 with ep
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
- label: EPLB Algorithm Test # 5min
timeout_in_minutes: 15
working_dir: "/vllm-workspace/tests"
@ -311,6 +331,26 @@ steps:
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: V1 Test attention (H100) # 10min
timeout_in_minutes: 30
gpu: h100
source_file_dependencies:
- vllm/v1/attention
- tests/v1/attention
commands:
- pytest -v -s v1/attention
- label: Batch Invariance Tests (H100) # 10min
timeout_in_minutes: 25
gpu: h100
source_file_dependencies:
- vllm/
- tests/v1/generation/
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pip install pytest-timeout pytest-forked
- pytest -v -s v1/generation
- label: V1 Test others (CPU) # 5 mins
source_file_dependencies:
- vllm/
@ -349,7 +389,8 @@ steps:
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
- label: Platform Tests (CUDA) # 4min
timeout_in_minutes: 15
@ -384,7 +425,12 @@ steps:
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
--ignore=lora/test_chatglm3_tp.py \
--ignore=lora/test_llama_tp.py \
--ignore=lora/test_llm_with_multi_loras.py
--ignore=lora/test_llm_with_multi_loras.py \
--ignore=lora/test_olmoe_tp.py \
--ignore=lora/test_deepseekv2_tp.py \
--ignore=lora/test_gptoss.py \
--ignore=lora/test_qwen3moe_tp.py
parallelism: 4
- label: PyTorch Compilation Unit Tests # 15min
@ -416,8 +462,8 @@ steps:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s compile/piecewise/
- label: PyTorch Fullgraph Test # 20min
timeout_in_minutes: 30
- label: PyTorch Fullgraph Test # 22min
timeout_in_minutes: 35
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
@ -425,6 +471,19 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py
- pytest -v -s compile/test_fusions_e2e.py
- label: Cudagraph test
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- tests/v1/cudagraph
- vllm/v1/cudagraph_dispatcher.py
- vllm/config/compilation.py
- vllm/compilation
commands:
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
- label: Kernels Core Operation Test # 48min
timeout_in_minutes: 75
@ -468,6 +527,8 @@ steps:
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
- vllm/distributed/device_communicators/
- vllm/envs.py
- vllm/config
commands:
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
@ -527,8 +588,9 @@ steps:
# since torchao nightly is only compatible with torch nightly currently
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
# we can only upgrade after this is resolved
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
# TODO(jerryzh168): resolve the above comment
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
- label: LM Eval Small Models # 53min
timeout_in_minutes: 75
@ -677,8 +739,10 @@ steps:
- vllm/
- tests/models/language/generation
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
- label: Language Models Test (PPL)
@ -733,6 +797,16 @@ steps:
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
timeout_in_minutes: 70
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- vllm/multimodal/
- vllm/inputs/
- vllm/v1/core/
commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
- label: Multi-Modal Models Test (Extended) 1
mirror_hardwares: [amdexperimental]
optional: true
@ -796,8 +870,8 @@ steps:
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Blackwell Test # 38 min
timeout_in_minutes: 60
- label: Blackwell Test # 21 min
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
@ -810,8 +884,6 @@ steps:
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/fusion.py
- vllm/compilation/fusion_attn.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
@ -828,15 +900,32 @@ steps:
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
# Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- label: Blackwell Fusion Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py
- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
@ -943,6 +1032,8 @@ steps:
- tests/v1/shutdown
- tests/v1/worker/test_worker_memory_snapshot.py
commands:
# https://github.com/NVIDIA/nccl/issues/1838
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
@ -950,6 +1041,7 @@ steps:
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s distributed/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
@ -993,6 +1085,11 @@ steps:
- pytest -v -s plugins_tests/test_io_processor_plugins.py
- pip uninstall prithvi_io_processor_plugin -y
# end io_processor plugins test
# begin stat_logger plugins test
- pip install -e ./plugins/vllm_add_dummy_stat_logger
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
- pip uninstall dummy_stat_logger -y
# end stat_logger plugins test
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
@ -1032,6 +1129,7 @@ steps:
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_llm_with_multi_loras.py
- pytest -v -s -x lora/test_olmoe_tp.py
- label: Weight Loading Multiple GPU Test # 33min
@ -1058,6 +1156,17 @@ steps:
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- label: NixlConnector PD accuracy tests (Distributed) # 30min
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
- tests/v1/kv_connector/nixl_integration/
commands:
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
##### multi gpus test #####
##### A100 test #####
@ -1088,8 +1197,21 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H100 test #####
- label: LM Eval Large Models (H100) # optional
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test #####
- label: Distrubted Tests (H200) # optional
- label: Distributed Tests (H200) # optional
gpu: h200
optional: true
working_dir: "/vllm-workspace/"
@ -1097,6 +1219,8 @@ steps:
commands:
- pytest -v -s tests/compile/test_async_tp.py
- pytest -v -s tests/compile/test_sequence_parallelism.py
- pytest -v -s tests/compile/test_fusion_all_reduce.py
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048

View File

@ -1,5 +1,10 @@
[run]
source = vllm
# Track the installed vllm package (this is what actually gets imported during tests)
# Use wildcard pattern to match the installed location
source =
vllm
*/dist-packages/vllm
*/site-packages/vllm
omit =
*/tests/*
*/test_*
@ -12,6 +17,16 @@ omit =
*/benchmarks/*
*/docs/*
[paths]
# Map all possible vllm locations to a canonical "vllm" path
# This ensures coverage.combine properly merges data from different test runs
source =
vllm
/vllm-workspace/src/vllm
/vllm-workspace/vllm
*/site-packages/vllm
*/dist-packages/vllm
[report]
exclude_lines =
pragma: no cover

4
.git-blame-ignore-revs Normal file
View File

@ -0,0 +1,4 @@
# Migrate from `yapf` & `isort` to `ruff`
d6953beb91da4e9c99be4c0a1304a2d24189535c
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
8fcaaf6a165e661f63fc51be906bc05b0767332f

14
.github/CODEOWNERS vendored
View File

@ -5,10 +5,8 @@
/vllm/attention @LucasWilkinson
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
/vllm/model_executor/layers/fused_moe @mgoin
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/model_loader @22quinn
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
@ -26,9 +24,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/attention @LucasWilkinson
/vllm/v1/attention/backends/flashinfer.py @mgoin
/vllm/v1/attention/backends/mla @pavanimajety
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
/vllm/v1/sample @22quinn @houseroad @njhill
@ -47,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
@ -60,7 +58,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/v1/offloading @ApostaC
# Transformers backend
/vllm/model_executor/models/transformers.py @hmellor
/vllm/model_executor/models/transformers @hmellor
/tests/models/test_transformers.py @hmellor
# Docs

2
.github/mergify.yml vendored
View File

@ -108,7 +108,7 @@ pull_request_rules:
- files~=^benchmarks/
- files~=^vllm/benchmarks/
- files~=^tests/benchmarks/
- files~=^\.buildkite/nightly-benchmarks/
- files~=^\.buildkite/performance-benchmarks/
actions:
label:
add:

View File

@ -13,6 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Label issues based on keywords
id: label-step
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
@ -42,7 +43,6 @@ jobs:
searchIn: "body"
},
],
// Substring search - matches anywhere in text (partial matches)
substrings: [
{
@ -89,14 +89,12 @@ jobs:
term: "hip_",
searchIn: "both"
},
// ROCm tools and libraries
{
term: "hipify",
searchIn: "both"
},
],
// Regex patterns - for complex pattern matching
regexPatterns: [
{
@ -107,13 +105,17 @@ jobs:
}
],
},
// Add more label configurations here as needed
// example: {
// keywords: [...],
// substrings: [...],
// regexPatterns: [...]
// },
};
// Helper function to create regex based on search type
function createSearchRegex(term, type) {
// Escape special regex characters in the term
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
switch (type) {
case 'keyword':
// Word boundary search - matches whole words only
@ -125,16 +127,13 @@ jobs:
throw new Error(`Unknown search type: ${type}`);
}
}
// Helper function to find matching terms in text with line information
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
const matches = [];
const lines = text.split('\n');
for (const termConfig of searchTerms) {
let regex;
let term, searchIn, pattern, description, flags;
// Handle different input formats (string or object)
if (typeof termConfig === 'string') {
term = termConfig;
@ -146,21 +145,17 @@ jobs:
description = termConfig.description;
flags = termConfig.flags;
}
// Skip if this term shouldn't be searched in the current location
if (searchIn !== 'both' && searchIn !== searchLocation) {
continue;
}
// Create appropriate regex
if (searchType === 'regex') {
regex = new RegExp(pattern, flags || "gi");
} else {
regex = createSearchRegex(term, searchType);
}
const termMatches = [];
// Check each line for matches
lines.forEach((line, lineIndex) => {
const lineMatches = line.match(regex);
@ -175,15 +170,14 @@ jobs:
originalTerm: term || pattern,
description: description,
// Show context around the match in the line
context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
context: line.length > 100 ?
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
: line.trim()
});
});
}
});
if (termMatches.length > 0) {
matches.push({
term: term || (description || pattern),
@ -196,64 +190,48 @@ jobs:
});
}
}
return matches;
}
// Helper function to check if label should be added
async function processLabel(labelName, config) {
const body = context.payload.issue.body || "";
const title = context.payload.issue.title || "";
core.notice(`Processing label: ${labelName}`);
core.notice(`Issue Title: "${title}"`);
core.notice(`Issue Body length: ${body.length} characters`);
let shouldAddLabel = false;
let allMatches = [];
let reason = '';
const keywords = config.keywords || [];
const substrings = config.substrings || [];
const regexPatterns = config.regexPatterns || [];
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
// Search in title
if (title.trim()) {
core.notice(`Searching in title: "${title}"`);
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
}
// Search in body
if (body.trim()) {
core.notice(`Searching in body (${body.length} characters)`);
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
}
if (allMatches.length > 0) {
core.notice(`Found ${allMatches.length} matching term(s):`);
for (const termMatch of allMatches) {
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
if (termMatch.searchType === 'regex') {
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
} else {
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
}
// Show details for each match
termMatch.matches.forEach((match, index) => {
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
@ -266,7 +244,6 @@ jobs:
}
});
}
shouldAddLabel = true;
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
@ -274,13 +251,10 @@ jobs:
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
}
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
core.notice(`Reason: ${reason || 'No matching terms found'}`);
if (shouldAddLabel) {
const existingLabels = context.payload.issue.labels.map(l => l.name);
if (!existingLabels.includes(labelName)) {
@ -296,14 +270,92 @@ jobs:
core.notice(`Label "${labelName}" already present.`);
return false;
}
core.notice(`No matching terms found for label "${labelName}".`);
return false;
}
// Process all configured labels
const processLabels = Object.entries(labelConfig)
.map(([labelName, config]) => processLabel(labelName, config));
const labelsAdded = await Promise.all(processLabels);
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
const labelsAddedResults = await Promise.all(
Object.entries(labelConfig).map(([labelName, config]) =>
processLabel(labelName, config).then(added => ({ labelName, added }))
)
);
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
// Return which labels were added for the next step
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
core.setOutput('labels_added', JSON.stringify(addedLabels));
return addedLabels;
- name: CC users for labeled issues
if: steps.label-step.outputs.labels_added != '[]'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
script: |
// Configuration: Map labels to GitHub users to CC
// You can add multiple users per label, and multiple label configurations
const ccConfig = {
rocm: {
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
},
// Add more label -> user mappings here
// Example:
// cuda: {
// users: ['user1', 'user2'],
// message: 'CC {users} for CUDA-related issue'
// },
// performance: {
// users: ['perfexpert'],
// message: 'CC {users} for performance issue'
// },
};
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
// Get existing comments to check for already mentioned users
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const issueBody = context.payload.issue.body || '';
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
// Process each label that was added
for (const label of labelsAdded) {
if (ccConfig[label]) {
const config = ccConfig[label];
const usersToMention = [];
// Check which users haven't been mentioned yet
for (const user of config.users) {
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
if (!mentionPattern.test(allExistingText)) {
usersToMention.push(user);
} else {
core.notice(`@${user} already mentioned for label "${label}", skipping`);
}
}
// Post comment if there are users to mention
if (usersToMention.length > 0) {
const mentions = usersToMention.map(u => `@${u}`).join(' ');
const message = config.message.replace('{users}', mentions);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: message
});
core.notice(`CC comment added for label "${label}": ${mentions}`);
} else {
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
}
}
}

3
.gitignore vendored
View File

@ -94,6 +94,9 @@ ipython_config.py
# generated files
**/generated/**
# uv
uv.lock
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:

View File

@ -4,7 +4,6 @@ MD013: false
MD024:
siblings_only: true
MD033: false
MD042: false
MD045: false
MD046: false
MD051: false

View File

@ -16,6 +16,7 @@ repos:
rev: v1.38.1
hooks:
- id: typos
args: [--force-exclude]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v21.1.2
hooks:
@ -37,18 +38,18 @@ repos:
rev: 0.9.1
hooks:
- id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
- id: format-torch-nightly-test
name: reformat nightly_torch_test.txt to be in sync with test.in
language: python
entry: python tools/generate_nightly_torch_test.py
entry: python tools/pre_commit/generate_nightly_torch_test.py
files: ^requirements/test\.(in|txt)$
- id: mypy-local
name: Run mypy for local Python installation
entry: python tools/pre_commit/mypy.py 0 "local"
name: Run mypy locally for lowest supported Python version
entry: python tools/pre_commit/mypy.py 0 "3.10"
stages: [pre-commit] # Don't run in CI
<<: &mypy_common
language: python
@ -77,12 +78,12 @@ repos:
stages: [manual] # Only run in CI
- id: shellcheck
name: Lint shell scripts
entry: tools/shellcheck.sh
entry: tools/pre_commit/shellcheck.sh
language: script
types: [shell]
- id: png-lint
name: Lint PNG exports from excalidraw
entry: tools/png-lint.sh
entry: tools/pre_commit/png-lint.sh
language: script
types: [png]
- id: signoff-commit
@ -99,12 +100,12 @@ repos:
stages: [commit-msg]
- id: check-spdx-header
name: Check SPDX headers
entry: python tools/check_spdx_header.py
entry: python tools/pre_commit/check_spdx_header.py
language: python
types: [python]
- id: check-root-lazy-imports
name: Check root lazy imports
entry: python tools/check_init_lazy_imports.py
entry: python tools/pre_commit/check_init_lazy_imports.py
language: python
types: [python]
- id: check-filenames
@ -118,11 +119,11 @@ repos:
pass_filenames: false
- id: update-dockerfile-graph
name: Update Dockerfile dependency graph
entry: tools/update-dockerfile-graph.sh
entry: tools/pre_commit/update-dockerfile-graph.sh
language: script
- id: enforce-import-regex-instead-of-re
name: Enforce import regex as re
entry: python tools/enforce_regex_import.py
entry: python tools/pre_commit/enforce_regex_import.py
language: python
types: [python]
pass_filenames: false
@ -130,7 +131,7 @@ repos:
# forbid directly import triton
- id: forbid-direct-triton-import
name: "Forbid direct 'import triton'"
entry: python tools/check_triton_import.py
entry: python tools/pre_commit/check_triton_import.py
language: python
types: [python]
pass_filenames: false
@ -143,7 +144,7 @@ repos:
additional_dependencies: [regex]
- id: validate-config
name: Validate configuration has default values and that each field has a docstring
entry: python tools/validate_config.py
entry: python tools/pre_commit/validate_config.py
language: python
additional_dependencies: [regex]
# Keep `suggestion` last

View File

@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from docker/Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
#
# Try to find python package with an executable that exactly matches
@ -883,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
"csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/moe_lora_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")

View File

@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
*Latest News* 🔥
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).

View File

@ -5,7 +5,7 @@ import gc
from benchmark_utils import TimeCollector
from tabulate import tabulate
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool

View File

@ -46,7 +46,7 @@ import time
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):

View File

@ -19,7 +19,7 @@ from vllm.config import (
VllmConfig,
)
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
from vllm.v1.worker.gpu_input_batch import InputBatch
from vllm.v1.worker.gpu_model_runner import GPUModelRunner

View File

@ -37,7 +37,7 @@ from transformers import PreTrainedTokenizerBase
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
try:
from vllm.transformers_utils.tokenizer import get_tokenizer

View File

@ -11,7 +11,7 @@ import time
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Select a equi-probable random priority

View File

@ -31,6 +31,7 @@ import time
import uuid
import warnings
from collections.abc import AsyncGenerator
from contextlib import nullcontext
from dataclasses import dataclass
import datasets
@ -50,7 +51,7 @@ except ImportError:
from backend_request_func import get_tokenizer
try:
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
@ -501,15 +502,9 @@ async def benchmark(
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
# This can be used once the minimum Python version is 3.10 or higher,
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
async def limited_request_func(request_func_input, pbar):
if semaphore is None:
return await request_func(request_func_input=request_func_input, pbar=pbar)
async with semaphore:
return await request_func(request_func_input=request_func_input, pbar=pbar)

View File

@ -15,7 +15,7 @@ from utils import make_rand_sparse_tensors
from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]

View File

@ -18,7 +18,8 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
w8a8_triton_block_scaled_mm,
)
from vllm.utils import FlexibleArgumentParser, cdiv
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.math_utils import cdiv
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]

View File

@ -10,7 +10,8 @@ import torch
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
def with_triton_mode(fn):

View File

@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation # noqa F401
from vllm.model_executor.custom_op import CustomOp
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
batch_size_range = [1, 16, 32, 64, 128]
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]

View File

@ -28,7 +28,7 @@ except ImportError as e:
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
parser = FlexibleArgumentParser(
description="Benchmark BitBLAS int4 on a specific target."

View File

@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
WEIGHT_SHAPES_MOE = {
"nvidia/DeepSeek-R1-FP4": [

View File

@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
# Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size]

View File

@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
)
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
from vllm.logger import init_logger
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
logger = init_logger(__name__)

View File

@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_experts,
fused_topk,
)
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = [
"nm-testing/Mixtral-8x7B-Instruct-v0.1",

View File

@ -7,7 +7,8 @@ import torch
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()

View File

@ -25,7 +25,7 @@ if HAS_TRITON:
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_TP_SIZES = [1]

View File

@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
quantize_weights,
)
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]

View File

@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
sort_weights,
)
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]

View File

@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
FP8_DTYPE = current_platform.fp8_dtype()
@ -631,7 +631,7 @@ def main(args: argparse.Namespace):
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
block_quant_shape = get_weight_block_size_safety(config)

View File

@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
)
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
FP8_DTYPE = current_platform.fp8_dtype()
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute

View File

@ -39,7 +39,7 @@ import torch
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

View File

@ -9,9 +9,9 @@ import torch
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random,
)

View File

@ -1,155 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import torch
from vllm import _custom_ops as vllm_ops
from vllm.triton_utils import triton
def polynorm_naive(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
eps: float = 1e-6,
):
orig_shape = x.shape
x = x.view(-1, x.shape[-1])
def norm(x, eps: float):
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
x = x.float()
return (
(
weight[0] * norm(x**3, eps)
+ weight[1] * norm(x**2, eps)
+ weight[2] * norm(x, eps)
+ bias
)
.to(weight.dtype)
.view(orig_shape)
)
def polynorm_vllm(
x: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
eps: float = 1e-6,
):
orig_shape = x.shape
x = x.view(-1, x.shape[-1])
out = torch.empty_like(x)
vllm_ops.poly_norm(out, x, weight, bias, eps)
output = out
output = output.view(orig_shape)
return output
def calculate_diff(batch_size, seq_len, hidden_dim):
dtype = torch.bfloat16
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
weight = torch.ones(3, dtype=dtype, device="cuda")
bias = torch.ones(1, dtype=dtype, device="cuda")
output_naive = polynorm_naive(x, weight, bias)
output_vllm = polynorm_vllm(x, weight, bias)
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
print("✅ All implementations match")
else:
print("❌ Implementations differ")
batch_size_range = [2**i for i in range(0, 7, 2)]
seq_length_range = [2**i for i in range(6, 11, 1)]
dim_range = [2048, 4096]
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
def get_benchmark():
@triton.testing.perf_report(
triton.testing.Benchmark(
x_names=["dim", "batch_size", "seq_len"],
x_vals=[list(_) for _ in configs],
line_arg="provider",
line_vals=["naive", "vllm"],
line_names=["Naive", "vLLM"],
styles=[("blue", "-"), ("red", "-")],
ylabel="us",
plot_name="polynorm-perf",
args={},
)
)
def benchmark(dim, batch_size, seq_len, provider):
dtype = torch.bfloat16
hidden_dim = dim * 4
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
weight = torch.ones(3, dtype=dtype, device="cuda")
bias = torch.ones(1, dtype=dtype, device="cuda")
quantiles = [0.5, 0.2, 0.8]
if provider == "naive":
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: polynorm_naive(x, weight, bias),
quantiles=quantiles,
)
else:
ms, min_ms, max_ms = triton.testing.do_bench(
lambda: polynorm_vllm(x, weight, bias),
quantiles=quantiles,
)
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
return benchmark
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--batch-size",
type=int,
default=4,
help="Batch size",
)
parser.add_argument(
"--seq-len",
type=int,
default=128,
help="Sequence length",
)
parser.add_argument(
"--hidden-dim",
type=int,
default=8192,
help="Intermediate size of MLP",
)
parser.add_argument(
"--save-path",
type=str,
default="./configs/polnorm/",
help="Path to save polnorm benchmark results",
)
args = parser.parse_args()
# Run correctness test
calculate_diff(
batch_size=args.batch_size,
seq_len=args.seq_len,
hidden_dim=args.hidden_dim,
)
benchmark = get_benchmark()
# Run performance benchmark
benchmark.run(print_data=True, save_path=args.save_path)

View File

@ -7,7 +7,8 @@ import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
@torch.inference_mode()

View File

@ -9,9 +9,9 @@ from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random,
)

View File

@ -12,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
)
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.torch_utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random_flash,
)

View File

@ -8,7 +8,7 @@ import torch
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
def benchmark_rope_kernels_multi_lora(

View File

@ -8,7 +8,7 @@ from datetime import datetime
import flashinfer
import torch
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
FP8_DTYPE = torch.float8_e4m3fn

View File

@ -8,7 +8,7 @@ from datetime import datetime
import flashinfer
import torch
from vllm.utils import round_up
from vllm.utils.math_utils import round_up
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
FP8_DTYPE = torch.float8_e4m3fn

View File

@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
)
from vllm.platforms import current_platform
from vllm.triton_utils import triton
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
mp.set_start_method("spawn", force=True)

View File

@ -11,7 +11,7 @@ import regex as re
import seaborn as sns
from torch.utils.benchmark import Measurement as TMeasurement
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
if __name__ == "__main__":
parser = FlexibleArgumentParser(

View File

@ -1251,7 +1251,7 @@ async def main() -> None:
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. ",
"same as the `--model` argument. ",
)
parser.add_argument(

View File

@ -5,7 +5,7 @@ import cProfile
import pstats
from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
from vllm.utils.argparse_utils import FlexibleArgumentParser
# A very long prompt, total number of tokens is about 15k.
LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000

View File

@ -188,16 +188,60 @@ else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
endif()
#
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
# Flag to enable ACL kernels for AARCH64 platforms
if (VLLM_BUILD_ACL STREQUAL "ON")
set(USE_ACL ON)
else()
set(USE_ACL OFF)
endif()
# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
# Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
# TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
if(ASIMD_FOUND)
if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
else()
message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
FetchContent_Populate(arm_compute
SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
SOURCE_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
GIT_TAG v52.2.0
GIT_SHALLOW TRUE
GIT_PROGRESS TRUE
)
set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
endif()
# Build ACL with scons
include(ProcessorCount)
ProcessorCount(_NPROC)
set(_scons_cmd
scons -j${_NPROC}
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
multi_isa=1 openmp=1 cppthreads=0
)
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
# and create a local shim dir with it
include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
endif()
execute_process(
COMMAND ${_scons_cmd}
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
RESULT_VARIABLE _acl_rc
)
if(NOT _acl_rc EQUAL 0)
message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
endif()
set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
add_compile_definitions(VLLM_USE_ACL)
endif()
set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
@ -217,16 +261,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
)
endif()
if(USE_ACL)
find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
if(NOT ARM_COMPUTE_LIBRARY)
message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
endif()
set(ONEDNN_AARCH64_USE_ACL "ON")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
add_compile_definitions(VLLM_USE_ACL)
endif()
set(ONEDNN_LIBRARY_TYPE "STATIC")
set(ONEDNN_BUILD_DOC "OFF")
set(ONEDNN_BUILD_EXAMPLES "OFF")

View File

@ -19,7 +19,7 @@ else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
@ -66,6 +66,7 @@ if(FLASH_MLA_ARCHS)
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
)
set(FlashMLA_INCLUDES

View File

@ -22,10 +22,10 @@ else()
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
FetchContent_Populate(qutlass)
set(qutlass_SOURCE_DIR "${qutlass_SOURCE_DIR}")
endif()
FetchContent_Populate(qutlass)
if(NOT qutlass_SOURCE_DIR)
message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.")
endif()

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 8f468e7da54a8e2f98abfa7c38636aac91c0cba1
GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -129,6 +129,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
endfunction()
# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
# libgomp.so -> libgomp-<hash>.so...
# libgomp.so.1 -> libgomp-<hash>.so...
# OUTPUT: TORCH_GOMP_SHIM_DIR ("" if not found)
function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
# Use run_python to locate vendored libgomp; never throw on failure.
run_python(_VLLM_TORCH_GOMP_PATH
"
import os, glob
try:
import torch
torch_pkg = os.path.dirname(torch.__file__)
site_root = os.path.dirname(torch_pkg)
torch_libs = os.path.join(site_root, 'torch.libs')
print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
except:
print('')
"
"failed to probe torch.libs for libgomp")
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
return()
endif()
# Create shim under the build tree
set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
file(MAKE_DIRECTORY "${_shim}")
execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
endfunction()
# Macro for converting a `gencode` version number to a cmake version number.
macro(string_to_ver OUT_VER IN_STR)
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})

12
codecov.yml Normal file
View File

@ -0,0 +1,12 @@
codecov:
require_ci_to_pass: false
fixes:
# Map source code paths to repository root paths
# Wildcards match any Python version (python3.*)
- "/vllm-workspace/src/vllm/::vllm/"
- "/vllm-workspace/vllm/::vllm/"
- "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/local/lib/python3.*/site-packages/vllm/::vllm/"
- "/usr/lib/python3.*/dist-packages/vllm/::vllm/"
- "/usr/lib/python3.*/site-packages/vllm/::vllm/"

View File

@ -125,32 +125,37 @@ public:
}
static void set_split_kv (KernelArguments& args) {
// printf("set_split_kv start");
if (args.split_kv >= 1) return;
auto [H, K, D, B] = args.problem_shape;
// std::cout << H << " " << K << " " << D << " " << B << "\n";
int sm_count = args.hw_info.sm_count;
// printf(" sm_count = %d\n", sm_count);
int max_splits = ceil_div(K, 128);
max_splits = min(16, max_splits);
float seq_length_k = static_cast<float>(K) / 1024.0f;
int max_splits = 1;
// TODO: This avoids a hang when the batch size larger than 1 and
// there is more than 1 kv_splits.
// Discuss with NVIDIA how this can be fixed.
if (B > 1) {
max_splits = min(1, max_splits);
if (B <= 4 && seq_length_k >= 16) {
max_splits = 16;
}
// printf(" max_splits = %d\n", max_splits);
else if (B <= 8 && seq_length_k >= 4) {
max_splits = 8;
}
else if ((B <= 16 && seq_length_k >= 8) ||
(B == 48 && seq_length_k >= 32)) {
max_splits = 4;
}
else if ((B <= 32 && seq_length_k >= 16) ||
(B == 96 && seq_length_k >= 16)) {
max_splits = 2;
}
else {
max_splits = 1;
}
// Wave-aware scheduling: ensure integer number of waves in K dimension
int sms_per_batch = max(1, sm_count / B);
// printf(" sms_per_batch = %d\n", sms_per_batch);
int split_heur = min(max_splits, sms_per_batch);
int waves = ceil_div(B * split_heur, sm_count);
int k_waves = ceil_div(max_splits, split_heur);
int split_wave_aware = ceil_div(max_splits, k_waves);
args.split_kv = split_wave_aware;
// printf(" args.split_kv = %d\n", args.split_kv);
}
/// Determines whether the GEMM can execute the given problem.

Some files were not shown because too many files have changed in this diff Show More