diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ac96f2f0d7..17fe60356e 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -395,7 +395,9 @@ steps: - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 @@ -436,7 +438,11 @@ steps: --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ - --ignore=lora/test_llm_with_multi_loras.py + --ignore=lora/test_llm_with_multi_loras.py \ + --ignore=lora/test_olmoe_tp.py \ + --ignore=lora/test_deepseekv2_tp.py \ + --ignore=lora/test_gptoss.py \ + --ignore=lora/test_qwen3moe_tp.py parallelism: 4 - label: PyTorch Compilation Unit Tests # 15min @@ -1208,6 +1214,7 @@ steps: - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py - label: Weight Loading Multiple GPU Test # 33min timeout_in_minutes: 45