From b5bae42f913efebef6d5239291418df8fb73b555 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Thu, 30 Oct 2025 11:17:13 +0800 Subject: [PATCH] [XPU] Update latest IPEX 2.8 release (#27735) Signed-off-by: Kunshang Ji --- .../scripts/hardware_ci/run-xpu-test.sh | 7 +++++-- .../installation/gpu.xpu.inc.md | 4 +++- requirements/xpu.txt | 2 +- vllm/_ipex_ops.py | 21 +++++-------------- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 250a64fdd0..27ed67c451 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -20,7 +20,10 @@ trap remove_docker_container EXIT # Run the image and test offline inference/tensor parallel docker run \ - --device /dev/dri \ + --device /dev/dri:/dev/dri \ + --net=host \ + --ipc=host \ + --privileged \ -v /dev/dri/by-path:/dev/dri/by-path \ --entrypoint="" \ -e "HF_TOKEN=${HF_TOKEN}" \ @@ -42,7 +45,7 @@ docker run \ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py pytest -v -s v1/test_serial_utils.py ' diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md index 9156df9db6..620a660a24 100644 --- a/docs/getting_started/installation/gpu.xpu.inc.md +++ b/docs/getting_started/installation/gpu.xpu.inc.md @@ -56,8 +56,10 @@ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . docker run -it \ --rm \ --network=host \ - --device /dev/dri \ + --device /dev/dri:/dev/dri \ -v /dev/dri/by-path:/dev/dri/by-path \ + --ipc=host \ + --privileged \ vllm-xpu-env ``` diff --git a/requirements/xpu.txt b/requirements/xpu.txt index d14b631aa9..e69a98b860 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -15,4 +15,4 @@ torchaudio torchvision --extra-index-url=https://download.pytorch.org/whl/xpu -intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index e773e1d13f..60ee0124c3 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -151,7 +151,9 @@ class ipex_ops: def rms_norm( input: torch.Tensor, weight: torch.Tensor, epsilon: float ) -> torch.Tensor: - return ipex.llm.functional.rms_norm(input, weight, epsilon) + out = torch.empty_like(input) + torch.ops.torch_ipex.rms_norm_vllm(out, input.contiguous(), weight, epsilon) + return out @staticmethod def fused_add_rms_norm( @@ -160,10 +162,7 @@ class ipex_ops: weight: torch.Tensor, epsilon: float, ) -> None: - tmp = ipex.llm.functional.add_rms_norm( - residual, input, weight, None, epsilon, True - ) - input.copy_(tmp) + torch.ops.torch_ipex.fused_add_rms_norm_vllm(input, residual, weight, epsilon) @staticmethod def varlen_attention( @@ -296,16 +295,6 @@ class ipex_ops: num_splits=0, s_aux: torch.Tensor | None = None, ): - if cu_seqlens_k is None: - # cu_seqlens_k is not used in ipex kernel. - cu_seqlens_k = torch.cumsum(seqused_k, dim=0) - cu_seqlens_k = torch.cat( - [ - torch.tensor([0], device=seqused_k.device, dtype=torch.int32), - cu_seqlens_k, - ] - ).to(torch.int32) - real_window_size: tuple[int, int] if window_size is None: real_window_size = (-1, -1) @@ -318,7 +307,7 @@ class ipex_ops: k, v, cu_seqlens_q, - cu_seqlens_k, + seqused_k, max_seqlen_q, max_seqlen_k, softmax_scale,