diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 65d2e5036b..ef42235250 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -1,12 +1,10 @@ FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base -RUN rm /etc/apt/sources.list.d/intel-graphics.list +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ + add-apt-repository -y ppa:kobuk-team/intel-graphics RUN apt clean && apt-get update -y && \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:deadsnakes/ppa && \ - apt-get install -y python3.10 python3.10-distutils && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \ apt-get install -y --no-install-recommends --fix-missing \ curl \ ffmpeg \ @@ -17,17 +15,29 @@ RUN apt clean && apt-get update -y && \ libgl1 \ lsb-release \ numactl \ - python3.10-dev \ - wget + wget \ + vim \ + python3.12 \ + python3.12-dev \ + python3-pip +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 -RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 +RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing + +RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh +RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc +SHELL ["bash", "-c"] +CMD ["bash", "-c", "source /root/.bashrc && exec bash"] WORKDIR /workspace/vllm COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt COPY requirements/common.txt /workspace/vllm/requirements/common.txt +# suppress the python externally managed environment error +RUN python3 -m pip config set global.break-system-packages true + RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ -r requirements/xpu.txt @@ -54,8 +64,9 @@ FROM vllm-base AS vllm-openai RUN --mount=type=cache,target=/root/.cache/pip \ pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope -ENV VLLM_USAGE_SOURCE production-docker-image \ - TRITON_XPU_PROFILE 1 +RUN --mount=type=cache,target=/root/.cache/pip \ + pip uninstall oneccl oneccl-devel -y + # install development dependencies (for testing) RUN python3 -m pip install -e tests/vllm_test_utils ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/docs/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md index b77c4e00cf..ed1dc0418c 100644 --- a/docs/getting_started/installation/gpu/xpu.inc.md +++ b/docs/getting_started/installation/gpu/xpu.inc.md @@ -3,13 +3,16 @@ vLLM initially supports basic model inference and serving on Intel GPU platform. !!! warning - There are no pre-built wheels or images for this device, so you must build vLLM from source. + There are no pre-built wheels for this device, so you need build vLLM from source. Or you can use pre-built images which are based on vLLM released versions. # --8<-- [end:installation] # --8<-- [start:requirements] - Supported Hardware: Intel Data Center GPU, Intel ARC GPU -- OneAPI requirements: oneAPI 2025.0 +- OneAPI requirements: oneAPI 2025.1 +- Python: 3.12 +!!! warning + The provided IPEX whl is Python3.12 specific so this version is a MUST. # --8<-- [end:requirements] # --8<-- [start:set-up-using-python] @@ -24,7 +27,7 @@ Currently, there are no pre-built XPU wheels. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] -- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.0 or later. +- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later. - Second, install Python packages for vLLM XPU backend building: ```bash @@ -40,14 +43,10 @@ pip install -v -r requirements/xpu.txt VLLM_TARGET_DEVICE=xpu python setup.py install ``` -!!! note - - FP16 is the default data type in the current XPU backend. The BF16 data - type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet. - # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] -Currently, there are no pre-built XPU images. +Currently, we release prebuilt XPU images at docker [hub](https://hub.docker.com/r/intel/vllm/tags) based on vLLM released version. For more information, please refer release [note](https://github.com/intel/ai-containers/blob/main/vllm). # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] @@ -65,14 +64,14 @@ docker run -it \ # --8<-- [end:build-image-from-source] # --8<-- [start:supported-features] -XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following: +XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. For **pipeline parallel**, we support it on single node with mp as the backend. For example, a reference execution like following: ```bash python -m vllm.entrypoints.openai.api_server \ --model=facebook/opt-13b \ --dtype=bfloat16 \ --max_model_len=1024 \ - --distributed-executor-backend=ray \ + --distributed-executor-backend=mp \ --pipeline-parallel-size=2 \ -tp=8 ``` diff --git a/requirements/xpu.txt b/requirements/xpu.txt index c44a2a9c74..74f5b05b23 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -11,10 +11,9 @@ jinja2>=3.1.6 datasets # for benchmark scripts numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding nixl==0.3.0 # for PD disaggregation ---extra-index-url=https://download.pytorch.org/whl/xpu torch==2.8.0+xpu torchaudio torchvision -pytorch-triton-xpu ---extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -intel-extension-for-pytorch==2.8.10+xpu +--extra-index-url=https://download.pytorch.org/whl/xpu + +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl