Compare commits
7 Commits
debug
...
build-flas
| Author | SHA1 | Date | |
|---|---|---|---|
| c3d9640b09 | |||
| 02f7780716 | |||
| 61d568a19d | |||
| 629468aa71 | |||
| b15005dc12 | |||
| 67ba6a9487 | |||
| f626cc9300 |
@ -41,6 +41,20 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- block: "Build FlashInfer wheel"
|
||||||
|
key: block-build-flashinfer-wheel
|
||||||
|
depends_on: ~
|
||||||
|
|
||||||
|
- label: "Build and upload FlashInfer wheel - CUDA 12.8"
|
||||||
|
depends_on: block-build-flashinfer-wheel
|
||||||
|
id: build-upload-flashinfer-wheel
|
||||||
|
agents:
|
||||||
|
queue: gpu_1_queue
|
||||||
|
commands:
|
||||||
|
- "bash .buildkite/scripts/build-upload-flashinfer-wheel.sh 12.8.1"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build release image"
|
- block: "Build release image"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
key: block-release-image-build
|
key: block-release-image-build
|
||||||
|
|||||||
57
.buildkite/scripts/build-upload-flashinfer-wheel.sh
Executable file
57
.buildkite/scripts/build-upload-flashinfer-wheel.sh
Executable file
@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
CUDA_VERSION="${1:-12.8.1}"
|
||||||
|
# FlashInfer version controlled in tools/flashinfer-build.sh
|
||||||
|
|
||||||
|
echo "Building FlashInfer wheel for CUDA ${CUDA_VERSION} using vLLM Dockerfile"
|
||||||
|
|
||||||
|
# Build the FlashInfer wheel using the existing Dockerfile stage
|
||||||
|
DOCKER_BUILDKIT=1 docker build \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg CUDA_VERSION="${CUDA_VERSION}" \
|
||||||
|
--tag flashinfer-wheel-builder:${CUDA_VERSION} \
|
||||||
|
--target flashinfer-wheel-builder \
|
||||||
|
--progress plain \
|
||||||
|
-f docker/Dockerfile .
|
||||||
|
|
||||||
|
# Extract the wheel
|
||||||
|
mkdir -p artifacts/dist
|
||||||
|
docker run --rm -v $(pwd)/artifacts:/output_host flashinfer-wheel-builder:${CUDA_VERSION} \
|
||||||
|
bash -c 'cp /output/*.whl /output_host/dist/ && chmod -R a+rw /output_host'
|
||||||
|
|
||||||
|
# Upload the wheel to S3
|
||||||
|
echo "Uploading FlashInfer wheel to S3..."
|
||||||
|
wheel_files=(artifacts/dist/*.whl)
|
||||||
|
|
||||||
|
# Check that exactly one wheel is found
|
||||||
|
if [[ ${#wheel_files[@]} -ne 1 ]]; then
|
||||||
|
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Get the single wheel file
|
||||||
|
wheel="${wheel_files[0]}"
|
||||||
|
echo "Processing FlashInfer wheel: $wheel"
|
||||||
|
|
||||||
|
# Rename 'linux' to 'manylinux1' in the wheel filename for compatibility
|
||||||
|
new_wheel="${wheel/linux/manylinux1}"
|
||||||
|
if [[ "$wheel" != "$new_wheel" ]]; then
|
||||||
|
mv -- "$wheel" "$new_wheel"
|
||||||
|
wheel="$new_wheel"
|
||||||
|
echo "Renamed wheel to: $wheel"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract the version from the wheel
|
||||||
|
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||||
|
wheel_name=$(basename "$wheel")
|
||||||
|
echo "FlashInfer version: $version"
|
||||||
|
|
||||||
|
# Upload the wheel to S3 under flashinfer-python directory
|
||||||
|
aws s3 cp "$wheel" "s3://vllm-wheels/flashinfer-python/"
|
||||||
|
|
||||||
|
echo "✅ FlashInfer wheel built and uploaded successfully for CUDA ${CUDA_VERSION}"
|
||||||
|
echo "📦 Wheel: $wheel_name (version $version)"
|
||||||
|
ls -la artifacts/dist/
|
||||||
@ -268,6 +268,15 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
|
|||||||
else \
|
else \
|
||||||
echo "Skipping wheel size check."; \
|
echo "Skipping wheel size check."; \
|
||||||
fi
|
fi
|
||||||
|
#################### FLASHINFER WHEEL BUILD IMAGE ####################
|
||||||
|
FROM base AS flashinfer-wheel-builder
|
||||||
|
ARG CUDA_VERSION
|
||||||
|
|
||||||
|
COPY tools/flashinfer-build.sh /tmp/flashinfer-build.sh
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
. /etc/environment && \
|
||||||
|
BUILD_WHEEL=true /tmp/flashinfer-build.sh
|
||||||
|
|
||||||
#################### EXTENSION Build IMAGE ####################
|
#################### EXTENSION Build IMAGE ####################
|
||||||
|
|
||||||
#################### DEV IMAGE ####################
|
#################### DEV IMAGE ####################
|
||||||
@ -391,35 +400,11 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
|
|||||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
|
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
|
||||||
|
|
||||||
# Install FlashInfer from source
|
# Install FlashInfer from source
|
||||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
# Version controlled in tools/flashinfer-build.sh - keep in sync with requirements/cuda.txt
|
||||||
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
|
COPY tools/flashinfer-build.sh /tmp/flashinfer-build.sh
|
||||||
# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
ARG FLASHINFER_GIT_REF="v0.2.9rc2"
|
. /etc/environment && \
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
|
/tmp/flashinfer-build.sh
|
||||||
. /etc/environment
|
|
||||||
git clone --depth 1 --recursive --shallow-submodules \
|
|
||||||
--branch ${FLASHINFER_GIT_REF} \
|
|
||||||
${FLASHINFER_GIT_REPO} flashinfer
|
|
||||||
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
|
||||||
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
|
|
||||||
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
|
||||||
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
|
||||||
else
|
|
||||||
# CUDA 12.8+ supports 10.0a and 12.0
|
|
||||||
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
|
||||||
fi
|
|
||||||
echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
|
||||||
# Needed to build AOT kernels
|
|
||||||
pushd flashinfer
|
|
||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
|
||||||
python3 -m flashinfer.aot
|
|
||||||
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
|
||||||
uv pip install --system --no-build-isolation --force-reinstall --no-deps .
|
|
||||||
popd
|
|
||||||
rm -rf flashinfer
|
|
||||||
BASH
|
|
||||||
COPY examples examples
|
COPY examples examples
|
||||||
COPY benchmarks benchmarks
|
COPY benchmarks benchmarks
|
||||||
COPY ./vllm/collect_env.py .
|
COPY ./vllm/collect_env.py .
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 119 KiB After Width: | Height: | Size: 143 KiB |
54
tools/flashinfer-build.sh
Executable file
54
tools/flashinfer-build.sh
Executable file
@ -0,0 +1,54 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Build FlashInfer with AOT kernels
|
||||||
|
# This script is used by both the Dockerfile and standalone wheel building
|
||||||
|
|
||||||
|
# FlashInfer configuration - keep FLASHINFER_GIT_REF in sync with requirements/cuda.txt
|
||||||
|
FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||||
|
FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF:-v0.2.9rc2}" # Must match requirements/cuda.txt
|
||||||
|
CUDA_VERSION="${CUDA_VERSION:-12.8.1}"
|
||||||
|
BUILD_WHEEL="${BUILD_WHEEL:-false}"
|
||||||
|
|
||||||
|
echo "🏗️ Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
|
||||||
|
|
||||||
|
# Clone FlashInfer
|
||||||
|
git clone --depth 1 --recursive --shallow-submodules \
|
||||||
|
--branch ${FLASHINFER_GIT_REF} \
|
||||||
|
${FLASHINFER_GIT_REPO} flashinfer
|
||||||
|
|
||||||
|
# Set CUDA arch list based on CUDA version
|
||||||
|
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
|
||||||
|
if [[ "${CUDA_VERSION}" == 11.* ]]; then
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
|
||||||
|
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
|
||||||
|
else
|
||||||
|
# CUDA 12.8+ supports 10.0a and 12.0
|
||||||
|
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
|
||||||
|
|
||||||
|
# Build AOT kernels and install/build wheel
|
||||||
|
pushd flashinfer
|
||||||
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
|
python3 -m flashinfer.aot
|
||||||
|
|
||||||
|
if [[ "${BUILD_WHEEL}" == "true" ]]; then
|
||||||
|
# Build wheel for distribution
|
||||||
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
|
uv pip wheel --no-deps --wheel-dir /wheels .
|
||||||
|
mkdir -p /output && cp /wheels/*.whl /output/
|
||||||
|
echo "✅ FlashInfer wheel built successfully"
|
||||||
|
else
|
||||||
|
# Install directly (for Dockerfile)
|
||||||
|
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
|
||||||
|
uv pip install --system --no-build-isolation --force-reinstall --no-deps .
|
||||||
|
echo "✅ FlashInfer installed successfully"
|
||||||
|
fi
|
||||||
|
popd
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
rm -rf flashinfer
|
||||||
Reference in New Issue
Block a user