[NIXL][non-cuda] Add install script for nixl with non-cuda ucx (#25959)

Signed-off-by: Chendi Xue <Chendi.Xue@intel.com>
This commit is contained in:
Chendi.Xue
2025-10-08 09:19:53 -05:00
committed by GitHub
parent 2f99f2f506
commit 9fc983c707
3 changed files with 217 additions and 1 deletions

View File

@ -11,6 +11,12 @@ Install the NIXL library: `uv pip install nixl`, as a quick start.
- Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
- The specified required NIXL version can be found in [requirements/kv_connectors.txt](gh-file:requirements/kv_connectors.txt) and other relevant config files
For non-cuda platform, please install nixl with ucx build from source, instructed as below.
```bash
python tools/install_nixl_from_source_ubuntu.py
```
### Transport Configuration
NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:

View File

@ -191,7 +191,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok
### Setup Steps
1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip.
1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip. For non-cuda platform to install nixl with non-cuda UCX build, run the [install_nixl_from_source_ubuntu.py](gh-file:tools/install_nixl_from_source_ubuntu.py) script.
2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`. Noted, you may also specify one or multiple NIXL_Backend. Such as: `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_connector_extra_config":{"backends":["UCX", "GDS"]}}'`

View File

@ -0,0 +1,210 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# install_prerequisites.py
import argparse
import glob
import os
import subprocess
import sys
# --- Configuration ---
WHEELS_CACHE_HOME = os.environ.get("WHEELS_CACHE_HOME", "/tmp/wheels_cache")
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
UCX_DIR = os.path.join('/tmp', 'ucx_source')
NIXL_DIR = os.path.join('/tmp', 'nixl_source')
UCX_INSTALL_DIR = os.path.join('/tmp', 'ucx_install')
UCX_REPO_URL = 'https://github.com/openucx/ucx.git'
NIXL_REPO_URL = 'https://github.com/ai-dynamo/nixl.git'
# --- Helper Functions ---
def run_command(command, cwd='.', env=None):
"""Helper function to run a shell command and check for errors."""
print(f"--> Running command: {' '.join(command)} in '{cwd}'", flush=True)
subprocess.check_call(command, cwd=cwd, env=env)
def is_pip_package_installed(package_name):
"""Checks if a package is installed via pip without raising an exception."""
result = subprocess.run(
[sys.executable, '-m', 'pip', 'show', package_name],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
return result.returncode == 0
def find_nixl_wheel_in_cache(cache_dir):
"""Finds a nixl wheel file in the specified cache directory."""
# The repaired wheel will have a 'manylinux' tag, but this glob still works.
search_pattern = os.path.join(cache_dir, "nixl-*.whl")
wheels = glob.glob(search_pattern)
if wheels:
# Sort to get the most recent/highest version if multiple exist
wheels.sort()
return wheels[-1]
return None
def install_system_dependencies():
"""Installs required system packages using apt-get if run as root."""
if os.geteuid() != 0:
print("\n---", flush=True)
print("WARNING: Not running as root. \
Skipping system dependency installation.",
flush=True)
print(
"Please ensure the listed packages are installed on your system:",
flush=True)
print(" patchelf build-essential git cmake ninja-build \
autotools-dev automake meson libtool libtool-bin",
flush=True)
print("---\n", flush=True)
return
print("--- Running as root. Installing system dependencies... ---",
flush=True)
apt_packages = [
"patchelf", # <-- Add patchelf here
"build-essential",
"git",
"cmake",
"ninja-build",
"autotools-dev",
"automake",
"meson",
"libtool",
"libtool-bin"
]
run_command(['apt-get', 'update'])
run_command(['apt-get', 'install', '-y'] + apt_packages)
print("--- System dependencies installed successfully. ---\n", flush=True)
def build_and_install_prerequisites(args):
"""Builds UCX and NIXL from source, creating a self-contained wheel."""
if not args.force_reinstall and is_pip_package_installed('nixl'):
print("--> NIXL is already installed. Nothing to do.", flush=True)
return
cached_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME)
if not args.force_reinstall and cached_wheel:
print(f"\n--> Found self-contained wheel: \
{os.path.basename(cached_wheel)}.",
flush=True)
print("--> Installing from cache, skipping all source builds.",
flush=True)
install_command = [
sys.executable, '-m', 'pip', 'install', cached_wheel
]
run_command(install_command)
print("\n--- Installation from cache complete. ---", flush=True)
return
print("\n--> No installed package or cached wheel found. \
Starting full build process...",
flush=True)
print("\n--> Installing auditwheel...", flush=True)
run_command([sys.executable, '-m', 'pip', 'install', 'auditwheel'])
install_system_dependencies()
ucx_install_path = os.path.abspath(UCX_INSTALL_DIR)
print(f"--> Using wheel cache directory: {WHEELS_CACHE_HOME}", flush=True)
os.makedirs(WHEELS_CACHE_HOME, exist_ok=True)
# -- Step 1: Build UCX from source --
print("\n[1/3] Configuring and building UCX from source...", flush=True)
if not os.path.exists(UCX_DIR):
run_command(['git', 'clone', UCX_REPO_URL, UCX_DIR])
ucx_source_path = os.path.abspath(UCX_DIR)
run_command(['git', 'checkout', 'v1.19.x'], cwd=ucx_source_path)
run_command(['./autogen.sh'], cwd=ucx_source_path)
configure_command = [
'./configure',
f'--prefix={ucx_install_path}',
'--enable-shared',
'--disable-static',
'--disable-doxygen-doc',
'--enable-optimizations',
'--enable-cma',
'--enable-devel-headers',
'--with-verbs',
'--enable-mt',
]
run_command(configure_command, cwd=ucx_source_path)
run_command(['make', '-j', str(os.cpu_count() or 1)], cwd=ucx_source_path)
run_command(['make', 'install'], cwd=ucx_source_path)
print("--- UCX build and install complete ---", flush=True)
# -- Step 2: Build NIXL wheel from source --
print("\n[2/3] Building NIXL wheel from source...", flush=True)
if not os.path.exists(NIXL_DIR):
run_command(['git', 'clone', NIXL_REPO_URL, NIXL_DIR])
build_env = os.environ.copy()
build_env['PKG_CONFIG_PATH'] = os.path.join(ucx_install_path, 'lib',
'pkgconfig')
ucx_lib_path = os.path.join(ucx_install_path, 'lib')
ucx_plugin_path = os.path.join(ucx_lib_path, 'ucx')
existing_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
build_env['LD_LIBRARY_PATH'] = \
f"{ucx_lib_path}:{ucx_plugin_path}:{existing_ld_path}".strip(':')
print(f"--> Using LD_LIBRARY_PATH: {build_env['LD_LIBRARY_PATH']}",
flush=True)
temp_wheel_dir = os.path.join(ROOT_DIR, 'temp_wheelhouse')
run_command([
sys.executable, '-m', 'pip', 'wheel', '.', '--no-deps',
f'--wheel-dir={temp_wheel_dir}'
],
cwd=os.path.abspath(NIXL_DIR),
env=build_env)
# -- Step 3: Repair the wheel by copying UCX libraries --
print("\n[3/3] Repairing NIXL wheel to include UCX libraries...",
flush=True)
unrepaired_wheel = find_nixl_wheel_in_cache(temp_wheel_dir)
if not unrepaired_wheel:
raise RuntimeError("Failed to find the NIXL wheel after building it.")
# We tell auditwheel to ignore the plugin that mesonpy already handled.
auditwheel_command = [
'auditwheel',
'repair',
'--exclude',
'libplugin_UCX.so', # <-- Exclude because mesonpy already includes it
unrepaired_wheel,
f'--wheel-dir={WHEELS_CACHE_HOME}'
]
run_command(auditwheel_command, env=build_env)
# --- CLEANUP ---
# No more temporary files to remove, just the temp wheelhouse
run_command(['rm', '-rf', temp_wheel_dir])
# --- END CLEANUP ---
newly_built_wheel = find_nixl_wheel_in_cache(WHEELS_CACHE_HOME)
if not newly_built_wheel:
raise RuntimeError("Failed to find the repaired NIXL wheel.")
print(f"--> Successfully built self-contained wheel: \
{os.path.basename(newly_built_wheel)}. Now installing...",
flush=True)
install_command = [
sys.executable, '-m', 'pip', 'install', newly_built_wheel
]
if args.force_reinstall:
install_command.insert(-1, '--force-reinstall')
run_command(install_command)
print("--- NIXL installation complete ---", flush=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Build and install UCX and NIXL dependencies.")
parser.add_argument('--force-reinstall',
action='store_true',
help='Force rebuild and reinstall of UCX and NIXL \
even if they are already installed.')
args = parser.parse_args()
build_and_install_prerequisites(args)