diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 4b8f0daacb..192d349b30 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -212,11 +212,24 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON # Build ACL with scons include(ProcessorCount) ProcessorCount(_NPROC) + set(_scons_cmd + scons -j${_NPROC} + Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux + arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1 + multi_isa=1 openmp=1 cppthreads=0 + ) + + # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0) + # and create a local shim dir with it + include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake") + vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR) + + if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "") + list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR}) + endif() + execute_process( - COMMAND scons -j${_NPROC} - Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux - arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1 - multi_isa=1 openmp=1 cppthreads=0 + COMMAND ${_scons_cmd} WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}" RESULT_VARIABLE _acl_rc ) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index f6a0d2b75b..c2181d4549 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -129,6 +129,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE) endfunction() +# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with: +# libgomp.so -> libgomp-.so... +# libgomp.so.1 -> libgomp-.so... +# OUTPUT: TORCH_GOMP_SHIM_DIR ("" if not found) +function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR) + set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE) + + # Use run_python to locate vendored libgomp; never throw on failure. + run_python(_VLLM_TORCH_GOMP_PATH + " +import os, glob +try: + import torch + torch_pkg = os.path.dirname(torch.__file__) + site_root = os.path.dirname(torch_pkg) + torch_libs = os.path.join(site_root, 'torch.libs') + print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0]) +except: + print('') +" + "failed to probe torch.libs for libgomp") + + if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}") + return() + endif() + + # Create shim under the build tree + set(_shim "${CMAKE_BINARY_DIR}/gomp_shim") + file(MAKE_DIRECTORY "${_shim}") + + execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so") + execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1") + execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so") + execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1") + + set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE) +endfunction() + # Macro for converting a `gencode` version number to a cmake version number. macro(string_to_ver OUT_VER IN_STR) string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 699a56be5c..8c1d46564f 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import glob import json import os import platform @@ -301,8 +302,8 @@ class CpuPlatform(Platform): os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "0" # Intel OpenMP setting - ld_prealod_str = os.getenv("LD_PRELOAD", "") - if "libiomp5.so" in ld_prealod_str: + ld_preload_str = os.getenv("LD_PRELOAD", "") + if "libiomp5.so" in ld_preload_str: # The time(milliseconds) that a thread should wait after # completing the execution of a parallel region, before sleeping. os.environ["KMP_BLOCKTIME"] = "1" @@ -313,6 +314,31 @@ class CpuPlatform(Platform): os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist" os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist" + if ( + platform.system() == "Linux" + and Platform.get_cpu_architecture() == CpuArchEnum.ARM + and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str) + ): + # We need to LD_PRELOAD PyTorch's libgomp, otherwise only + # one core will be properly utilized when we thread-bind + # See: https://github.com/vllm-project/vllm/issues/27369 + # TODO: Remove once: + # https://github.com/pytorch/pytorch/issues/166087 is fixed + + # We need to find the location of PyTorch's libgomp + torch_pkg = os.path.dirname(torch.__file__) + site_root = os.path.dirname(torch_pkg) + torch_libs = os.path.join(site_root, "torch.libs") + pytorch_libgomp_so_candidates = glob.glob( + os.path.join(torch_libs, "libgomp-*.so*") + ) + if pytorch_libgomp_so_candidates: + pytorch_libgomp_so = pytorch_libgomp_so_candidates[0] + if ld_preload_str: + ld_preload_str += ":" + ld_preload_str += pytorch_libgomp_so + os.environ["LD_PRELOAD"] = ld_preload_str + # To hint IPEX uses shared memory based AllReduce os.environ["LOCAL_WORLD_SIZE"] = str( vllm_config.parallel_config.tensor_parallel_size