CUTLASS 2.1 (#83)

CUTLASS 2.1 contributes: - BLAS-style host-side API added to CUTLASS Library - Planar Complex GEMM kernels targeting Volta and Turing Tensor Cores - Minor enhancements and bug fixes
2020-04-07 13:51:25 -07:00
parent 7c0cd26d13
commit 96dab34ad9
196 changed files with 20653 additions and 1995 deletions
--- a/CUDA.cmake
+++ b/CUDA.cmake
@ -39,23 +39,27 @@ if(CUTLASS_NATIVE_CUDA)

  enable_language(CUDA)

+  if(NOT CUDA_VERSION)
+    set(CUDA_VERSION ${CMAKE_CUDA_COMPILER_VERSION})
+  endif()
+  if(NOT CUDA_TOOLKIT_ROOT_DIR)
+    get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}/../.." ABSOLUTE)
+  endif()
+
 else()

  find_package(CUDA REQUIRED)
+  # We workaround missing variables with the native flow by also finding the CUDA toolkit the old way.

-endif()
+  if(NOT CMAKE_CUDA_COMPILER_VERSION)
+    set(CMAKE_CUDA_COMPILER_VERSION ${CUDA_VERSION})
+  endif()

-if(NOT CUDA_VERSION)
-  set(CUDA_VERSION ${CMAKE_CUDA_COMPILER_VERSION})
-endif()
-if(NOT CUDA_TOOLKIT_ROOT_DIR)
-  get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}/../.." ABSOLUTE)
 endif()

 if (CUDA_VERSION VERSION_LESS 9.2)
  message(FATAL_ERROR "CUDA 9.2+ Required, Found ${CUDA_VERSION}.")
 endif()
-
 if(NOT CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "[Cc]lang")
  set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
  message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")
@ -74,7 +78,7 @@ find_library(
  # in the CUDA toolkit we're building against.
  )

-if(CUDART_LIBRARY)
+if(NOT TARGET cudart AND CUDART_LIBRARY)

  message(STATUS "CUDART: ${CUDART_LIBRARY}")

@ -95,6 +99,10 @@ if(CUDART_LIBRARY)
    ${CUDART_LIBRARY}
    )

+elseif(TARGET cudart)
+
+  message(STATUS "CUDART: Already Found")
+
 else()

  message(STATUS "CUDART: Not Found")
@ -116,7 +124,7 @@ find_library(
  # in the CUDA toolkit we're building against.
  )

-if(CUDA_DRIVER_LIBRARY)
+if(NOT TARGET cuda_driver AND CUDA_DRIVER_LIBRARY)

  message(STATUS "CUDA Driver: ${CUDA_DRIVER_LIBRARY}")

@ -137,6 +145,10 @@ if(CUDA_DRIVER_LIBRARY)
    ${CUDA_DRIVER_LIBRARY}
    )

+elseif(TARGET cuda_driver)
+
+  message(STATUS "CUDA Driver: Already Found")
+
 else()

  message(STATUS "CUDA Driver: Not Found")
@ -156,7 +168,7 @@ find_library(
  # in the CUDA toolkit we're building against.
  )

-if(NVRTC_LIBRARY)
+if(NOT TARGET nvrtc AND NVRTC_LIBRARY)

  message(STATUS "NVRTC: ${NVRTC_LIBRARY}")

@ -177,6 +189,10 @@ if(NVRTC_LIBRARY)
    ${NVRTC_LIBRARY}
    )

+elseif(TARGET nvrtc)
+
+  message(STATUS "NVRTC: Already Found")
+
 else()

  message(STATUS "NVRTC: Not Found")
@ -197,48 +213,137 @@ function(cutlass_correct_source_file_language_property)
  endif()
 endfunction()

-function(cutlass_add_library)
+set(CUTLASS_UNITY_BUILD_ENABLED ON CACHE BOOL "Enable combined source compilation")
+set(CUTLASS_UNITY_BUILD_BATCH_SIZE 16 CACHE STRING "Batch size for unified source files")

-  set(options INTERFACE STATIC SHARED OBJECT)
-  set(oneValueArgs)
+function(cutlass_unify_source_files TARGET_ARGS_VAR)
+
+  set(options)
+  set(oneValueArgs BATCH_SOURCES BATCH_SIZE)
  set(multiValueArgs)
  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-  if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang" OR __INTERFACE)
-    cutlass_correct_source_file_language_property(${ARGN})
-    add_library(${ARGN})
-  else()
-    set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
-    cuda_add_library(${ARGN})
+  if (NOT DEFINED TARGET_ARGS_VAR)
+    message(FATAL_ERROR "TARGET_ARGS_VAR parameter is required")
  endif()

+  if (__BATCH_SOURCES AND NOT DEFINED __BATCH_SIZE)
+    set(__BATCH_SIZE ${CUTLASS_UNITY_BUILD_BATCH_SIZE})
+  endif()
+
+  if (CUTLASS_UNITY_BUILD_ENABLED AND DEFINED __BATCH_SIZE AND __BATCH_SIZE GREATER 1)
+
+    set(CUDA_FILE_ARGS)
+    set(TARGET_SOURCE_ARGS)
+    
+    foreach(ARG ${__UNPARSED_ARGUMENTS})
+      if(${ARG} MATCHES ".*\.cu$")
+        list(APPEND CUDA_FILE_ARGS ${ARG})
+      else()
+        list(APPEND TARGET_SOURCE_ARGS ${ARG})
+      endif()
+    endforeach()
+    
+    list(LENGTH CUDA_FILE_ARGS NUM_CUDA_FILE_ARGS)
+    while(NUM_CUDA_FILE_ARGS GREATER 0)
+      list(SUBLIST CUDA_FILE_ARGS 0 ${__BATCH_SIZE} CUDA_FILE_BATCH)
+      string(SHA256 CUDA_FILE_BATCH_HASH "${CUDA_FILE_BATCH}")
+      string(SUBSTRING ${CUDA_FILE_BATCH_HASH} 0 12 CUDA_FILE_BATCH_HASH)
+      set(BATCH_FILE ${CMAKE_CURRENT_BINARY_DIR}/${NAME}.unity.${CUDA_FILE_BATCH_HASH}.cu)
+      message(STATUS "Generating ${BATCH_FILE}")
+      file(WRITE ${BATCH_FILE} "// Unity File - Auto Generated!\n")
+      foreach(CUDA_FILE ${CUDA_FILE_BATCH})
+        get_filename_component(CUDA_FILE_ABS_PATH ${CUDA_FILE} ABSOLUTE)
+        file(APPEND ${BATCH_FILE} "#include \"${CUDA_FILE_ABS_PATH}\"\n")
+      endforeach()
+      list(APPEND TARGET_SOURCE_ARGS ${BATCH_FILE})
+      if (NUM_CUDA_FILE_ARGS LESS_EQUAL __BATCH_SIZE)
+        break()
+      endif()
+      list(SUBLIST CUDA_FILE_ARGS ${__BATCH_SIZE} -1 CUDA_FILE_ARGS)
+      list(LENGTH CUDA_FILE_ARGS NUM_CUDA_FILE_ARGS)
+    endwhile()
+
+  else()
+
+    set(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+
+  endif()
+
+  set(${TARGET_ARGS_VAR} ${TARGET_SOURCE_ARGS} PARENT_SCOPE)
+
 endfunction()

-function(cutlass_add_executable)
+function(cutlass_add_library NAME)

  set(options)
-  set(oneValueArgs)
+  set(oneValueArgs EXPORT_NAME)
  set(multiValueArgs)
  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+  cutlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+
  if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
-    cutlass_correct_source_file_language_property(${ARGN})
-    add_executable(${ARGN})
+    cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
+    add_library(${NAME} ${TARGET_SOURCE_ARGS})
  else()
    set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
-    cuda_add_executable(${ARGN})
+    cuda_add_library(${NAME} ${TARGET_SOURCE_ARGS})
+  endif()
+
+  cutlass_apply_standard_compile_options(${NAME})
+  cutlass_apply_cuda_gencode_flags(${NAME})
+
+  target_compile_features(
+   ${NAME}
+   INTERFACE
+   cxx_std_11
+   )
+
+  if(__EXPORT_NAME)
+    add_library(nvidia::cutlass::${__EXPORT_NAME} ALIAS ${NAME})
+    set_target_properties(${NAME} PROPERTIES EXPORT_NAME ${__EXPORT_NAME})
  endif()

 endfunction()

-function(cutlass_target_sources)
+function(cutlass_add_executable NAME)

  set(options)
  set(oneValueArgs)
  set(multiValueArgs)
  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-  cutlass_correct_source_file_language_property(${ARGN})
-  target_sources(${ARGN})
+  cutlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+
+  if(CUTLASS_NATIVE_CUDA OR CUDA_COMPILER MATCHES "clang")
+    cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
+    add_executable(${NAME} ${TARGET_SOURCE_ARGS})
+  else()
+    set(CUDA_LINK_LIBRARIES_KEYWORD PRIVATE)
+    cuda_add_executable(${NAME} ${TARGET_SOURCE_ARGS})
+  endif()
+
+  cutlass_apply_standard_compile_options(${NAME})
+  cutlass_apply_cuda_gencode_flags(${NAME})
+
+  target_compile_features(
+   ${NAME}
+   INTERFACE
+   cxx_std_11
+   )
+
+endfunction()
+
+function(cutlass_target_sources NAME)
+
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs)
+  cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  cutlass_unify_source_files(TARGET_SOURCE_ARGS ${__UNPARSED_ARGUMENTS})
+  cutlass_correct_source_file_language_property(${TARGET_SOURCE_ARGS})
+  target_sources(${NAME} ${TARGET_SOURCE_ARGS})

 endfunction()