CUTLASS 2.0 (#62)

CUTLASS 2.0 Substantially refactored for - Better performance, particularly for native Turing Tensor Cores - Robust and durable templates spanning the design space - Encapsulated functionality embodying modern C++11 programming techniques - Optimized containers and data types for efficient, generic, portable device code Updates to: - Quick start guide - Documentation - Utilities - CUTLASS Profiler Native Turing Tensor Cores - Efficient GEMM kernels targeting Turing Tensor Cores - Mixed-precision floating point, 8-bit integer, 4-bit integer, and binarized operands Coverage of existing CUTLASS functionality: - GEMM kernels targeting CUDA and Tensor Cores in NVIDIA GPUs - Volta Tensor Cores through native mma.sync and through WMMA API - Optimizations such as parallel reductions, threadblock rasterization, and intra-threadblock reductions - Batched GEMM operations - Complex-valued GEMMs Note: this commit and all that follow require a host compiler supporting C++11 or greater.
2019-11-19 16:55:34 -08:00
parent b5cab177a9
commit fb335f6a5f
5434 changed files with 599799 additions and 250176 deletions
--- a/test/unit/nvrtc/thread/CMakeLists.txt
+++ b/test/unit/nvrtc/thread/CMakeLists.txt
@ -0,0 +1,29 @@
+# Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are permitted
+# provided that the following conditions are met:
+#     * Redistributions of source code must retain the above copyright notice, this list of
+#       conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright notice, this list of
+#       conditions and the following disclaimer in the documentation and/or other materials
+#       provided with the distribution.
+#     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+#       to endorse or promote products derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_nvrtc_thread
+  gemm_nvrtc.cu
+  testbed.h
+  )
+
+target_link_libraries(cutlass_test_unit_nvrtc_thread PRIVATE cutlass_nvrtc)
--- a/test/unit/nvrtc/thread/gemm_nvrtc.cu
+++ b/test/unit/nvrtc/thread/gemm_nvrtc.cu
@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/gemm/thread/mma.h"
+
+#include "testbed.h"
+
+#if 0
+int main() {
+  nvrtc::thread::Testbed<
+    cutlass::gemm::GemmShape<3, 4, 2>,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run("cutlass::gemm::thread::Mma<cutlass::gemm::GemmShape<3, 4, 2>, float, cutlass::layout::ColumnMajor, float, cutlass::layout::RowMajor, float, cutlass::layout::ColumnMajor >");
+  return 0;
+}
+#endif
+
+TEST(SM50_Sgemm_thread_nvrtc, DISABLED_col_row_3x4x2) {
+
+  test::nvrtc::thread::Testbed<
+    cutlass::gemm::GemmShape<3, 4, 2>,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run("cutlass::gemm::thread::Mma<cutlass::gemm::GemmShape<3, 4, 2>, float, cutlass::layout::ColumnMajor, float, cutlass::layout::RowMajor, float, cutlass::layout::ColumnMajor >");
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#if 0
+TEST(SM50_Sgemm_thread, col_row_3x4x2) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<3, 4, 2>,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+
+TEST(SM50_Sgemm_thread, col_row_4x4x2) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<4, 4, 2>,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+
+TEST(SM50_Sgemm_thread, row_col_4x4x2) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<4, 4, 2>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+
+TEST(SM50_Sgemm_thread, col_row_4x5x3) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<4, 5, 3>,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+
+TEST(SM50_Sgemm_thread, col_row) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<8, 8, 1>,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+
+TEST(SM50_Sgemm_thread, row_col) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<8, 8, 1>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+
+TEST(SM50_Sgemm_thread, col_col) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<8, 8, 1>,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+
+TEST(SM50_Sgemm_thread, row_row) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<8, 8, 1>,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::RowMajor,
+    float,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Dgemm_thread, col_row) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<8, 8, 1>,
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+
+TEST(SM50_Dgemm_thread, row_col) {
+
+  test::gemm::thread::Testbed<
+    cutlass::gemm::GemmShape<8, 8, 1>,
+    double,
+    cutlass::layout::RowMajor,
+    double,
+    cutlass::layout::ColumnMajor,
+    double,
+    cutlass::layout::ColumnMajor
+  >().run();
+}
+#endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
--- a/test/unit/nvrtc/thread/testbed.h
+++ b/test/unit/nvrtc/thread/testbed.h
@ -0,0 +1,317 @@
+/***************************************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright notice, this list of
+ *       conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright notice, this list of
+ *       conditions and the following disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
+ *       to endorse or promote products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+
+#pragma once
+
+#include <iostream>
+
+#include "cutlass/gemm/thread/mma.h"
+#include "../kernel/thread/testbed_kernel.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include <cuda.h>
+#include <nvrtc.h>
+#include "../cutlass/nvrtc/environment.h"
+#include <assert.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace nvrtc {
+namespace thread {
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+  /// Data type of A elements
+  typename ElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC
+>
+struct Testbed {
+
+  /// Thread-level matrix multiply-accumulate operator
+  using Mma = cutlass::gemm::thread::Mma<
+    Shape,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC
+  >;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed() {
+
+    tensor_A.reset(cutlass::make_Coord(Shape::kM, Shape::kK));
+    tensor_B.reset(cutlass::make_Coord(Shape::kK, Shape::kN));
+    tensor_C.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_computed.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+  }
+
+  static inline bool check_nvrtc_error(nvrtcResult error) {
+    if (error != NVRTC_SUCCESS) {
+      std::cerr << "failed to compile ";
+      return false;
+    }
+    return true;
+  }
+
+  /// Runs the test
+  bool run(std::string const &gemm_traits) {
+
+    //
+    // initialize device memory
+    //
+
+    cutlass::reference::host::BlockFillSequential(
+      tensor_A.host_data(),
+      tensor_A.capacity()
+    );
+
+    cutlass::reference::host::BlockFillSequential(
+      tensor_B.host_data(),
+      tensor_B.capacity(),
+      ElementB(1),
+      ElementB(2)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_C.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_computed.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_reference.host_view(),
+      ElementC(0)
+    );
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+
+#if 0
+    // launch kernel
+    cutlass::gemm::kernel::testbed_kernel<Mma><<< dim3(1, 1), dim3(1, 1, 1) >>>(
+      tensor_D_computed.device_data(),
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data());
+
+#else
+    // Instantiate gemm_kernel
+    nvrtcResult result_nvrtc;
+    nvrtcProgram program;
+    static char const *src =
+        "#include \"cutlass/gemm/thread/mma.h\"\n"
+        "#include \"cutlass/gemm/gemm.h\"\n"
+        "#include \"cutlass/layout/matrix.h\"\n"
+        "#include \"unit/nvrtc/kernel/thread/testbed_kernel.h\"\n"
+    ;
+
+    std::string type_name;
+#if 0
+    // TODO Ideally we'd use nvrtcGetTypeName to determine the type, but it cannot resolve enum symbol names
+    // As altername solution we might want to implement to_string<GemmTraits>() to get the traits string.
+    nvrtcGetTypeName<typename GemmTraits_>(&type_name);
+#else
+    type_name = gemm_traits;
+#endif
+
+    result_nvrtc = nvrtcCreateProgram(&program,
+                                    src,
+                                    NULL,
+                                    (int)cutlass::nvrtc::kCutlassHeaderCount,
+                                    cutlass::nvrtc::kCutlassHeaders,
+                                    cutlass::nvrtc::kCutlassHeaderNames);
+    check_nvrtc_error(result_nvrtc);
+
+    std::string gemm_kernel_instantiation =
+      "test::nvrtc::kernel::thread::testbed_kernel< " + type_name + " >";
+    nvrtcAddNameExpression(program, gemm_kernel_instantiation.c_str());
+
+    const char *opts[] = {"--gpu-architecture=compute_75",
+                          "--std=c++11",
+                          "--include-path=/usr/local/cuda-10.1/include"};
+
+    result_nvrtc = nvrtcCompileProgram(program, 3, opts);
+    if (result_nvrtc != NVRTC_SUCCESS) {
+      size_t logSize;
+      nvrtcGetProgramLogSize(program, &logSize);
+      std::vector<char> log(logSize);
+      nvrtcGetProgramLog(program, log.data());
+      std::cout << "Compile log:" << std::endl << log.data() << std::endl;
+    }
+    if (!check_nvrtc_error(result_nvrtc)) {
+      assert(0);
+    }
+
+    // The lowered name is the name of the template instantiation in the generated PTX code.
+    char const *gemm_kernel_lowered_name;
+    nvrtcGetLoweredName(program, gemm_kernel_instantiation.c_str(), &gemm_kernel_lowered_name);
+    if (!check_nvrtc_error(result_nvrtc)) {
+      assert(0);
+    }
+
+    // Query the size of the genereated PTX so that we can allocate storage and retrieve it afterwards
+    size_t ptx_size;
+    result_nvrtc = nvrtcGetPTXSize(program, &ptx_size);
+    if (!check_nvrtc_error(result_nvrtc)) {
+      assert(0);
+    }
+
+    std::vector<char> ptx(ptx_size);
+    result_nvrtc = nvrtcGetPTX(program, ptx.data());
+    if (!check_nvrtc_error(result_nvrtc)) {
+      assert(0);
+    }
+
+    // we do not need the nvrtc program anymore
+    //nvrtcDestroyProgram(&program);
+
+    CUmodule module;
+    CUresult result_cuda;
+    result_cuda = cuModuleLoadDataEx(&module, ptx.data(), 0, 0, 0);
+    if (result_cuda != CUDA_SUCCESS) {
+      assert(0);
+    }
+
+    CUfunction kernel;
+    result_cuda = cuModuleGetFunction(&kernel, module, gemm_kernel_lowered_name);
+    if (result_cuda != CUDA_SUCCESS) {
+      assert(0);
+    }
+
+    void* d_a = (void*)tensor_A.device_data();
+    void* d_b = (void*)tensor_B.device_data();
+    void* d_c = (void*)tensor_C.device_data();
+    void* d_d = (void*)tensor_D_computed.device_data();
+    void* args[] = { &d_d, &d_a, &d_b, &d_c };
+
+    // CUfunction f, unsigned int  gridDimX, unsigned int  gridDimY, unsigned int  gridDimZ, unsigned int  blockDimX, unsigned int  blockDimY, unsigned int  blockDimZ, unsigned int  sharedMemBytes, CUstream hStream, void** kernelParams, void** extra
+    result_cuda = cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, 0 /*cudaStreamDefault*/, args, 0);
+    if (result_cuda != CUDA_SUCCESS) {
+      assert(0);
+    } else {
+}
+#endif
+
+    // verify no errors
+    cudaError_t result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cout << "CUDA ERROR: " << cudaGetErrorString(result);
+      return false;
+    }
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference implementation
+    //
+
+    //tensor_D_reference.fill(tensor_C.host_view());
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC> reference_gemm;
+
+    reference_gemm(
+      {Shape::kM, Shape::kN, Shape::kK},
+      ElementC(1),
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      ElementC(0),
+      tensor_D_reference.host_ref()
+    );
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view()
+    );
+
+    if(!passed) std::cout
+      << "A:\n" << tensor_A.host_view() << "\n\n"
+      << "B:\n" << tensor_B.host_view() << "\n\n"
+      << "C:\n" << tensor_C.host_view() << "\n\n"
+      << "Reference:\n" << tensor_D_reference.host_view() << "\n\n"
+      << "Computed:\n" << tensor_D_computed.host_view() << std::endl;
+    
+    std::cout << "passed " << passed << std::endl;
+    
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace nvrtc
+} // namespace test