diff --git a/media/docs/cpp/cute/03_tensor.md b/media/docs/cpp/cute/03_tensor.md
index aead2907..0a2883ba 100644
--- a/media/docs/cpp/cute/03_tensor.md
+++ b/media/docs/cpp/cute/03_tensor.md
@@ -86,7 +86,7 @@ and destroying the `Tensor` doesn't deallocate the array of elements.
 
 This has implications for developers of generic `Tensor` algorithms.
 For example, input `Tensor` parameters of a function
-should be passed by referece or const reference,
+should be passed by reference or const reference,
 because passing a `Tensor` by value
 may or may not make a deep copy of the `Tensor`'s elements.
 
diff --git a/test/unit/layout/tensor.cu b/test/unit/layout/tensor.cu
index 35a04183..43123945 100644
--- a/test/unit/layout/tensor.cu
+++ b/test/unit/layout/tensor.cu
@@ -72,8 +72,8 @@ namespace layout {
 
     // test capacity
     auto capacity = tensor_nhwc.capacity(cutlass::Tensor4DCoord(n_size, h_size, w_size, c_size));
-    decltype(capacity) referece_capacity = ldh * n_size;
-    EXPECT_EQ(capacity, referece_capacity);
+    decltype(capacity) reference_capacity = ldh * n_size;
+    EXPECT_EQ(capacity, reference_capacity);
 
     // test packed
     auto packed_tensor_layout = tensor_nhwc.packed(cutlass::Tensor4DCoord(n_size, h_size, w_size, c_size));
@@ -115,8 +115,8 @@ namespace layout {
 
     // test capacity
     auto capacity = tensor_nchw.capacity(cutlass::Tensor4DCoord(n_size, h_size, w_size, c_size));
-    decltype(capacity) referece_capacity = ldc * n_size;
-    EXPECT_EQ(capacity, referece_capacity);
+    decltype(capacity) reference_capacity = ldc * n_size;
+    EXPECT_EQ(capacity, reference_capacity);
 
     // test packed
     auto packed_tensor_layout = tensor_nchw.packed(cutlass::Tensor4DCoord(n_size, h_size, w_size, c_size));
diff --git a/test/unit/layout/tensor_nhwc.cu b/test/unit/layout/tensor_nhwc.cu
index ef5146fb..dd8b9359 100644
--- a/test/unit/layout/tensor_nhwc.cu
+++ b/test/unit/layout/tensor_nhwc.cu
@@ -34,19 +34,19 @@
 
 #include "../common/cutlass_unit_test.h"
 #include "cutlass/layout/tensor.h"
-#include "cutlass/util/device_memory.h"    
+#include "cutlass/util/device_memory.h"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 namespace test {
 namespace layout {
-  
+
   void test_nhwc_layout(int n_size, int h_size, int w_size, int c_size) {
     int ldc = c_size + 1;
     int ldw = ldc * (w_size + 2);
     int ldh = ldw * (h_size + 3);
 
     typedef cutlass::layout::TensorNHWC Tensor;
-        
+
     Tensor::Stride tensor_stride({ ldc, ldw, ldh });
     Tensor tensor_nhw_packed_c(tensor_stride);
 
@@ -74,8 +74,8 @@ namespace layout {
     // test capacity
     auto capacity = tensor_nhw_packed_c.capacity(
             cutlass::Tensor4DCoord(n_size, h_size, w_size, c_size));
-    decltype(capacity) referece_capacity = ldh * n_size;
-    EXPECT_EQ(capacity, referece_capacity);
+    decltype(capacity) reference_capacity = ldh * n_size;
+    EXPECT_EQ(capacity, reference_capacity);
 
   }
 
@@ -86,7 +86,7 @@ namespace layout {
     int ldh = ldw * h_size;
 
     typedef cutlass::layout::TensorNHWC Tensor;
-        
+
     Tensor::Stride tensor_stride({ ldc, ldw, ldh });
     Tensor tensor_nhw_packed_c(tensor_stride);
 
@@ -130,7 +130,7 @@ namespace layout {
     dim3 grid(1,1);
     dim3 block(c_size, 1, 1);
 
-    test::layout::test_nhwc_inverse<<< grid, block >>>(output.get(), 
+    test::layout::test_nhwc_inverse<<< grid, block >>>(output.get(),
             n_size, h_size, w_size, c_size);
 
     cudaError_t result = cudaDeviceSynchronize();