[op] optimize layernorm kernel for half2 type

Xianzhe Dong · Xianzhe Dong · commit bd5b91f51a92 · 2024-04-27T02:11:53.000-04:00
diff --git a/src/kernels/layernorm_kernels.cu b/src/kernels/layernorm_kernels.cu
@@ -3,6 +3,7 @@
 
 #include "dispatch.h"
 #include "reduce_kernel_utils.cuh"
+#include "layernorm_kernels.h"
 
 namespace llm::kernel {
 
@@ -173,6 +174,67 @@ __global__ void layer_norm_kernel(T* __restrict__ out,
   }
 }
 
+// equation: x -> (x - E[x]) / sqrt(Var[x] + eps) * w + b
+// The mean and standard-deviation are calculated over the last dimension
+template <>
+__global__ void layer_norm_kernel<half2>(half2* __restrict__ out,
+                                  const half2* __restrict__ input,
+                                  const half2* __restrict__ weight,
+                                  const half2* __restrict__ bias,
+                                  const float epsilon,
+                                  int n) {
+  const int tidx = threadIdx.x;
+  const int bidx = blockIdx.x;
+
+  __shared__ half s_mean;
+  __shared__ half s_variance;
+  half2 mean = make_half2(__float2half(0.0f), __float2half(0.0f));
+  half2 variance = make_half2(__float2half(0.0f), __float2half(0.0f));
+
+  // calculate mean of the input.
+  for (int i = tidx; i < n; i += blockDim.x) {
+    const int idx = bidx * n + i;
+    mean = __hadd2(mean, __ldg(&input[idx]));
+  }
+  mean = block_reduce_sum<half2>(mean);
+  if (tidx == 0) {
+    s_mean = __hdiv(__hadd(mean.x, mean.y), __float2half((float)n * 2));
+  }
+  __syncthreads();
+
+  // calculate variance of the input.
+  for (int i = tidx; i < n; i += blockDim.x) {
+    const half2 x = __hsub2(input[bidx * n + i], make_half2(s_mean, s_mean));
+    variance = __hadd2(variance, __hmul2(x, x));
+  }
+  variance = block_reduce_sum<half2>(variance);
+  if (tidx == 0) {
+    // const half2 e = make_half2(__float2half(epsilon), __float2half(epsilon));
+    s_variance = __hadd(variance.x, variance.y);
+    s_variance = __hdiv(s_variance, __float2half((float)n * 2));
+    s_variance = __hadd(s_variance, __float2half(epsilon));
+    s_variance = hrsqrt(s_variance);
+  }
+  __syncthreads();
+
+  for (int i = tidx; i < n; i += blockDim.x) {
+    const int idx = bidx * n + i;
+    // float local_out =
+    //     (__ldg(&input[idx]) - s_mean) * s_variance * __ldg(&weight[i]);
+    // if (bias != nullptr) {
+    //   local_out += __ldg(&bias[i]);
+    // }
+    half2 local_out = __ldg(&input[idx]);
+    local_out = __hsub2(local_out, make_half2(s_mean, s_mean));
+    local_out = __hmul2(local_out, make_half2(s_variance, s_variance));
+    local_out = __hmul2(local_out, __ldg(&weight[i]));
+    if (bias != nullptr){
+      local_out = __hadd2(local_out, __ldg(&bias[i]));
+    }
+    out[idx] = local_out;
+  }
+}
+
 void layer_norm(torch::Tensor& out,
                 torch::Tensor input,
                 torch::Tensor weight,
@@ -197,4 +259,54 @@ void layer_norm(torch::Tensor& out,
   });
 }
 
+template <typename T>
+void invoke_layernorm_kernel(T* out,
+                                  const T* input,
+                                  const T* weight,
+                                  const T* bias,
+                                  const float epsilon,
+                                  int m,
+                                  int n) {
+  layer_norm_kernel<T><<<m, n>>>(out, input, weight, bias, epsilon, n);
+}
+
+template <>
+void invoke_layernorm_kernel<half2>(half2* out,
+                                  const half2* input,
+                                  const half2* weight,
+                                  const half2* bias,
+                                  const float epsilon,
+                                  int m,
+                                  int n) {
+  layer_norm_kernel<half2><<<m, n>>>(out, input, weight, bias, epsilon, n);
+}
+template <>
+void invoke_layernorm_kernel<float>(float* out,
+                                  const float* input,
+                                  const float* weight,
+                                  const float* bias,
+                                  const float epsilon,
+                                  int m,
+                                  int n) {
+  layer_norm_kernel<float><<<m, n>>>(out, input, weight, bias, epsilon, n);
+                                  }
+// void invoke_float_layernorm_kernel(float* out,
+//                                    const float* input,
+//                                    const float* weight,
+//                                    const float* bias,
+//                                    const float epsilon,
+//                                    int m,
+//                                    int n){
+//   layer_norm_kernel<float><<<m, n>>>(out, input, weight, bias, epsilon, n);
+//                                    }
+
+// void invoke_half2_layernorm_kernel(half2* out,
+//                                    const half2* input,
+//                                    const half2* weight,
+//                                    const half2* bias,
+//                                    const float epsilon,
+//                                    int m,
+//                                    int n){
+//   layer_norm_kernel<half2><<<m, n>>>(out, input, weight, bias, epsilon, n);
+// }
 }  // namespace llm::kernel
diff --git a/src/kernels/layernorm_kernels.h b/src/kernels/layernorm_kernels.h
@@ -20,4 +20,28 @@ void layer_norm(torch::Tensor& out,
                 torch::Tensor bias,
                 float epsilon);
 
+template <typename T>
+void invoke_layernorm_kernel(T* out,
+                             const T* input,
+                             const T* weight,
+                             const T* bias,
+                             const float epsilon,
+                             int m,
+                             int n);
+
+// void invoke_float_layernorm_kernel(float* out,
+//                                    const float* input,
+//                                    const float* weight,
+//                                    const float* bias,
+//                                    const float epsilon,
+//                                    int m,
+//                                    int n);
+
+// void invoke_half2_layernorm_kernel(half2* out,
+//                                    const half2* input,
+//                                    const half2* weight,
+//                                    const half2* bias,
+//                                    const float epsilon,
+//                                    int m,
+//                                    int n);
 }  // namespace llm::kernel