Write back LSE (#5209)

Aya-ZIbra · meta-codesync[bot] · commit 8b2c265be59b · 2025-12-10T08:31:52.000-08:00
Summary: Pull Request resolved: #5209 X-link: https://github.com/facebookresearch/FBGEMM/pull/2204 * **Python interface**: Modifies `fmha_gen_fwd` to return LSE tensor instead of creating a dummy one * **CUDA implementation**: Adds LSE tensor allocation and computation logic * **Epilogue**: Adds LSE computation and storage in the epilogue * **Mainloop**: Updates `correction_epilogue` to compute and write LSE values Reviewed By: jsisometa Differential Revision: D86949420 fbshipit-source-id: bf6fd9fa616d91c3b758b8a47a933690a88a9b80
diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/attention/cutlass_blackwell_fmha/cutlass_blackwell_fmha_interface.py
@@ -180,26 +180,6 @@ def _prepare_decode_inputs(
     return q, k, v, batch_size, needs_reshape_output, original_shape
 
 
-def _create_decode_lse(
-    out: torch.Tensor,
-    batch_size: int,
-    needs_reshape_output: bool,
-    q_shape: tuple[int, ...],
-) -> torch.Tensor:
-    """
-    Create dummy LSE tensor for decode output compatibility.
-    Gen kernel doesn't return LSE, so we create a zero tensor.
-    """
-    if needs_reshape_output:
-        # For varlen output format
-        lse_shape = [batch_size, q_shape[-1]]  # [B, H]
-    else:
-        # For batch output format
-        lse_shape = [batch_size, q_shape[-2], q_shape[1]]  # [B, H, 1]
-
-    return torch.zeros(*lse_shape, dtype=torch.float32, device=out.device)
-
-
 def cutlass_blackwell_fmha_decode_forward(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -233,7 +213,7 @@ def cutlass_blackwell_fmha_decode_forward(
         q, k, v
     )
     # Call the gen kernel (optimized for decode)
-    out = torch.ops.fbgemm.fmha_gen_fwd(
+    out, lse = torch.ops.fbgemm.fmha_gen_fwd(
         q,
         k,
         v,
@@ -248,9 +228,6 @@ def cutlass_blackwell_fmha_decode_forward(
     if needs_reshape_output:
         out = out.view(*original_shape)
 
-    # Create dummy LSE for compatibility
-    lse = _create_decode_lse(out, batch_size, needs_reshape_output, original_shape)
-
     return out, lse
 
 
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_impl.cu
@@ -99,6 +99,7 @@ struct GenRunner {
   using StrideNewV = StrideNewK;
   using StrideCacheV = StrideCacheK;
   using StrideO = StrideQ;
+  using StrideLSE = Stride<int, int, _1>;
 
   using Mainloop =
       cutlass::fmha::collective::Sm100FmhaGenMainloopWarpspecialized<
@@ -117,7 +118,9 @@ struct GenRunner {
   using Epilogue =
       cutlass::fmha::collective::Sm100FmhaGenEpilogueWarpspecialized<
           ElementOut,
-          StrideO>;
+          StrideO,
+          ElementAcc,
+          StrideLSE>;
 
   using TileScheduler = std::conditional_t<
       kKernelType == KernelType::UMMA_P,
@@ -138,12 +141,14 @@ struct GenRunner {
   StrideCacheK stride_cache_k;
   StrideCacheV stride_cache_v;
   StrideO stride_o;
+  StrideLSE stride_lse;
 
   at::Tensor block_o;
+  at::Tensor block_lse;
   at::Tensor q, k, v, seqlen_kv;
   std::optional<at::Tensor> batch_idx;
 
-  at::Tensor fmha_fwd(
+  std::tuple<at::Tensor, at::Tensor> fmha_fwd(
       const at::Tensor& q_input,
       const at::Tensor& k_input,
       const at::Tensor& v_input,
@@ -177,7 +182,7 @@ struct GenRunner {
 
     run(options, hw_info);
 
-    return block_o;
+    return std::make_tuple(block_o, block_lse);
   }
 
   ProblemShape _initialize(const InputShape& options) {
@@ -210,13 +215,20 @@ struct GenRunner {
     stride_new_v = stride_new_k;
     stride_cache_v = stride_cache_k;
     stride_o = stride_q;
+    stride_lse = make_stride(options.h_k * h_r, h_r, cute::_1{});
 
     block_o = at::empty(
         q.sizes(),
         at::TensorOptions()
             .dtype(to_torch_type<ElementOut>())
             .device(at::Device(at::kCUDA, at::cuda::current_device())));
 
+    block_lse = at::empty(
+        {options.b, options.h, _1{}},
+        at::TensorOptions()
+            .dtype(at::kFloat)
+            .device(at::Device(at::kCUDA, at::cuda::current_device())));
+
     return result;
   }
 
@@ -241,6 +253,8 @@ struct GenRunner {
         stride_cache_v,
         static_cast<ElementOut*>(block_o.data_ptr()),
         stride_o,
+        static_cast<ElementAcc*>(block_lse.data_ptr()),
+        stride_lse,
         hw_info};
 
     Operation op;
@@ -306,7 +320,7 @@ struct GenRunner {
   }()
 
 template <typename Element, KernelType KType, int HeadDim>
-at::Tensor run_gen_runner_fwd(
+std::tuple<at::Tensor, at::Tensor> run_gen_runner_fwd(
     const at::Tensor& q,
     const at::Tensor& k,
     const at::Tensor& v,
@@ -321,7 +335,7 @@ at::Tensor run_gen_runner_fwd(
   }
 }
 
-at::Tensor dispatch_fmha_gen_fwd(
+std::tuple<at::Tensor, at::Tensor> dispatch_fmha_gen_fwd(
     const at::Tensor& q,
     const at::Tensor& k,
     const at::Tensor& v,
@@ -343,30 +357,38 @@ at::Tensor dispatch_fmha_gen_fwd(
   });
 }
 
-at::Tensor dispatch_fmha_gen_fwd_meta(
+std::tuple<at::Tensor, at::Tensor> dispatch_fmha_gen_fwd_meta(
     const at::Tensor& q,
     const at::Tensor& k,
     const at::Tensor& v,
     const at::Tensor& seqlen_kv,
     const std::optional<at::Tensor>& batch_idx,
     int64_t kernel_type
   ) {
-  return at::empty_like(q);
+  // Return tuple matching the operator signature: (output, lse)
+  at::Tensor output = at::empty_like(q);
+  // LSE should have shape [B, num_splits, H]
+  int b = q.size(0);
+  int h = q.size(2);
+  // For meta, just create a dummy LSE with single split
+  at::Tensor lse = at::empty(
+      {b, 1, h},
+      at::TensorOptions().dtype(at::kFloat).device(at::kMeta));
+  return std::make_tuple(output, lse);
 }
 
 // -------------------------------------------------------------------------------------------------
 // Op registration
 // -------------------------------------------------------------------------------------------------
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def("fmha_gen_fwd("
-        "    Tensor query, "
-        "    Tensor key, "
-        "    Tensor value, "
-        "    Tensor seqlen_kv, "
-        "    Tensor? batch_idx = None,"
-        "    int kernel_type = 0"
-        ") -> Tensor"
-  );
+      "    Tensor query, "
+      "    Tensor key, "
+      "    Tensor value, "
+      "    Tensor seqlen_kv, "
+      "    Tensor? batch_idx = None,"
+      "    int kernel_type = 0"
+      ") -> (Tensor, Tensor)");
 }
 
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_interface.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/blackwell_gen_interface.hpp
@@ -41,7 +41,7 @@ at::ScalarType to_torch_type() {
 }
 
 // Main dispatch function for the generation FMHA
-at::Tensor dispatch_fmha_gen_fwd(
+std::tuple<at::Tensor, at::Tensor> dispatch_fmha_gen_fwd(
     const at::Tensor& q,
     const at::Tensor& k,
     const at::Tensor& v,
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_gen_epilogue_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_gen_epilogue_warpspecialized.hpp
@@ -38,7 +38,9 @@ namespace cutlass::fmha::collective {
 
 template<
     class Element_,
-    class StrideO_
+    class StrideO_, 
+    class ElementAcc_, 
+    class StrideLSE_
 >
 struct Sm100FmhaGenEpilogueWarpspecialized {
     
@@ -47,9 +49,11 @@ struct Sm100FmhaGenEpilogueWarpspecialized {
   using SmemLayoutO = Layout<Shape<_1, _1, _1>>;
   using SmemLayoutO_ = SmemLayoutO;
   using Element = Element_;
+  using ElementAcc = ElementAcc_;
   using StrideOOrig = StrideO_;
   using StrideO = decltype(replace<0>(StrideOOrig{}, 0));
-  
+  using StrideLSE = StrideLSE_;
+
   struct TensorStorage {
 
     using SmemLayoutO = SmemLayoutO_;
@@ -60,6 +64,8 @@ struct Sm100FmhaGenEpilogueWarpspecialized {
   struct Arguments {
     Element* ptr_o;
     StrideO dO;
+    ElementAcc* ptr_LSE;
+    StrideLSE dLSE;
   };
 
   using Params = Arguments;
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp
@@ -870,12 +870,12 @@ struct Sm100FmhaGenMainloopWarpspecialized {
     ++pipeline_s_consumer_state;
   }
 
-  template<class Vector, class GTensor, class CTensor, class Shape, class Epilogue>
+  template<class Vector, class GTensor, class CTensor, class Shape, class Epilogue, class BlkCoord, class ProblemShape>
   CUTLASS_DEVICE auto
   correction_epilogue(
       float scale_softmax_log2, float scale_out, Vector const& v0, Vector const& v1, 
       GTensor& gO, CTensor const& cO, Shape const& g_shape,
-      Epilogue const& epilogue) {
+      Epilogue const& epilogue, BlkCoord const& blk_coord, ProblemShape const& problem_shape,int const row_idx) {
 
     using ElementOut = typename GTensor::value_type;
 
@@ -887,7 +887,6 @@ struct Sm100FmhaGenMainloopWarpspecialized {
     const int kCorrectionTileSize = 32 / sizeof(ElementOut); 
     // TODO: load all values
 
-
     // Choose TMEM OP based on
     // - TileM shape
     // - kCorrectionTileSize
@@ -933,6 +932,31 @@ struct Sm100FmhaGenMainloopWarpspecialized {
     float scale0 = scale_out * adj0 / row_sum;
     float scale1 = scale_out * adj1 / row_sum;
 
+    // Compute and store LSE if requested
+    if (epilogue.params.ptr_LSE != nullptr) {
+      // LSE = log(row_sum) + scale_softmax * row_max
+      // scale_softmax_log2 is already in log2 space, convert to natural log
+      float lse = cutlass::fast_log(row_sum) + (scale_softmax_log2 / std::log2(std::exp(1.0f))) * row_max;
+      int h_r = row_idx;
+      int h_k = get<2, 0>(blk_coord);
+      int b = get<2, 1>(blk_coord);
+
+      // After problem_shape transformation in kernel:
+      // problem_shape = (H_R, Sk, D, ((1, H_K), B))
+      // So: get<0> = H_R, get<3,0,1> = H_K
+      int H_R = get<0>(problem_shape);
+
+      // Check bounds
+      if (thread_idx < H_R) {
+        // LSE tensor shape: [B, H_K, H_R]
+        // Use stride from epilogue.params.dLSE instead of hardcoding
+        int linear_idx = b * get<0>(epilogue.params.dLSE) +
+            h_k * get<1>(epilogue.params.dLSE) +
+            h_r * get<2>(epilogue.params.dLSE);
+        epilogue.params.ptr_LSE[linear_idx] = lse;
+      }
+    }
+
     float2 scale0_f32x2 = make_float2(scale0, scale0);
     float2 scale1_f32x2 = make_float2(scale1, scale1);
 
@@ -1223,8 +1247,9 @@ struct Sm100FmhaGenMainloopWarpspecialized {
     auto g_shape = select<0,2>(problem_shape);
     auto mO = make_tensor(make_gmem_ptr(epilogue.params.ptr_o), append<3>(select<0,1>(TileShapePV{}), get<3>(problem_shape)), epilogue.params.dO);
     auto gO = mO(_, _, get<2>(blk_coord));
-
-    correction_epilogue(params.scale_softmax_log2, params.scale_output, tTMEM_LOADVrS0, tTMEM_LOADVrS1, gO, cO, g_shape, epilogue);
+    int row_idx = get<0>(tTMEM_LOADVcS(_0{}));
+    correction_epilogue(params.scale_softmax_log2, params.scale_output, tTMEM_LOADVrS0, tTMEM_LOADVrS1, 
+      gO, cO, g_shape, epilogue, blk_coord, problem_shape, row_idx);
 
     cutlass::arch::fence_view_async_tmem_load();
 
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_gen_kernel_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/kernel/sm100_fmha_gen_kernel_warpspecialized.hpp
@@ -168,6 +168,9 @@ struct Sm100FmhaGenKernelWarpspecialized {
     ElementOut* ptr_o;     // 1 x D x (H x B)
     StrideOOrig dO;
 
+    ElementAcc* ptr_LSE; // (B, H_K, H_R)
+    cute::Stride<int, int, cute::_1> dLSE; // stride: (H_K*H_R, H_R, 1)
+
     cutlass::KernelHardwareInfo hw_info;
 
     ElementAcc scale_softmax = 0.0f;
@@ -227,6 +230,8 @@ struct Sm100FmhaGenKernelWarpspecialized {
 
     typename CollectiveEpilogue::Arguments epilogue_args {
       args.ptr_o, dO,
+        args.ptr_LSE,
+        args.dLSE,
     };
 
     return Params{

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ at::ScalarType to_torch_type() {`
`41`	`41`	`}`
`42`	`42`
`43`	`43`	`// Main dispatch function for the generation FMHA`
`44`		`-at::Tensor dispatch_fmha_gen_fwd(`
	`44`	`+std::tuple<at::Tensor, at::Tensor> dispatch_fmha_gen_fwd(`
`45`	`45`	`const at::Tensor& q,`
`46`	`46`	`const at::Tensor& k,`
`47`	`47`	`const at::Tensor& v,`