support wgrad gemm (#10892)

zhangbo9674 · web-flow · commit 8a0986e36d02 · 2025-07-29T10:32:45.000+08:00
diff --git a/paddlenlp/transformers/fp8_utils.py b/paddlenlp/transformers/fp8_utils.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import numpy
 import paddle
 import paddle.nn.functional as F
@@ -26,8 +27,13 @@ def swiglu(x, y=None):
         return F.silu(x) * y
 
 
+USE_DS_GEMM = os.getenv("USE_DS_GEMM", "False").lower() == "true"
+
 try:
-    from paddle.incubate.fp8 import deep_gemm
+    if USE_DS_GEMM:
+        import deep_gemm
+    else:
+        from paddle.incubate.fp8 import deep_gemm
 except:
     pass
 
@@ -43,6 +49,13 @@ def swiglu(x, y=None):
 def kitchen_fp8_gemm(
     x_fp8, x_scale, w_fp8, w_scale, is_a_1d_scaled, is_b_1d_scaled, out=None, rtn_dtype=paddle.bfloat16
 ):
+    if USE_DS_GEMM:
+        if out is None:
+            out = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], rtn_dtype)
+        if numpy.prod(x_fp8.shape) != 0 and numpy.prod(w_fp8.shape) != 0:
+            deep_gemm.wgrad_gemm_fp8_fp8_fp32_nt((x_fp8, x_scale), (w_fp8, w_scale), out, num_sms=112)
+        return out
+
     if out is not None:
         accumulate = True
         out_dtype = out.dtype
@@ -1118,13 +1131,14 @@ def backward_dx(self, out_grad):
         self.out_grad = out_grad
 
         # clear status for save memory
-        self.m_indices = None
         self.unzipped_probs = None
         self.input = None
 
         # dx
         dx = self.bwd_gate_up_input(do1, expert_w1, dx=out_grad[0] if isinstance(out_grad, tuple) else out_grad)
 
+        self.m_indices = None
+
         return dx, probs_grad
 
     @paddle.no_grad()