triton-lang · catherinelee274 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 23, 2024
diff --git a/models/llama/llama/generation.py b/models/llama/llama/generation.py
@@ -193,17 +193,21 @@ def generate(
         for cur_pos in range(min_prompt_len, total_len):
             logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
             if temperature > 0:
-                if self.use_triton: 
-                    probs = triton_softmax(logits[:,-1])
-                else: 
-                    probs = self.Math.softmax(logits[:, -1] / temperature, dim=-1)
+                # if self.use_triton: 
+                #     probs = triton_softmax(logits[:,-1])
+                # else: 
+                #     probs = self.Math.softmax(logits[:, -1] / temperature, dim=-1)
+                MathOps.softmax(logits[:, -1] / temperature, dim=-1)
+
+
 
                 next_token = sample_top_p(probs, top_p)
             else:
-                if self.use_triton: 
-                    next_token = self.triton.language.argmax(logits[:, -1], axis=-1)
-                else:
-                    next_token = self.Math.argmax(logits[:, -1], dim=-1)
+                # if self.use_triton: 
+                #     next_token = self.triton.language.argmax(logits[:, -1], axis=-1)
+                # else:
+                #     next_token = self.Math.argmax(logits[:, -1], dim=-1)
+                MathOps.argmax(logits[:,-1], dim = -1)
 
             next_token = next_token.reshape(-1)
             # only replace token if prompt has already been generated

diff --git a/models/llama/llama/math_ops.py b/models/llama/llama/math_ops.py
@@ -8,6 +8,7 @@
 from kernels.cross_entropy import cross_entropy
 from kernels.matmul import matmul
 from kernels.flash_attention import attention
+from kernels.fused_softmax import triton_softmax
 from benchmarking import Profiler
 import time
 
@@ -70,14 +71,15 @@ def attention(self, xq, keys, values, head_dim, mask):
 
     @Profiler.profiling_decorator("softmax")
     def softmax(self, x, dim):
-        if self.use_triton:
-            return F.softmax(x, dim=-1)
+        if self.use_triton and len(x) == 2:
+            return triton_softmax(x, dim=-1)
         else:
             return F.softmax(x, dim=-1)
 
     @Profiler.profiling_decorator("argmax")
     def argmax(self, x, dim):
         if self.use_triton:
+            # TODO: change
             return torch.argmax(x, dim=-1)
         else:
             return torch.argmax(x, dim=-1)