During prompt processing, logits are only needed for the last batch, so the matrix multiplication to compute the logits can be skiped and also the attention and FFN on the last layer.

mukel · mukel · commit a49683c2a6e3 · 2024-11-14T02:02:49.000+01:00
diff --git a/Llama3.java b/Llama3.java
@@ -956,7 +956,7 @@ static void rmsnorm(FloatTensor out, FloatTensor x, FloatBuffer weight, int size
         out.mapWithIndexInPlace(0, size, (value, index) -> weight.get(index) * (finalss * x.getFloat(index)));
     }
 
-    static FloatTensor forward(Llama model, State state, int[] tokens, int position) {
+    static FloatTensor forward(Llama model, State state, int[] tokens, int position, boolean computeLogits) {
         // a few convenience variables
         Configuration config = model.configuration();
         Weights weights = model.weights();
@@ -1010,6 +1010,12 @@ static FloatTensor forward(Llama model, State state, int[] tokens, int position)
                 state.v[t].copyTo(0, state.valueCache[curLayer], (position + t) * kvDim, kvDim);
             });
 
+            // If the logits are not required, the attention and FFN of the last layer can be skipped entirely.
+            if (!computeLogits && curLayer == config.numberOfLayers - 1) {
+                state.idxPrevBlock = nTokens - 1;
+                return null;
+            }
+
             // multihead attention. iterate over all heads
             Parallel.parallelForLong(0, (long) nTokens * (long) config.numberOfHeads, ht -> {
                 int token = (int) (ht / config.numberOfHeads);
@@ -1136,7 +1142,7 @@ public static List<Integer> generateTokens(Llama model, State state, int startPo
         int promptIndex = 0;
         for (int position = startPosition; position < maxTokens; ++position) {
             if (promptIndex < promptTokens.size()) {
-                final int nTokens = Math.min(promptTokens.size() - promptIndex, state.batchsize);
+                final int nTokens = Math.min(maxTokens - position, Math.min(promptTokens.size() - promptIndex, state.batchsize));
                 final int[] tokens = new int[nTokens];
                 for (int i = 0; i < nTokens; i++) {
                     tokens[i] = promptTokens.get(promptIndex + i);
@@ -1148,15 +1154,17 @@ public static List<Integer> generateTokens(Llama model, State state, int startPo
                 if (echo) {
                     System.out.format("position=%d, promptIdx=%d, promptSize=%d, tokens=%s%n", position, promptIndex, promptTokens.size(), Arrays.toString(tokens));
                 }
-                forward(model, state, tokens, position);
+                // Only compute logits on the very last batch.
+                boolean computeLogits = promptIndex + nTokens >= promptTokens.size();
+                forward(model, state, tokens, position, computeLogits);
                 position += nTokens - 1; // -1 -> incremented later in the for loop
                 promptIndex += nTokens;
                 if (promptIndex < promptTokens.size()) {
                     continue;
                 }
                 startGen = System.nanoTime();
             } else {
-                forward(model, state, new int[]{token}, position);
+                forward(model, state, new int[]{token}, position, true);
             }
             nextToken = sampler.sampleToken(state.logits);
             if (echo) {