SciSharp · krisbiradar · Sep 4, 2025 · Sep 11, 2025 · Sep 13, 2025 · Sep 13, 2025
diff --git a/LLama.KernelMemory/BuilderExtensions.cs b/LLama.KernelMemory/BuilderExtensions.cs
@@ -77,7 +77,6 @@ public static IKernelMemoryBuilder WithLLamaSharpDefaults(this IKernelMemoryBuil
                 SplitMode = config.SplitMode,
                 BatchSize = 512,
                 UBatchSize = 512,
-                FlashAttention = true,
                 UseMemorymap = true
             };
 

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -40,7 +40,6 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
                 SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
                 BatchSize = 512,
                 UBatchSize = 512,
-                FlashAttention = true,
                 UseMemorymap = true,
                 PoolingType = LLamaPoolingType.Mean,
             };
@@ -68,7 +67,6 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config, LLamaWeights we
                 SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
                 BatchSize = 512,
                 UBatchSize = 512,
-                FlashAttention = true,
                 UseMemorymap = true,
                 PoolingType = LLamaPoolingType.Mean,
             };

diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -38,7 +38,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
                 SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
                 BatchSize = 512,
                 UBatchSize = 512,
-                FlashAttention = true,
                 UseMemorymap = true
             };
             _weights = LLamaWeights.LoadFromFile(@params);
@@ -66,7 +65,6 @@ public LlamaSharpTextGenerator(LLamaWeights weights, LLamaSharpConfig config, St
                 SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
                 BatchSize = 512,
                 UBatchSize = 512,
-                FlashAttention = true,
                 UseMemorymap = true
             };
             _executor = executor ?? new StatelessExecutor(_weights, @params);

diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
@@ -13,7 +13,7 @@ public LLamaContextTests()
         {
             var @params = new ModelParams(Constants.GenerativeModelPath2)
             {
-                ContextSize = 128,
+                ContextSize = 512,
                 BatchSize = 8,
                 UBatchSize = 8,
                 SeqMax = 1,
@@ -33,7 +33,7 @@ public void Dispose()
         [Fact]
         public void CheckProperties()
         {
-            Assert.Equal(128u, _context.ContextSize);
+            Assert.Equal(512u, _context.ContextSize);
             Assert.Equal(960, _context.EmbeddingSize);
             Assert.Equal(49152, _context.Vocab.Count);
         }

diff --git a/LLama.Unittest/LLamaContextWithCustomLoggerTests.cs b/LLama.Unittest/LLamaContextWithCustomLoggerTests.cs
@@ -30,7 +30,7 @@ public LLamaContextWithCustomLoggerTests()
         {
             var @params = new ModelParams(Constants.GenerativeModelPath2)
             {
-                ContextSize = 128,
+                ContextSize = 512,
                 GpuLayerCount = Constants.CIGpuLayerCount,
             };
 
@@ -55,7 +55,7 @@ public void Dispose()
         [Fact]
         public void CheckProperties()
         {
-            Assert.Equal(128u, _context.ContextSize);
+            Assert.Equal(512u, _context.ContextSize);
             Assert.Equal(960, _context.EmbeddingSize);
             Assert.Equal(49152, _context.Vocab.Count);
         }

diff --git a/LLama.Unittest/LLamaRerankerTests.cs b/LLama.Unittest/LLamaRerankerTests.cs
@@ -18,9 +18,9 @@ public LLamaRerankerTests(ITestOutputHelper testOutputHelper)
         var @params = new ModelParams(Constants.RerankingModelPath)
         {
             ContextSize = 0,
+            SeqMax = 1,
             PoolingType = LLamaPoolingType.Rank,
             GpuLayerCount = Constants.CIGpuLayerCount,
-
         };
         using var weights = LLamaWeights.LoadFromFile(@params);
         _reranker = new LLamaReranker(weights, @params);

diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs
@@ -25,6 +25,7 @@ public SamplingTests(ITestOutputHelper testOutputHelper)
             _params = new ModelParams(Constants.GenerativeModelPath2) {
                 ContextSize = 200,
                 BatchSize = 200,
+                SeqMax = 4,
                 GpuLayerCount = Constants.CIGpuLayerCount,
             };
             _model = LLamaWeights.LoadFromFile(_params);
@@ -104,7 +105,7 @@ public void BatchedSampling()
                 }
             }
 
-            // Add " repeat" and test whether next tokens will be "this phrase forever.".
+            // Add " repeat" and test whether next tokens will be "this phrase forever."
             for (int i = 0; i < 4; i++)
             {
                 for (int b = 0; b < batch_count; b++)

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -102,7 +102,7 @@ public class ModelOptions
         public bool NoKqvOffload { get; set; }
 
         /// <inheritdoc />
-        public bool FlashAttention { get; set; }
+        public bool? FlashAttention { get; set; }
 
         /// <inheritdoc />
         public Encoding Encoding { get; set; } = Encoding.UTF8;

diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -106,8 +106,8 @@ public interface IContextParams
     /// <summary>
     /// Whether to use flash attention
     /// </summary>
-    bool FlashAttention { get; }
-
+    bool? FlashAttention { get; }
+    
     /// <summary>
     /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
     /// </summary>

diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -1,3 +1,4 @@
+using System;
 using LLama.Abstractions;
 using System.Text;
 using System.Text.Json.Serialization;
@@ -95,12 +96,12 @@ public record ModelParams
 
         /// <inheritdoc />
         public bool NoKqvOffload { get; set; }
-
+        
         /// <inheritdoc />
-
-        public bool FlashAttention { get; set; }
+        public bool? FlashAttention { get; set; }
 
         /// <inheritdoc />
+        [Obsolete]
         public float? DefragThreshold { get; set; }
 
         /// <inheritdoc />

diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
@@ -49,9 +49,15 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
             result.type_v = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
             result.offload_kqv = !@params.NoKqvOffload;
-            result.flash_attention = @params.FlashAttention;
             result.llama_pooling_type = @params.PoolingType;
             result.attention_type = @params.AttentionType;
+            result.llama_flash_attn_type = @params.FlashAttention switch
+            {
+                true => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_ENABLED,
+                false => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_DISABLED,
+                null => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_AUTO
+            };
+            result.kv_unified = true;
 
             result.n_threads = Threads(@params.Threads);
             result.n_threads_batch = Threads(@params.BatchThreads);

diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
@@ -57,7 +57,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>11dd5a44eb180e</BinaryReleaseId>
+    <BinaryReleaseId>86587da</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>

diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
@@ -64,6 +64,11 @@ public struct LLamaContextParams
         /// Attention type to use for embeddings
         /// </summary>
         public LLamaAttentionType attention_type;
+
+        /// <summary>
+        /// when to enable Flash Attention
+        /// </summary>
+        public LLamaFlashAttentionType llama_flash_attn_type;
 
         /// <summary>
         /// RoPE base frequency, 0 = from model

diff --git a/LLama/Native/LLamaFlashAttentionType.cs b/LLama/Native/LLamaFlashAttentionType.cs
@@ -0,0 +1,19 @@
+namespace LLama.Native;
+/// <summary>
+/// flash_attn_type
+/// </summary>
+public enum LLamaFlashAttentionType
+{
+    /// <summary>
+    /// attention type auto
+    /// </summary>
+    LLAMA_FLASH_ATTENTION_TYPE_AUTO = -1,
+    /// <summary>
+    /// attention disabled
+    /// </summary>
+    LLAMA_FLASH_ATTENTION_TYPE_DISABLED = 0,
+    /// <summary>
+    /// attention enabled
+    /// </summary>
+    LLAMA_FLASH_ATTENTION_TYPE_ENABLED = 1,
+}
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
@@ -201,7 +201,12 @@ public enum LLamaFtype
         /// except 1d tensors
         /// </summary>
         LLAMA_FTYPE_MOSTLY_TQ2_0 = 37,
-
+
+        /// <summary>
+        /// except 1d tensors 
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38,
+
         /// <summary>
         /// File type was not specified
         /// </summary>

diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
@@ -100,7 +100,16 @@ public bool check_tensors
             set => _check_tensors = Convert.ToSByte(value);
         }
         private sbyte _check_tensors;
-
+
+        /// <summary>
+        /// use extra buffer types (used for weight repacking) 
+        /// </summary>
+        public bool use_extra_bufts
+        {
+            readonly get => Convert.ToBoolean(_use_extra_bufts);
+            set => _use_extra_bufts = Convert.ToSByte(value);
+        }
+        private sbyte _use_extra_bufts;
         /// <summary>
         /// Create a LLamaModelParams with default values
         /// </summary>