Skip to content
1 change: 0 additions & 1 deletion LLama.KernelMemory/BuilderExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ public static IKernelMemoryBuilder WithLLamaSharpDefaults(this IKernelMemoryBuil
SplitMode = config.SplitMode,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using FlashAttention should likely remain the LLamaSharp's default.

UseMemorymap = true
};

Expand Down
2 changes: 0 additions & 2 deletions LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using FlashAttention should likely remain the LLamaSharp's default.

UseMemorymap = true,
PoolingType = LLamaPoolingType.Mean,
};
Expand Down Expand Up @@ -68,7 +67,6 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config, LLamaWeights we
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
UseMemorymap = true,
PoolingType = LLamaPoolingType.Mean,
};
Expand Down
2 changes: 0 additions & 2 deletions LLama.KernelMemory/LlamaSharpTextGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
UseMemorymap = true
};
_weights = LLamaWeights.LoadFromFile(@params);
Expand Down Expand Up @@ -66,7 +65,6 @@ public LlamaSharpTextGenerator(LLamaWeights weights, LLamaSharpConfig config, St
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
UseMemorymap = true
};
_executor = executor ?? new StatelessExecutor(_weights, @params);
Expand Down
4 changes: 2 additions & 2 deletions LLama.Unittest/LLamaContextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public LLamaContextTests()
{
var @params = new ModelParams(Constants.GenerativeModelPath2)
{
ContextSize = 128,
ContextSize = 512,
BatchSize = 8,
UBatchSize = 8,
SeqMax = 1,
Expand All @@ -33,7 +33,7 @@ public void Dispose()
[Fact]
public void CheckProperties()
{
Assert.Equal(128u, _context.ContextSize);
Assert.Equal(512u, _context.ContextSize);
Assert.Equal(960, _context.EmbeddingSize);
Assert.Equal(49152, _context.Vocab.Count);
}
Expand Down
4 changes: 2 additions & 2 deletions LLama.Unittest/LLamaContextWithCustomLoggerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public LLamaContextWithCustomLoggerTests()
{
var @params = new ModelParams(Constants.GenerativeModelPath2)
{
ContextSize = 128,
ContextSize = 512,
GpuLayerCount = Constants.CIGpuLayerCount,
};

Expand All @@ -55,7 +55,7 @@ public void Dispose()
[Fact]
public void CheckProperties()
{
Assert.Equal(128u, _context.ContextSize);
Assert.Equal(512u, _context.ContextSize);
Assert.Equal(960, _context.EmbeddingSize);
Assert.Equal(49152, _context.Vocab.Count);
}
Expand Down
2 changes: 1 addition & 1 deletion LLama.Unittest/LLamaRerankerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ public LLamaRerankerTests(ITestOutputHelper testOutputHelper)
var @params = new ModelParams(Constants.RerankingModelPath)
{
ContextSize = 0,
SeqMax = 1,
PoolingType = LLamaPoolingType.Rank,
GpuLayerCount = Constants.CIGpuLayerCount,

};
using var weights = LLamaWeights.LoadFromFile(@params);
_reranker = new LLamaReranker(weights, @params);
Expand Down
3 changes: 2 additions & 1 deletion LLama.Unittest/SamplingTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ public SamplingTests(ITestOutputHelper testOutputHelper)
_params = new ModelParams(Constants.GenerativeModelPath2) {
ContextSize = 200,
BatchSize = 200,
SeqMax = 4,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think specifiying SeqMax is necessary now that kv_unified is enabled.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is due to the regression in llama cpp without this the batched sampling and reranker tests would fail
look at this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you try without? I think it isn't needed

Copy link
Author

@krisbiradar krisbiradar Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did try and the reranker and batched sampling tests failed. One of the reasons I think is previously it did pass some no of max sequences by default but after the unified kv it has stopped doing that. And passes 0 as max sequences

Looks like this is going to be a breaking change shall we go ahead anyway?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this answers your query can we go ahead ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, go ahead and remove SeqMax from here and other tests.

The tests are there to ensure LL# continues to function properly with at least some backward comp.
Making the users specify the SeqMax during model instantiation is not a good idea.

If kv_unified is enabled correctly, we shouldn't need to specify a seq num beforehand.
ggml-org/llama.cpp#16432

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since Martin mentions there's a min context size now, maybe that's why the test was failing?

Either way, batching should be allowed without having to specify SeqMax, so that has to go.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if that's the case i guess i've missed something else in the kv unified implementation. looking into this

GpuLayerCount = Constants.CIGpuLayerCount,
};
_model = LLamaWeights.LoadFromFile(_params);
Expand Down Expand Up @@ -104,7 +105,7 @@ public void BatchedSampling()
}
}

// Add " repeat" and test whether next tokens will be "this phrase forever.".
// Add " repeat" and test whether next tokens will be "this phrase forever."
for (int i = 0; i < 4; i++)
{
for (int b = 0; b < batch_count; b++)
Expand Down
2 changes: 1 addition & 1 deletion LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ public class ModelOptions
public bool NoKqvOffload { get; set; }

/// <inheritdoc />
public bool FlashAttention { get; set; }
public bool? FlashAttention { get; set; }

/// <inheritdoc />
public Encoding Encoding { get; set; } = Encoding.UTF8;
Expand Down
4 changes: 2 additions & 2 deletions LLama/Abstractions/IContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ public interface IContextParams
/// <summary>
/// Whether to use flash attention
/// </summary>
bool FlashAttention { get; }

bool? FlashAttention { get; }
/// <summary>
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
/// </summary>
Expand Down
7 changes: 4 additions & 3 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System;
using LLama.Abstractions;
using System.Text;
using System.Text.Json.Serialization;
Expand Down Expand Up @@ -95,12 +96,12 @@ public record ModelParams

/// <inheritdoc />
public bool NoKqvOffload { get; set; }

/// <inheritdoc />

public bool FlashAttention { get; set; }
public bool? FlashAttention { get; set; }

/// <inheritdoc />
[Obsolete]
public float? DefragThreshold { get; set; }

/// <inheritdoc />
Expand Down
8 changes: 7 additions & 1 deletion LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,15 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_v = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;
result.flash_attention = @params.FlashAttention;
Copy link
Contributor

@Lyrcaxis Lyrcaxis Oct 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of completely removing the option to use flash attention, can you pass to llama_flash_attn_type?
I would suggest keeping the previous FlashAttention bool as it was -- but turn it to nullable, so null == Auto.

result.llama_flash_attn_type = @params.FlashAttention switch
{
    true => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_ENABLED,
    false => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_DISABLED,
    null => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_AUTO
}
result.kv_unified = true; // if we wanna hardcode it here instead of in `Default()`.

result.llama_pooling_type = @params.PoolingType;
result.attention_type = @params.AttentionType;
result.llama_flash_attn_type = @params.FlashAttention switch
{
true => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_ENABLED,
false => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_DISABLED,
null => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_AUTO
};
result.kv_unified = true;

result.n_threads = Threads(@params.Threads);
result.n_threads_batch = Threads(@params.BatchThreads);
Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
</ItemGroup>

<PropertyGroup>
<BinaryReleaseId>11dd5a44eb180e</BinaryReleaseId>
<BinaryReleaseId>86587da</BinaryReleaseId>
</PropertyGroup>

<PropertyGroup>
Expand Down
5 changes: 5 additions & 0 deletions LLama/Native/LLamaContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ public struct LLamaContextParams
/// Attention type to use for embeddings
/// </summary>
public LLamaAttentionType attention_type;

/// <summary>
/// when to enable Flash Attention
/// </summary>
public LLamaFlashAttentionType llama_flash_attn_type;

/// <summary>
/// RoPE base frequency, 0 = from model
Expand Down
19 changes: 19 additions & 0 deletions LLama/Native/LLamaFlashAttentionType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
namespace LLama.Native;
/// <summary>
/// flash_attn_type
/// </summary>
public enum LLamaFlashAttentionType
{
/// <summary>
/// attention type auto
/// </summary>
LLAMA_FLASH_ATTENTION_TYPE_AUTO = -1,
/// <summary>
/// attention disabled
/// </summary>
LLAMA_FLASH_ATTENTION_TYPE_DISABLED = 0,
/// <summary>
/// attention enabled
/// </summary>
LLAMA_FLASH_ATTENTION_TYPE_ENABLED = 1,
}
7 changes: 6 additions & 1 deletion LLama/Native/LLamaFtype.cs
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,12 @@ public enum LLamaFtype
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37,


/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38,

/// <summary>
/// File type was not specified
/// </summary>
Expand Down
11 changes: 10 additions & 1 deletion LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,16 @@ public bool check_tensors
set => _check_tensors = Convert.ToSByte(value);
}
private sbyte _check_tensors;


/// <summary>
/// use extra buffer types (used for weight repacking)
/// </summary>
public bool use_extra_bufts
{
readonly get => Convert.ToBoolean(_use_extra_bufts);
set => _use_extra_bufts = Convert.ToSByte(value);
}
private sbyte _use_extra_bufts;
/// <summary>
/// Create a LLamaModelParams with default values
/// </summary>
Expand Down
Loading
Loading