File tree Expand file tree Collapse file tree 2 files changed +7
-5
lines changed Expand file tree Collapse file tree 2 files changed +7
-5
lines changed Original file line number Diff line number Diff line change @@ -200,13 +200,13 @@ if ($modelDataIsAvailable) {
200
200
Select-String - Pattern ' \b(\d+) MiB\b'
201
201
).Matches.Groups[1 ].Value * 1024 * 1024 )
202
202
203
- # CUDA Flash Attention requires the GPU to have Tensor Cores,
204
- # which are available with a Compute Capability >= 7 .0.
203
+ # The CUDA Flash Attention implementation of llama.cpp requires
204
+ # the NVIDIA GPU to have a Compute Capability of >= 6 .0.
205
205
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
206
206
# https://github.com/ggerganov/llama.cpp/issues/7055
207
207
$enableFlashAttention = ([Double ](
208
208
Invoke-Expression " nvidia-smi --query-gpu=compute_cap --format=csv,noheader"
209
- ) -ge 7 .0 )
209
+ ) -ge 6 .0 )
210
210
211
211
# The automatic calculating the optimal number of GPU layers can
212
212
# always be "overruled" by using the -numberOfGPULayers option.
@@ -258,7 +258,9 @@ if ($contextSize -gt $modelContextLength) {
258
258
$groupAttentionWidth = $modelContextLength / 2
259
259
}
260
260
261
- # We are defaulting the KV cache data type to a quantized format.
261
+ # We are defaulting the KV cache data type to a quantized format
262
+ # if Flash Attention is enabled to maximize the context size.
263
+ # https://github.com/ggerganov/llama.cpp/pull/7527
262
264
if (! $kvCacheDataType ) {
263
265
if ($enableFlashAttention ) {
264
266
$kvCacheDataType = ' q4_0'
You can’t perform that action at this time.
0 commit comments