Skip to content

Commit fdc2bd4

Browse files
committed
Enable Flash Attention on Pascal GPUs
1 parent 2ec4aab commit fdc2bd4

File tree

2 files changed

+7
-5
lines changed

2 files changed

+7
-5
lines changed

examples/server.ps1

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,13 +200,13 @@ if ($modelDataIsAvailable) {
200200
Select-String -Pattern '\b(\d+) MiB\b'
201201
).Matches.Groups[1].Value * 1024 * 1024)
202202

203-
# CUDA Flash Attention requires the GPU to have Tensor Cores,
204-
# which are available with a Compute Capability >= 7.0.
203+
# The CUDA Flash Attention implementation of llama.cpp requires
204+
# the NVIDIA GPU to have a Compute Capability of >= 6.0.
205205
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications
206206
# https://github.com/ggerganov/llama.cpp/issues/7055
207207
$enableFlashAttention = ([Double](
208208
Invoke-Expression "nvidia-smi --query-gpu=compute_cap --format=csv,noheader"
209-
) -ge 7.0)
209+
) -ge 6.0)
210210

211211
# The automatic calculating the optimal number of GPU layers can
212212
# always be "overruled" by using the -numberOfGPULayers option.
@@ -258,7 +258,9 @@ if ($contextSize -gt $modelContextLength) {
258258
$groupAttentionWidth = $modelContextLength / 2
259259
}
260260

261-
# We are defaulting the KV cache data type to a quantized format.
261+
# We are defaulting the KV cache data type to a quantized format
262+
# if Flash Attention is enabled to maximize the context size.
263+
# https://github.com/ggerganov/llama.cpp/pull/7527
262264
if (!$kvCacheDataType) {
263265
if ($enableFlashAttention) {
264266
$kvCacheDataType = 'q4_0'

vendor/llama.cpp

0 commit comments

Comments
 (0)