-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[AArch64] Add sve bf16 fpext and fpround costs. #150485
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesThis prevents them from generating Invalid costs, as generating the instructions seems to work fine with and without +bf16. The costs are mostly taken from the number of instructions (minus ptrue and constants) from https://llvm.godbolt.org/z/16zMd47he Full diff: https://github.com/llvm/llvm-project/pull/150485.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 40f49dade6131..ba00d9e4dbb99 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3093,6 +3093,12 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
{ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
{ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
+ {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
+ {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
+ {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
+ {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
+ {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // fcvtx+bfcvt+bfcvt+uzp1
+ {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
};
if (ST->hasBF16())
@@ -3501,11 +3507,21 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
{ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
+ // Truncate from nxvmf32 to nxvmbf16.
+ {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
+ {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
+ {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
+
// Truncate from nxvmf64 to nxvmf16.
{ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
{ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
{ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
+ // Truncate from nxvmf64 to nxvmbf16.
+ {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
+ {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
+ {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
+
// Truncate from nxvmf64 to nxvmf32.
{ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
{ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
@@ -3516,11 +3532,22 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
{ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
+ // Extend from nxvmbf16 to nxvmf32.
+ {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
+ {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
+ {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 2}, // unpck+unpck+lsl+lsl
+
// Extend from nxvmf16 to nxvmf64.
{ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
{ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
{ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
+ // Extend from nxvmbf16 to nxvmf64.
+ {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
+ {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // unpck+unpck+lsl+lsl
+ // + fcvt+fcvt
+ {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
+
// Extend from nxvmf32 to nxvmf64.
{ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
{ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
index 805b3713830ab..ddc1a72e16318 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll
@@ -34,12 +34,12 @@ define void @sve_fpext() {
define void @sve_fpext_bf16() {
; CHECK-LABEL: 'sve_fpext_bf16'
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x float>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x double>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x double>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x float>
+; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x float>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_to_f64 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext <vscale x 4 x bfloat> undef to <vscale x 4 x double>
+; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext <vscale x 8 x bfloat> undef to <vscale x 8 x double>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%nxv2_f16_to_f32 = fpext <vscale x 2 x bfloat> undef to <vscale x 2 x float>
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
index bb31ebfbed4ba..88c53ec4510b1 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
@@ -34,14 +34,23 @@ define void @sve_fptruncs() {
}
define void @sve_fptruncs_bf16() {
-; CHECK-LABEL: 'sve_fptruncs_bf16'
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
-; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-NOBF16-LABEL: 'sve_fptruncs_bf16'
+; CHECK-NOBF16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
+; CHECK-NOBF16-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
+; CHECK-NOBF16-NEXT: Cost Model: Found costs of RThru:17 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
+; CHECK-NOBF16-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
+; CHECK-NOBF16-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
+; CHECK-NOBF16-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+; CHECK-NOBF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-BF16-LABEL: 'sve_fptruncs_bf16'
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x bfloat>
+; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x bfloat>
%nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x bfloat>
@@ -53,6 +62,3 @@ define void @sve_fptruncs_bf16() {
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-BF16: {{.*}}
-; CHECK-NOBF16: {{.*}}
|
This comment was marked as off-topic.
This comment was marked as off-topic.
// Extend from nxvmbf16 to nxvmf32. | ||
{ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl | ||
{ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl | ||
{ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 2}, // unpck+unpck+lsl+lsl |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be 4?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The jump from 1 to 4 feels a bit odd. I was thinking that we might be able to use zip1/zip2 with a zero vector to pull the bits into the right lanes. For the moment it uses 4 though, thanks.
This prevents them from generating Invalid costs, as generating the instructions seems to work fine with and without +bf16. The costs are mostly taken from the number of instructions (minus ptrue and constants) from https://llvm.godbolt.org/z/16zMd47he
4155e87
to
ef71c66
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some nits but otherwise looks good to me.
@@ -3092,6 +3092,12 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | |||
return AdjustCost( | |||
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
|
|||
// For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as | |||
// we use fcvtx undef SVE2. Give them invalid costs. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/undef/under/?
@@ -3092,6 +3092,12 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | |||
return AdjustCost( | |||
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
|
|||
// For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as | |||
// we use fcvtx undef SVE2. Give them invalid costs. | |||
if (!ST->hasSVE2() && ISD == ISD::FP_ROUND && SrcTy.isScalableVector() && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be worth adding && !ST->isStreamingSVEAvailable()
because fcvtx
is also available to streaming functions.
{ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt | ||
{ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1 | ||
{ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt | ||
{ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // fcvtx+bfcvt+bfcvt+uzp1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // fcvtx+bfcvt+bfcvt+uzp1 | |
{ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2xfcvtx+2xbfcvt+uzp1 |
This prevents them from generating Invalid costs, as generating the instructions seems to work fine with and without +bf16. The costs are mostly taken from the number of instructions (minus ptrue and constants) from https://llvm.godbolt.org/z/16zMd47he