-
Notifications
You must be signed in to change notification settings - Fork 14.7k
[CostModel][X86] Update SK_Reverse based on cost kinds #150650
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
When these were converted to CostKindTblEntry the throughput was mainly copied to all cost kinds Regenerated with my check_cost_tables.py helper script
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-llvm-analysis Author: Simon Pilgrim (RKSimon) ChangesWhen these were converted to CostKindTblEntry the throughput was mainly copied to all cost kinds Regenerated with my check_cost_tables.py helper script Patch is 72.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150650.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 37a7b37e4133a..0b24088b52792 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1842,10 +1842,11 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{ TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
{ TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
- { TTI::SK_Reverse, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_Reverse, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
+ { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
+ { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
{ TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_Reverse, MVT::v64i8, { 2, 2, 2, 2 } }, // pshufb + vshufi64x2
+ { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
+ { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
{ TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
{ TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
@@ -1882,10 +1883,10 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb
- {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
- {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
- {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
- {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
+ {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
+ {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
+ {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
+ {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
{TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
{TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
{TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
@@ -1981,13 +1982,13 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{ TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw
{ TTI::SK_Broadcast, MVT::v32i8, { 1, 1, 1, 1 } }, // vpbroadcastb
- { TTI::SK_Reverse, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
- { TTI::SK_Reverse, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
- { TTI::SK_Reverse, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
- { TTI::SK_Reverse, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
- { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
+ { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
+ { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
+ { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
+ { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
{ TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
{ TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
@@ -2085,15 +2086,15 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
{TTI::SK_Broadcast, MVT::v32i8, {2,2,2,2}}, // vpshufb + vinsertf128
- {TTI::SK_Reverse, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Reverse, MVT::v8f32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Reverse, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Reverse, MVT::v8i32, {2,2,2,2}}, // vperm2f128 + vpermilps
- {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+ {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
// + vinsertf128
- {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+ {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
// + vinsertf128
- {TTI::SK_Reverse, MVT::v32i8, {4,4,4,4}}, // vextractf128 + 2*pshufb
+ {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
// + vinsertf128
{TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
@@ -2160,9 +2161,9 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
{TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
- {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+ {TTI::SK_Reverse, MVT::v8i16, {1, 2, 1, 2}}, // pshufb
+ {TTI::SK_Reverse, MVT::v8f16, {1, 2, 1, 2}}, // pshufb
+ {TTI::SK_Reverse, MVT::v16i8, {1, 2, 1, 2}}, // pshufb
{TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
{TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
@@ -2199,9 +2200,9 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
{TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
{TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
- {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
- {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
- {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw
+ {TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+ {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + packus
{TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll
index 4affc7e97f6e5..f47e1a35aa087 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll
@@ -3,11 +3,11 @@
define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) {
; CHECK-LABEL: 'test_vXf16'
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:2 SizeLat:2 for: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: Cost Model: Found costs of 2 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:4 for: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll
index 1bcf2d51f098b..aca0ba3ca74e4 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-reverse.ll
@@ -25,20 +25,20 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
;
; AVX1-LABEL: 'test_vXf64'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; AVX1-NEXT: Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT: Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'test_vXf64'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; AVX2-NEXT: Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:12 SizeLat:4 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'test_vXf64'
; AVX512-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; AVX512-NEXT: Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:5 SizeLat:3 for: %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
@@ -56,20 +56,20 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
;
; AVX1-LABEL: 'test_vXi64'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; AVX1-NEXT: Cost Model: Found costs of 2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT: Cost Model: Found costs of 4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:12 SizeLat:4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'test_vXi64'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; AVX2-NEXT: Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: Cost Model: Found costs of 2 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:12 SizeLat:4 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'test_vXi64'
; AVX512-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; AVX512-NEXT: Cost Model: Found costs of 1 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:6 SizeLat:2 for: %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:5 SizeLat:3 for: %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
@@ -89,22 +89,22 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
; AVX1-LABEL: 'test_vXf32'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 0>
; AVX1-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT: Cost Model: Found costs of 2 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT: Cost Model: Found costs of 4 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:8 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'test_vXf32'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 0>
; AVX2-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT: Cost Model: Found costs of 2 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:14 SizeLat:8 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'test_vXf32'
; AVX512-NEXT: Cost Model: Found costs of 1 for: %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 0>
; AVX512-NEXT: Cost Model: Found costs of 1 for: %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: Cost Model: Found costs of 1 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:7 SizeLat:4 for: %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:3 SizeLat:3 for: %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; AVX512-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 0>
@@ -125,22 +125,22 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
; AVX1-LABEL: 'test_vXi32'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %V64 = shufflevector ...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions cpp -- llvm/lib/Target/X86/X86TargetTransformInfo.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 90791fc57..cf9557d19 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1838,34 +1838,34 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX512BWShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
-
- { TTI::SK_Reverse, MVT::v32i16, { 2, 6, 2, 4 } }, // vpermw
- { TTI::SK_Reverse, MVT::v32f16, { 2, 6, 2, 4 } }, // vpermw
- { TTI::SK_Reverse, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_Reverse, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_Reverse, MVT::v64i8, { 2, 9, 2, 3 } }, // pshufb + vshufi64x2
-
- { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
- { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 8, 8, 8, 8 } }, // extend to v32i16
-
- { TTI::SK_PermuteTwoSrc, MVT::v32i16,{ 2, 2, 2, 2 } }, // vpermt2w
- { TTI::SK_PermuteTwoSrc, MVT::v32f16,{ 2, 2, 2, 2 } }, // vpermt2w
- { TTI::SK_PermuteTwoSrc, MVT::v16i16,{ 2, 2, 2, 2 } }, // vpermt2w
- { TTI::SK_PermuteTwoSrc, MVT::v8i16, { 2, 2, 2, 2 } }, // vpermt2w
- { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
-
- { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
- { TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vblendmb
-
- { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
- { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
- { TTI::SK_Splice, MVT::v64i8, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
+ {TTI::SK_Broadcast, MVT::v32i16, {1, 3, 1, 1}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32f16, {1, 3, 1, 1}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, {1, 3, 1, 1}}, // vpbroadcastb
+
+ {TTI::SK_Reverse, MVT::v32i16, {2, 6, 2, 4}}, // vpermw
+ {TTI::SK_Reverse, MVT::v32f16, {2, 6, 2, 4}}, // vpermw
+ {TTI::SK_Reverse, MVT::v16i16, {2, 2, 2, 2}}, // vpermw
+ {TTI::SK_Reverse, MVT::v16f16, {2, 2, 2, 2}}, // vpermw
+ {TTI::SK_Reverse, MVT::v64i8, {2, 9, 2, 3}}, // pshufb + vshufi64x2
+
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, {2, 2, 2, 2}}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v32f16, {2, 2, 2, 2}}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, {2, 2, 2, 2}}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v16f16, {2, 2, 2, 2}}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, {8, 8, 8, 8}}, // extend to v32i16
+
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, {2, 2, 2, 2}}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v32f16, {2, 2, 2, 2}}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, {2, 2, 2, 2}}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v8i16, {2, 2, 2, 2}}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, {19, 19, 19, 19}}, // 6 * v32i8 + 1
+
+ {TTI::SK_Select, MVT::v32i16, {1, 1, 1, 1}}, // vblendmw
+ {TTI::SK_Select, MVT::v64i8, {1, 1, 1, 1}}, // vblendmb
+
+ {TTI::SK_Splice, MVT::v32i16, {2, 2, 2, 2}}, // vshufi64x2 + palignr
+ {TTI::SK_Splice, MVT::v32f16, {2, 2, 2, 2}}, // vshufi64x2 + palignr
+ {TTI::SK_Splice, MVT::v64i8, {2, 2, 2, 2}}, // vshufi64x2 + palignr
};
if (ST->hasBWI())
@@ -1875,84 +1875,84 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX512ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v8f64, { 1, 3, 1, 1 } }, // vbroadcastsd
- {TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 1 } }, // vbroadcastsd
- {TTI::SK_Broadcast, MVT::v16f32, { 1, 3, 1, 1 } }, // vbroadcastss
- {TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 1 } }, // vbroadcastss
- {TTI::SK_Broadcast, MVT::v8i64, { 1, 3, 1, 1 } }, // vpbroadcastq
- {TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 1 } }, // vpbroadcastq
- {TTI::SK_Broadcast, MVT::v16i32, { 1, 3, 1, 1 } }, // vpbroadcastd
- {TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 1 } }, // vpbroadcastd
- {TTI::SK_Broadcast, MVT::v32i16, { 1, 3, 1, 1 } }, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 1 } }, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v32f16, { 1, 3, 1, 1 } }, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 1 } }, // vpbroadcastw
- {TTI::SK_Broadcast, MVT::v64i8, { 1, 3, 1, 1 } }, // vpbroadcastb
- {TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 1 }}, // vpbroadcastb
-
- {TTI::SK_Reverse, MVT::v8f64, { 1, 5, 2, 3 } }, // vpermpd
- {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 2, 3 } }, // vpermps
- {TTI::SK_Reverse, MVT::v8i64, { 1, 5, 2, 3 } }, // vpermq
- {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 2, 3 } }, // vpermd
- {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca
- {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca
- {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca
-
- {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd
- {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd
- {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd
- {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd
- {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd
- {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd
- {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd
- {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd
- {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr
- {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr
- {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr
-
- {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd
- {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd
- {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd
- {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps
- {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps
- {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps
- {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq
- {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq
- {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq
- {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd
- {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd
- {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd
- {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb
-
- {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd
- {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps
- {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q
- {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d
- {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd
- {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps
- {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q
- {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d
- {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd
- {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps
- {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q
- {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d
+ {TTI::SK_Broadcast, MVT::v8f64, {1, 3, 1, 1}}, // vbroadcastsd
+ {TTI::SK_Broadcast, MVT::v4f64, {1, 3, 1, 1}}, // vbroadcastsd
+ {TTI::SK_Broadcast, MVT::v16f32, {1, 3, 1, 1}}, // vbroadcastss
+ {TTI::SK_Broadcast, MVT::v8f32, {1, 3, 1, 1}}, // vbroadcastss
+ {TTI::SK_Broadcast, MVT::v8i64, {1, 3, 1, 1}}, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v4i64, {1, 3, 1, 1}}, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v16i32, {1, 3, 1, 1}}, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v8i32, {1, 3, 1, 1}}, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v32i16, {1, 3, 1, 1}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v16i16, {1, 3, 1, 1}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32f16, {1, 3, 1, 1}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v16f16, {1, 3, 1, 1}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, {1, 3, 1, 1}}, // vpbroadcastb
+ {TTI::SK_Broadcast, MVT::v32i8, {1, 3, 1, 1}}, // vpbroadcastb
+
+ {TTI::SK_Reverse, MVT::v8f64, {1, 5, 2, 3}}, // vpermpd
+ {TTI::SK_Reverse, MVT::v16f32, {1, 3, 2, 3}}, // vpermps
+ {TTI::SK_Reverse, MVT::v8i64, {1, 5, 2, 3}}, // vpermq
+ {TTI::SK_Reverse, MVT::v16i32, {1, 3, 2, 3}}, // vpermd
+ {TTI::SK_Reverse, MVT::v32i16, {7, 7, 7, 7}}, // per mca
+ {TTI::SK_Reverse, MVT::v32f16, {7, 7, 7, 7}}, // per mca
+ {TTI::SK_Reverse, MVT::v64i8, {7, 7, 7, 7}}, // per mca
+
+ {TTI::SK_Splice, MVT::v8f64, {1, 1, 1, 1}}, // vpalignd
+ {TTI::SK_Splice, MVT::v4f64, {1, 1, 1, 1}}, // vpalignd
+ {TTI::SK_Splice, MVT::v16f32, {1, 1, 1, 1}}, // vpalignd
+ {TTI::SK_Splice, MVT::v8f32, {1, 1, 1, 1}}, // vpalignd
+ {TTI::SK_Splice, MVT::v8i64, {1, 1, 1, 1}}, // vpalignd
+ {TTI::SK_Splice, MVT::v4i64, {1, 1, 1, 1}}, // vpalignd
+ {TTI::SK_Splice, MVT::v16i32, {1, 1, 1, 1}}, // vpalignd
+ {TTI::SK_Splice, MVT::v8i32, {1, 1, 1, 1}}, // vpalignd
+ {TTI::SK_Splice, MVT::v32i16, {4, 4, 4, 4}}, // split + palignr
+ {TTI::SK_Splice, MVT::v32f16, {4, 4, 4, 4}}, // split + palignr
+ {TTI::SK_Splice, MVT::v64i8, {4, 4, 4, 4}}, // split + palignr
+
+ {TTI::SK_PermuteSingleSrc, MVT::v8f64, {1, 3, 1, 1}}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, {1, 3, 1, 1}}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 3, 1, 1}}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v16f32, {1, 3, 1, 1}}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, {1, 3, 1, 1}}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v4f32, {1, 3, 1, 1}}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v8i64, {1, 3, 1, 1}}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, {1, 3, 1, 1}}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 3, 1, 1}}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v16i32, {1, 3, 1, 1}}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, {1, 3, 1, 1}}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 3, 1, 1}}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 3, 1, 1}}, // pshufb
+
+ {TTI::SK_PermuteTwoSrc, MVT::v8f64, {1, 3, 1, 1}}, // vpermt2pd
+ {TTI::SK_PermuteTwoSrc, MVT::v16f32, {1, 3, 1, 1}}, // vpermt2ps
+ {TTI::SK_PermuteTwoSrc, MVT::v8i64, {1, 3, 1, 1}}, // vpermt2q
+ {TTI::SK_PermuteTwoSrc, MVT::v16i32, {1, 3, 1, 1}}, // vpermt2d
+ {TTI::SK_PermuteTwoSrc, MVT::v4f64, {1, 3, 1, 1}}, // vpermt2pd
+ {TTI::SK_PermuteTwoSrc, MVT::v8f32, {1, 3, 1, 1}}, // vpermt2ps
+ {TTI::SK_PermuteTwoSrc, MVT::v4i64, {1, 3, 1, 1}}, // vpermt2q
+ {TTI::SK_PermuteTwoSrc, MVT::v8i32, {1, 3, 1, 1}}, // vpermt2d
+ {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 3, 1, 1}}, // vpermt2pd
+ {TTI::SK_PermuteTwoSrc, MVT::v4f32, {1, 3, 1, 1}}, // vpermt2ps
+ {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 3, 1, 1}}, // vpermt2q
+ {TTI::SK_PermuteTwoSrc, MVT::v4i32, {1, 3, 1, 1}}, // vpermt2d
// FIXME: This just applies the type legalization cost rules above
// assuming these completely split.
- {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } },
- {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } },
- {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } },
- {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } },
- {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } },
- {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } },
-
- {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq
- {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq
- {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq
- {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd
- {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps
- {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq
- {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, {14, 14, 14, 14}},
+ {TTI::SK_PermuteSingleSrc, MVT::v32f16, {14, 14, 14, 14}},
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, {14, 14, 14, 14}},
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, {42, 42, 42, 42}},
+ {TTI::SK_PermuteTwoSrc, MVT::v32f16, {42, 42, 42, 42}},
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, {42, 42, 42, 42}},
+
+ {TTI::SK_Select, MVT::v32i16, {1, 1, 1, 1}}, // vpternlogq
+ {TTI::SK_Select, MVT::v32f16, {1, 1, 1, 1}}, // vpternlogq
+ {TTI::SK_Select, MVT::v64i8, {1, 1, 1, 1}}, // vpternlogq
+ {TTI::SK_Select, MVT::v8f64, {1, 1, 1, 1}}, // vblendmpd
+ {TTI::SK_Select, MVT::v16f32, {1, 1, 1, 1}}, // vblendmps
+ {TTI::SK_Select, MVT::v8i64, {1, 1, 1, 1}}, // vblendmq
+ {TTI::SK_Select, MVT::v16i32, {1, 1, 1, 1}}, // vblendmd
};
if (ST->hasAVX512())
@@ -1981,50 +1981,50 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX2ShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v4f64, { 1, 3, 1, 2 } }, // vbroadcastpd
- { TTI::SK_Broadcast, MVT::v8f32, { 1, 3, 1, 2 } }, // vbroadcastps
- { TTI::SK_Broadcast, MVT::v4i64, { 1, 3, 1, 2 } }, // vpbroadcastq
- { TTI::SK_Broadcast, MVT::v8i32, { 1, 3, 1, 2 } }, // vpbroadcastd
- { TTI::SK_Broadcast, MVT::v16i16, { 1, 3, 1, 2 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v8i16, { 1, 3, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v16f16, { 1, 3, 1, 2 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v8f16, { 1, 3, 1, 1 } }, // vpbroadcastw
- { TTI::SK_Broadcast, MVT::v32i8, { 1, 3, 1, 2 } }, // vpbroadcastb
- { TTI::SK_Broadcast, MVT::v16i8, { 1, 3, 1, 1 } }, // vpbroadcastb
-
- { TTI::SK_Reverse, MVT::v4f64, { 1, 6, 1, 2 } }, // vpermpd
- { TTI::SK_Reverse, MVT::v8f32, { 2, 7, 2, 4 } }, // vpermps
- { TTI::SK_Reverse, MVT::v4i64, { 1, 6, 1, 2 } }, // vpermq
- { TTI::SK_Reverse, MVT::v8i32, { 2, 7, 2, 4 } }, // vpermd
- { TTI::SK_Reverse, MVT::v16i16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v16f16, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
- { TTI::SK_Reverse, MVT::v32i8, { 2, 9, 2, 4 } }, // vperm2i128 + pshufb
-
- { TTI::SK_Select, MVT::v16i16, { 1, 1, 1, 1 } }, // vpblendvb
- { TTI::SK_Select, MVT::v16f16, { 1, 1, 1, 1 } }, // vpblendvb
- { TTI::SK_Select, MVT::v32i8, { 1, 1, 1, 1 } }, // vpblendvb
-
- { TTI::SK_Splice, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
- { TTI::SK_Splice, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
- { TTI::SK_Splice, MVT::v16i16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
- { TTI::SK_Splice, MVT::v16f16, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
- { TTI::SK_Splice, MVT::v32i8, { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
-
- { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 1, 1, 1 } }, // vpermpd
- { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 1, 1, 1 } }, // vpermps
- { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 1, 1, 1 } }, // vpermq
- { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 1, 1, 1 } }, // vpermd
- { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
- { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
- { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } },
-
- { TTI::SK_PermuteTwoSrc, MVT::v4f64, { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
- { TTI::SK_PermuteTwoSrc, MVT::v8f32, { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
- { TTI::SK_PermuteTwoSrc, MVT::v4i64, { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
- { TTI::SK_PermuteTwoSrc, MVT::v8i32, { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
- { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
- { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
- { TTI::SK_PermuteTwoSrc, MVT::v32i8, { 7, 7, 7, 7 } },
+ {TTI::SK_Broadcast, MVT::v4f64, {1, 3, 1, 2}}, // vbroadcastpd
+ {TTI::SK_Broadcast, MVT::v8f32, {1, 3, 1, 2}}, // vbroadcastps
+ {TTI::SK_Broadcast, MVT::v4i64, {1, 3, 1, 2}}, // vpbroadcastq
+ {TTI::SK_Broadcast, MVT::v8i32, {1, 3, 1, 2}}, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v16i16, {1, 3, 1, 2}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v8i16, {1, 3, 1, 1}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v16f16, {1, 3, 1, 2}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v8f16, {1, 3, 1, 1}}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v32i8, {1, 3, 1, 2}}, // vpbroadcastb
+ {TTI::SK_Broadcast, MVT::v16i8, {1, 3, 1, 1}}, // vpbroadcastb
+
+ {TTI::SK_Reverse, MVT::v4f64, {1, 6, 1, 2}}, // vpermpd
+ {TTI::SK_Reverse, MVT::v8f32, {2, 7, 2, 4}}, // vpermps
+ {TTI::SK_Reverse, MVT::v4i64, {1, 6, 1, 2}}, // vpermq
+ {TTI::SK_Reverse, MVT::v8i32, {2, 7, 2, 4}}, // vpermd
+ {TTI::SK_Reverse, MVT::v16i16, {2, 9, 2, 4}}, // vperm2i128 + pshufb
+ {TTI::SK_Reverse, MVT::v16f16, {2, 9, 2, 4}}, // vperm2i128 + pshufb
+ {TTI::SK_Reverse, MVT::v32i8, {2, 9, 2, 4}}, // vperm2i128 + pshufb
+
+ {TTI::SK_Select, MVT::v16i16, {1, 1, 1, 1}}, // vpblendvb
+ {TTI::SK_Select, MVT::v16f16, {1, 1, 1, 1}}, // vpblendvb
+ {TTI::SK_Select, MVT::v32i8, {1, 1, 1, 1}}, // vpblendvb
+
+ {TTI::SK_Splice, MVT::v8i32, {2, 2, 2, 2}}, // vperm2i128 + vpalignr
+ {TTI::SK_Splice, MVT::v8f32, {2, 2, 2, 2}}, // vperm2i128 + vpalignr
+ {TTI::SK_Splice, MVT::v16i16, {2, 2, 2, 2}}, // vperm2i128 + vpalignr
+ {TTI::SK_Splice, MVT::v16f16, {2, 2, 2, 2}}, // vperm2i128 + vpalignr
+ {TTI::SK_Splice, MVT::v32i8, {2, 2, 2, 2}}, // vperm2i128 + vpalignr
+
+ {TTI::SK_PermuteSingleSrc, MVT::v4f64, {1, 1, 1, 1}}, // vpermpd
+ {TTI::SK_PermuteSingleSrc, MVT::v8f32, {1, 1, 1, 1}}, // vpermps
+ {TTI::SK_PermuteSingleSrc, MVT::v4i64, {1, 1, 1, 1}}, // vpermq
+ {TTI::SK_PermuteSingleSrc, MVT::v8i32, {1, 1, 1, 1}}, // vpermd
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, {4, 4, 4, 4}},
+ {TTI::SK_PermuteSingleSrc, MVT::v16f16, {4, 4, 4, 4}},
+ {TTI::SK_PermuteSingleSrc, MVT::v32i8, {4, 4, 4, 4}},
+
+ {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3, 3, 3, 3}}, // 2*vpermpd + vblendpd
+ {TTI::SK_PermuteTwoSrc, MVT::v8f32, {3, 3, 3, 3}}, // 2*vpermps + vblendps
+ {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3, 3, 3, 3}}, // 2*vpermq + vpblendd
+ {TTI::SK_PermuteTwoSrc, MVT::v8i32, {3, 3, 3, 3}}, // 2*vpermd + vpblendd
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, {7, 7, 7, 7}},
+ {TTI::SK_PermuteTwoSrc, MVT::v16f16, {7, 7, 7, 7}},
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, {7, 7, 7, 7}},
};
if (ST->hasAVX2())
@@ -2088,62 +2088,100 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
return LT.first * *KindCost;
static const CostKindTblEntry AVX1ShuffleTbl[] = {
- {TTI::SK_Broadcast, MVT::v4f64, {2,3,2,3}}, // vperm2f128 + vpermilpd
- {TTI::SK_Broadcast, MVT::v8f32, {2,3,2,3}}, // vperm2f128 + vpermilps
- {TTI::SK_Broadcast, MVT::v4i64, {2,3,2,3}}, // vperm2f128 + vpermilpd
- {TTI::SK_Broadcast, MVT::v8i32, {2,3,2,3}}, // vperm2f128 + vpermilps
- {TTI::SK_Broadcast, MVT::v16i16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
- {TTI::SK_Broadcast, MVT::v16f16, {2,3,3,4}}, // vpshuflw + vpshufd + vinsertf128
- {TTI::SK_Broadcast, MVT::v32i8, {3,4,3,6}}, // vpshufb + vinsertf128
-
- {TTI::SK_Reverse, MVT::v4f64, {2,6,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Reverse, MVT::v8f32, {2,7,2,4}}, // vperm2f128 + vpermilps
- {TTI::SK_Reverse, MVT::v4i64, {2,6,2,2}}, // vperm2f128 + vpermilpd
- {TTI::SK_Reverse, MVT::v8i32, {2,7,2,4}}, // vperm2f128 + vpermilps
- {TTI::SK_Reverse, MVT::v16i16, {2,9,5,5}}, // vextractf128 + 2*pshufb
- // + vinsertf128
- {TTI::SK_Reverse, MVT::v16f16, {2,9,5,5}}, // vextractf128 + 2*pshufb
- // + vinsertf128
- {TTI::SK_Reverse, MVT::v32i8, {2,9,5,5}}, // vextractf128 + 2*pshufb
- // + vinsertf128
-
- {TTI::SK_Select, MVT::v4i64, {1,1,1,1}}, // vblendpd
- {TTI::SK_Select, MVT::v4f64, {1,1,1,1}}, // vblendpd
- {TTI::SK_Select, MVT::v8i32, {1,1,1,1}}, // vblendps
- {TTI::SK_Select, MVT::v8f32, {1,1,1,1}}, // vblendps
- {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
- {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
- {TTI::SK_Select, MVT::v32i8, {3,3,3,3}}, // vpand + vpandn + vpor
-
- {TTI::SK_Splice, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + shufpd
- {TTI::SK_Splice, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + shufpd
- {TTI::SK_Splice, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
- {TTI::SK_Splice, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
- {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
- {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
- {TTI::SK_Splice, MVT::v32i8, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
-
- {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
- {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
- {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
- {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
- {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
- // + 2*por + vinsertf128
- {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
- // + 2*por + vinsertf128
- {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
- // + 2*por + vinsertf128
-
- {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
- {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}}, // 2*vperm2f128 + vshufpd
- {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
- {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
- {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
- // + 4*por + vinsertf128
- {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
- // + 4*por + vinsertf128
- {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
- // + 4*por + vinsertf128
+ {TTI::SK_Broadcast, MVT::v4f64, {2, 3, 2, 3}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8f32, {2, 3, 2, 3}}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast, MVT::v4i64, {2, 3, 2, 3}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Broadcast, MVT::v8i32, {2, 3, 2, 3}}, // vperm2f128 + vpermilps
+ {TTI::SK_Broadcast,
+ MVT::v16i16,
+ {2, 3, 3, 4}}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast,
+ MVT::v16f16,
+ {2, 3, 3, 4}}, // vpshuflw + vpshufd + vinsertf128
+ {TTI::SK_Broadcast, MVT::v32i8, {3, 4, 3, 6}}, // vpshufb + vinsertf128
+
+ {TTI::SK_Reverse, MVT::v4f64, {2, 6, 2, 2}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8f32, {2, 7, 2, 4}}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v4i64, {2, 6, 2, 2}}, // vperm2f128 + vpermilpd
+ {TTI::SK_Reverse, MVT::v8i32, {2, 7, 2, 4}}, // vperm2f128 + vpermilps
+ {TTI::SK_Reverse, MVT::v16i16, {2, 9, 5, 5}}, // vextractf128 + 2*pshufb
+ // + vinsertf128
+ {TTI::SK_Reverse, MVT::v16f16, {2, 9, 5, 5}}, // vextractf128 + 2*pshufb
+ // + vinsertf128
+ {TTI::SK_Reverse, MVT::v32i8, {2, 9, 5, 5}}, // vextractf128 + 2*pshufb
+ // + vinsertf128
+
+ {TTI::SK_Select, MVT::v4i64, {1, 1, 1, 1}}, // vblendpd
+ {TTI::SK_Select, MVT::v4f64, {1, 1, 1, 1}}, // vblendpd
+ {TTI::SK_Select, MVT::v8i32, {1, 1, 1, 1}}, // vblendps
+ {TTI::SK_Select, MVT::v8f32, {1, 1, 1, 1}}, // vblendps
+ {TTI::SK_Select, MVT::v16i16, {3, 3, 3, 3}}, // vpand + vpandn + vpor
+ {TTI::SK_Select, MVT::v16f16, {3, 3, 3, 3}}, // vpand + vpandn + vpor
+ {TTI::SK_Select, MVT::v32i8, {3, 3, 3, 3}}, // vpand + vpandn + vpor
+
+ {TTI::SK_Splice, MVT::v4i64, {2, 2, 2, 2}}, // vperm2f128 + shufpd
+ {TTI::SK_Splice, MVT::v4f64, {2, 2, 2, 2}}, // vperm2f128 + shufpd
+ {TTI::SK_Splice, MVT::v8i32, {4, 4, 4, 4}}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_Splice, MVT::v8f32, {4, 4, 4, 4}}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_Splice,
+ MVT::v16i16,
+ {5, 5, 5, 5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
+ {TTI::SK_Splice,
+ MVT::v16f16,
+ {5, 5, 5, 5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
+ {TTI::SK_Splice,
+ MVT::v32i8,
+ {5, 5, 5, 5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
+
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v4f64,
+ {2, 2, 2, 2}}, // vperm2f128 + vshufpd
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v4i64,
+ {2, 2, 2, 2}}, // vperm2f128 + vshufpd
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v8f32,
+ {4, 4, 4, 4}}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v8i32,
+ {4, 4, 4, 4}}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v16i16,
+ {8, 8, 8, 8}}, // vextractf128 + 4*pshufb
+ // + 2*por + vinsertf128
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v16f16,
+ {8, 8, 8, 8}}, // vextractf128 + 4*pshufb
+ // + 2*por + vinsertf128
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v32i8,
+ {8, 8, 8, 8}}, // vextractf128 + 4*pshufb
+ // + 2*por + vinsertf128
+
+ {TTI::SK_PermuteTwoSrc,
+ MVT::v4f64,
+ {3, 3, 3, 3}}, // 2*vperm2f128 + vshufpd
+ {TTI::SK_PermuteTwoSrc,
+ MVT::v4i64,
+ {3, 3, 3, 3}}, // 2*vperm2f128 + vshufpd
+ {TTI::SK_PermuteTwoSrc,
+ MVT::v8f32,
+ {4, 4, 4, 4}}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteTwoSrc,
+ MVT::v8i32,
+ {4, 4, 4, 4}}, // 2*vperm2f128 + 2*vshufps
+ {TTI::SK_PermuteTwoSrc,
+ MVT::v16i16,
+ {15, 15, 15, 15}}, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
+ {TTI::SK_PermuteTwoSrc,
+ MVT::v16f16,
+ {15, 15, 15, 15}}, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
+ {TTI::SK_PermuteTwoSrc,
+ MVT::v32i8,
+ {15, 15, 15, 15}}, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
};
if (ST->hasAVX())
@@ -2212,8 +2250,10 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
{TTI::SK_Reverse, MVT::v8i16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
{TTI::SK_Reverse, MVT::v8f16, {2, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
- {TTI::SK_Reverse, MVT::v16i8, {5, 6,11,11}}, // 2*pshuflw + 2*pshufhw
- // + 2*pshufd + 2*unpck + packus
+ {TTI::SK_Reverse,
+ MVT::v16i8,
+ {5, 6, 11, 11}}, // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + packus
{TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
{TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
@@ -2232,16 +2272,24 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
{TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
{TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
{TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
- {TTI::SK_PermuteSingleSrc, MVT::v8i16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
- // + pshufd/unpck
- {TTI::SK_PermuteSingleSrc, MVT::v8f16, {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
- // + pshufd/unpck
- {TTI::SK_PermuteSingleSrc, MVT::v16i8, {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
- // + 2*pshufd + 2*unpck + 2*packus
-
- {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
- {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
- {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v8i16,
+ {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
+ // + pshufd/unpck
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v8f16,
+ {3, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
+ // + pshufd/unpck
+ {TTI::SK_PermuteSingleSrc,
+ MVT::v16i8,
+ {8, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + 2*packus
+
+ {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
+ {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
+ {TTI::SK_PermuteTwoSrc,
+ MVT::v4i32,
+ {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
{TTI::SK_PermuteTwoSrc, MVT::v8i16, {6, 8, 8, 8}}, // blend+permute
{TTI::SK_PermuteTwoSrc, MVT::v8f16, {6, 8, 8, 8}}, // blend+permute
{TTI::SK_PermuteTwoSrc, MVT::v16i8, {11, 13, 13, 13}}, // blend+permute
|
When these were converted to CostKindTblEntry the throughput was mainly copied to all cost kinds Regenerated with my check_cost_tables.py helper script
When these were converted to CostKindTblEntry the throughput was mainly copied to all cost kinds
Regenerated with my check_cost_tables.py helper script