Skip to content

Commit c3e7a1a

Browse files
Himadhithhimadhith
andauthored
[NFC][PowerPC] Optimize vector compares for not equal to non zero vectors (#171635)
Lockdown instructions for vector compares `not equal to non-zero (Ex: vec[i]!=7)`. Current implementation can be made better by removing the negation and using the identity ``` 0XFFFF + 1 = 0 and 0 + 1 = 0 ``` Co-authored-by: himadhith <himadhith.v@ibm.com>
1 parent cdfdb06 commit c3e7a1a

File tree

1 file changed

+74
-0
lines changed

1 file changed

+74
-0
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \
3+
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64LE
4+
5+
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \
6+
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_64
7+
8+
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc-ibm-aix \
9+
; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=POWERPC_32
10+
11+
; The current implementation is comparing vector of non-zeros in register v2 with v3. v3 is then negated and converts:
12+
; 0XFFFF -> 0
13+
; 0 -> 1
14+
; An optimized version is to follow this NFC patch
15+
16+
define i32 @cols_needed(<4 x i16> %wide.load) {
17+
; POWERPC_64LE-LABEL: cols_needed:
18+
; POWERPC_64LE: # %bb.0: # %entry
19+
; POWERPC_64LE-NEXT: xxlxor v3, v3, v3
20+
; POWERPC_64LE-NEXT: li r3, 0
21+
; POWERPC_64LE-NEXT: vcmpequh v2, v2, v3
22+
; POWERPC_64LE-NEXT: vspltisw v3, 1
23+
; POWERPC_64LE-NEXT: xxlnor v2, v2, v2
24+
; POWERPC_64LE-NEXT: vmrglh v2, v2, v2
25+
; POWERPC_64LE-NEXT: xxland v2, v2, v3
26+
; POWERPC_64LE-NEXT: xxswapd v3, v2
27+
; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
28+
; POWERPC_64LE-NEXT: xxspltw v3, v2, 2
29+
; POWERPC_64LE-NEXT: vadduwm v2, v2, v3
30+
; POWERPC_64LE-NEXT: vextuwrx r3, r3, v2
31+
; POWERPC_64LE-NEXT: blr
32+
;
33+
; POWERPC_64-LABEL: cols_needed:
34+
; POWERPC_64: # %bb.0: # %entry
35+
; POWERPC_64-NEXT: xxlxor v3, v3, v3
36+
; POWERPC_64-NEXT: li r3, 0
37+
; POWERPC_64-NEXT: vcmpequh v2, v2, v3
38+
; POWERPC_64-NEXT: vspltisw v3, 1
39+
; POWERPC_64-NEXT: xxlnor v2, v2, v2
40+
; POWERPC_64-NEXT: vmrghh v2, v2, v2
41+
; POWERPC_64-NEXT: xxland v2, v2, v3
42+
; POWERPC_64-NEXT: xxswapd v3, v2
43+
; POWERPC_64-NEXT: vadduwm v2, v2, v3
44+
; POWERPC_64-NEXT: xxspltw v3, v2, 1
45+
; POWERPC_64-NEXT: vadduwm v2, v2, v3
46+
; POWERPC_64-NEXT: vextuwlx r3, r3, v2
47+
; POWERPC_64-NEXT: blr
48+
;
49+
; POWERPC_32-LABEL: cols_needed:
50+
; POWERPC_32: # %bb.0: # %entry
51+
; POWERPC_32-NEXT: xxlxor v3, v3, v3
52+
; POWERPC_32-NEXT: vcmpequh v2, v2, v3
53+
; POWERPC_32-NEXT: vspltisw v3, 1
54+
; POWERPC_32-NEXT: xxlnor v2, v2, v2
55+
; POWERPC_32-NEXT: vmrghh v2, v2, v2
56+
; POWERPC_32-NEXT: xxland v2, v2, v3
57+
; POWERPC_32-NEXT: xxswapd v3, v2
58+
; POWERPC_32-NEXT: vadduwm v2, v2, v3
59+
; POWERPC_32-NEXT: xxspltw v3, v2, 1
60+
; POWERPC_32-NEXT: vadduwm v2, v2, v3
61+
; POWERPC_32-NEXT: stxv v2, -16(r1)
62+
; POWERPC_32-NEXT: lwz r3, -16(r1)
63+
; POWERPC_32-NEXT: blr
64+
entry:
65+
%0 = icmp ne <4 x i16> %wide.load, zeroinitializer
66+
%1 = zext <4 x i1> %0 to <4 x i32>
67+
%2 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
68+
ret i32 %2
69+
}
70+
71+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
72+
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #0
73+
74+
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

0 commit comments

Comments
 (0)