1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
|
From c1565af764adceca118daad0f592e5f14c2bdd4a Mon Sep 17 00:00:00 2001
From: Naveen Saini <naveen.kumar.saini@intel.com>
Date: Fri, 27 Aug 2021 12:15:09 +0800
Subject: [PATCH 2/2] [X86] Convert vXi1 vectors to xmm/ymm/zmm types via
getRegisterTypeForCallingConv rather than using CCPromoteToType in the td
file
Previously we tried to promote these to xmm/ymm/zmm by promoting
in the X86CallingConv.td file. But this breaks when we run out
of xmm/ymm/zmm registers and need to fall back to memory. We end
up trying to create a non-sensical scalar to vector. This lead
to an assertion. The new tests in avx512-calling-conv.ll all
trigger this assertion.
Since we really want to treat these types like we do on avx2,
it seems better to promote them before the calling convention
code gets involved. Except when the calling convention is one
that passes the vXi1 type in a k register.
The changes in avx512-regcall-Mask.ll are because we indicated
that xmm/ymm/zmm types should be passed indirectly for the
Win64 ABI before we go to the common lines that promoted the
vXi1 types. This caused the promoted types to be picked up by
the default calling convention code. Now we promote them earlier
so they get passed indirectly as though they were xmm/ymm/zmm.
Differential Revision: https://reviews.llvm.org/D75154
Upstream-Status: Backport [https://github.com/llvm/llvm-project/commit/eadea7868f5b7542ee6bdcd9a975697a0c919ffc]
Signed-off-by:Craig Topper <craig.topper@intel.com>
Signed-off-by: Naveen Saini <naveen.kumar.saini@intel.com>
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 90 +++++++++++++++++--------
1 file changed, 61 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 96b5e2cfbd82..d5de94aeb8a2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2085,51 +2085,83 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+static std::pair<MVT, unsigned>
+handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
+ const X86Subtarget &Subtarget) {
+ // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
+ // convention is one that uses k registers.
+ if (NumElts == 2)
+ return {MVT::v2i64, 1};
+ if (NumElts == 4)
+ return {MVT::v4i32, 1};
+ if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
+ CC != CallingConv::Intel_OCL_BI)
+ return {MVT::v8i16, 1};
+ if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
+ CC != CallingConv::Intel_OCL_BI)
+ return {MVT::v16i8, 1};
+ // v32i1 passes in ymm unless we have BWI and the calling convention is
+ // regcall.
+ if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
+ return {MVT::v32i8, 1};
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
+ if (Subtarget.useAVX512Regs())
+ return {MVT::v64i8, 1};
+ return {MVT::v32i8, 2};
+ }
+
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
+ NumElts > 64)
+ return {MVT::i8, NumElts};
+
+ return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
+}
+
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- // v32i1 vectors should be promoted to v32i8 to match avx2.
- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
- return MVT::v32i8;
- // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512() &&
- (!isPowerOf2_32(VT.getVectorNumElements()) ||
- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
- return MVT::i8;
- // Split v64i1 vectors if we don't have v64i8 available.
- if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
- CC != CallingConv::X86_RegCall)
- return MVT::v32i1;
+ Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return RegisterVT;
+ }
+
// FIXME: Should we just make these types legal and custom split operations?
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
return MVT::v16i32;
+
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- // v32i1 vectors should be promoted to v32i8 to match avx2.
- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
- return 1;
- // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512() &&
- (!isPowerOf2_32(VT.getVectorNumElements()) ||
- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
- return VT.getVectorNumElements();
- // Split v64i1 vectors if we don't have v64i8 available.
- if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
- CC != CallingConv::X86_RegCall)
- return 2;
+ Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return NumRegisters;
+ }
+
// FIXME: Should we just make these types legal and custom split operations?
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
return 1;
+
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
@@ -2140,8 +2172,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
(!isPowerOf2_32(VT.getVectorNumElements()) ||
- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
+ (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
+ VT.getVectorNumElements() > 64)) {
RegisterVT = MVT::i8;
IntermediateVT = MVT::i1;
NumIntermediates = VT.getVectorNumElements();
@@ -2151,7 +2183,7 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
// Split v64i1 vectors if we don't have v64i8 available.
if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
CC != CallingConv::X86_RegCall) {
- RegisterVT = MVT::v32i1;
+ RegisterVT = MVT::v32i8;
IntermediateVT = MVT::v32i1;
NumIntermediates = 2;
return 2;
--
2.17.1
|