summaryrefslogtreecommitdiffstats
path: root/dynamic-layers/clang-layer/recipes-devtools/clang/files/0001-Update-LowerOpenCL-pass-to-handle-new-blocks-represn.patch
diff options
context:
space:
mode:
Diffstat (limited to 'dynamic-layers/clang-layer/recipes-devtools/clang/files/0001-Update-LowerOpenCL-pass-to-handle-new-blocks-represn.patch')
-rw-r--r--dynamic-layers/clang-layer/recipes-devtools/clang/files/0001-Update-LowerOpenCL-pass-to-handle-new-blocks-represn.patch986
1 files changed, 0 insertions, 986 deletions
diff --git a/dynamic-layers/clang-layer/recipes-devtools/clang/files/0001-Update-LowerOpenCL-pass-to-handle-new-blocks-represn.patch b/dynamic-layers/clang-layer/recipes-devtools/clang/files/0001-Update-LowerOpenCL-pass-to-handle-new-blocks-represn.patch
deleted file mode 100644
index 2037421b..00000000
--- a/dynamic-layers/clang-layer/recipes-devtools/clang/files/0001-Update-LowerOpenCL-pass-to-handle-new-blocks-represn.patch
+++ /dev/null
@@ -1,986 +0,0 @@
1From 177cce531fd3665bb964a03db51890e0241e3e72 Mon Sep 17 00:00:00 2001
2From: Alexey Sotkin <alexey.sotkin@intel.com>
3Date: Thu, 21 Feb 2019 17:14:36 +0300
4Subject: [PATCH] Update LowerOpenCL pass to handle new blocks represntation in
5 LLVM IR
6
7Upstream-Status: Backport [https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/bd6ddfaf7232cd81c7f2fe9877e66f286731bd8e]
8Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
9
10---
11 lib/SPIRV/SPIRVLowerOCLBlocks.cpp | 249 ++++--------------------------
12 test/global_block.ll | 71 ++++-----
13 test/literal-struct.ll | 31 ++--
14 test/transcoding/block_w_struct_return.ll | 47 +++---
15 test/transcoding/enqueue_kernel.ll | 237 ++++++++++++++++------------
16 5 files changed, 235 insertions(+), 400 deletions(-)
17
18diff --git a/lib/SPIRV/SPIRVLowerOCLBlocks.cpp b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
19index c80bf04..b42a4ec 100644
20--- a/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
21+++ b/lib/SPIRV/SPIRVLowerOCLBlocks.cpp
22@@ -40,207 +40,34 @@
23 // In both cases values with function type used as intermediate representation
24 // for block literal structure.
25 //
26-// This pass is designed to find such cases and simplify them to avoid any
27-// function pointer types occurrences in LLVM IR in 4 steps.
28-//
29-// 1. Find all function pointer allocas, like
30-// %block = alloca void () *
31-//
32-// Then find a single store to that alloca:
33-// %blockLit = alloca <{ i32, i32, ...}>, align 4
34-// %0 = bitcast <{ i32, i32, ... }>* %blockLit to void ()*
35-// > store void ()* %0, void ()** %block, align 4
36-//
37-// And replace the alloca users by new instructions which used stored value
38-// %blockLit itself instead of function pointer alloca %block.
39-//
40-// 2. Find consecutive casts from block literal type to i8 addrspace(4)*
41-// used function pointers as an intermediate type:
42-// %0 = bitcast <{ i32, i32 }> %block to void() *
43-// %1 = addrspacecast void() * %0 to i8 addrspace(4)*
44-// And simplify them:
45-// %2 = addrspacecast <{ i32, i32 }> %block to i8 addrspace(4)*
46-//
47-// 3. Find all unused instructions with function pointer type occured after
48-// pp.1-2 and remove them.
49-//
50-// 4. Find unused globals with function pointer type, like
51-// @block = constant void ()*
52-// bitcast ({ i32, i32 }* @__block_literal_global to void ()*
53-//
54-// And remove them.
55+// In LLVM IR produced by clang, blocks are represented with the following
56+// structure:
57+// %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
58+// Pointers to block invoke functions are stored in the third field. Clang
59+// replaces inderect function calls in all cases except if block is passed as a
60+// function argument. Note that it is somewhat unclear if the OpenCL C spec
61+// should allow passing blocks as function argumernts. This pass is not supposed
62+// to work correctly with such functions.
63+// Clang though has to store function pointers to this structure. Purpose of
64+// this pass is to replace store of function pointers(not allowed in SPIR-V)
65+// with null pointers.
66 //
67 //===----------------------------------------------------------------------===//
68 #define DEBUG_TYPE "spv-lower-ocl-blocks"
69
70-#include "OCLUtil.h"
71 #include "SPIRVInternal.h"
72
73-#include "llvm/ADT/SetVector.h"
74-#include "llvm/Analysis/ValueTracking.h"
75-#include "llvm/IR/GlobalVariable.h"
76-#include "llvm/IR/InstIterator.h"
77 #include "llvm/IR/Module.h"
78 #include "llvm/Pass.h"
79-#include "llvm/PassSupport.h"
80-#include "llvm/Support/Casting.h"
81+#include "llvm/Support/Regex.h"
82
83 using namespace llvm;
84
85 namespace {
86
87-static void
88-removeUnusedFunctionPtrInst(Instruction *I,
89- SmallSetVector<Instruction *, 16> &FuncPtrInsts) {
90- for (unsigned OpIdx = 0, Ops = I->getNumOperands(); OpIdx != Ops; ++OpIdx) {
91- Instruction *OpI = dyn_cast<Instruction>(I->getOperand(OpIdx));
92- I->setOperand(OpIdx, nullptr);
93- if (OpI && OpI != I && OpI->user_empty())
94- FuncPtrInsts.insert(OpI);
95- }
96- I->eraseFromParent();
97-}
98-
99-static bool isFuncPtrAlloca(const AllocaInst *AI) {
100- auto *ET = dyn_cast<PointerType>(AI->getAllocatedType());
101- return ET && ET->getElementType()->isFunctionTy();
102-}
103-
104-static bool hasFuncPtrType(const Value *V) {
105- auto *PT = dyn_cast<PointerType>(V->getType());
106- return PT && PT->getElementType()->isFunctionTy();
107-}
108-
109-static bool isFuncPtrInst(const Instruction *I) {
110- if (auto *AI = dyn_cast<AllocaInst>(I))
111- return isFuncPtrAlloca(AI);
112-
113- for (auto &Op : I->operands()) {
114- if (auto *AI = dyn_cast<AllocaInst>(Op))
115- return isFuncPtrAlloca(AI);
116-
117- auto *OpI = dyn_cast<Instruction>(&Op);
118- if (OpI && OpI != I && hasFuncPtrType(OpI))
119- return true;
120- }
121- return false;
122-}
123-
124-static StoreInst *findSingleStore(AllocaInst *AI) {
125- StoreInst *Store = nullptr;
126- for (auto *U : AI->users()) {
127- if (!isa<StoreInst>(U))
128- continue; // not a store
129- if (Store)
130- return nullptr; // there are more than one stores
131- Store = dyn_cast<StoreInst>(U);
132- }
133- return Store;
134-}
135-
136-static void fixFunctionPtrAllocaUsers(AllocaInst *AI) {
137- // Find and remove a single store to alloca
138- auto *SingleStore = findSingleStore(AI);
139- assert(SingleStore && "More than one store to the function pointer alloca");
140- auto *StoredVal = SingleStore->getValueOperand();
141- SingleStore->eraseFromParent();
142-
143- // Find loads from the alloca and replace thier users
144- for (auto *U : AI->users()) {
145- auto *LI = dyn_cast<LoadInst>(U);
146- if (!LI)
147- continue;
148-
149- for (auto *U : LI->users()) {
150- auto *UInst = cast<Instruction>(U);
151- auto *Cast = CastInst::CreatePointerBitCastOrAddrSpaceCast(
152- StoredVal, UInst->getType(), "", UInst);
153- UInst->replaceAllUsesWith(Cast);
154- }
155- }
156-}
157-
158-static int getBlockLiteralIdx(const Function &F) {
159- StringRef FName = F.getName();
160- if (isEnqueueKernelBI(FName))
161- return FName.contains("events") ? 7 : 4;
162- if (isKernelQueryBI(FName))
163- return FName.contains("for_ndrange") ? 2 : 1;
164- if (FName.startswith("__") && FName.contains("_block_invoke"))
165- return F.hasStructRetAttr() ? 1 : 0;
166-
167- return -1; // No block literal argument
168-}
169-
170-static bool hasBlockLiteralArg(const Function &F) {
171- return getBlockLiteralIdx(F) != -1;
172-}
173-
174-static bool simplifyFunctionPtrCasts(Function &F) {
175- bool Changed = false;
176- int BlockLiteralIdx = getBlockLiteralIdx(F);
177- for (auto *U : F.users()) {
178- auto *Call = dyn_cast<CallInst>(U);
179- if (!Call)
180- continue;
181- if (Call->getFunction()->getName() == F.getName().str() + "_kernel")
182- continue; // Skip block invoke function calls inside block invoke kernels
183-
184- const DataLayout &DL = F.getParent()->getDataLayout();
185- auto *BlockLiteral = Call->getOperand(BlockLiteralIdx);
186- auto *BlockLiteralVal = GetUnderlyingObject(BlockLiteral, DL);
187- if (isa<GlobalVariable>(BlockLiteralVal))
188- continue; // nothing to do with globals
189-
190- auto *BlockLiteralAlloca = cast<AllocaInst>(BlockLiteralVal);
191- assert(!BlockLiteralAlloca->getAllocatedType()->isFunctionTy() &&
192- "Function type shouldn't be there");
193-
194- auto *NewBlockLiteral = CastInst::CreatePointerBitCastOrAddrSpaceCast(
195- BlockLiteralAlloca, BlockLiteral->getType(), "", Call);
196- BlockLiteral->replaceAllUsesWith(NewBlockLiteral);
197- Changed |= true;
198- }
199- return Changed;
200-}
201-
202-static void
203-findFunctionPtrAllocas(Module &M,
204- SmallVectorImpl<AllocaInst *> &FuncPtrAllocas) {
205- for (auto &F : M) {
206- if (F.isDeclaration())
207- continue;
208- for (auto &I : instructions(F)) {
209- auto *AI = dyn_cast<AllocaInst>(&I);
210- if (!AI || !isFuncPtrAlloca(AI))
211- continue;
212- FuncPtrAllocas.push_back(AI);
213- }
214- }
215-}
216-
217-static void
218-findUnusedFunctionPtrInsts(Module &M,
219- SmallSetVector<Instruction *, 16> &FuncPtrInsts) {
220- for (auto &F : M) {
221- if (F.isDeclaration())
222- continue;
223- for (auto &I : instructions(F))
224- if (I.user_empty() && isFuncPtrInst(&I))
225- FuncPtrInsts.insert(&I);
226- }
227-}
228-
229-static void
230-findUnusedFunctionPtrGlbs(Module &M,
231- SmallVectorImpl<GlobalVariable *> &FuncPtrGlbs) {
232- for (auto &GV : M.globals()) {
233- if (!GV.user_empty())
234- continue;
235- auto *GVType = dyn_cast<PointerType>(GV.getType()->getElementType());
236- if (GVType && GVType->getElementType()->isFunctionTy())
237- FuncPtrGlbs.push_back(&GV);
238- }
239+static bool isBlockInvoke(Function &F) {
240+ static Regex BlockInvokeRegex("_block_invoke_?[0-9]*$");
241+ return BlockInvokeRegex.match(F.getName());
242 }
243
244 class SPIRVLowerOCLBlocks : public ModulePass {
245@@ -250,44 +77,24 @@ public:
246
247 bool runOnModule(Module &M) {
248 bool Changed = false;
249-
250- // 1. Find function pointer allocas and fix their users
251- SmallVector<AllocaInst *, 16> FuncPtrAllocas;
252- findFunctionPtrAllocas(M, FuncPtrAllocas);
253-
254- Changed |= !FuncPtrAllocas.empty();
255- for (auto *AI : FuncPtrAllocas)
256- fixFunctionPtrAllocaUsers(AI);
257-
258- // 2. Simplify consecutive casts which use function pointer types
259- for (auto &F : M)
260- if (hasBlockLiteralArg(F))
261- Changed |= simplifyFunctionPtrCasts(F);
262-
263- // 3. Cleanup unused instructions with function pointer type
264- // which are occured after pp. 1-2
265- SmallSetVector<Instruction *, 16> FuncPtrInsts;
266- findUnusedFunctionPtrInsts(M, FuncPtrInsts);
267-
268- Changed |= !FuncPtrInsts.empty();
269- while (!FuncPtrInsts.empty()) {
270- Instruction *I = FuncPtrInsts.pop_back_val();
271- removeUnusedFunctionPtrInst(I, FuncPtrInsts);
272+ for (Function &F : M) {
273+ if (!isBlockInvoke(F))
274+ continue;
275+ for (User *U : F.users()) {
276+ if (!isa<Constant>(U))
277+ continue;
278+ Constant *Null = Constant::getNullValue(U->getType());
279+ if (U != Null) {
280+ U->replaceAllUsesWith(Null);
281+ Changed = true;
282+ }
283+ }
284 }
285-
286- // 4. Find and remove unused global variables with function pointer type
287- SmallVector<GlobalVariable *, 16> FuncPtrGlbs;
288- findUnusedFunctionPtrGlbs(M, FuncPtrGlbs);
289-
290- Changed |= !FuncPtrGlbs.empty();
291- for (auto *GV : FuncPtrGlbs)
292- GV->eraseFromParent();
293-
294 return Changed;
295 }
296
297 static char ID;
298-}; // class SPIRVLowerOCLBlocks
299+};
300
301 char SPIRVLowerOCLBlocks::ID = 0;
302
303diff --git a/test/global_block.ll b/test/global_block.ll
304index 4fc453b..b558213 100644
305--- a/test/global_block.ll
306+++ b/test/global_block.ll
307@@ -17,7 +17,7 @@
308 ; RUN: spirv-val %t.spv
309 ; RUN: llvm-spirv -r %t.spv -o - | llvm-dis | FileCheck %s --check-prefix=CHECK-LLVM
310
311-target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
312+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
313 target triple = "spir-unknown-unknown"
314
315 ; CHECK-SPIRV: Name [[block_invoke:[0-9]+]] "_block_invoke"
316@@ -27,71 +27,56 @@ target triple = "spir-unknown-unknown"
317 ; CHECK-SPIRV: TypePointer [[int8Ptr:[0-9]+]] 8 [[int8]]
318 ; CHECK-SPIRV: TypeFunction [[block_invoke_type:[0-9]+]] [[int]] [[int8Ptr]] [[int]]
319
320-;; This variable is not needed in SPIRV
321-; CHECK-SPIRV-NOT: Name {{[0-9]+}} block_kernel.b1
322-; CHECK-LLVM-NOT: @block_kernel.b1
323-@block_kernel.b1 = internal addrspace(2) constant i32 (i32) addrspace(4)* addrspacecast (i32 (i32) addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i32 (i32) addrspace(1)*) to i32 (i32) addrspace(4)*), align 8
324+%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
325
326-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
327+@block_kernel.b1 = internal addrspace(2) constant %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), align 4
328+@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (i32 (i8 addrspace(4)*, i32)* @_block_invoke to i8*) to i8 addrspace(4)*) }, align 4
329
330-; Function Attrs: convergent nounwind
331-define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
332+; Function Attrs: convergent noinline nounwind optnone
333+define spir_kernel void @block_kernel(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
334 entry:
335- %res.addr = alloca i32 addrspace(1)*, align 8
336- store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 8, !tbaa !10
337-
338+ %res.addr = alloca i32 addrspace(1)*, align 4
339+ store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 4
340 ; CHECK-SPIRV: FunctionCall [[int]] {{[0-9]+}} [[block_invoke]] {{[0-9]+}} [[five]]
341 ; CHECK-LLVM: %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* {{.*}}, i32 5)
342- %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 5) #2
343-
344- %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 8, !tbaa !10
345- store i32 %call, i32 addrspace(1)* %0, align 4, !tbaa !14
346+ %call = call spir_func i32 @_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 5) #2
347+ %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 4
348+ store i32 %call, i32 addrspace(1)* %0, align 4
349 ret void
350 }
351
352-; CHECK-SPIRV: 5 Function [[int]] [[block_invoke]] 0 [[block_invoke_type]]
353+; CHECK-SPIRV: 5 Function [[int]] [[block_invoke]] 2 [[block_invoke_type]]
354 ; CHECK-SPIRV-NEXT: 3 FunctionParameter [[int8Ptr]] {{[0-9]+}}
355 ; CHECK-SPIRV-NEXT: 3 FunctionParameter [[int]] {{[0-9]+}}
356 ; CHECK-LLVM: define internal spir_func i32 @_block_invoke(i8 addrspace(4)* {{.*}}, i32 %{{.*}})
357-; Function Attrs: convergent nounwind
358+; Function Attrs: convergent noinline nounwind optnone
359 define internal spir_func i32 @_block_invoke(i8 addrspace(4)* %.block_descriptor, i32 %i) #1 {
360 entry:
361- %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8
362+ %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
363 %i.addr = alloca i32, align 4
364- store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8
365- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
366- store i32 %i, i32* %i.addr, align 4, !tbaa !14
367- %0 = load i32, i32* %i.addr, align 4, !tbaa !14
368+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
369+ store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
370+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
371+ store i32 %i, i32* %i.addr, align 4
372+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
373+ %0 = load i32, i32* %i.addr, align 4
374 %add = add nsw i32 %0, 1
375 ret i32 %add
376 }
377
378-attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
379-attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
380+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
381+attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
382 attributes #2 = { convergent }
383
384 !llvm.module.flags = !{!0}
385-!opencl.enable.FP_CONTRACT = !{}
386 !opencl.ocl.version = !{!1}
387 !opencl.spir.version = !{!1}
388-!opencl.used.extensions = !{!2}
389-!opencl.used.optional.core.features = !{!2}
390-!opencl.compiler.options = !{!2}
391-!llvm.ident = !{!3}
392+!llvm.ident = !{!2}
393
394 !0 = !{i32 1, !"wchar_size", i32 4}
395 !1 = !{i32 2, i32 0}
396-!2 = !{}
397-!3 = !{!"clang version 7.0.0"}
398-!4 = !{i32 1}
399-!5 = !{!"none"}
400-!6 = !{!"int*"}
401-!7 = !{!""}
402-!8 = !{i1 false}
403-!9 = !{i32 0}
404-!10 = !{!11, !11, i64 0}
405-!11 = !{!"any pointer", !12, i64 0}
406-!12 = !{!"omnipotent char", !13, i64 0}
407-!13 = !{!"Simple C/C++ TBAA"}
408-!14 = !{!15, !15, i64 0}
409-!15 = !{!"int", !12, i64 0}
410+!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
411+!3 = !{i32 1}
412+!4 = !{!"none"}
413+!5 = !{!"int*"}
414+!6 = !{!""}
415diff --git a/test/literal-struct.ll b/test/literal-struct.ll
416index b88187f..dec957a 100644
417--- a/test/literal-struct.ll
418+++ b/test/literal-struct.ll
419@@ -2,7 +2,7 @@
420 ; structs, i.e. structs whose type has no name. Typicaly clang generate such
421 ; structs if the kernel contains OpenCL 2.0 blocks. The IR was produced with
422 ; the following command:
423-; clang -cc1 -triple spir -cl-std=cl2.0 -O0 -finclude-default-header literal-struct.cl -emit-llvm -o test/literal-struct.ll
424+; clang -cc1 -triple spir -cl-std=cl2.0 -O0 literal-struct.cl -emit-llvm -o test/literal-struct.ll
425
426 ; literal-struct.cl:
427 ; void foo()
428@@ -17,25 +17,28 @@
429 ; RUN: llvm-spirv %t.bc -o %t.spv
430 ; RUN: spirv-val %t.spv
431
432-; CHECK-DAG: TypeInt [[Int:[0-9]+]] 32 0
433-; CHECK-DAG: TypeStruct [[StructType:[0-9]+]] [[Int]] [[Int]] {{$}}
434+; CHECK: TypeInt [[Int:[0-9]+]] 32 0
435+; CHECK: TypeInt [[Int8:[0-9]+]] 8 0
436+; CHECK: TypePointer [[Int8Ptr:[0-9]+]] 8 [[Int8]]
437+; CHECK: TypeStruct [[StructType:[0-9]+]] [[Int]] [[Int]] [[Int8Ptr]]
438
439 target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
440 target triple = "spir"
441
442-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
443+%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
444+
445+@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__foo_block_invoke to i8*) to i8 addrspace(4)*) }, align 4
446 ; CHECK: ConstantComposite [[StructType]]
447
448-; This is artificial case is added to cover ConstantNull instrucitions with TypeStruct.
449-@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } zeroinitializer, align 4
450+@__block_literal_global.1 = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } zeroinitializer, align 4
451 ; CHECK: ConstantNull [[StructType]]
452
453 ; Function Attrs: convergent noinline nounwind optnone
454 define spir_func void @foo() #0 {
455 entry:
456- %myBlock = alloca void () addrspace(4)*, align 4
457- store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %myBlock, align 4
458- call spir_func void @__foo_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) #1
459+ %myBlock = alloca %struct.__opencl_block_literal_generic addrspace(4)*, align 4
460+ store %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), %struct.__opencl_block_literal_generic addrspace(4)** %myBlock, align 4
461+ call spir_func void @__foo_block_invoke(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*)) #1
462 ret void
463 }
464
465@@ -43,14 +46,14 @@ entry:
466 define internal spir_func void @__foo_block_invoke(i8 addrspace(4)* %.block_descriptor) #0 {
467 entry:
468 %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
469- %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
470+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
471 store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
472- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
473- store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
474+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
475+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
476 ret void
477 }
478
479-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
480+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
481 attributes #1 = { convergent }
482
483 !llvm.module.flags = !{!0}
484@@ -60,4 +63,4 @@ attributes #1 = { convergent }
485
486 !0 = !{i32 1, !"wchar_size", i32 4}
487 !1 = !{i32 2, i32 0}
488-!2 = !{!"clang version 8.0.0 "}
489+!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
490diff --git a/test/transcoding/block_w_struct_return.ll b/test/transcoding/block_w_struct_return.ll
491index a68820f..ebd2c5f 100644
492--- a/test/transcoding/block_w_struct_return.ll
493+++ b/test/transcoding/block_w_struct_return.ll
494@@ -16,6 +16,8 @@
495 ; res[tid] = kernelBlock(aa).a - 6;
496 ; }
497
498+; clang -cc1 -triple spir -cl-std=cl2.0 -disable-llvm-passes -finclude-default-header block_w_struct_return.cl -emit-llvm -o test/transcoding/block_w_struct_return.ll
499+
500 ; RUN: llvm-as %s -o %t.bc
501 ; RUN: llvm-spirv %t.bc -spirv-text -o %t.spv.txt
502 ; RUN: FileCheck < %t.spv.txt %s --check-prefix=CHECK-SPIRV
503@@ -28,12 +30,14 @@
504 ; CHECK-SPIRV: Name [[BlockInv:[0-9]+]] "__block_ret_struct_block_invoke"
505
506 ; CHECK-SPIRV: 4 TypeInt [[IntTy:[0-9]+]] 32
507+; CHECK-SPIRV: 4 TypeInt [[Int8Ty:[0-9]+]] 8
508+; CHECK-SPIRV: 4 TypePointer [[Int8Ptr:[0-9]+]] 8 [[Int8Ty]]
509 ; CHECK-SPIRV: 3 TypeStruct [[StructTy:[0-9]+]] [[IntTy]]
510 ; CHECK-SPIRV: 4 TypePointer [[StructPtrTy:[0-9]+]] 7 [[StructTy]]
511
512 ; CHECK-SPIRV: 4 Variable [[StructPtrTy]] [[StructArg:[0-9]+]] 7
513 ; CHECK-SPIRV: 4 Variable [[StructPtrTy]] [[StructRet:[0-9]+]] 7
514-; CHECK-SPIRV: 4 PtrCastToGeneric {{[0-9]+}} [[BlockLit:[0-9]+]] {{[0-9]+}}
515+; CHECK-SPIRV: 4 PtrCastToGeneric [[Int8Ptr]] [[BlockLit:[0-9]+]] {{[0-9]+}}
516 ; CHECK-SPIRV: 7 FunctionCall {{[0-9]+}} {{[0-9]+}} [[BlockInv]] [[StructRet]] [[BlockLit]] [[StructArg]]
517
518 ; CHECK-LLVM: %[[StructA:.*]] = type { i32 }
519@@ -42,20 +46,21 @@
520 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
521 target triple = "spir64-unknown-unknown"
522
523+%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
524 %struct.A = type { i32 }
525
526-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
527+@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 16, i32 8, i8 addrspace(4)* addrspacecast (i8* bitcast (void (%struct.A*, i8 addrspace(4)*, %struct.A*)* @__block_ret_struct_block_invoke to i8*) to i8 addrspace(4)*) }, align 8
528
529 ; Function Attrs: convergent noinline nounwind optnone
530-define spir_kernel void @block_ret_struct(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 !kernel_arg_host_accessible !8 !kernel_arg_pipe_depth !9 !kernel_arg_pipe_io !7 !kernel_arg_buffer_location !7 {
531+define spir_kernel void @block_ret_struct(i32 addrspace(1)* %res) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
532 entry:
533 %res.addr = alloca i32 addrspace(1)*, align 8
534- %kernelBlock = alloca void (%struct.A*, %struct.A*) addrspace(4)*, align 8
535+ %kernelBlock = alloca %struct.__opencl_block_literal_generic addrspace(4)*, align 8
536 %tid = alloca i64, align 8
537 %aa = alloca %struct.A, align 4
538 %tmp = alloca %struct.A, align 4
539 store i32 addrspace(1)* %res, i32 addrspace(1)** %res.addr, align 8
540- store void (%struct.A*, %struct.A*) addrspace(4)* addrspacecast (void (%struct.A*, %struct.A*) addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to void (%struct.A*, %struct.A*) addrspace(1)*) to void (%struct.A*, %struct.A*) addrspace(4)*), void (%struct.A*, %struct.A*) addrspace(4)** %kernelBlock, align 8
541+ store %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), %struct.__opencl_block_literal_generic addrspace(4)** %kernelBlock, align 8
542 %call = call spir_func i64 @_Z13get_global_idj(i32 0) #4
543 store i64 %call, i64* %tid, align 8
544 %0 = load i32 addrspace(1)*, i32 addrspace(1)** %res.addr, align 8
545@@ -64,7 +69,7 @@ entry:
546 store i32 -1, i32 addrspace(1)* %arrayidx, align 4
547 %a = getelementptr inbounds %struct.A, %struct.A* %aa, i32 0, i32 0
548 store i32 5, i32* %a, align 4
549- call spir_func void @__block_ret_struct_block_invoke(%struct.A* sret %tmp, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), %struct.A* byval align 4 %aa) #5
550+ call spir_func void @__block_ret_struct_block_invoke(%struct.A* sret %tmp, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), %struct.A* byval align 4 %aa) #5
551 %a1 = getelementptr inbounds %struct.A, %struct.A* %tmp, i32 0, i32 0
552 %2 = load i32, i32* %a1, align 4
553 %sub = sub nsw i32 %2, 6
554@@ -79,10 +84,10 @@ entry:
555 define internal spir_func void @__block_ret_struct_block_invoke(%struct.A* noalias sret %agg.result, i8 addrspace(4)* %.block_descriptor, %struct.A* byval align 4 %a) #1 {
556 entry:
557 %.block_descriptor.addr = alloca i8 addrspace(4)*, align 8
558- %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 8
559+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 8
560 store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 8
561- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
562- store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 8
563+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
564+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 8
565 %a1 = getelementptr inbounds %struct.A, %struct.A* %a, i32 0, i32 0
566 store i32 6, i32* %a1, align 4
567 %0 = bitcast %struct.A* %agg.result to i8*
568@@ -97,30 +102,22 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture r
569 ; Function Attrs: convergent nounwind readnone
570 declare spir_func i64 @_Z13get_global_idj(i32) #3
571
572-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
573-attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
574+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
575+attributes #1 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
576 attributes #2 = { argmemonly nounwind }
577 attributes #3 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
578 attributes #4 = { convergent nounwind readnone }
579 attributes #5 = { convergent }
580
581 !llvm.module.flags = !{!0}
582-!opencl.enable.FP_CONTRACT = !{}
583 !opencl.ocl.version = !{!1}
584 !opencl.spir.version = !{!1}
585-!opencl.used.extensions = !{!2}
586-!opencl.used.optional.core.features = !{!2}
587-!opencl.compiler.options = !{!2}
588-!llvm.ident = !{!3}
589+!llvm.ident = !{!2}
590
591 !0 = !{i32 1, !"wchar_size", i32 4}
592 !1 = !{i32 2, i32 0}
593-!2 = !{}
594-!3 = !{!"clang version 7.0.0"}
595-!4 = !{i32 1}
596-!5 = !{!"none"}
597-!6 = !{!"int*"}
598-!7 = !{!""}
599-!8 = !{i1 false}
600-!9 = !{i32 0}
601-
602+!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
603+!3 = !{i32 1}
604+!4 = !{!"none"}
605+!5 = !{!"int*"}
606+!6 = !{!""}
607diff --git a/test/transcoding/enqueue_kernel.ll b/test/transcoding/enqueue_kernel.ll
608index 1f0b360..761043e 100644
609--- a/test/transcoding/enqueue_kernel.ll
610+++ b/test/transcoding/enqueue_kernel.ll
611@@ -51,11 +51,12 @@
612 ; ModuleID = 'enqueue_kernel.cl'
613 source_filename = "enqueue_kernel.cl"
614 target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
615-target triple = "spir-unknown-unknown"
616+target triple = "spir"
617
618 %opencl.queue_t = type opaque
619 %struct.ndrange_t = type { i32 }
620 %opencl.clk_event_t = type opaque
621+%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
622
623 ; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer1:[0-9]+]] "__device_side_enqueue_block_invoke_kernel"
624 ; CHECK-SPIRV: EntryPoint {{[0-9]+}} [[BlockKer2:[0-9]+]] "__device_side_enqueue_block_invoke_2_kernel"
625@@ -66,89 +67,123 @@ target triple = "spir-unknown-unknown"
626
627 ; CHECK-SPIRV: TypeInt [[Int32Ty:[0-9]+]] 32
628 ; CHECK-SPIRV: TypeInt [[Int8Ty:[0-9]+]] 8
629-; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8
630 ; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt0:[0-9]+]] 0
631-; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt17:[0-9]+]] 17
632+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt17:[0-9]+]] 21
633 ; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt2:[0-9]+]] 2
634-; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt20:[0-9]+]] 20
635-; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]]
636+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt8:[0-9]+]] 8
637+; CHECK-SPIRV: Constant [[Int32Ty]] [[ConstInt20:[0-9]+]] 24
638
639 ; CHECK-SPIRV: TypePointer {{[0-9]+}} 7 {{[0-9]+}}
640+; CHECK-SPIRV: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]]
641+; CHECK-SPIRV: TypeVoid [[VoidTy:[0-9]+]]
642 ; CHECK-SPIRV: TypePointer [[Int32LocPtrTy:[0-9]+]] 7 [[Int32Ty]]
643 ; CHECK-SPIRV: TypeDeviceEvent [[EventTy:[0-9]+]]
644-; CHECK-SPIRV: TypePointer [[Int8PtrGenTy:[0-9]+]] 8 [[Int8Ty]]
645 ; CHECK-SPIRV: TypePointer [[EventPtrTy:[0-9]+]] 8 [[EventTy]]
646 ; CHECK-SPIRV: TypeFunction [[BlockTy1:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
647 ; CHECK-SPIRV: TypeFunction [[BlockTy2:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
648 ; CHECK-SPIRV: TypeFunction [[BlockTy3:[0-9]+]] [[VoidTy]] [[Int8PtrGenTy]]
649 ; CHECK-SPIRV: ConstantNull [[EventPtrTy]] [[EventNull:[0-9]+]]
650
651-; CHECK-LLVM: [[BlockTy1:%[0-9a-z\.]+]] = type { i32, i32 }
652-; CHECK-LLVM: [[BlockTy2:%[0-9a-z\.]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i8 }>
653-; CHECK-LLVM: [[BlockTy3:%[0-9a-z\.]+]] = type <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>
654-; CHECK-LLVM: [[BlockTy4:%[0-9a-z\.]+]] = type <{ i32, i32 }>
655+; CHECK-LLVM: [[BlockTy1:%[0-9a-z\.]+]] = type { i32, i32, i8 addrspace(4)* }
656+; CHECK-LLVM: [[BlockTy2:%[0-9a-z\.]+]] = type <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>
657+; CHECK-LLVM: [[BlockTy3:%[0-9a-z\.]+]] = type <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>
658+; CHECK-LLVM: [[BlockTy4:%[0-9a-z\.]+]] = type <{ i32, i32, i8 addrspace(4)* }>
659
660-; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4
661-; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy1]] { i32 8, i32 4 }, align 4
662+; CHECK-LLVM: @__block_literal_global = internal addrspace(1) constant [[BlockTy1]] { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* null to i8 addrspace(4)*) }, align 4
663+; CHECK-LLVM: @__block_literal_global.1 = internal addrspace(1) constant [[BlockTy1]] { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* null to i8 addrspace(4)*) }, align 4
664
665-@__block_literal_global = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
666-@__block_literal_global.1 = internal addrspace(1) constant { i32, i32 } { i32 8, i32 4 }, align 4
667+@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3 to i8*) to i8 addrspace(4)*) }, align 4
668+@__block_literal_global.1 = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4 to i8*) to i8 addrspace(4)*) }, align 4
669
670 ; Function Attrs: convergent noinline nounwind optnone
671-define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %i, i8 signext %c0) #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !5 !kernel_arg_type !6 !kernel_arg_base_type !6 !kernel_arg_type_qual !7 {
672+define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %i, i8 signext %c0) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
673 entry:
674+ %a.addr = alloca i32 addrspace(1)*, align 4
675+ %b.addr = alloca i32 addrspace(1)*, align 4
676+ %i.addr = alloca i32, align 4
677+ %c0.addr = alloca i8, align 1
678 %default_queue = alloca %opencl.queue_t*, align 4
679 %flags = alloca i32, align 4
680 %ndrange = alloca %struct.ndrange_t, align 4
681 %clk_event = alloca %opencl.clk_event_t*, align 4
682 %event_wait_list = alloca %opencl.clk_event_t*, align 4
683 %event_wait_list2 = alloca [1 x %opencl.clk_event_t*], align 4
684- %block = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, align 4
685- %block3 = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4
686+ %block = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, align 4
687+ %tmp = alloca %struct.ndrange_t, align 4
688+ %block3 = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4
689+ %tmp4 = alloca %struct.ndrange_t, align 4
690 %c = alloca i8, align 1
691+ %tmp11 = alloca %struct.ndrange_t, align 4
692+ %block_sizes = alloca [1 x i32], align 4
693+ %tmp12 = alloca %struct.ndrange_t, align 4
694+ %block_sizes13 = alloca [3 x i32], align 4
695+ store i32 addrspace(1)* %a, i32 addrspace(1)** %a.addr, align 4
696+ store i32 addrspace(1)* %b, i32 addrspace(1)** %b.addr, align 4
697+ store i32 %i, i32* %i.addr, align 4
698+ store i8 %c0, i8* %c0.addr, align 1
699 store i32 0, i32* %flags, align 4
700 %arrayinit.begin = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
701 %0 = load %opencl.clk_event_t*, %opencl.clk_event_t** %clk_event, align 4
702 store %opencl.clk_event_t* %0, %opencl.clk_event_t** %arrayinit.begin, align 4
703 %1 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
704 %2 = load i32, i32* %flags, align 4
705- %block.size = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0
706- store i32 17, i32* %block.size, align 4
707- %block.align = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1
708+ %3 = bitcast %struct.ndrange_t* %tmp to i8*
709+ %4 = bitcast %struct.ndrange_t* %ndrange to i8*
710+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %3, i8* align 4 %4, i32 4, i1 false)
711+ %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0
712+ store i32 21, i32* %block.size, align 4
713+ %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1
714 store i32 4, i32* %block.align, align 4
715- %block.captured = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2
716- store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured, align 4
717- %block.captured1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3
718- store i32 %i, i32* %block.captured1, align 4
719- %block.captured2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4
720- store i8 %c0, i8* %block.captured2, align 4
721- %3 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i8 }>* %block to void ()*
722- %4 = addrspacecast void ()* %3 to i8 addrspace(4)*
723+ %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2
724+ store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 4
725+ %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3
726+ %5 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4
727+ store i32 addrspace(1)* %5, i32 addrspace(1)** %block.captured, align 4
728+ %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4
729+ %6 = load i32, i32* %i.addr, align 4
730+ store i32 %6, i32* %block.captured1, align 4
731+ %block.captured2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 5
732+ %7 = load i8, i8* %c0.addr, align 1
733+ store i8 %7, i8* %block.captured2, align 4
734+ %8 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block to %struct.__opencl_block_literal_generic*
735+ %9 = addrspacecast %struct.__opencl_block_literal_generic* %8 to i8 addrspace(4)*
736
737 ; CHECK-SPIRV: PtrCastToGeneric [[Int8PtrGenTy]] [[BlockLit1:[0-9]+]]
738 ; CHECK-SPIRV: EnqueueKernel [[Int32Ty]] {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{[0-9]+}}
739 ; [[ConstInt0]] [[EventNull]] [[EventNull]]
740 ; [[BlockKer1]] [[BlockLit1]] [[ConstInt17]] [[ConstInt8]]
741
742-; CHECK-LLVM: [[Block2:%[0-9]+]] = addrspacecast [[BlockTy2]]* %block to i8 addrspace(4)*
743+; CHECK-LLVM: [[Block2:%[0-9]+]] = bitcast [[BlockTy2]]* %block to %struct.__opencl_block_literal_generic*
744+; CHECK-LLVM: [[Block2Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block2]] to i8 addrspace(4)*
745 ; CHECK-LLVM: [[BlockInv2:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8 addrspace(4)*
746-; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2]])
747-
748- %5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %4)
749- %6 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)*
750- %7 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
751- %block.size5 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 0
752- store i32 20, i32* %block.size5, align 4
753- %block.align6 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 1
754+; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2Ptr]])
755+
756+ %10 = call i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval %tmp, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9)
757+ %11 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
758+ %12 = load i32, i32* %flags, align 4
759+ %13 = bitcast %struct.ndrange_t* %tmp4 to i8*
760+ %14 = bitcast %struct.ndrange_t* %ndrange to i8*
761+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %13, i8* align 4 %14, i32 4, i1 false)
762+ %15 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)*
763+ %16 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
764+ %block.size5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 0
765+ store i32 24, i32* %block.size5, align 4
766+ %block.align6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 1
767 store i32 4, i32* %block.align6, align 4
768- %block.captured7 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 2
769- store i32 addrspace(1)* %a, i32 addrspace(1)** %block.captured7, align 4
770- %block.captured8 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 3
771- store i32 %i, i32* %block.captured8, align 4
772- %block.captured9 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 4
773- store i32 addrspace(1)* %b, i32 addrspace(1)** %block.captured9, align 4
774- %8 = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3 to void ()*
775- %9 = addrspacecast void ()* %8 to i8 addrspace(4)*
776+ %block.invoke7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 2
777+ store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2 to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke7, align 4
778+ %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 3
779+ %17 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4
780+ store i32 addrspace(1)* %17, i32 addrspace(1)** %block.captured8, align 4
781+ %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 4
782+ %18 = load i32, i32* %i.addr, align 4
783+ store i32 %18, i32* %block.captured9, align 4
784+ %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3, i32 0, i32 5
785+ %19 = load i32 addrspace(1)*, i32 addrspace(1)** %b.addr, align 4
786+ store i32 addrspace(1)* %19, i32 addrspace(1)** %block.captured10, align 4
787+ %20 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block3 to %struct.__opencl_block_literal_generic*
788+ %21 = addrspacecast %struct.__opencl_block_literal_generic* %20 to i8 addrspace(4)*
789+
790
791 ; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event1:[0-9]+]]
792 ; CHECK-SPIRV: PtrCastToGeneric [[EventPtrTy]] [[Event2:[0-9]+]]
793@@ -158,16 +193,24 @@ entry:
794 ; [[ConstInt2]] [[Event1]] [[Event2]]
795 ; [[BlockKer2]] [[BlockLit2]] [[ConstInt20]] [[ConstInt8]]
796
797-; CHECK-LLVM: [[Block3:%[0-9]+]] = addrspacecast [[BlockTy3]]* %block3 to i8 addrspace(4)*
798+; CHECK-LLVM: [[Block3:%[0-9]+]] = bitcast [[BlockTy3]]* %block3 to %struct.__opencl_block_literal_generic*
799+; CHECK-LLVM: [[Block3Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block3]] to i8 addrspace(4)
800 ; CHECK-LLVM: [[BlockInv3:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8 addrspace(4)*
801-; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv3]], i8 addrspace(4)* [[Block3]])
802-
803- %10 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9)
804- %11 = alloca [1 x i32]
805- %12 = getelementptr [1 x i32], [1 x i32]* %11, i32 0, i32 0
806- %13 = load i8, i8* %c, align 1
807- %14 = zext i8 %13 to i32
808- store i32 %14, i32* %12, align 4
809+; CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv3]], i8 addrspace(4)* [[Block3Ptr]])
810+
811+ %22 = call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %11, i32 %12, %struct.ndrange_t* %tmp4, i32 2, %opencl.clk_event_t* addrspace(4)* %15, %opencl.clk_event_t* addrspace(4)* %16, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %21)
812+ %23 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
813+ %24 = load i32, i32* %flags, align 4
814+ %25 = bitcast %struct.ndrange_t* %tmp11 to i8*
815+ %26 = bitcast %struct.ndrange_t* %ndrange to i8*
816+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %25, i8* align 4 %26, i32 4, i1 false)
817+ %arraydecay = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
818+ %27 = addrspacecast %opencl.clk_event_t** %arraydecay to %opencl.clk_event_t* addrspace(4)*
819+ %28 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
820+ %29 = getelementptr [1 x i32], [1 x i32]* %block_sizes, i32 0, i32 0
821+ %30 = load i8, i8* %c, align 1
822+ %31 = zext i8 %30 to i32
823+ store i32 %31, i32* %29, align 4
824
825 ; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf31:[0-9]+]]
826 ; CHECK-SPIRV: Bitcast {{[0-9]+}} [[BlockLit3Tmp:[0-9]+]] [[BlockGlb1:[0-9]+]]
827@@ -182,14 +225,18 @@ entry:
828 ; CHECK-LLVM: [[BlockInv0:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8 addrspace(4)*
829 ; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv0]], i8 addrspace(4)* [[Block0]], i32 1, i32* {{.*}})
830
831- %15 = call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i32 2, %opencl.clk_event_t* addrspace(4)* %6, %opencl.clk_event_t* addrspace(4)* %7, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %12)
832- %16 = alloca [3 x i32]
833- %17 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 0
834- store i32 1, i32* %17, align 4
835- %18 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 1
836- store i32 2, i32* %18, align 4
837- %19 = getelementptr [3 x i32], [3 x i32]* %16, i32 0, i32 2
838- store i32 4, i32* %19, align 4
839+ %32 = call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %23, i32 %24, %struct.ndrange_t* %tmp11, i32 2, %opencl.clk_event_t* addrspace(4)* %27, %opencl.clk_event_t* addrspace(4)* %28, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %29)
840+ %33 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
841+ %34 = load i32, i32* %flags, align 4
842+ %35 = bitcast %struct.ndrange_t* %tmp12 to i8*
843+ %36 = bitcast %struct.ndrange_t* %ndrange to i8*
844+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %35, i8* align 4 %36, i32 4, i1 false)
845+ %37 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 0
846+ store i32 1, i32* %37, align 4
847+ %38 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 1
848+ store i32 2, i32* %38, align 4
849+ %39 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 2
850+ store i32 4, i32* %39, align 4
851
852 ; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf41:[0-9]+]]
853 ; CHECK-SPIRV: PtrAccessChain [[Int32LocPtrTy]] [[LocalBuf42:[0-9]+]]
854@@ -206,24 +253,27 @@ entry:
855 ; CHECK-LLVM: [[BlockInv1:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8 addrspace(4)*
856 ; CHECK-LLVM: call i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* null, i8 addrspace(4)* [[BlockInv1]], i8 addrspace(4)* [[Block1]], i32 3, i32* {{.*}})
857
858- %20 = call i32 @__enqueue_kernel_varargs(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* %ndrange, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %17)
859+ %40 = call i32 @__enqueue_kernel_varargs(%opencl.queue_t* %33, i32 %34, %struct.ndrange_t* %tmp12, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %37)
860 ret void
861 }
862
863+; Function Attrs: argmemonly nounwind
864+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1) #1
865+
866 ; Function Attrs: convergent noinline nounwind optnone
867 define internal spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %.block_descriptor) #2 {
868 entry:
869 %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
870- %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4
871+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4
872 store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
873- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)*
874- store <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4
875- %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4
876+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)*
877+ store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4
878+ %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 5
879 %0 = load i8, i8 addrspace(4)* %block.capture.addr, align 4
880 %conv = sext i8 %0 to i32
881- %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 2
882+ %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3
883 %1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr1, align 4
884- %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3
885+ %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4
886 %2 = load i32, i32 addrspace(4)* %block.capture.addr2, align 4
887 %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 %2
888 store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
889@@ -243,19 +293,19 @@ declare i32 @__enqueue_kernel_basic(%opencl.queue_t*, i32, %struct.ndrange_t*, i
890 define internal spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %.block_descriptor) #2 {
891 entry:
892 %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
893- %block.addr = alloca <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4
894+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4
895 store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
896- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*
897- store <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4
898- %block.capture.addr = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
899+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*
900+ store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4
901+ %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 5
902 %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr, align 4
903- %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
904+ %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
905 %1 = load i32, i32 addrspace(4)* %block.capture.addr1, align 4
906 %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %1
907 %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
908- %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 2
909+ %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
910 %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr2, align 4
911- %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
912+ %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
913 %4 = load i32, i32 addrspace(4)* %block.capture.addr3, align 4
914 %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 %4
915 store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
916@@ -276,11 +326,11 @@ define internal spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspac
917 entry:
918 %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
919 %p.addr = alloca i8 addrspace(3)*, align 4
920- %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
921+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
922 store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
923- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
924+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
925 store i8 addrspace(3)* %p, i8 addrspace(3)** %p.addr, align 4
926- store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
927+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
928 ret void
929 }
930
931@@ -300,13 +350,13 @@ entry:
932 %p1.addr = alloca i8 addrspace(3)*, align 4
933 %p2.addr = alloca i8 addrspace(3)*, align 4
934 %p3.addr = alloca i8 addrspace(3)*, align 4
935- %block.addr = alloca <{ i32, i32 }> addrspace(4)*, align 4
936+ %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
937 store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
938- %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32 }> addrspace(4)*
939+ %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
940 store i8 addrspace(3)* %p1, i8 addrspace(3)** %p1.addr, align 4
941 store i8 addrspace(3)* %p2, i8 addrspace(3)** %p2.addr, align 4
942 store i8 addrspace(3)* %p3, i8 addrspace(3)** %p3.addr, align 4
943- store <{ i32, i32 }> addrspace(4)* %block, <{ i32, i32 }> addrspace(4)** %block.addr, align 4
944+ store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
945 ret void
946 }
947
948@@ -329,27 +379,20 @@ declare i32 @__enqueue_kernel_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*,
949 ; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)*, i8 addrspace(3)*)
950 ; CHECK-LLVM-DAG: define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)
951
952-attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
953+attributes #0 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
954 attributes #1 = { argmemonly nounwind }
955-attributes #2 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
956+attributes #2 = { convergent noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
957 attributes #3 = { nounwind }
958
959 !llvm.module.flags = !{!0}
960-!opencl.enable.FP_CONTRACT = !{}
961 !opencl.ocl.version = !{!1}
962 !opencl.spir.version = !{!1}
963-!opencl.used.extensions = !{!2}
964-!opencl.used.optional.core.features = !{!2}
965-!opencl.compiler.options = !{!2}
966-!llvm.ident = !{!3}
967+!llvm.ident = !{!2}
968
969 !0 = !{i32 1, !"wchar_size", i32 4}
970 !1 = !{i32 2, i32 0}
971-!2 = !{}
972-!3 = !{!"clang version 7.0.0"}
973-!4 = !{i32 1, i32 1, i32 0, i32 0}
974-!5 = !{!"none", !"none", !"none", !"none"}
975-!6 = !{!"int*", !"int*", !"int", !"char"}
976-!7 = !{!"", !"", !"", !""}
977-!8 = !{i1 false, i1 false, i1 false, i1 false}
978-!9 = !{i32 0, i32 0, i32 0, i32 0}
979+!2 = !{!"clang version 9.0.0 (https://llvm.org/git/clang 04fb8964a801a5c5d7baa5a22272243a7d183896) (https://llvm.org/git/llvm 384f64397f6ad95a361b72d62c07d7bac9f24163)"}
980+!3 = !{i32 1, i32 1, i32 0, i32 0}
981+!4 = !{!"none", !"none", !"none", !"none"}
982+!5 = !{!"int*", !"int*", !"int", !"char"}
983+!6 = !{!"", !"", !"", !""}
984--
9852.7.4
986