diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 1f25d75932372..70874227cf939 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -128,8 +128,13 @@ class LLVM_LIBRARY_VISIBILITY BaseSPIRTargetInfo : public TargetInfo { } CallingConvCheckResult checkCallingConvention(CallingConv CC) const override { - return (CC == CC_SpirFunction || CC == CC_OpenCLKernel) ? CCCR_OK - : CCCR_Warning; + return (CC == CC_SpirFunction || CC == CC_OpenCLKernel || + // Permit CC_X86RegCall which is used to mark external functions + // with explicit simd or structure type arguments to pass them via + // registers. + CC == CC_X86RegCall) + ? CCCR_OK + : CCCR_Warning; } CallingConv getDefaultCallingConv() const override { @@ -286,8 +291,10 @@ class LLVM_LIBRARY_VISIBILITY WindowsX86_64_SPIR64TargetInfo } CallingConvCheckResult checkCallingConvention(CallingConv CC) const override { - if (CC == CC_X86VectorCall) + if (CC == CC_X86VectorCall || CC == CC_X86RegCall) // Permit CC_X86VectorCall which is used in Microsoft headers + // Permit CC_X86RegCall which is used to mark external functions with + // explicit simd or structure type arguments to pass them via registers. return CCCR_OK; return (CC == CC_SpirFunction || CC == CC_OpenCLKernel) ? CCCR_OK : CCCR_Warning; diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp index a0d38dcb798b8..d5a5d843384ee 100644 --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -10286,6 +10286,11 @@ class CommonSPIRABIInfo : public DefaultABIInfo { ABIArgInfo classifyKernelArgumentType(QualType Ty) const; + // Add new functions rather than overload existing so that these public APIs + // can't be blindly misused with wrong calling convention. + ABIArgInfo classifyRegcallReturnType(QualType RetTy) const; + ABIArgInfo classifyRegcallArgumentType(QualType RetTy) const; + void computeInfo(CGFunctionInfo &FI) const override; private: @@ -10305,17 +10310,114 @@ ABIArgInfo CommonSPIRABIInfo::classifyKernelArgumentType(QualType Ty) const { void CommonSPIRABIInfo::computeInfo(CGFunctionInfo &FI) const { llvm::CallingConv::ID CC = FI.getCallingConvention(); + bool IsRegCall = CC == llvm::CallingConv::X86_RegCall; - if (!getCXXABI().classifyReturnType(FI)) - FI.getReturnInfo() = classifyReturnType(FI.getReturnType()); + if (!getCXXABI().classifyReturnType(FI)) { + CanQualType RetT = FI.getReturnType(); + FI.getReturnInfo() = + IsRegCall ? classifyRegcallReturnType(RetT) : classifyReturnType(RetT); + } for (auto &Arg : FI.arguments()) { if (CC == llvm::CallingConv::SPIR_KERNEL) { Arg.info = classifyKernelArgumentType(Arg.type); } else { - Arg.info = classifyArgumentType(Arg.type); + Arg.info = IsRegCall ? classifyRegcallArgumentType(Arg.type) + : classifyArgumentType(Arg.type); + } + } +} + +// The two functions below are based on AMDGPUABIInfo, but without any +// restriction on the maximum number of arguments passed via registers. +// SPIRV BEs are expected to further adjust the calling convention as +// needed (use stack or byval-like passing) for some of the arguments. + +ABIArgInfo CommonSPIRABIInfo::classifyRegcallReturnType(QualType RetTy) const { + if (isAggregateTypeForABI(RetTy)) { + // Records with non-trivial destructors/copy-constructors should not be + // returned by value. + if (!getRecordArgABI(RetTy, getCXXABI())) { + // Ignore empty structs/unions. + if (isEmptyRecord(getContext(), RetTy, true)) + return ABIArgInfo::getIgnore(); + + // Lower single-element structs to just return a regular value. + if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext())) + return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + + if (const RecordType *RT = RetTy->getAs()) { + const RecordDecl *RD = RT->getDecl(); + if (RD->hasFlexibleArrayMember()) + return classifyReturnType(RetTy); + } + + // Pack aggregates <= 8 bytes into a single vector register or pair. + // TODO make this parameterizeable/adjustable depending on spir target + // triple abi component. + uint64_t Size = getContext().getTypeSize(RetTy); + if (Size <= 16) + return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); + + if (Size <= 32) + return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); + + if (Size <= 64) { + llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); + return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); + } + return ABIArgInfo::getDirect(); } } + // Otherwise just do the default thing. + return classifyReturnType(RetTy); +} + +ABIArgInfo CommonSPIRABIInfo::classifyRegcallArgumentType(QualType Ty) const { + Ty = useFirstFieldIfTransparentUnion(Ty); + + if (isAggregateTypeForABI(Ty)) { + // Records with non-trivial destructors/copy-constructors should not be + // passed by value. + if (auto RAA = getRecordArgABI(Ty, getCXXABI())) + return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory); + + // Ignore empty structs/unions. + if (isEmptyRecord(getContext(), Ty, true)) + return ABIArgInfo::getIgnore(); + + // Lower single-element structs to just pass a regular value. TODO: We + // could do reasonable-size multiple-element structs too, using getExpand(), + // though watch out for things like bitfields. + if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) + return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0))); + + if (const RecordType *RT = Ty->getAs()) { + const RecordDecl *RD = RT->getDecl(); + if (RD->hasFlexibleArrayMember()) + return classifyArgumentType(Ty); + } + + // Pack aggregates <= 8 bytes into single vector register or pair. + // TODO make this parameterizeable/adjustable depending on spir target + // triple abi component. + uint64_t Size = getContext().getTypeSize(Ty); + if (Size <= 64) { + if (Size <= 16) + return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext())); + + if (Size <= 32) + return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext())); + + // XXX: Should this be i64 instead, and should the limit increase? + llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext()); + return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2)); + } + return ABIArgInfo::getDirect(); + } + + // Otherwise just do the default thing. + return classifyArgumentType(Ty); } class SPIRVABIInfo : public CommonSPIRABIInfo { diff --git a/clang/test/CodeGenSYCL/regcall-cc-test.cpp b/clang/test/CodeGenSYCL/regcall-cc-test.cpp new file mode 100644 index 0000000000000..4d0118fcd8714 --- /dev/null +++ b/clang/test/CodeGenSYCL/regcall-cc-test.cpp @@ -0,0 +1,359 @@ +// clang-format off +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -disable-llvm-passes -O0 -emit-llvm %s -o - | FileCheck %s + +// This test checks SYCL device compiler code generation for the __regcall +// functions. This calling convention makes return values and function arguments +// passed as values (through virtual registers) in most cases. + +// CHECK-DAG: target triple = "spir64-unknown-unknown" + +// ------------------- Positive test cases (pass by value) + +template using raw_vector = + T __attribute__((ext_vector_type(N))); + +template +struct simd { + raw_vector val; +}; + +#ifdef __SYCL_DEVICE_ONLY__ +#define SYCL_DEVICE __attribute__((sycl_device)) +#else +#define SYCL_DEVICE +#endif + +template T __regcall func(T x) { return x.foo(); } + +// === TEST CASE: invoke_simd scenario, when sycl::ext::intel::esimd::simd +// objects used as return value and parameters + +SYCL_DEVICE simd __regcall SCALE(simd v); +// CHECK-DAG: declare x86_regcallcc <8 x float> @_Z17__regcall3__SCALE4simdIfLi8EE(<8 x float>) + +SYCL_DEVICE simd __regcall foo(simd x) { + return SCALE(x); +// CHECK-DAG: %{{[0-9a-zA-Z_.]+}} = call x86_regcallcc <8 x float> @_Z17__regcall3__SCALE4simdIfLi8EE(<8 x float> %{{[0-9a-zA-Z_.]+}}) +} + +// === TEST CASE: nested struct with different types of fields + +struct C { + float x, y; +}; +// CHECK-DAG: %struct.C = type { float, float } + +struct PassAsByval { + C a; + int *b; + raw_vector c; +}; +// CHECK-DAG: %struct.PassAsByval = type { %struct.C, i32 addrspace(4)*, <3 x float> } + +SYCL_DEVICE PassAsByval __regcall bar(PassAsByval x) { +// CHECK-DAG: define dso_local x86_regcallcc %struct.PassAsByval @_Z15__regcall3__bar11PassAsByval(%struct.C %{{[0-9a-zA-Z_.]+}}, i32 addrspace(4)* %{{[0-9a-zA-Z_.]+}}, <3 x float> %{{[0-9a-zA-Z_.]+}}) + x.a.x += 1; + return x; +} + +// === TEST CASE: multi-level nested structs with single primitive type element at the bottom + +struct A1 { char x; }; +struct B1 { A1 a; }; +struct C1 { + B1 b; + C1 foo() { return *this; } +}; +// CHECK-DAG: %struct.C1 = type { %struct.B1 } +// CHECK-DAG: %struct.B1 = type { %struct.A1 } +// CHECK-DAG: %struct.A1 = type { i8 } + +template SYCL_DEVICE C1 __regcall func(C1 x); +// CHECK-DAG: define weak_odr x86_regcallcc i8 @_Z16__regcall3__funcI2C1ET_S1_(i8 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with multiple elements at all levels + +struct A2 { char x; }; +struct B2 { A2 a; int* ptr; }; +struct C2 { + B2 b; + double c; + + C2 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C2 = type { %struct.B2, double } +// CHECK-DAG: %struct.B2 = type { %struct.A2, i32 addrspace(4)* } +// CHECK-DAG: %struct.A2 = type { i8 } + +template SYCL_DEVICE C2 __regcall func(C2 x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.C2 @_Z16__regcall3__funcI2C2ET_S1_(%struct.B2 %{{[0-9a-zA-Z_.]+}}, double %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with one primitive type element at +// the bottom, and one - at the top. The nested struct at the top is expected to +// get "unwrapped" by the compiler evaporating to the single element at the +// bottom. + +struct A3 { char x; }; +struct B3 { A3 a; }; // unwrapped +struct C3 { // unwrapped + B3 b; + char c; + + C3 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C3 = type { %struct.B3, i8 } +// CHECK-DAG: %struct.B3 = type { %struct.A3 } +// CHECK-DAG: %struct.A3 = type { i8 } + +template SYCL_DEVICE C3 __regcall func(C3 x); +// CHECK-DAG: define weak_odr x86_regcallcc i16 @_Z16__regcall3__funcI2C3ET_S1_(i16 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with a pointer field at the top +// level. 1 step-deep unwrapping for a function argument type and no unwrapping +// for the return type is expected to happen. + +struct A4 { char x; }; +struct B4 { A4 a; }; +struct C4 { + B4 b; + int *ptr; + + C4 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C4 = type { %struct.B4, i32 addrspace(4)* } +// CHECK-DAG: %struct.B4 = type { %struct.A4 } +// CHECK-DAG: %struct.A4 = type { i8 } + +template SYCL_DEVICE C4 __regcall func(C4 x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.C4 @_Z16__regcall3__funcI2C4ET_S1_(%struct.B4 %{{[0-9a-zA-Z_.]+}}, i32 addrspace(4)* %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with only leaf fields of primitive +// types. Unwrapping and merging should yield 2 32-bit integers + +struct A5a { char x; char y; }; +struct A5b { char x; char y; }; +struct B5 { A5a a; A5b b; }; +struct C5 { + B5 b1; + B5 b2; + + C5 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C5 = type { %struct.B5, %struct.B5 } +// CHECK-DAG: %struct.B5 = type { %struct.A5a, %struct.A5b } +// CHECK-DAG: %struct.A5a = type { i8, i8 } +// CHECK-DAG: %struct.A5b = type { i8, i8 } + +template SYCL_DEVICE C5 __regcall func(C5 x); +// CHECK-DAG: define weak_odr x86_regcallcc [2 x i32] @_Z16__regcall3__funcI2C5ET_S1_([2 x i32] %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: multi-level nested structs with only leaf fields of primitive +// types. Unwrapping and merging should yield 2 32-bit integers + +struct B6 { int *a; int b; }; +struct C6 { + B6 b; + char x; + char y; + + C6 foo() { return *this; } +}; + +// CHECK-DAG: %struct.C6 = type { %struct.B6, i8, i8 } +// CHECK-DAG: %struct.B6 = type { i32 addrspace(4)*, i32 } + +template SYCL_DEVICE C6 __regcall func(C6 x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.C6 @_Z16__regcall3__funcI2C6ET_S1_(%struct.B6 %{{[0-9a-zA-Z_.]+}}, i8 %{{[0-9a-zA-Z_.]+}}, i8 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with sizeof() <= 2 is passed as a single 16-bit integer + +struct CharChar { + char a; + char b; + + CharChar foo() { return *this; } +}; +// CHECK-DAG: %struct.CharChar = type { i8, i8 } + +template SYCL_DEVICE CharChar __regcall func(CharChar x); +// CHECK-DAG: define weak_odr x86_regcallcc i16 @_Z16__regcall3__funcI8CharCharET_S1_(i16 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with sizeof() == 3-4 is passed as single 32-bit integer + +struct ShortShort { + short a; + short b; + + ShortShort foo() { return *this; } +}; +// CHECK-DAG: %struct.ShortShort = type { i16, i16 } + +template SYCL_DEVICE ShortShort __regcall func(ShortShort x); +// CHECK-DAG: define weak_odr x86_regcallcc i32 @_Z16__regcall3__funcI10ShortShortET_S1_(i32 %{{[0-9a-zA-Z_.]+}}) + +struct CharShort { + char a; + short b; + + CharShort foo() { return *this; } +}; +// CHECK-DAG: %struct.CharShort = type { i8, i16 } + +template SYCL_DEVICE CharShort __regcall func(CharShort x); +// CHECK-DAG: define weak_odr x86_regcallcc i32 @_Z16__regcall3__funcI9CharShortET_S1_(i32 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with primitive single field element is just unwrapped + +struct Char { + char a; + + Char foo() { return *this; } +}; +// CHECK-DAG: %struct.Char = type { i8 } + +template SYCL_DEVICE Char __regcall func(Char x); +// CHECK-DAG: define weak_odr x86_regcallcc i8 @_Z16__regcall3__funcI4CharET_S1_(i8 %{{[0-9a-zA-Z_.]+}}) + +struct Float { + float a; + + Float foo() { return *this; } +}; +// CHECK-DAG: %struct.Float = type { float } + +template SYCL_DEVICE Float __regcall func(Float x); +// CHECK-DAG: define weak_odr x86_regcallcc float @_Z16__regcall3__funcI5FloatET_S1_(float %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with sizeof() == 5-8 is passed as two 32-bit integers +// 32-bit integers + +struct CharCharShortFloat { + char a, b; + short c; + float d; + + CharCharShortFloat foo() { return *this; } +}; +// CHECK-DAG: %struct.CharCharShortFloat = type { i8, i8, i16, float } + +template SYCL_DEVICE CharCharShortFloat __regcall func(CharCharShortFloat x); +// CHECK-DAG: define weak_odr x86_regcallcc [2 x i32] @_Z16__regcall3__funcI18CharCharShortFloatET_S1_([2 x i32] %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: a struct with some of the fields padded and sizeof() > 8 +// * when passed as argument, it is broken into constituents +// * is returned by value + +struct CharFloatCharShort { + char a; + float b; + char c; + short d; + + CharFloatCharShort foo() { return *this; } +}; + +// CHECK-DAG: %struct.CharFloatCharShort = type { i8, float, i8, i16 } + +template SYCL_DEVICE CharFloatCharShort __regcall func(CharFloatCharShort x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.CharFloatCharShort @_Z16__regcall3__funcI18CharFloatCharShortET_S1_(i8 %{{[0-9a-zA-Z_.]+}}, float %{{[0-9a-zA-Z_.]+}}, i8 %{{[0-9a-zA-Z_.]+}}, i16 %{{[0-9a-zA-Z_.]+}}) + +struct CharDoubleCharLonglong { + char a; + double b; + char c; + long long d; + + CharDoubleCharLonglong foo() { return *this; } +}; + +// CHECK-DAG: %struct.CharDoubleCharLonglong = type { i8, double, i8, i64 } + +template SYCL_DEVICE CharDoubleCharLonglong __regcall func(CharDoubleCharLonglong x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.CharDoubleCharLonglong @_Z16__regcall3__funcI22CharDoubleCharLonglongET_S1_(i8 %{{[0-9a-zA-Z_.]+}}, double %{{[0-9a-zA-Z_.]+}}, i8 %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}) + + +// === TEST CASE: a struct of 130x4-byte elements is still passed by value + +struct StillPassThroughRegisters { + // 130 total: + int a, a01, a02, a03, a04, a05, a06, a07, a08, a09, + a10, a11, a12, a13, a14, a15, a16, a17, a18, a19, + a20, a21, a22, a23, a24, a25, a26, a27, a28, a29, + a30, a31, a32, a33, a34, a35, a36, a37, a38, a39, + a40, a41, a42, a43, a44, a45, a46, a47, a48, a49, + a50, a51, a52, a53, a54, a55, a56, a57, a58, a59, + a60, a61, a62, a63, a64, a65, a66, a67, a68, a69, + a70, a71, a72, a73, a74, a75, a76, a77, a78, a79, + a80, a81, a82, a83, a84, a85, a86, a87, a88, a89, + a90, a91, a92, a93, a94, a95, a96, a97, a98, a99, + aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7, aa8, aa9, + ab0, ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8, ab9, + ac0, ac1, ac2, ac3, ac4, ac5, ac6, ac7, ac8, ac9; + + StillPassThroughRegisters foo() { return *this; } +}; +// CHECK-DAG: %struct.StillPassThroughRegisters = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } + + +template SYCL_DEVICE StillPassThroughRegisters __regcall func(StillPassThroughRegisters x); +// CHECK-DAG: define weak_odr x86_regcallcc %struct.StillPassThroughRegisters @_Z16__regcall3__funcI25StillPassThroughRegistersET_S1_(i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) + +// === TEST CASE: class with multiple inheritance is passed by value + +class Class0 { int x0; }; +class Class1 { int x1; }; +class ClassX : public Class0, public Class1 { int x; }; +class ClassY { int y; }; +class ClassXY : public ClassX, public ClassY { + int xy; +public: + ClassXY foo() { return *this; } +}; +// CHECK-DAG: %class.ClassXY = type { %class.ClassX, %class.ClassY, i32 } +// CHECK-DAG: %class.ClassX = type { %class.Class0, %class.Class1, i32 } +// CHECK-DAG: %class.Class0 = type { i32 } +// CHECK-DAG: %class.Class1 = type { i32 } +// CHECK-DAG: %class.ClassY = type { i32 } + +template SYCL_DEVICE ClassXY __regcall func(ClassXY x); +// CHECK-DAG: define weak_odr x86_regcallcc %class.ClassXY @_Z16__regcall3__funcI7ClassXYET_S1_(%class.ClassX %{{[0-9a-zA-Z_.]+}}, %class.ClassY %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}) + +// ------------------- Negative test cases (pass via memory) + +// === TEST CASE: no copy constructor -> pass by pointer +struct NonCopyable { + NonCopyable(int a) : a(a) {} + NonCopyable(const NonCopyable&) = delete; + int a; +}; +// CHECK-DAG: %struct.NonCopyable = type { i32 } + +SYCL_DEVICE int __regcall bar(NonCopyable x) { +// CHECK-DAG: define dso_local x86_regcallcc noundef i32 @_Z15__regcall3__bar11NonCopyable(%struct.NonCopyable* noundef %x) + return x.a; +} + +// === TEST CASE: empty struct -> optimize out +struct Empty {}; +// CHECK-DAG: %struct.Empty = type + +SYCL_DEVICE int __regcall bar(Empty x) { +// CHECK-DAG: define dso_local x86_regcallcc noundef i32 @_Z15__regcall3__bar5Empty() + return 10; +} + +// === TEST CASE: struct ends with flexible array -> pass by pointer +struct EndsWithFlexArray { + int a; + int x[]; +}; +// CHECK-DAG: %struct.EndsWithFlexArray = type { i32, [0 x i32] } + +SYCL_DEVICE int __regcall bar(EndsWithFlexArray x) { +// CHECK-DAG: define dso_local x86_regcallcc noundef i32 @_Z15__regcall3__bar17EndsWithFlexArray(%struct.EndsWithFlexArray* noundef byval(%struct.EndsWithFlexArray) align 4 %x) + return x.a; +} diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md index 3d42b34d2596e..e599f6138f205 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md +++ b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md @@ -106,7 +106,7 @@ device-side API - `sycl::accessor::get_pointer()`. All memory accesses through an accessor are done via explicit APIs; e.g. `sycl::ext::intel::experimental::esimd::block_store(acc, offset)` - Accessors with offsets and/or access range specified -- `sycl::sampler` and `sycl::stream` classes +- `sycl::sampler` and `sycl::stream` classes ## Core Explicit SIMD programming APIs @@ -511,6 +511,112 @@ ESIMD_PRIVATE ESIMD_REGISTER(32) simd vc; ```
+### `__regcall` Calling convention. + +ESIMD supports `__regcall` calling convention (CC) in addition to the default +SPIR CC. This makes compiler try generate more efficient calls where arguments +of aggregate types (classes, structs, unions) are passed and values returned via +registers rather than memory. This matters most for external functions linked on +binary level, such as functions called via `invoke_simd`. Arguments and return +values ("ARV") are still passed or returned ("communicated") via a pointer if +their type is either of the following: +- a class or struct with deleted copy constructor +- an empty class or struct +- a class or struct ending with a flexible array member. For example: +`class A { int x[]; }` + +ARVs of all other aggregate types are communicated by value or "per-field". Some +fields can be replaced with 1 or 2 integer elements with total size being equal +or exceeding the total size of fields. The rules for communicating ARVs of these +types are part of the SPIR-V level function call ABI, and are described below. +This part of the ABI is defined in terms of LLVM IR types - it basically +tells how a specific source aggregate type is represented in resulting LLVM IR +when it (the type) is part of a signature of a function with linkage defined. + +Compiler uses aggregate type "unwrapping process" for communicating ARVs. +Unwrapping a structure with a single field results in the unwrapped type of +that field, so unwrapping is a recursive process. Unwrapped primitive type is +the primitive type itself. Structures with pointer fields are not unwrapped. +For example, unwrapping `Y` defined as +```cpp +struct X { int x; }; +struct Y { X x; }; +``` +results in `i32`. Unwrapping `C4` defined as +```cpp +struct A4 { char x; }; +struct B4 { A4 a; }; +struct C4 { + B4 b; + int *ptr; +}; +``` +results in { `%struct.B4`, `i32 addrspace(4)*` } pair of types. Thus, +unwrapping can result in a set of a structure, primitive or pointer types - +the "unwrapped type set". + +- If the unwrapped type set has only primitive types, then compiler will "merge" + the resulting types if their total size is less than or equal to 8 bytes. The total + size is calculated as `sizeof()`, and structure field + alignment rules can make it greater than the simple sum of `sizeof` of all + the types resulted from unwrapping. [Total size] to [merged type] + correspondence is as follows: + * 1-2 bytes - short + * 3-4 bytes - int + * 5-8 bytes - array of 2 ints + If the total size exceeds 8, then: + * a source parameter of this type is broken down into multiple parameters + with types resulted from unwrapping + * a source return value of this type keeps it (the type) +- If the unwrapped type set has non-primitive types, then merging does not + happen, in this case unwrapping for the return value does not happen as well. + +More examples of the unwrap/merge process: + +- For `C5` in + ```cpp + struct A5a { char x; char y; }; + struct A5b { char x; char y; }; + struct B5 { A5a a; A5b b; }; + struct C5 { + B5 b1; + B5 b2; + }; + ``` + The result is `[2 x i32]`. It is not `i32` because of padding rules, as + sizeof(C5) is 8 for the SPIRV target. +- For `C6` + ```cpp + struct B6 { int *a; int b; }; + struct C6 { + B6 b; + char x; + char y; + + C6 foo() { return *this; } + }; + ``` + the result depends whether this is a type of an argument or a return value. + * Argument: { `%struct.B6`, `i8`, `i8` } type set + * Return value: `%struct.C6` type. Where the struct LLVM types are defined + as: + ``` + %struct.C6 = type { %struct.B6, i8, i8 } + %struct.B6 = type { i32 addrspace(4)*, i32 } + ``` + +Note that `__regcall` does not guarantee passing through registers in the final +generated code. For example, compiler will use a threshold for argument or +return value size, which is implementation-defined. Values larger than the +threshold will still be passed by pointer (memory). + +Example declaration of a `__regcall` function: +```cpp +simd __regcall SCALE(simd v); +``` +The parameter and the return type in the ABI form will be `<8 x float>`. +
+ ## Examples ### Vector addition (USM) ```cpp