diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 3801f7ce88321..c21cae8ecb65e 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -285,6 +285,15 @@ if(SYCL_BUILD_PI_ROCM) list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libspirv-builtins pi_rocm) endif() +if (SYCL_BUILD_PI_ESIMD_CPU) + list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libcmrt-headers) + if (MSVC) + list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libcmrt-libs libcmrt-dlls) + else() + list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libcmrt-sos) + endif() +endif() + # Use it as fake dependency in order to force another command(s) to execute. add_custom_command(OUTPUT __force_it COMMAND "${CMAKE_COMMAND}" -E echo diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp index abae9c107b130..872c56813efb3 100755 --- a/sycl/include/CL/sycl/accessor.hpp +++ b/sycl/include/CL/sycl/accessor.hpp @@ -671,6 +671,11 @@ class image_accessor AccessorBaseHost::getPtr() /*Ptr to Image*/); #endif } + +#ifndef __SYCL_DEVICE_ONLY__ +public: + void *get_pointer() const { return detail::AccessorBaseHost::getPtr(); } +#endif // __SYCL_DEVICE_ONLY__ }; template class HostKernel : public HostKernelBase { using IDBuilder = sycl::detail::Builder; - KernelType MKernel; public: + KernelType MKernel; HostKernel(KernelType Kernel) : MKernel(Kernel) {} void call(const NDRDescT &NDRDesc, HostProfilingInfo *HPI) override { // adjust ND range for serial host: diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index ad9b003c51a3f..e71d7770fd6b0 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -514,6 +514,98 @@ class __SYCL_EXPORT handler { return {z, y, x}; } + /* The kernel passed to StoreLambda can take an id, an item or an nd_item as + * its argument. Since esimd plugin directly invokes the kernel (doesn’t use + * piKernelSetArg), the kernel argument type must be known to the plugin. + * However, passing kernel argument type to the plugin requires changing ABI + * in HostKernel class. To overcome this problem, helpers below wrap the + * “original” kernel with a functor that always takes an nd_item as argument. + * A functor is used instead of a lambda because extractArgsAndReqsFromLambda + * needs access to the “original” kernel and keeps references to its internal + * data, i.e. the kernel passed as argument cannot be local in scope. The + * functor itself is again encapsulated in a std::function since functor’s + * type is unknown to the plugin. + */ + + template + KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) { + NormalizedKernelType NormalizedKernel(KernelFunc); + auto NormalizedKernelFunc = + std::function &)>(NormalizedKernel); + auto HostKernelPtr = + new detail::HostKernel, Dims, KernelType>( + NormalizedKernelFunc); + MHostKernel.reset(HostKernelPtr); + return &HostKernelPtr->MKernel.template target() + ->MKernelFunc; + } + + template + typename std::enable_if>::value, + KernelType *>::type + ResetHostKernel(const KernelType &KernelFunc) { + struct NormalizedKernelType { + KernelType MKernelFunc; + NormalizedKernelType(const KernelType &KernelFunc) + : MKernelFunc(KernelFunc) {} + void operator()(const nd_item &Arg) { + MKernelFunc(Arg.get_global_id()); + } + }; + return ResetHostKernelHelper( + KernelFunc); + } + template + typename std::enable_if>::value, + KernelType *>::type + ResetHostKernel(const KernelType &KernelFunc) { + struct NormalizedKernelType { + KernelType MKernelFunc; + NormalizedKernelType(const KernelType &KernelFunc) + : MKernelFunc(KernelFunc) {} + void operator()(const nd_item &Arg) { MKernelFunc(Arg); } + }; + return ResetHostKernelHelper( + KernelFunc); + } + + template + typename std::enable_if>::value, + KernelType *>::type + ResetHostKernel(const KernelType &KernelFunc) { + struct NormalizedKernelType { + KernelType MKernelFunc; + NormalizedKernelType(const KernelType &KernelFunc) + : MKernelFunc(KernelFunc) {} + void operator()(const nd_item &Arg) { + sycl::item Item = detail::Builder::createItem( + Arg.get_global_range(), Arg.get_global_id()); + MKernelFunc(Item); + } + }; + return ResetHostKernelHelper( + KernelFunc); + } + + template + typename std::enable_if>::value, + KernelType *>::type + ResetHostKernel(const KernelType &KernelFunc) { + struct NormalizedKernelType { + KernelType MKernelFunc; + NormalizedKernelType(const KernelType &KernelFunc) + : MKernelFunc(KernelFunc) {} + void operator()(const nd_item &Arg) { + sycl::item Item = detail::Builder::createItem( + Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset()); + MKernelFunc(Item); + } + }; + return ResetHostKernelHelper( + KernelFunc); + } + /// Stores lambda to the template-free object /// /// Also initializes kernel name, list of arguments and requirements using @@ -530,9 +622,8 @@ class __SYCL_EXPORT handler { "kernel_handler is not yet supported by host device.", PI_INVALID_OPERATION); } - MHostKernel.reset( - new detail::HostKernel( - KernelFunc)); + KernelType *KernelPtr = + ResetHostKernel(KernelFunc); using KI = sycl::detail::KernelInfo; // Empty name indicates that the compilation happens without integration @@ -540,8 +631,8 @@ class __SYCL_EXPORT handler { if (KI::getName() != nullptr && KI::getName()[0] != '\0') { // TODO support ESIMD in no-integration-header case too. MArgs.clear(); - extractArgsAndReqsFromLambda(MHostKernel->getPtr(), KI::getNumParams(), - &KI::getParamDesc(0), KI::isESIMD()); + extractArgsAndReqsFromLambda(reinterpret_cast(KernelPtr), + KI::getNumParams(), &KI::getParamDesc(0)); MKernelName = KI::getName(); MOSModuleHandle = detail::OSUtil::getOSModuleHandle(KI::getName()); } else { diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/atomic_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/atomic_intrin.hpp new file mode 100644 index 0000000000000..21cb6ad5c0826 --- /dev/null +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/atomic_intrin.hpp @@ -0,0 +1,19 @@ +//==-------- atomic_intrin.hpp - Atomic intrinsic definition file ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include + +template Ty atomic_add_fetch(Ty *ptr, Ty val) { +#ifdef _WIN32 + // TODO: Windows will be supported soon + throw cl::sycl::feature_not_supported(); +#else + return __atomic_add_fetch(ptr, val, __ATOMIC_RELAXED); +#endif +} diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/cmrt_if_defs.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/cmrt_if_defs.hpp new file mode 100644 index 0000000000000..aed9e8cadbbda --- /dev/null +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/cmrt_if_defs.hpp @@ -0,0 +1,163 @@ +//==---------- cmrt_if_defs.hpp - CM-Runtime interface header file ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +/// \file cmrt_if_defs.hpp +/// +/// Interface definitions for esimd_cpu pi module to communitcate with +/// underlying CM emulation runtime library +/// +/// \ingroup sycl_pi_esimd_cpu + +#pragma once + +/// CMRT Inteface Defines + +#include + +// Base class to store common data +struct _pi_object { + _pi_object() : RefCount{1} {} + + std::atomic RefCount; +}; +struct _pi_platform { + _pi_platform() {} + + // Keep Version information. + std::string CmEmuVersion; +}; + +struct _pi_device : _pi_object { + _pi_device(pi_platform plt) : Platform{plt} {} + + pi_platform Platform; +}; + +struct _pi_context : _pi_object { + _pi_context(pi_device DeviceArg, cm_support::CmDevice *CmDeviceArg) + : Device{DeviceArg}, CmDevicePtr{CmDeviceArg} {} + + /// One-to-one mapping between Context and Device + pi_device Device; + + cm_support::CmDevice *CmDevicePtr = nullptr; + + /// Map SVM memory starting address to corresponding + /// CmBufferSVM object. CmBufferSVM object is needed to release memory. + std::unordered_map Addr2CmBufferSVM; +}; + +struct _pi_queue : _pi_object { + _pi_queue(pi_context ContextArg, cm_support::CmQueue *CmQueueArg) + : Context{ContextArg}, CmQueuePtr{CmQueueArg} {} + + // Keeps the PI context to which this queue belongs. + pi_context Context = nullptr; + cm_support::CmQueue *CmQueuePtr = nullptr; +}; + +struct _pi_mem : _pi_object { + _pi_mem() {} + + pi_context Context; + + char *MapHostPtr = nullptr; + + // Mutex for load/store accessing + std::mutex mutexLock; + + // Surface index used by CM + int SurfaceIndex; + + // Supplementary data to keep track of the mappings of this memory + // created with piEnqueueMemBufferMap and piEnqueueMemImageMap. + struct Mapping { + // The offset in the buffer giving the start of the mapped region. + size_t Offset; + // The size of the mapped region. + size_t Size; + }; + + /* + // Method to get type of the derived object (image or buffer) + virtual bool isImage() const = 0; + */ + + virtual ~_pi_mem() = default; + + _pi_mem_type getMemType() const { return MemType; }; + + /* + // Thread-safe methods to work with memory mappings + pi_result addMapping(void *MappedTo, size_t Size, size_t Offset); + pi_result removeMapping(void *MappedTo, Mapping &MapInfo); + */ + +protected: + _pi_mem(pi_context ctxt, char *HostPtr, _pi_mem_type MemTypeArg, + int SurfaceIdxArg) + : Context{ctxt}, MapHostPtr{HostPtr}, + SurfaceIndex{SurfaceIdxArg}, Mappings{}, MemType{MemTypeArg} {} + +private: + // The key is the host pointer representing an active mapping. + // The value is the information needed to maintain/undo the mapping. + std::unordered_map Mappings; + + // TODO: we'd like to create a thread safe map class instead of mutex + map, + // that must be carefully used together. + // The mutex that is used for thread-safe work with Mappings. + std::mutex MappingsMutex; + + _pi_mem_type MemType; +}; + +struct _pi_buffer final : _pi_mem { + // Buffer/Sub-buffer constructor + _pi_buffer(pi_context ctxt, char *HostPtr, cm_support::CmBuffer *CmBufArg, + int SurfaceIdxArg, size_t SizeArg) + : _pi_mem(ctxt, HostPtr, PI_MEM_TYPE_BUFFER, SurfaceIdxArg), + CmBufferPtr{CmBufArg}, Size{SizeArg} {} + + cm_support::CmBuffer *CmBufferPtr; + size_t Size; +}; + +struct _pi_image final : _pi_mem { + // Image constructor + _pi_image(pi_context ctxt, char *HostPtr, cm_support::CmSurface2D *CmSurfArg, + int SurfaceIdxArg, size_t WidthArg, size_t HeightArg, size_t BPPArg) + : _pi_mem(ctxt, HostPtr, PI_MEM_TYPE_IMAGE2D, SurfaceIdxArg), + CmSurfacePtr{CmSurfArg}, Width{WidthArg}, Height{HeightArg}, + BytesPerPixel{BPPArg} {} + + cm_support::CmSurface2D *CmSurfacePtr; + size_t Width; + size_t Height; + size_t BytesPerPixel; +}; + +struct _pi_event : _pi_object { + _pi_event() {} + + cm_support::CmEvent *CmEventPtr = nullptr; + cm_support::CmQueue *OwnerQueue = nullptr; + pi_context Context = nullptr; + bool IsDummyEvent = false; +}; + +struct _pi_program : _pi_object { + _pi_program() {} + + // Keep the context of the program. + pi_context Context; +}; + +struct _pi_kernel : _pi_object { + _pi_kernel() {} +}; diff --git a/sycl/include/CL/sycl/INTEL/esimd/detail/emu/esimd_emu_functions_v1.h b/sycl/include/sycl/ext/intel/experimental/esimd/detail/emu/esimd_emu_functions_v1.h similarity index 100% rename from sycl/include/CL/sycl/INTEL/esimd/detail/emu/esimd_emu_functions_v1.h rename to sycl/include/sycl/ext/intel/experimental/esimd/detail/emu/esimd_emu_functions_v1.h diff --git a/sycl/include/CL/sycl/INTEL/esimd/detail/emu/esimdcpu_device_interface.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/emu/esimdcpu_device_interface.hpp similarity index 100% rename from sycl/include/CL/sycl/INTEL/esimd/detail/emu/esimdcpu_device_interface.hpp rename to sycl/include/sycl/ext/intel/experimental/esimd/detail/emu/esimdcpu_device_interface.hpp diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index 03ec0ab45b171..9225944df8581 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -521,6 +521,155 @@ __esimd_raw_send_store(uint8_t modifier, uint8_t execSize, __SEIEED::vector_type_t msgSrc0); #ifndef __SYCL_DEVICE_ONLY__ +#define __SYCL_EXPLICIT_SIMD_PLUGIN__ + +// Header files required for accessing CM-managed memory - Surface, +// buffer, etc +namespace cm_support { +#include +} // namespace cm_support +#include "cmrt_if_defs.hpp" + +#include +#include +#include +#include + +namespace raw_send { + +enum class msgField : short { + OP, + VNNI, + ADDRSIZE, + DATASIZE, + VECTSIZE, + TRANSPOSE, + CACHE, + DSTLEN, + SRC0LEN, + ADDRTYPE +}; + +enum class msgOp : short { + DP_LOAD = 0x0, // scatter/vector load + LOAD_2D = 0x3, + DP_STORE = 0x4, // scatter/vector store + STORE_2D = 0x7, + OP_MAX = 0x3F +}; + +typedef struct _bitfields_ { + uint32_t offset; + uint32_t mask; +} bitfields; + +const bitfields BIT_FIELDS[10] = { + {0, 0x3F}, // OP / 6 bits + {7, 0x1}, // VNNI -> LOAD only + {7, 0x3}, // Address size + {9, 0x7}, // DATASIZE + {12, 0x7}, // VECTSIZE + {15, 0x1}, // TRANSPOSE -> LOAD only + {17, 0x7}, // CACHE + {20, 0x1F}, // DSTLEN + {25, 0xF}, // SRC0LEN, + {29, 0x3} // ADDRTYPE +}; +uint32_t inline getMsgField(uint32_t msg, msgField field) { + uint32_t idx = static_cast(field); + return ((msg >> BIT_FIELDS[idx].offset) & BIT_FIELDS[idx].mask); +} + +auto inline getMsgOp(uint32_t msg) { + msgOp ret; + ret = static_cast(getMsgField((uint32_t)msg, msgField::OP)); + return ret; +} + +template +uint64_t inline getSurfaceBaseAddr(__SEIEED::vector_type_t addrMsg) { + constexpr int sizeofT = sizeof(T); + uint64_t Ret = 0; + + if constexpr (sizeofT == 4) { + Ret = (uint64_t)addrMsg[1] << 32; + Ret |= (uint64_t)addrMsg[0]; + } else if constexpr (sizeofT == 8) { + Ret = addrMsg[0]; + } + + return Ret; +} + +template +uint64_t inline getLaneAddr(__SEIEED::vector_type_t addrMsg, + unsigned lane_id) { + // (matrix_ref addrMsg) + // vector_ref addr_ref = addrMsg.template select<1, 1, 2, 1>(0, 2 + // * lane_id).template format(); return addr_ref(0); + throw cl::sycl::feature_not_supported(); +} + +template +auto inline getSurfaceDim(__SEIEED::vector_type_t addrMsg) { + __SEIEED::vector_type_t Ret; + constexpr int sizeofT = sizeof(T); + + static_assert(sizeofT == 4, "Unsupported addrMsg format!!"); + + if constexpr (sizeofT == 4) { + for (int idx = 0; idx < 4; idx++) { + Ret[idx] = addrMsg[idx + 2]; + } + } + + return Ret; +} + +template +auto inline getBlockOffsets(__SEIEED::vector_type_t addrMsg) { + __SEIEED::vector_type_t Ret; + constexpr int sizeofT = sizeof(T); + + static_assert(sizeofT == 4, "Unsupported addrMsg format!!"); + + if constexpr (sizeofT == 4) { + for (int idx = 0; idx < 4; idx++) { + Ret[idx] = static_cast(addrMsg[idx + 5]); + } + } + + return Ret; +} + +template +auto inline getBlockDim(__SEIEED::vector_type_t addrMsg) { + __SEIEED::vector_type_t Ret; + constexpr int sizeofT = sizeof(T); + T RawValue = 0; + + static_assert(sizeofT == 4, "Unsupported addrMsg format!!"); + + if constexpr (sizeofT == 4) { + RawValue = addrMsg[7]; + Ret[0] = (unsigned char)(RawValue & 0xFF); // width + Ret[1] = (unsigned char)((RawValue >> 8) & 0xFF); // height + Ret[2] = (unsigned char)((RawValue >> 24) & 0xFF); // For ArrayLen + } + + assert(RawValue != 0); + + return Ret; +} + +template +auto inline getArrayLen(__SEIEED::vector_type_t addrMsg) { + auto blkDim = getBlockDim(addrMsg); + return (blkDim[2] >> 4); +} + +} // namespace raw_send + template inline __SEIEED::vector_type_t @@ -693,48 +842,105 @@ template inline __SEIEED::vector_type_t __esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane, unsigned width, unsigned x, unsigned y) { - // On host the input surface is modeled as sycl image 2d object, - // and the read/write access is done through accessor, - // which is passed in as the handle argument. - auto range = __SEIEED::AccessorPrivateProxy::getImageRange(handle); - unsigned bpp = __SEIEED::AccessorPrivateProxy::getElemSize(handle); - unsigned vpp = bpp / sizeof(Ty); - unsigned int i = x / bpp; - unsigned int j = y; - - assert(x % bpp == 0); - unsigned int xbound = range[0] - 1; - unsigned int ybound = range[1] - 1; - __SEIEED::vector_type_t vals; - for (int row = 0; row < M; row++) { - for (int col = 0; col < N; col += vpp) { - unsigned int xoff = (i > xbound) ? xbound : i; - unsigned int yoff = (j > ybound) ? ybound : j; - auto coords = cl::sycl::cl_int2(xoff, yoff); - cl::sycl::cl_uint4 data = handle.read(coords); - - __SEIEED::vector_type_t res; - for (int idx = 0; idx < 4; idx++) { - res[idx] = data[idx]; + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + _pi_image *img = + static_cast<_pi_image *>(static_cast(handle.get_pointer())); + std::unique_lock lock(img->mutexLock); + + char *readBase = I->sycl_get_surface_base_addr_ptr(img->SurfaceIndex); + + uint32_t bpp = static_cast(img->BytesPerPixel); + uint32_t imgWidth = static_cast(img->Width) * bpp; + uint32_t imgHeight = static_cast(img->Height); + int x_pos_a, y_pos_a, offset, index; + + // TODO : Remove intermediate 'in' matrix + std::vector> in(M, std::vector(N)); + int R = M; + int C = N; + for (int i = 0; i < R; i++) { + for (int j = 0; j < C; j++) { + x_pos_a = x + j * sizeof(Ty); + { y_pos_a = y + i; } + // We should check the boundary condition based on sizeof(Ty), x_pos_a is + // 0-based Note: Use a signed variable; otherwise sizeof(Ty) is unsigned + if ((x_pos_a + sizeof(Ty)) > imgWidth) { + // If we're trying to read outside the boundary, limit the value of + // x_pos_a Assumption -- We don't this situation: + // x_pos_a width's boundary + // | | + // <---type(Ty)---> + // At most x_pos_a+sizeof(Ty) is exactly at the boundary. + x_pos_a = imgWidth; + } + if (y_pos_a > imgHeight - 1) { + y_pos_a = imgHeight - 1; + } + if (y_pos_a < 0) { + y_pos_a = 0; + } + { + if (x_pos_a < 0) { + // Need to align x position to bbp + int offset = x % bpp; + x_pos_a -= offset; + } + while (x_pos_a < 0) { + // If we're trying to read outside the left boundary, increase x_pos_a + x_pos_a += bpp; + } } - constexpr int refN = sizeof(cl::sycl::cl_uint4) / sizeof(Ty); - unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp; - using refTy = __SEIEED::vector_type_t; - auto ref = reinterpret_cast(res); - - unsigned int offset1 = col + row * N; - unsigned int offset2 = 0; - for (int idx = 0; idx < vpp; idx++) { - vals[offset1] = ref[offset2]; - offset1++; - offset2 += stride; + if (x_pos_a >= imgWidth) { + { + x_pos_a = x_pos_a - bpp; + for (uint byte_count = 0; byte_count < sizeof(Ty); byte_count++) { + if (x_pos_a >= imgWidth) { + x_pos_a = x_pos_a - bpp; + } + offset = y_pos_a * imgWidth + x_pos_a; + + /* + If destination size per element is less then or equal pixel size + of the surface move the pixel value accross the destination + elements. If destination size per element is greater then pixel + size of the surface replicate pixel value in the destination + element. + */ + if (sizeof(Ty) <= bpp) { + for (uint bpp_count = 0; j < C && bpp_count < bpp; + j++, bpp_count += sizeof(Ty)) { + in[i][j] = *((Ty *)(readBase + offset + bpp_count)); + } + j--; + break; + } else { + // ((unsigned char*)in.get_addr(i*C+j))[byte_count] = *((unsigned + // char*)((char*)buff_iter->p + offset)); + unsigned char *pTempBase = + ((unsigned char *)in[i].data()) + j * sizeof(Ty); + pTempBase[byte_count] = *((unsigned char *)(readBase + offset)); + } + + x_pos_a = x_pos_a + 1; + } + x_pos_a = imgWidth; + } + } else { + offset = y_pos_a * imgWidth + x_pos_a; + { in[i][j] = *((Ty *)(readBase + offset)); } } - i++; } - i = x / bpp; - j++; + } + + for (auto i = 0, k = 0; i < M; i++) { + for (auto j = 0; j < N; j++) { + vals[k++] = in[i][j]; + } } return vals; @@ -745,45 +951,57 @@ inline void __esimd_media_block_store(unsigned modififer, TACC handle, unsigned plane, unsigned width, unsigned x, unsigned y, __SEIEED::vector_type_t vals) { - unsigned bpp = __SEIEED::AccessorPrivateProxy::getElemSize(handle); - unsigned vpp = bpp / sizeof(Ty); - auto range = __SEIEED::AccessorPrivateProxy::getImageRange(handle); - unsigned int i = x / bpp; - unsigned int j = y; - - assert(x % bpp == 0); - - for (int row = 0; row < M; row++) { - for (int col = 0; col < N; col += vpp) { - constexpr int Sz = sizeof(cl::sycl::cl_uint4) / sizeof(Ty); - __SEIEED::vector_type_t res = 0; - - unsigned int offset1 = col + row * N; - unsigned int offset2 = 0; - unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp; - for (int idx = 0; idx < vpp; idx++) { - res[offset2] = vals[offset1]; - offset1++; - offset2 += stride; - } + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + _pi_image *img = + static_cast<_pi_image *>(static_cast(handle.get_pointer())); + + char *writeBase = I->sycl_get_surface_base_addr_ptr(img->SurfaceIndex); + + uint32_t bpp = static_cast(img->BytesPerPixel); + uint32_t imgWidth = static_cast(img->Width) * bpp; + uint32_t imgHeight = static_cast(img->Height); + int x_pos_a, y_pos_a, offset; + + assert((x % 4) == 0); + assert((N * sizeof(Ty)) % 4 == 0); - using refTy = __SEIEED::vector_type_t; - auto ref = reinterpret_cast(res); + // TODO : Remove intermediate 'out' matrix + std::vector> out(M, std::vector(N)); - cl::sycl::cl_uint4 data; - for (int idx = 0; idx < 4; idx++) { - data[idx] = ref[idx]; + std::unique_lock lock(img->mutexLock); + + for (int i = 0, k = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + out[i][j] = vals[k++]; + } + } + + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + x_pos_a = x + j * sizeof(Ty); + { y_pos_a = y + i; } + if ((int)x_pos_a < 0) { + continue; + } + if ((int)y_pos_a < 0) { + continue; + } + if ((int)(x_pos_a + sizeof(Ty)) > imgWidth) { + continue; } - if (i < range[0] && j < range[1]) { - auto coords = cl::sycl::cl_int2(i, j); - handle.write(coords, data); + if ((int)y_pos_a > imgHeight - 1) { + continue; } - i++; + offset = y_pos_a * imgWidth + x_pos_a; + *((Ty *)(writeBase + offset)) = out[i][j]; } - i = x / bpp; - j++; } + + /// TODO : Optimize + I->cm_fence_ptr(); } template @@ -820,10 +1038,27 @@ __esimd_dp4(__SEIEED::vector_type_t v1, return retv; } -/// TODO -inline void __esimd_barrier() {} +inline void __esimd_slm_init(size_t size) { + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + I->cm_slm_init_ptr(size); +} -inline void __esimd_sbarrier(__SEIEE::split_barrier_action flag) {} +inline void __esimd_barrier() { + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + I->cm_barrier_ptr(); +} + +inline void __esimd_sbarrier( + sycl::ext::intel::experimental::esimd::EsimdSbarrierType flag) { + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + I->cm_sbarrier_ptr((uint32_t)flag); +} inline void __esimd_slm_fence(uint8_t cntl) {} @@ -832,6 +1067,17 @@ inline __SEIEED::vector_type_t __esimd_slm_read(__SEIEED::vector_type_t addrs, __SEIEED::vector_type_t pred) { __SEIEED::vector_type_t retv; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + char *SlmBase = I->__cm_emu_get_slm_ptr(); + for (int i = 0; i < N; ++i) { + if (pred[i]) { + Ty *addr = reinterpret_cast(addrs[i] + SlmBase); + retv[i] = *addr; + } + } + return retv; } @@ -839,19 +1085,49 @@ __esimd_slm_read(__SEIEED::vector_type_t addrs, template inline void __esimd_slm_write(__SEIEED::vector_type_t addrs, __SEIEED::vector_type_t vals, - __SEIEED::vector_type_t pred) {} + __SEIEED::vector_type_t pred) { + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + + char *SlmBase = I->__cm_emu_get_slm_ptr(); + for (int i = 0; i < N; ++i) { + if (pred[i]) { + Ty *addr = reinterpret_cast(addrs[i] + SlmBase); + *addr = vals[i]; + } + } +} // slm_block_read reads a block of data from SLM template inline __SEIEED::vector_type_t __esimd_slm_block_read(uint32_t addr) { __SEIEED::vector_type_t retv; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + char *SlmBase = I->__cm_emu_get_slm_ptr(); + addr <<= 4; + for (int i = 0; i < N; ++i) { + Ty *SlmAddr = reinterpret_cast(addr + SlmBase); + retv[i] = *SlmAddr; + addr += sizeof(Ty); + } return retv; } // slm_block_write writes a block of data to SLM template inline void __esimd_slm_block_write(uint32_t addr, - __SEIEED::vector_type_t vals) {} + __SEIEED::vector_type_t vals) { + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + char *SlmBase = I->__cm_emu_get_slm_ptr(); + addr <<= 4; + for (int i = 0; i < N; ++i) { + Ty *SlmAddr = reinterpret_cast(addr + SlmBase); + *SlmAddr = vals[i]; + addr += sizeof(Ty); + } +} // slm_read4 does SLM gather4 template @@ -859,6 +1135,52 @@ inline __SEIEED::vector_type_t __esimd_slm_read4(__SEIEED::vector_type_t addrs, __SEIEED::vector_type_t pred) { __SEIEED::vector_type_t retv; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + char *ReadBase = I->__cm_emu_get_slm_ptr(); + + unsigned int Next = 0; + if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *addr = reinterpret_cast(addrs[I] + ReadBase); + retv[Next] = *addr; + } + } + } + + ReadBase += sizeof(Ty); + + if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *addr = reinterpret_cast(addrs[I] + ReadBase); + retv[Next] = *addr; + } + } + } + + ReadBase += sizeof(Ty); + + if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *addr = reinterpret_cast(addrs[I] + ReadBase); + retv[Next] = *addr; + } + } + } + + ReadBase += sizeof(Ty); + + if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *addr = reinterpret_cast(addrs[I] + ReadBase); + retv[Next] = *addr; + } + } + } return retv; } @@ -867,7 +1189,55 @@ template inline void __esimd_slm_write4( __SEIEED::vector_type_t addrs, __SEIEED::vector_type_t vals, - __SEIEED::vector_type_t pred) {} + __SEIEED::vector_type_t pred) { + + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + char *WriteBase = I->__cm_emu_get_slm_ptr(); + + unsigned int Next = 0; + if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *addr = reinterpret_cast(addrs[I] + WriteBase); + *addr = vals[Next]; + } + } + } + + WriteBase += sizeof(Ty); + + if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *addr = reinterpret_cast(addrs[I] + WriteBase); + *addr = vals[Next]; + } + } + } + + WriteBase += sizeof(Ty); + + if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *addr = reinterpret_cast(addrs[I] + WriteBase); + *addr = vals[Next]; + } + } + } + + WriteBase += sizeof(Ty); + + if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) { + for (int I = 0; I < N; I++, Next++) { + if (pred[I]) { + Ty *addr = reinterpret_cast(addrs[I] + WriteBase); + *addr = vals[Next]; + } + } + } +} // slm_atomic: SLM atomic template <__SEIEE::atomic_op Op, typename Ty, int N> @@ -875,6 +1245,23 @@ inline __SEIEED::vector_type_t __esimd_slm_atomic0(__SEIEED::vector_type_t addrs, __SEIEED::vector_type_t pred) { __SEIEED::vector_type_t retv; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + char *WriteBase = I->__cm_emu_get_slm_ptr(); + + for (int i = 0; i < N; i++) { + if (pred[i]) { + Ty *p = reinterpret_cast(addrs[i] + WriteBase); + + switch (Op) { + case __SEIEE::EsimdAtomicOpType::ATOMIC_INC: + retv[i] = atomic_add_fetch(p, 1); + break; + default: + throw cl::sycl::feature_not_supported(); + } + } + } return retv; } @@ -884,6 +1271,21 @@ __esimd_slm_atomic1(__SEIEED::vector_type_t addrs, __SEIEED::vector_type_t src0, __SEIEED::vector_type_t pred) { __SEIEED::vector_type_t retv; + + for (int i = 0; i < N; i++) { + if (pred[i]) { + Ty *p = reinterpret_cast(addrs[i]); + + switch (Op) { + case __SEIEE::EsimdAtomicOpType::ATOMIC_ADD: + retv[i] = atomic_add_fetch(p, src0[i]); + break; + default: + throw cl::sycl::feature_not_supported(); + } + } + } + return retv; } @@ -930,15 +1332,59 @@ __esimd_flat_atomic2(__SEIEED::vector_type_t addrs, template inline __SEIEED::vector_type_t __esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset) { - throw cl::sycl::feature_not_supported(); - return __SEIEED::vector_type_t(); + __SEIEED::vector_type_t retv; + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + _pi_buffer *buf = + static_cast<_pi_buffer *>(static_cast(surf_ind.get_pointer())); + + char *readBase = I->sycl_get_surface_base_addr_ptr(buf->SurfaceIndex); + + uint32_t width = static_cast(buf->Size); + uint32_t pos = offset; + + std::unique_lock lock(buf->mutexLock); + + for (int idx = 0; idx < N; idx++) { + if (pos >= width) { + retv[idx] = 0; + } else { + retv[idx] = *((Ty *)(readBase + pos)); + } + pos += (uint32_t)sizeof(Ty); + } + + return retv; } template inline void __esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset, __SEIEED::vector_type_t vals) { + sycl::detail::ESIMDDeviceInterface *I = + sycl::detail::getESIMDDeviceInterface(); + _pi_buffer *buf = + static_cast<_pi_buffer *>(static_cast(surf_ind.get_pointer())); - throw cl::sycl::feature_not_supported(); + char *writeBase = I->sycl_get_surface_base_addr_ptr(buf->SurfaceIndex); + + uint32_t width = static_cast(buf->Size); + assert(buf->Size == width); + + uint32_t pos = offset << 4; + + std::unique_lock lock(buf->mutexLock); + + for (int idx = 0; idx < N; idx++) { + if (pos < width) { + *((Ty *)(writeBase + pos)) = vals[idx]; + } else { + break; + } + pos += (uint32_t)sizeof(Ty); + } + + /// TODO : Optimize + I->cm_fence_ptr(); } /// \brief esimd_get_value @@ -1032,8 +1478,173 @@ __esimd_raw_send_load(uint8_t modifier, uint8_t execSize, uint32_t exDesc, uint32_t msgDesc, __SEIEED::vector_type_t msgSrc0, __SEIEED::vector_type_t msgDst) { - throw cl::sycl::feature_not_supported(); - return 0; + assert(sfid == 0xF); // UGM type only + + __SEIEED::vector_type_t retv; + + auto op = raw_send::getMsgOp(msgDesc); + assert(op == raw_send::msgOp::LOAD_2D); + uint64_t surfaceBase = raw_send::getSurfaceBaseAddr(msgSrc0); + auto surfaceDim = raw_send::getSurfaceDim(msgSrc0); + auto blockOffset = raw_send::getBlockOffsets(msgSrc0); + auto blockDim = raw_send::getBlockDim(msgSrc0); + auto arrayLen = raw_send::getArrayLen(msgSrc0); + + unsigned SurfaceWidth = surfaceDim[0] + 1; + unsigned SurfaceHeight = surfaceDim[1] + 1; + unsigned SurfacePitch = surfaceDim[2] + 1; + + int X = blockOffset[0]; + int Y = blockOffset[1]; + int Width = blockDim[0] + 1; + int Height = blockDim[1] + 1; + int NBlks = arrayLen + 1; + + bool Transposed = + raw_send::getMsgField(msgDesc, raw_send::msgField::TRANSPOSE); + bool Transformed = raw_send::getMsgField(msgDesc, raw_send::msgField::VNNI); + + constexpr unsigned sizeofT = sizeof(Ty1); + + char *buffBase = (char *)surfaceBase; + + // TODO : Acquire mutex for the surface pointed to by 'surfaceBase' + int vecIdx = 0; + int blkCount = 0; + + for (int xBase = X * sizeofT; blkCount < NBlks; xBase += sizeofT * Width) { + if (Transformed == true) { + constexpr int elems_per_DW = (sizeofT == 1) ? 4 : 2; /// VNNI_pack + if (Transposed == false) { /// Transform only load + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; + u += elems_per_DW, yRead += SurfacePitch * elems_per_DW) { + if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) { + /// Vertically out-of-bound, padding zero on out of boundary + for (int v = 0; v < Width; v += 1) { + for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) { + retv[vecIdx] = (Ty1)(0); + } // k loop + } + // vecIdx += Width * elems_per_DW;; + continue; + } + + int xRead = xBase; + for (int v = 0; v < Width; v += 1, xRead += sizeofT) { + if ((xRead < 0) || (xRead >= SurfaceWidth)) { + /// Horizontally out-of-bound + for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) { + retv[vecIdx] = (Ty1)(0); + } // k loop + // vecIdx += elems_per_DW; + continue; + } + + char *base = buffBase + yRead + xRead; + int offset = 0; + for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) { + retv[vecIdx] = *((Ty1 *)(base + offset)); + // Increasing in Y-direction + offset += SurfacePitch; + } // k loop + } // v loop + } /// u loop + } // Transposed = false + else // Transposed == true + { /// Transform & Transpose load + int xRead = xBase; + for (int v = 0; v < Width; + v += elems_per_DW, xRead += sizeofT * elems_per_DW) { + if ((xRead < 0) || (xRead >= SurfaceWidth)) { + // Horizontally out-of-bound + for (int u = 0; u < Height; u += 1) { + for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) { + retv[vecIdx] = (Ty1)(0); + } // k loop + } + // vecIdx += Height * elems_per_DW; + continue; + } + + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; u += 1, yRead += SurfacePitch) { + if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) { + /// Vertically out-of-bound + for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) { + retv[vecIdx] = (Ty1)(0); + } // k loop + // vecIdx += elems_per_DW; + continue; + } + + char *base = buffBase + yRead + xRead; + int offset = 0; + for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) { + retv[vecIdx] = *((Ty1 *)(base + offset)); + // Increasing in X-direction + offset += sizeofT; + } // k loop + } // u loop + } // v loop + } // Transposed == true + } // Transformed == true + else // (Transformed == false) + { + if (Transposed == false) { /// Linear load - no transform, no transpose + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; u += 1, yRead += SurfacePitch) { + if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) { + // Vertically Out-of-bound + for (int v = 0; v < Width; v += 1, vecIdx += 1) { + retv[vecIdx] = (Ty1)(0); + } + // vecIdx += Width; + continue; + } + + int xRead = xBase; + for (int v = 0; v < Width; v += 1, xRead += sizeofT, vecIdx += 1) { + if ((xRead >= 0) && (xRead < SurfaceWidth)) { + retv[vecIdx] = *((Ty1 *)(buffBase + yRead + xRead)); + } else { + // Horizontally out of bound + retv[vecIdx] = (Ty1)(0); + } + } // v loop + } // u loop + } /// Transposed == false + else // Transposed = true + { /// Transpose load - no transform + int xRead = xBase; + for (int v = 0; v < Width; v += 1, xRead += sizeofT) { + if ((xRead < 0) || (xRead > SurfaceWidth)) { + // Horizontally out-of-bound + for (int u = 0; u < Height; u += 1, vecIdx += 1) { + retv[vecIdx] = (Ty1)(0); + } + // vecIdx += Height; + continue; + } + + int yRead = Y * SurfacePitch; + for (int u = 0; u < Height; + u += 1, yRead += SurfacePitch, vecIdx += 1) { + if ((yRead >= 0) && (yRead < SurfacePitch * SurfaceHeight)) { + retv[vecIdx] = *((Ty1 *)(buffBase + yRead + xRead)); + } else { + // Vertically out of bound + retv[vecIdx] = (Ty1)(0); + } + } // u loop + } // v loop + } // Transposed == true + } // Transformed == false + blkCount += 1; + vecIdx = blkCount * Width * Height; + } // xBase loop + + return retv; } /// \brief Raw sends store. @@ -1068,7 +1679,44 @@ inline void __esimd_raw_sends_store(uint8_t modifier, uint8_t execSize, uint32_t msgDesc, __SEIEED::vector_type_t msgSrc0, __SEIEED::vector_type_t msgSrc1) { - throw cl::sycl::feature_not_supported(); + assert(sfid == 0xF); // UGM type only + auto op = raw_send::getMsgOp(msgDesc); + assert(op == raw_send::msgOp::STORE_2D); + uint64_t surfaceBase = raw_send::getSurfaceBaseAddr(msgSrc0); + auto surfaceDim = raw_send::getSurfaceDim(msgSrc0); + auto blockOffset = raw_send::getBlockOffsets(msgSrc0); + auto blockDim = raw_send::getBlockDim(msgSrc0); + + unsigned SurfaceWidth = surfaceDim[0] + 1; + unsigned SurfaceHeight = surfaceDim[1] + 1; + unsigned SurfacePitch = surfaceDim[2] + 1; + + int X = blockOffset[0]; + int Y = blockOffset[1]; + int Width = blockDim[0] + 1; + int Height = blockDim[1] + 1; + + constexpr unsigned sizeofT = sizeof(Ty2); + + char *buffBase = (char *)surfaceBase; + + int vecIdx = 0; + int rowCount = 0; + for (int yWrite = Y * SurfacePitch; rowCount < Height; + yWrite += SurfacePitch) { + if (yWrite == SurfacePitch * SurfaceHeight) { + // Vertically Out-of-bound + break; + } + int writeCount = 0; + for (int xWrite = X * sizeofT; writeCount < Width; + xWrite += sizeofT, vecIdx += 1, writeCount += 1) { + if (xWrite >= 0 && xWrite < SurfaceWidth) { + *((Ty2 *)(buffBase + yWrite + xWrite)) = msgSrc1[vecIdx]; + } + } // xWrite loop + rowCount += 1; + } // yWrite loop } /// \brief Raw send store. @@ -1096,6 +1744,13 @@ inline void __esimd_raw_send_store(uint8_t modifier, uint8_t execSize, uint8_t numSrc0, uint8_t sfid, uint32_t exDesc, uint32_t msgDesc, __SEIEED::vector_type_t msgSrc0) { + auto op = raw_send::getMsgOp(msgDesc); + + if (op == raw_send::msgOp::LOAD_2D) { + // Prefetch? + return; + } + throw cl::sycl::feature_not_supported(); } diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index dadcc437d0e5d..95dfdc9ab249b 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -803,7 +803,7 @@ media_block_store(AccessorTy acc, unsigned x, unsigned y, simd vals) { #ifndef __SYCL_DEVICE_ONLY__ -inline void slm_init(uint32_t size) {} +inline void slm_init(uint32_t size) { __esimd_slm_init(size); } #endif diff --git a/sycl/plugins/esimd_cpu/CMakeLists.txt b/sycl/plugins/esimd_cpu/CMakeLists.txt index b6c12e47b5e0f..4fbaf5ef65853 100755 --- a/sycl/plugins/esimd_cpu/CMakeLists.txt +++ b/sycl/plugins/esimd_cpu/CMakeLists.txt @@ -2,9 +2,112 @@ # PI Esimd CPU library # Create Shared library for libpi_esimd_cpu.so. +include(ExternalProject) + include_directories("${sycl_inc_dir}") include_directories(${OpenCL_INCLUDE_DIR}) -include_directories(${LIBCMRT_INCLUDE}) + +if (NOT MSVC) + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libva_build) + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libva_install) + ExternalProject_Add(libva + GIT_REPOSITORY https://github.com/intel/libva.git + GIT_TAG bef69c5f380a27b6908d6daea0fc18ce50e33bb9 + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libva_build + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/libva_install + CONFIGURE_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/libva-prefix/src/libva && ./autogen.sh --prefix=${CMAKE_CURRENT_BINARY_DIR}/libva_install + BUILD_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/libva-prefix/src/libva && make -j + INSTALL_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/libva-prefix/src/libva && make install + ) + ExternalProject_Add_Step(libva llvminstall + COMMAND ${CMAKE_COMMAND} -E make_directory ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps && ${CMAKE_COMMAND} -E copy_directory / ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps + COMMENT "Installing libva into the LLVM binary directory" + DEPENDEES install + ) +endif() + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build) +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install) + +if (MSVC) + set(LIBCM ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/libcm${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(LIBIGFXCMRT_EMU ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/igfxcmrt64_emu${CMAKE_STATIC_LIBRARY_SUFFIX}) +else() + set(LIBCM ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/libcm${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(LIBIGFXCMRT_EMU ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/libigfxcmrt_emu${CMAKE_SHARED_LIBRARY_SUFFIX}) +endif() + +if (DEFINED CM_LOCAL_SOURCE_DIR) + # Using local CM directory for online building without downloading + if (MSVC) + ExternalProject_Add(cm-emu + DOWNLOAD_COMMAND "" + SOURCE_DIR ${CM_LOCAL_SOURCE_DIR} + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= + BUILD_BYPRODUCTS ${LIBCM} ${LIBIGFXCMRT_EMU} + ) + else() + ExternalProject_Add(cm-emu + DOWNLOAD_COMMAND "" + SOURCE_DIR ${CM_LOCAL_SOURCE_DIR} + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install + CMAKE_ARGS -DLIBVA_INSTALL_PATH=${CMAKE_CURRENT_BINARY_DIR}/libva_install + -D__SYCL_EXPLICIT_SIMD_PLUGIN__=true + -DCMAKE_INSTALL_PREFIX= + DEPENDS libva + BUILD_BYPRODUCTS ${LIBCM} ${LIBIGFXCMRT_EMU} + ) + endif() +else () + if (DEFINED CM_PACKAGE_URL) + # Downloading pre-built CM Package + file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install) + ExternalProject_Add(cm-emu + URL ${CM_PACKAGE_URL} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/cm-emu-prefix/src/cm-emu/ + BUILD_BYPRODUCTS ${LIBCM} ${LIBIGFXCMRT_EMU} + ) + else() + # Build from CM source tree fetched from github + ExternalProject_Add(cm-emu + GIT_REPOSITORY https://github.com/intel/cm-cpu-emulation.git + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install + CMAKE_ARGS -DLIBVA_INSTALL_PATH=${CMAKE_CURRENT_BINARY_DIR}/libva_install + -DCMAKE_INSTALL_PREFIX= + DEPENDS libva + BUILD_BYPRODUCTS ${LIBCM} ${LIBIGFXCMRT_EMU} + ) + endif() +endif () +ExternalProject_Add_Step(cm-emu llvminstall + COMMAND ${CMAKE_COMMAND} -E make_directory ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps && ${CMAKE_COMMAND} -E copy_directory / ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps + COMMENT "Installing cm-emu into the LLVM binary directory" + DEPENDEES install +) + +include_directories(${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/include/igfxcmrt_emu) +include_directories(${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/include/libcm/cm) + +# Compilation flag to exclude lines in header files imported from CM +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__SYCL_EXPLICIT_SIMD_PLUGIN__") + +set(CMAKE_CXX_STANDARD 17) + +# Compilation option modification to prevent build termination caused by +# warnings from CM-imported files +if (MSVC) +string(REPLACE "/W4" " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +else() +string(REPLACE "-pedantic" " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +endif() add_library(pi_esimd_cpu SHARED "${sycl_inc_dir}/CL/sycl/detail/pi.h" @@ -31,16 +134,42 @@ else() ) endif() +add_dependencies(pi_esimd_cpu OpenCL-Headers) +add_dependencies(pi_esimd_cpu cm-emu) add_dependencies(sycl-toolchain pi_esimd_cpu) -add_dependencies(pi_esimd_cpu - OpenCL-Headers) - -target_link_libraries(pi_esimd_cpu PRIVATE sycl) +target_link_libraries(pi_esimd_cpu PRIVATE sycl ${LIBCM} ${LIBIGFXCMRT_EMU}) set_target_properties(pi_esimd_cpu PROPERTIES LINKER_LANGUAGE CXX) add_common_options(pi_esimd_cpu) install(TARGETS pi_esimd_cpu - LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT pi_esimd_cpu - RUNTIME DESTINATION "bin" COMPONENT pi_esimd_cpu) + LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT pi_esimd_cpu + RUNTIME DESTINATION "bin" COMPONENT pi_esimd_cpu) + +# Copy CM Header files to $(INSTALL)/include/sycl/CL/ +install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/include/libcm/cm/ + DESTINATION ${SYCL_INCLUDE_DIR}/CL + COMPONENT libcmrt-headers + FILES_MATCHING PATTERN "*.h" +) + +# Copy '.so' files to '$(INSTALL)/lib' +if (MSVC) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/lib/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + COMPONENT libcmrt-libs + FILES_MATCHING PATTERN "*.lib" + ) + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/bin/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/bin + COMPONENT libcmrt-dlls + FILES_MATCHING PATTERN "*.dll" + ) +else() + install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/lib/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + COMPONENT libcmrt-sos + FILES_MATCHING PATTERN "*.so" + ) +endif() diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp index 63fc720f49eee..ce6126736275f 100644 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp @@ -28,6 +28,8 @@ #include #include +#include + #include #include #include @@ -38,17 +40,77 @@ #include #include -#ifdef __GNUC__ -// Linux -#include -#else -// Windows -#include -#endif - #include "pi_esimd_cpu.hpp" -#define PLACEHOLDER_UNUSED(x) (void)x +namespace { + +template +pi_result getInfoImpl(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, T value, size_t value_size, + Assign &&assign_func) { + + if (param_value != nullptr) { + + if (param_value_size < value_size) { + return PI_INVALID_VALUE; + } + + assign_func(param_value, value, value_size); + } + + if (param_value_size_ret != nullptr) { + *param_value_size_ret = value_size; + } + + return PI_SUCCESS; +} + +template +pi_result getInfo(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, T value) { + + auto assignment = [](void *param_value, T value, size_t value_size) { + *static_cast(param_value) = value; + }; + + return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, + sizeof(T), assignment); +} + +template +pi_result getInfoArray(size_t array_length, size_t param_value_size, + void *param_value, size_t *param_value_size_ret, + T *value) { + return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, + array_length * sizeof(T), memcpy); +} + +template <> +pi_result getInfo(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, + const char *value) { + return getInfoArray(strlen(value) + 1, param_value_size, param_value, + param_value_size_ret, value); +} + +class ReturnHelper { +public: + ReturnHelper(size_t param_value_size, void *param_value, + size_t *param_value_size_ret) + : param_value_size(param_value_size), param_value(param_value), + param_value_size_ret(param_value_size_ret) {} + + template pi_result operator()(const T &t) { + return getInfo(param_value_size, param_value, param_value_size_ret, t); + } + +private: + size_t param_value_size; + void *param_value; + size_t *param_value_size_ret; +}; + +} // anonymous namespace // Global variables used in PI_esimd_cpu // Note we only create a simple pointer variables such that C++ RT won't @@ -115,121 +177,208 @@ LAMBDA_WRAPPER_TMPL(sycl::nd_item<3>, NDITEM_3DIM, 3) #undef _COMMA_ #undef LAMBDA_WRAPPER_TMPL -extern "C" inline void invokeLambda_ID_1DIM(void *Wrapper) { - auto *LambdaWrapper = reinterpret_cast(Wrapper); +extern "C" { - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::id<1> instance using thread ID info - // retrieved from CM and call Lambda function - // LambdaWrapper->Func(id_1dim); +inline void InvokeLambda_ID_1DIM(void *Wrapper) { + auto *LambdaWrapper = reinterpret_cast(Wrapper); + cl::sycl::id<1> Id1Dim(cm_support::get_thread_idx(0)); + LambdaWrapper->Func(Id1Dim); } -extern "C" inline void invokeLambda_ID_2DIM(void *Wrapper) { +inline void InvokeLambda_ID_2DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::id<2> instance using thread ID info - // retrieved from CM and call Lambda function - // LambdaWrapper->Func(id_2dim); + cl::sycl::id<2> Id2Dim(cm_support::get_thread_idx(0), + cm_support::get_thread_idx(1)); + LambdaWrapper->Func(Id2Dim); } -extern "C" inline void invokeLambda_ID_3DIM(void *Wrapper) { +inline void InvokeLambda_ID_3DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::id<3> instance using thread ID info - // retrieved from CM and call Lambda function - // LambdaWrapper->Func(id_3dim); + cl::sycl::id<3> Id3Dim(cm_support::get_thread_idx(0), + cm_support::get_thread_idx(1), + cm_support::get_thread_idx(2)); + LambdaWrapper->Func(Id3Dim); } -extern "C" inline void invokeLambda_ITEM_1DIM(void *Wrapper) { +inline void InvokeLambda_ITEM_1DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<1, false> instance using thread - // ID info retrieved from CM and call Lambda function - // LambdaWrapper->Func(item_1dim); + cl::sycl::item<1, false> Item1Dim = IDBuilder::createItem<1, false>( + {cm_support::get_thread_count(0)}, /// Extent + {cm_support::get_thread_idx(0)}); /// Index + LambdaWrapper->Func(Item1Dim); } -extern "C" inline void invokeLambda_ITEM_2DIM(void *Wrapper) { +inline void InvokeLambda_ITEM_2DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); - - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<2, false> instance using thread - // ID info retrieved from CM and call Lambda function - // LambdaWrapper->Func(item_2dim); + cl::sycl::item<2, false> Item2Dim = IDBuilder::createItem<2, false>( + {cm_support::get_thread_count(0), /// Extent + cm_support::get_thread_count(1)}, + {cm_support::get_thread_idx(0), /// Index + cm_support::get_thread_idx(1)}); + LambdaWrapper->Func(Item2Dim); } -extern "C" inline void invokeLambda_ITEM_3DIM(void *Wrapper) { +inline void InvokeLambda_ITEM_3DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); + cl::sycl::item<3, false> Item3Dim = IDBuilder::createItem<3, false>( + {cm_support::get_thread_count(0), /// Extent + cm_support::get_thread_count(1), cm_support::get_thread_count(2)}, + {cm_support::get_thread_idx(0), /// Index + cm_support::get_thread_idx(1), cm_support::get_thread_idx(2)}); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<3, false> instance using thread - // ID info retrieved from CM and call Lambda function - // LambdaWrapper->Func(item_3dim); + LambdaWrapper->Func(Item3Dim); } -extern "C" inline void invokeLambda_ITEM_OFFSET_1DIM(void *Wrapper) { +inline void InvokeLambda_ITEM_OFFSET_1DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<1, true> instance using thread - // ID info retrieved from CM with GlobalOffset info and call Lambda - // function - // LambdaWrapper->Func(item_offset_1dim); + cl::sycl::item<1, true> ItemOffset1Dim = IDBuilder::createItem<1, true>( + {cm_support::get_thread_count(0)}, /// Extent + {cm_support::get_thread_idx(0) + + LambdaWrapper->GlobalOffset[0]}, /// Index + {LambdaWrapper->GlobalOffset[0]} /// Offset + ); + LambdaWrapper->Func(ItemOffset1Dim); } -extern "C" inline void invokeLambda_ITEM_OFFSET_2DIM(void *Wrapper) { +inline void InvokeLambda_ITEM_OFFSET_2DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<2, true> instance using thread - // ID info retrieved from CM with GlobalOffset info and call Lambda - // function - // LambdaWrapper->Func(item_offset_2dim); + cl::sycl::item<2, true> ItemOffset2Dim = IDBuilder::createItem<2, true>( + {cm_support::get_thread_count(0), /// Extent + cm_support::get_thread_count(1)}, + {cm_support::get_thread_idx(0) + LambdaWrapper->GlobalOffset[0], /// Index + cm_support::get_thread_idx(1) + LambdaWrapper->GlobalOffset[1]}, + {LambdaWrapper->GlobalOffset[0], /// Offset + LambdaWrapper->GlobalOffset[1]}); + LambdaWrapper->Func(ItemOffset2Dim); } -extern "C" inline void invokeLambda_ITEM_OFFSET_3DIM(void *Wrapper) { +inline void InvokeLambda_ITEM_OFFSET_3DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::item<3, true> instance using thread - // ID info retrieved from CM with GlobalOffset info and call Lambda - // function - // LambdaWrapper->Func(item_offset_3dim); + cl::sycl::item<3, true> ItemOffset3Dim = IDBuilder::createItem<3, true>( + {cm_support::get_thread_count(0), /// Extent + cm_support::get_thread_count(1), cm_support::get_thread_count(2)}, + {cm_support::get_thread_idx(0) + LambdaWrapper->GlobalOffset[0], /// Index + cm_support::get_thread_idx(1) + LambdaWrapper->GlobalOffset[1], + cm_support::get_thread_idx(2) + LambdaWrapper->GlobalOffset[2]}, + {LambdaWrapper->GlobalOffset[0], /// Offset + LambdaWrapper->GlobalOffset[1], LambdaWrapper->GlobalOffset[2]}); + LambdaWrapper->Func(ItemOffset3Dim); } -extern "C" inline void invokeLambda_NDITEM_1DIM(void *Wrapper) { +inline void InvokeLambda_NDITEM_1DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); + sycl::range<1> GroupSize( + sycl::detail::InitializedVal<1, sycl::range>::template get<0>()); + + if (LambdaWrapper->LocalSize[0] == 0 || + LambdaWrapper->GlobalSize[0] % LambdaWrapper->LocalSize[0] != 0) { + throw sycl::nd_range_error("Invalid local size for global size - 1DIM", + PI_INVALID_WORK_GROUP_SIZE); + } + GroupSize[0] = LambdaWrapper->GlobalSize[0] / LambdaWrapper->LocalSize[0]; + + const sycl::id<1> LocalID = {cm_support::get_thread_idx(0)}; + + const sycl::id<1> GroupID = {cm_support::get_group_idx(0)}; - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::nd_item<1> instance using thread ID - // info retrieved from CM with GlobalOffset/GlobalSize/LocalSize - // info and call Lambda function - // LambdaWrapper->Func(nd_item_1dim); + const sycl::group<1> Group = IDBuilder::createGroup<1>( + LambdaWrapper->GlobalSize, LambdaWrapper->LocalSize, GroupSize, GroupID); + + const sycl::id<1> GlobalID = GroupID * LambdaWrapper->LocalSize + LocalID + + LambdaWrapper->GlobalOffset; + const sycl::item<1, /*Offset=*/true> GlobalItem = + IDBuilder::createItem<1, true>(LambdaWrapper->GlobalSize, GlobalID, + LambdaWrapper->GlobalOffset); + const sycl::item<1, /*Offset=*/false> LocalItem = + IDBuilder::createItem<1, false>(LambdaWrapper->LocalSize, LocalID); + + const sycl::nd_item<1> NDItem1Dim = + IDBuilder::createNDItem<1>(GlobalItem, LocalItem, Group); + + LambdaWrapper->Func(NDItem1Dim); } -extern "C" inline void invokeLambda_NDITEM_2DIM(void *Wrapper) { +inline void InvokeLambda_NDITEM_2DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); + sycl::range<2> GroupSize( + sycl::detail::InitializedVal<2, sycl::range>::template get<0>()); + + for (int I = 0; I < 2 /*Dims*/; ++I) { + if (LambdaWrapper->LocalSize[I] == 0 || + LambdaWrapper->GlobalSize[I] % LambdaWrapper->LocalSize[I] != 0) { + throw sycl::nd_range_error("Invalid local size for global size - 2DIM", + PI_INVALID_WORK_GROUP_SIZE); + } + GroupSize[I] = LambdaWrapper->GlobalSize[I] / LambdaWrapper->LocalSize[I]; + } + + const sycl::id<2> LocalID = {cm_support::get_thread_idx(0), + cm_support::get_thread_idx(1)}; + + const sycl::id<2> GroupID = {cm_support::get_group_idx(0), + cm_support::get_group_idx(1)}; + + const sycl::group<2> Group = IDBuilder::createGroup<2>( + LambdaWrapper->GlobalSize, LambdaWrapper->LocalSize, GroupSize, GroupID); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::nd_item<2> instance using thread ID - // info retrieved from CM with GlobalOffset/GlobalSize/LocalSize - // info and call Lambda function - // LambdaWrapper->Func(nd_item_2dim); + const sycl::id<2> GlobalID = GroupID * LambdaWrapper->LocalSize + LocalID + + LambdaWrapper->GlobalOffset; + const sycl::item<2, /*Offset=*/true> GlobalItem = + IDBuilder::createItem<2, true>(LambdaWrapper->GlobalSize, GlobalID, + LambdaWrapper->GlobalOffset); + const sycl::item<2, /*Offset=*/false> LocalItem = + IDBuilder::createItem<2, false>(LambdaWrapper->LocalSize, LocalID); + + const sycl::nd_item<2> NDItem2Dim = + IDBuilder::createNDItem<2>(GlobalItem, LocalItem, Group); + + LambdaWrapper->Func(NDItem2Dim); } -extern "C" inline void invokeLambda_NDITEM_3DIM(void *Wrapper) { +inline void InvokeLambda_NDITEM_3DIM(void *Wrapper) { auto *LambdaWrapper = reinterpret_cast(Wrapper); + sycl::range<3> GroupSize( + sycl::detail::InitializedVal<3, sycl::range>::template get<0>()); + + for (int I = 0; I < 3 /*Dims*/; ++I) { + if (LambdaWrapper->LocalSize[I] == 0 || + LambdaWrapper->GlobalSize[I] % LambdaWrapper->LocalSize[I] != 0) { + throw sycl::nd_range_error("Invalid local size for global size - 3DIM", + PI_INVALID_WORK_GROUP_SIZE); + } + GroupSize[I] = LambdaWrapper->GlobalSize[I] / LambdaWrapper->LocalSize[I]; + } + + const sycl::id<3> LocalID = {cm_support::get_thread_idx(0), + cm_support::get_thread_idx(1), + cm_support::get_thread_idx(2)}; + + const sycl::id<3> GroupID = {cm_support::get_group_idx(0), + cm_support::get_group_idx(1), + cm_support::get_group_idx(2)}; + + const sycl::group<3> Group = IDBuilder::createGroup<3>( + LambdaWrapper->GlobalSize, LambdaWrapper->LocalSize, GroupSize, GroupID); - PLACEHOLDER_UNUSED(LambdaWrapper); - // TODO : construct cl::sycl::nd_item<3> instance using thread ID - // info retrieved from CM with GlobalOffset/GlobalSize/LocalSize - // info and call Lambda function - // LambdaWrapper->Func(nd_item_3dim); + const sycl::id<3> GlobalID = GroupID * LambdaWrapper->LocalSize + LocalID + + LambdaWrapper->GlobalOffset; + const sycl::item<3, /*Offset=*/true> GlobalItem = + IDBuilder::createItem<3, true>(LambdaWrapper->GlobalSize, GlobalID, + LambdaWrapper->GlobalOffset); + const sycl::item<3, /*Offset=*/false> LocalItem = + IDBuilder::createItem<3, false>(LambdaWrapper->LocalSize, LocalID); + + const sycl::nd_item<3> NDItem3Dim = + IDBuilder::createNDItem<3>(GlobalItem, LocalItem, Group); + + LambdaWrapper->Func(NDItem3Dim); +} } // libCMBatch class defines interface for lauching kernels with @@ -268,9 +417,10 @@ template class libCMBatch { SpaceDim[0] = (uint32_t)Range[0]; - PLACEHOLDER_UNUSED(WrappedLambda_ID_1DIM); - // TODO : Invoke invokeLambda_ID_1DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ID_1DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ID_1DIM, GroupDim, SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ID_1DIM), + WrappedLambda_ID_1DIM.get()); } // ID_2DIM @@ -284,9 +434,10 @@ template class libCMBatch { SpaceDim[0] = (uint32_t)Range[0]; SpaceDim[1] = (uint32_t)Range[1]; - PLACEHOLDER_UNUSED(WrappedLambda_ID_2DIM); - // TODO : Invoke invokeLambda_ID_2DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ID_2DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ID_2DIM, GroupDim, SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ID_2DIM), + WrappedLambda_ID_2DIM.get()); } // ID_3DIM @@ -301,9 +452,10 @@ template class libCMBatch { SpaceDim[1] = (uint32_t)Range[1]; SpaceDim[2] = (uint32_t)Range[2]; - PLACEHOLDER_UNUSED(WrappedLambda_ID_3DIM); - // TODO : Invoke invokeLambda_ID_3DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ID_3DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ID_3DIM, GroupDim, SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ID_3DIM), + WrappedLambda_ID_3DIM.get()); } // Item w/o offset @@ -317,9 +469,11 @@ template class libCMBatch { SpaceDim[0] = (uint32_t)Range[0]; - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_1DIM); - // TODO : Invoke invokeLambda_ITEM_1DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_1DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_1DIM, GroupDim, + SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_1DIM), + WrappedLambda_ITEM_1DIM.get()); } template @@ -333,9 +487,11 @@ template class libCMBatch { SpaceDim[0] = (uint32_t)Range[0]; SpaceDim[1] = (uint32_t)Range[1]; - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_2DIM); - // TODO : Invoke invokeLambda_ITEM_2DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_2DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_2DIM, GroupDim, + SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_2DIM), + WrappedLambda_ITEM_2DIM.get()); } template @@ -350,9 +506,11 @@ template class libCMBatch { SpaceDim[1] = (uint32_t)Range[1]; SpaceDim[2] = (uint32_t)Range[2]; - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_3DIM); - // TODO : Invoke invokeLambda_ITEM_3DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_3DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_3DIM, GroupDim, + SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_3DIM), + WrappedLambda_ITEM_3DIM.get()); } // Item w/ offset @@ -366,9 +524,11 @@ template class libCMBatch { SpaceDim[0] = (uint32_t)Range[0]; - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_OFFSET_1DIM); - // TODO : Invoke invokeLambda_ITEM_OFFSET_1DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_OFFSET_1DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_OFFSET_1DIM, GroupDim, + SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_OFFSET_1DIM), + WrappedLambda_ITEM_OFFSET_1DIM.get()); } template @@ -382,9 +542,11 @@ template class libCMBatch { SpaceDim[0] = (uint32_t)Range[0]; SpaceDim[1] = (uint32_t)Range[1]; - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_OFFSET_2DIM); - // TODO : Invoke invokeLambda_ITEM_OFFSET_2DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_OFFSET_2DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_OFFSET_2DIM, GroupDim, + SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_OFFSET_2DIM), + WrappedLambda_ITEM_OFFSET_2DIM.get()); } template @@ -399,9 +561,11 @@ template class libCMBatch { SpaceDim[1] = (uint32_t)Range[1]; SpaceDim[2] = (uint32_t)Range[2]; - PLACEHOLDER_UNUSED(WrappedLambda_ITEM_OFFSET_3DIM); - // TODO : Invoke invokeLambda_ITEM_OFFSET_3DIM through CM's multi-threaded - // kernel launching with WrappedLambda_ITEM_OFFSET_3DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_OFFSET_3DIM, GroupDim, + SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_OFFSET_3DIM), + WrappedLambda_ITEM_OFFSET_3DIM.get()); } // NDItem_1DIM @@ -418,9 +582,11 @@ template class libCMBatch { GroupDim[0] = (uint32_t)(GlobalSize[0] / LocalSize[0]); - PLACEHOLDER_UNUSED(WrappedLambda_NDITEM_1DIM); - // TODO : Invoke invokeLambda_NDITEM_1DIM through CM's multi-threaded - // kernel launching with WrappedLambda_NDITEM_1DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_NDITEM_1DIM, GroupDim, + SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_NDITEM_1DIM), + WrappedLambda_NDITEM_1DIM.get()); } // NDItem_2DIM @@ -439,9 +605,11 @@ template class libCMBatch { GroupDim[0] = (uint32_t)(GlobalSize[0] / LocalSize[0]); GroupDim[1] = (uint32_t)(GlobalSize[1] / LocalSize[1]); - PLACEHOLDER_UNUSED(WrappedLambda_NDITEM_2DIM); - // TODO : Invoke invokeLambda_NDITEM_2DIM through CM's multi-threaded - // kernel launching with WrappedLambda_NDITEM_2DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_NDITEM_2DIM, GroupDim, + SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_NDITEM_2DIM), + WrappedLambda_NDITEM_2DIM.get()); } // NDItem_3DIM @@ -462,30 +630,104 @@ template class libCMBatch { GroupDim[1] = (uint32_t)(GlobalSize[1] / LocalSize[1]); GroupDim[2] = (uint32_t)(GlobalSize[2] / LocalSize[2]); - PLACEHOLDER_UNUSED(WrappedLambda_NDITEM_3DIM); - // TODO : Invoke invokeLambda_NDITEM_3DIM through CM's multi-threaded - // kernel launching with WrappedLambda_NDITEM_3DIM and dimension info + ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_NDITEM_3DIM, GroupDim, + SpaceDim); + + ESimdCPU.launchMT(sizeof(struct LambdaWrapper_NDITEM_3DIM), + WrappedLambda_NDITEM_3DIM.get()); } }; +/// Implementation for ESIMD_CPU device interface accessing ESIMD +/// intrinsics and LibCM functionalties requred by intrinsics // Intrinsics sycl::detail::ESIMDDeviceInterface::ESIMDDeviceInterface() { - reserved = nullptr; version = ESIMDEmuPluginInterfaceVersion; + reserved = nullptr; - /// TODO : Fill *_ptr fields with function pointers from CM - /// functions prefixed with 'cm_support' + /* From 'esimd_emu_functions_v1.h' : Start */ + cm_barrier_ptr = cm_support::barrier; + cm_sbarrier_ptr = cm_support::split_barrier; + cm_fence_ptr = cm_support::fence; - cm_barrier_ptr = nullptr; /* cm_support::barrier; */ - cm_sbarrier_ptr = nullptr; /* cm_support::split_barrier; */ - cm_fence_ptr = nullptr; /* cm_support::fence; */ + sycl_get_surface_base_addr_ptr = cm_support::get_surface_base_addr; + __cm_emu_get_slm_ptr = cm_support::get_slm_base; + cm_slm_init_ptr = cm_support::init_slm; + /* From 'esimd_emu_functions_v1.h' : End */ +} + +/// Implementation for Host Kernel Launch used by +/// piEnqueueKernelLaunch +template using KernelFunc = std::function; - sycl_get_surface_base_addr_ptr = - nullptr; /* cm_support::get_surface_base_addr; */ - __cm_emu_get_slm_ptr = nullptr; /* cm_support::get_slm_base; */ - cm_slm_init_ptr = nullptr; /* cm_support::init_slm; */ +template struct InvokeBaseImpl { + static sycl::range get_range(const size_t *GlobalWorkSize); +}; + +static constexpr bool isNull(int NDims, const size_t *R) { + return ((0 == R[0]) && (1 > NDims || 0 == R[1]) && (2 > NDims || 0 == R[2])); } +template struct InvokeImpl { + + template + static typename std::enable_if<_NDims == 1, sycl::range<1>>::type + get_range(const size_t *a) { + return sycl::range<1>(a[0]); + } + + template + static typename std::enable_if<_NDims == 2, sycl::range<2>>::type + get_range(const size_t *a) { + return sycl::range<2>(a[0], a[1]); + } + + template + static typename std::enable_if<_NDims == 3, sycl::range<3>>::type + get_range(const size_t *a) { + return sycl::range<3>(a[0], a[1], a[2]); + } + + static void invoke(void *fptr, const sycl::range &range) { + auto f = reinterpret_cast *>(fptr); + libCMBatch, ArgTy, NDims> CmThreading(*f); + CmThreading.runIterationSpace(range); + } + + static void invoke(void *fptr, const size_t *GlobalWorkSize) { + sycl::range range = get_range(GlobalWorkSize); + invoke(fptr, range); + } + + static void invoke(void *fptr, const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize) { + auto GlobalSize = get_range(GlobalWorkSize); + sycl::id GlobalOffset = get_range(GlobalWorkOffset); + + auto f = reinterpret_cast *>(fptr); + libCMBatch, ArgTy, NDims> CmThreading(*f); + CmThreading.runIterationSpace(GlobalSize, GlobalOffset); + } + + static void invoke(void *fptr, const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, + const size_t *LocalWorkSize) { + const size_t LocalWorkSz[] = {1, 1, 1}; + if (isNull(NDims, LocalWorkSize)) { + LocalWorkSize = LocalWorkSz; + } + + auto GlobalSize = get_range(GlobalWorkSize); + auto LocalSize = get_range(LocalWorkSize); + sycl::id GlobalOffset = get_range(GlobalWorkOffset); + + auto f = reinterpret_cast *>(fptr); + libCMBatch, ArgTy, NDims> CmThreading(*f); + + CmThreading.runIterationSpace(LocalSize, GlobalSize, GlobalOffset); + } +}; + extern "C" { #define DIE_NO_IMPLEMENTATION \ @@ -506,22 +748,52 @@ extern "C" { pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, pi_uint32 *NumPlatforms) { - (void)NumEntries; - (void)Platforms; - (void)NumPlatforms; - DIE_NO_IMPLEMENTATION; + if (NumEntries == 0 && Platforms != nullptr) { + return PI_INVALID_VALUE; + } + if (Platforms == nullptr && NumPlatforms == nullptr) { + return PI_INVALID_VALUE; + } + + if (Platforms && NumEntries > 0) { + *Platforms = new _pi_platform(); + Platforms[0]->CmEmuVersion = std::string("0.0.1"); + } + + if (NumPlatforms) { + *NumPlatforms = 1; + } + return PI_SUCCESS; } pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Platform; - (void)ParamName; - (void)ParamValueSize; - (void)ParamValue; - (void)ParamValueSizeRet; - DIE_NO_IMPLEMENTATION; + assert(Platform); + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + + switch (ParamName) { + case PI_PLATFORM_INFO_NAME: + return ReturnValue("Intel(R) ESIMD_CPU/GPU"); + + case PI_PLATFORM_INFO_VENDOR: + return ReturnValue("Intel(R) Corporation"); + + case PI_PLATFORM_INFO_VERSION: + return ReturnValue(Platform->CmEmuVersion); + + case PI_PLATFORM_INFO_PROFILE: + return ReturnValue("CM_FULL_PROFILE"); + + case PI_PLATFORM_INFO_EXTENSIONS: + return ReturnValue(""); + + default: + // TODO: implement other parameters + die("Unsupported ParamName in piPlatformGetInfo"); + } + return PI_SUCCESS; } @@ -538,35 +810,148 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle, pi_platform *) { pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, pi_uint32 NumEntries, pi_device *Devices, pi_uint32 *NumDevices) { - (void)Platform; - (void)DeviceType; - (void)NumEntries; - (void)Devices; - (void)NumDevices; - DIE_NO_IMPLEMENTATION; + if (NumEntries == 0) { + if (NumDevices) { + *NumDevices = 1; + } else { + return PI_INVALID_VALUE; + } + } + + if (NumDevices) { + *NumDevices = 1; + } else { + // assert(NumEntries == 1); + Devices[0] = new _pi_device(Platform); + } + return PI_SUCCESS; } pi_result piDeviceRetain(pi_device Device) { - (void)Device; - DIE_NO_IMPLEMENTATION; + assert(Device); + + ++(Device->RefCount); + return PI_SUCCESS; } pi_result piDeviceRelease(pi_device) { - DIE_NO_IMPLEMENTATION; + CONTINUE_NO_IMPLEMENTATION; return PI_SUCCESS; } pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Device; - (void)ParamName; - (void)ParamValueSize; - (void)ParamValue; - (void)ParamValueSizeRet; - DIE_NO_IMPLEMENTATION; + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + + switch (ParamName) { + case PI_DEVICE_INFO_TYPE: + return ReturnValue(PI_DEVICE_TYPE_GPU); + case PI_DEVICE_INFO_PARENT_DEVICE: + return ReturnValue(pi_device{0}); + case PI_DEVICE_INFO_PLATFORM: + return ReturnValue(Device->Platform); + case PI_DEVICE_INFO_NAME: + return ReturnValue("ESIMD_CPU"); + case PI_DEVICE_INFO_IMAGE_SUPPORT: + return ReturnValue(pi_bool{true}); + case PI_DEVICE_INFO_DRIVER_VERSION: + return ReturnValue("0.0.1"); + case PI_DEVICE_INFO_VENDOR: + return ReturnValue("Intel(R) Corporation"); + case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: + return ReturnValue(size_t{8192}); + case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: + return ReturnValue(size_t{8192}); + case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: + return ReturnValue(pi_bool{1}); + +#define UNSUPPORTED_INFO(info) \ + case info: \ + std::cerr << std::endl \ + << "Unsupported defice info = " << #info << std::endl; \ + DIE_NO_IMPLEMENTATION; \ + break; + + UNSUPPORTED_INFO(PI_DEVICE_INFO_VENDOR_ID) + UNSUPPORTED_INFO(PI_DEVICE_INFO_EXTENSIONS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_COMPILER_AVAILABLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_LINKER_AVAILABLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_COMPUTE_UNITS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY) + UNSUPPORTED_INFO(PI_DEVICE_INFO_ADDRESS_BITS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_LOCAL_MEM_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_AVAILABLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_VERSION) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_REFERENCE_COUNT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_PROPERTIES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_TYPE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_OPENCL_C_VERSION) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PRINTF_BUFFER_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PROFILE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_BUILT_IN_KERNELS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_QUEUE_PROPERTIES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_EXECUTION_CAPABILITIES) + UNSUPPORTED_INFO(PI_DEVICE_INFO_ENDIAN_LITTLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION) + UNSUPPORTED_INFO(PI_DEVICE_INFO_LOCAL_MEM_TYPE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_CONSTANT_ARGS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_PARAMETER_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_SAMPLERS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_SINGLE_FP_CONFIG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_HALF_FP_CONFIG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_DOUBLE_FP_CONFIG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE) + UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF) + UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF) + UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS) + UNSUPPORTED_INFO(PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL) + UNSUPPORTED_INFO(PI_DEVICE_INFO_IL_VERSION) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_HOST_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_DEVICE_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT) + UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT) + +#undef UNSUPPORTED_INFO + default: + DIE_NO_IMPLEMENTATION; + } return PI_SUCCESS; } @@ -593,13 +978,28 @@ pi_result piContextCreate(const pi_context_properties *Properties, const void *PrivateInfo, size_t CB, void *UserData), void *UserData, pi_context *RetContext) { - (void)Properties; - (void)NumDevices; - (void)Devices; - (void)PFnNotify; - (void)UserData; - (void)RetContext; - DIE_NO_IMPLEMENTATION; + if (NumDevices != 1) { + return PI_INVALID_VALUE; + } + assert(Devices); + assert(RetContext); + + cm_support::CmDevice *device = nullptr; + unsigned int version = 0; + + int result = cm_support::CreateCmDevice(device, version); + + if (result != cm_support::CM_SUCCESS) { + return PI_INVALID_VALUE; + } + + try { + *RetContext = new _pi_context(*Devices, device); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } @@ -633,18 +1033,36 @@ pi_result piContextRetain(pi_context) { } pi_result piContextRelease(pi_context Context) { - (void)Context; - DIE_NO_IMPLEMENTATION; + if ((Context == nullptr) || (Context->CmDevicePtr == nullptr)) { + return PI_INVALID_CONTEXT; + } + + int result = cm_support::DestroyCmDevice(Context->CmDevicePtr); + if (result != cm_support::CM_SUCCESS) { + return PI_INVALID_CONTEXT; + } + + delete Context; return PI_SUCCESS; } pi_result piQueueCreate(pi_context Context, pi_device Device, pi_queue_properties Properties, pi_queue *Queue) { - (void)Context; - (void)Device; - (void)Properties; - (void)Queue; - DIE_NO_IMPLEMENTATION; + cm_support::CmQueue *cmQueue; + + int result = Context->CmDevicePtr->CreateQueue(cmQueue); + if (result != cm_support::CM_SUCCESS) { + return PI_INVALID_CONTEXT; + } + + try { + *Queue = new _pi_queue(Context, cmQueue); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + return PI_SUCCESS; } @@ -659,8 +1077,13 @@ pi_result piQueueRetain(pi_queue) { } pi_result piQueueRelease(pi_queue Queue) { - (void)Queue; - DIE_NO_IMPLEMENTATION; + if ((Queue == nullptr) || (Queue->CmQueuePtr == nullptr)) { + return PI_INVALID_QUEUE; + } + + // TODO : Destory 'Queue->CmQueuePtr'? + delete Queue; + return PI_SUCCESS; } @@ -683,13 +1106,42 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle, pi_context, pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, void *HostPtr, pi_mem *RetMem, const pi_mem_properties *properties) { - (void)Context; - (void)Flags; - (void)Size; - (void)HostPtr; - (void)RetMem; - (void)properties; - DIE_NO_IMPLEMENTATION; + assert((Flags & PI_MEM_FLAGS_ACCESS_RW) != 0); + assert(Context); + assert(RetMem); + + cm_support::CmBuffer *CmBuf = nullptr; + cm_support::SurfaceIndex *CmIndex; + + int status = Context->CmDevicePtr->CreateBuffer( + static_cast(Size), CmBuf); + + if (status != cm_support::CM_SUCCESS) { + return PI_OUT_OF_HOST_MEMORY; + } + + status = CmBuf->GetIndex(CmIndex); + + if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || + (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { + status = + CmBuf->WriteSurface(reinterpret_cast(HostPtr), + nullptr, static_cast(Size)); + } + + auto HostPtrOrNull = + (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast(HostPtr) : nullptr; + + try { + *RetMem = + new _pi_buffer(Context, HostPtrOrNull, CmBuf, + /* integer buffer index */ CmIndex->get_data(), Size); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + return PI_SUCCESS; } @@ -704,22 +1156,138 @@ pi_result piMemRetain(pi_mem) { } pi_result piMemRelease(pi_mem Mem) { - (void)Mem; - DIE_NO_IMPLEMENTATION; + if (Mem->getMemType() == PI_MEM_TYPE_BUFFER) { + _pi_buffer *pi_buf = static_cast<_pi_buffer *>(Mem); + int result = Mem->Context->CmDevicePtr->DestroySurface(pi_buf->CmBufferPtr); + + if (result != cm_support::CM_SUCCESS) { + return PI_INVALID_MEM_OBJECT; + } + } else if (Mem->getMemType() == PI_MEM_TYPE_IMAGE2D) { + _pi_image *pi_image = static_cast<_pi_image *>(Mem); + int result = + Mem->Context->CmDevicePtr->DestroySurface(pi_image->CmSurfacePtr); + + if (result != cm_support::CM_SUCCESS) { + return PI_INVALID_MEM_OBJECT; + } + } else if (Mem->getMemType() == PI_MEM_TYPE_IMAGE2D) { + _pi_image *pi_img = static_cast<_pi_image *>(Mem); + int result = + Mem->Context->CmDevicePtr->DestroySurface(pi_img->CmSurfacePtr); + if (result != cm_support::CM_SUCCESS) { + return PI_INVALID_MEM_OBJECT; + } + } else { + return PI_INVALID_MEM_OBJECT; + } + return PI_SUCCESS; } +cm_support::CM_SURFACE_FORMAT +piImageFormatToCmFormat(const pi_image_format *piFormat) { + using ULongPair = std::pair; + using FmtMap = std::map; + static const FmtMap pi2cm = { + {{PI_IMAGE_CHANNEL_TYPE_UNORM_INT8, PI_IMAGE_CHANNEL_ORDER_RGBA}, + cm_support::CM_SURFACE_FORMAT_A8R8G8B8}, + + {{PI_IMAGE_CHANNEL_TYPE_UNORM_INT8, PI_IMAGE_CHANNEL_ORDER_ARGB}, + cm_support::CM_SURFACE_FORMAT_A8R8G8B8}, + + {{PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, PI_IMAGE_CHANNEL_ORDER_RGBA}, + cm_support::CM_SURFACE_FORMAT_A8R8G8B8}, + + {{PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, PI_IMAGE_CHANNEL_ORDER_RGBA}, + cm_support::CM_SURFACE_FORMAT_R32G32B32A32F}, + }; + auto result = pi2cm.find( + {piFormat->image_channel_data_type, piFormat->image_channel_order}); + if (result != pi2cm.end()) { + return result->second; + } + DIE_NO_IMPLEMENTATION; + return cm_support::CM_SURFACE_FORMAT_A8R8G8B8; +} + pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, void *HostPtr, pi_mem *RetImage) { - (void)Context; - (void)Flags; - (void)ImageFormat; - (void)ImageDesc; - (void)HostPtr; - (void)RetImage; - DIE_NO_IMPLEMENTATION; + if (ImageFormat == nullptr || ImageDesc == nullptr) + return PI_INVALID_VALUE; + + switch (ImageDesc->image_type) { + case PI_MEM_TYPE_IMAGE2D: + break; + case PI_MEM_TYPE_IMAGE1D_BUFFER: + // NOTE : Temporarily added for enabling vadd_1d and + // vadd_raw_send. Remove for migration to github as + // 'wrapIntoImageBuffer' is deprecated in github repo + assert(ImageFormat->image_channel_data_type == + PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8); + assert(ImageFormat->image_channel_order == PI_IMAGE_CHANNEL_ORDER_R); + assert(ImageDesc->image_height == 0); + return piMemBufferCreate(Context, Flags, ImageDesc->image_width, HostPtr, + RetImage); + default: + return PI_INVALID_MEM_OBJECT; + } + + auto bytesPerPixel = 4; + switch (ImageFormat->image_channel_data_type) { + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: + bytesPerPixel = 16; + break; + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: + case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: + bytesPerPixel = 4; + break; + default: + return PI_INVALID_VALUE; + } + + cm_support::CmSurface2D *CmSurface = nullptr; + cm_support::SurfaceIndex *CmIndex; + + int status = Context->CmDevicePtr->CreateSurface2D( + static_cast(ImageDesc->image_width), + static_cast(ImageDesc->image_height), + piImageFormatToCmFormat(ImageFormat), CmSurface); + + if (status != cm_support::CM_SUCCESS) { + return PI_OUT_OF_HOST_MEMORY; + } + + status = CmSurface->GetIndex(CmIndex); + + if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || + (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { + + if (HostPtr != nullptr) { + + status = CmSurface->WriteSurface( + reinterpret_cast(HostPtr), nullptr, + static_cast(ImageDesc->image_width * + ImageDesc->image_height * bytesPerPixel)); + } + } + + auto HostPtrOrNull = + (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast(HostPtr) : nullptr; + + try { + *RetImage = new _pi_image(Context, HostPtrOrNull, CmSurface, + /* integer surface index */ CmIndex->get_data(), + ImageDesc->image_width, ImageDesc->image_height, + bytesPerPixel); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + return PI_SUCCESS; } @@ -875,19 +1443,21 @@ pi_result piEventGetInfo(pi_event, pi_event_info, size_t, void *, size_t *) { pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Event; - (void)ParamName; - (void)ParamValueSize; - (void)ParamValue; - (void)ParamValueSizeRet; - DIE_NO_IMPLEMENTATION; + std::cerr << "Warning : Profiling Not supported under PI_ESIMD_CPU" + << std::endl; return PI_SUCCESS; } pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { - (void)NumEvents; - (void)EventList; - DIE_NO_IMPLEMENTATION; + for (int i = 0; i < (int)NumEvents; i++) { + if (EventList[i]->IsDummyEvent) { + continue; + } + int result = EventList[i]->CmEventPtr->WaitForTaskFinished(); + if (result != cm_support::CM_SUCCESS) { + return PI_OUT_OF_RESOURCES; + } + } return PI_SUCCESS; } @@ -908,8 +1478,17 @@ pi_result piEventRetain(pi_event) { } pi_result piEventRelease(pi_event Event) { - (void)Event; - DIE_NO_IMPLEMENTATION; + if (!Event->IsDummyEvent) { + if ((Event->CmEventPtr == nullptr) || (Event->OwnerQueue == nullptr)) { + return PI_INVALID_EVENT; + } + int result = Event->OwnerQueue->DestroyEvent(Event->CmEventPtr); + if (result != cm_support::CM_SUCCESS) { + return PI_INVALID_EVENT; + } + } + delete Event; + return PI_SUCCESS; } @@ -962,16 +1541,32 @@ pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - (void)Queue; - (void)Src; - (void)BlockingRead; - (void)Offset; - (void)Size; - (void)Dst; - (void)NumEventsInWaitList; - (void)EventWaitList; - (void)Event; - DIE_NO_IMPLEMENTATION; + /// TODO : Support Blocked read, 'Queue' handling + assert(BlockingRead == false); + assert(NumEventsInWaitList == 0); + + _pi_buffer *buf = static_cast<_pi_buffer *>(Src); + + int status = + buf->CmBufferPtr->ReadSurface(reinterpret_cast(Dst), + nullptr, // event + static_cast(Size)); + + if (status != cm_support::CM_SUCCESS) { + return PI_INVALID_MEM_OBJECT; + } + + if (Event) { + try { + *Event = new _pi_event(); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + (*Event)->IsDummyEvent = true; + } + return PI_SUCCESS; } @@ -1048,18 +1643,25 @@ pi_result piEnqueueMemImageRead(pi_queue command_queue, pi_mem image, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - (void)command_queue; - (void)image; - (void)blocking_read; - (void)origin; - (void)region; - (void)row_pitch; - (void)slice_pitch; - (void)ptr; - (void)num_events_in_wait_list; - (void)event_wait_list; - (void)event; - DIE_NO_IMPLEMENTATION; + _pi_image *img = static_cast<_pi_image *>(image); + int status = + img->CmSurfacePtr->ReadSurface(reinterpret_cast(ptr), + nullptr, // event + row_pitch * (region->height)); + if (status != cm_support::CM_SUCCESS) { + return PI_INVALID_MEM_OBJECT; + } + + if (event) { + try { + *event = new _pi_event(); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + (*event)->IsDummyEvent = true; + } return PI_SUCCESS; } @@ -1096,17 +1698,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - (void)Queue; - (void)Kernel; - (void)WorkDim; - (void)GlobalWorkOffset; - (void)GlobalWorkSize; - (void)LocalWorkSize; - (void)NumEventsInWaitList; - (void)EventWaitList; - (void)Event; - DIE_NO_IMPLEMENTATION; - return PI_SUCCESS; + switch (WorkDim) { + case 1: + InvokeImpl<1, sycl::nd_item<1>>::invoke(Kernel, GlobalWorkOffset, + GlobalWorkSize, LocalWorkSize); + return PI_SUCCESS; + + case 2: + InvokeImpl<2, sycl::nd_item<2>>::invoke(Kernel, GlobalWorkOffset, + GlobalWorkSize, LocalWorkSize); + return PI_SUCCESS; + + case 3: + InvokeImpl<3, sycl::nd_item<3>>::invoke(Kernel, GlobalWorkOffset, + GlobalWorkSize, LocalWorkSize); + return PI_SUCCESS; + + default: + DIE_NO_IMPLEMENTATION; + return PI_ERROR_UNKNOWN; + } } pi_result piextKernelCreateWithNativeHandle(pi_native_handle, pi_context, bool, @@ -1149,20 +1760,36 @@ pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, pi_uint32 Alignment) { - (void)ResultPtr; - (void)Context; - (void)Device; - (void)Properties; - (void)Size; - (void)Alignment; - DIE_NO_IMPLEMENTATION; + assert(Context); + assert(ResultPtr); + + cm_support::CmBufferSVM *buf = nullptr; + void *pSystemMem = nullptr; + int32_t ret = Context->CmDevicePtr->CreateBufferSVM( + Size, pSystemMem, CM_SVM_ACCESS_FLAG_DEFAULT, buf); + + if (ret != cm_support::CM_SUCCESS) { + return PI_OUT_OF_HOST_MEMORY; + } + *ResultPtr = pSystemMem; + auto it = Context->Addr2CmBufferSVM.find(pSystemMem); + assert(Context->Addr2CmBufferSVM.end() == it); + Context->Addr2CmBufferSVM[pSystemMem] = buf; return PI_SUCCESS; } pi_result piextUSMFree(pi_context Context, void *Ptr) { - (void)Context; - (void)Ptr; - DIE_NO_IMPLEMENTATION; + assert(Context); + assert(Ptr); + + cm_support::CmBufferSVM *buf = Context->Addr2CmBufferSVM[Ptr]; + assert(buf); + auto count = Context->Addr2CmBufferSVM.erase(Ptr); + assert(1 == count); + int32_t ret = Context->CmDevicePtr->DestroyBufferSVM(buf); + if (cm_support::CM_SUCCESS != ret) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp index e6b17be78db34..55dfb9528ad8a 100755 --- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp +++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp @@ -23,6 +23,10 @@ #include +namespace cm_support { +#include +} // namespace cm_support + template To pi_cast(From Value) { // TODO: see if more sanity checks are possible. assert(sizeof(From) == sizeof(To)); @@ -43,4 +47,5 @@ template <> uint32_t pi_cast(uint64_t Value) { std::terminate(); } -#include +#include +#include diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp index dd91d24eba642..158a540771a1c 100644 --- a/sycl/source/detail/config.hpp +++ b/sycl/source/detail/config.hpp @@ -131,11 +131,12 @@ template <> class SYCLConfig { return BackendPtr; const char *ValStr = BaseT::getRawValue(); - const std::array, 5> SyclBeMap = { + const std::array, 6> SyclBeMap = { {{"PI_OPENCL", backend::opencl}, {"PI_LEVEL_ZERO", backend::level_zero}, {"PI_LEVEL0", backend::level_zero}, // for backward compatibility {"PI_CUDA", backend::cuda}, + {"PI_ESIMD_CPU", backend::esimd_cpu}, {"PI_ROCM", backend::rocm}}}; if (ValStr) { auto It = std::find_if( @@ -145,7 +146,8 @@ template <> class SYCLConfig { }); if (It == SyclBeMap.end()) pi::die("Invalid backend. " - "Valid values are PI_OPENCL/PI_LEVEL_ZERO/PI_CUDA/PI_ROCM"); + "Valid values are " + "PI_OPENCL/PI_LEVEL_ZERO/PI_CUDA/PI_ESIMD_CPU/PI_ROCM"); static backend Backend = It->second; BackendPtr = &Backend; } @@ -185,11 +187,12 @@ static const std::array, 5> {"*", info::device_type::all}}}; // Array is used by SYCL_DEVICE_FILTER and SYCL_DEVICE_ALLOWLIST -static const std::array, 5> SyclBeMap = { +static const std::array, 6> SyclBeMap = { {{"host", backend::host}, {"opencl", backend::opencl}, {"level_zero", backend::level_zero}, {"cuda", backend::cuda}, + {"esimd_cpu", backend::esimd_cpu}, {"*", backend::all}}}; template <> class SYCLConfig { diff --git a/sycl/source/detail/device_filter.cpp b/sycl/source/detail/device_filter.cpp index 8558d0d9e4f94..a438911fe4696 100644 --- a/sycl/source/detail/device_filter.cpp +++ b/sycl/source/detail/device_filter.cpp @@ -75,7 +75,8 @@ device_filter::device_filter(const std::string &FilterString) { } catch (...) { std::string Message = std::string("Invalid device filter: ") + FilterString + - "\nPossible backend values are {host,opencl,level_zero,cuda,*}.\n" + "\nPossible backend values are " + "{host,opencl,level_zero,cuda,esimd_cpu*}.\n" "Possible device types are {host,cpu,gpu,acc,*}.\n" "Device number should be an non-negative integer.\n"; throw cl::sycl::invalid_parameter_error(Message, PI_INVALID_VALUE); diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 5745e4c2c58ae..1f4257e63d667 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -237,12 +237,14 @@ bool findPlugins(vector_class> &PluginNames) { PluginNames.emplace_back(__SYCL_LEVEL_ZERO_PLUGIN_NAME, backend::level_zero); PluginNames.emplace_back(__SYCL_CUDA_PLUGIN_NAME, backend::cuda); + PluginNames.emplace_back(__SYCL_ESIMD_CPU_PLUGIN_NAME, backend::esimd_cpu); PluginNames.emplace_back(__SYCL_ROCM_PLUGIN_NAME, backend::rocm); } else { std::vector Filters = FilterList->get(); bool OpenCLFound = false; bool LevelZeroFound = false; bool CudaFound = false; + bool EsimdCpuFound = false; bool RocmFound = false; for (const device_filter &Filter : Filters) { backend Backend = Filter.Backend; @@ -261,6 +263,12 @@ bool findPlugins(vector_class> &PluginNames) { PluginNames.emplace_back(__SYCL_CUDA_PLUGIN_NAME, backend::cuda); CudaFound = true; } + if (!EsimdCpuFound && + (Backend == backend::esimd_cpu || Backend == backend::all)) { + PluginNames.emplace_back(__SYCL_ESIMD_CPU_PLUGIN_NAME, + backend::esimd_cpu); + EsimdCpuFound = true; + } if (!RocmFound && (Backend == backend::rocm || Backend == backend::all)) { PluginNames.emplace_back(__SYCL_ROCM_PLUGIN_NAME, backend::rocm); RocmFound = true; @@ -379,6 +387,11 @@ static void initializePlugins(vector_class *Plugins) { // Use the LEVEL_ZERO plugin as the GlobalPlugin GlobalPlugin = std::make_shared(PluginInformation, backend::level_zero, Library); + } else if (InteropBE == backend::esimd_cpu && + PluginNames[I].first.find("esimd_cpu") != std::string::npos) { + // Use the ESIMD_CPU plugin as the GlobalPlugin + GlobalPlugin = std::make_shared(PluginInformation, + backend::esimd_cpu, Library); } Plugins->emplace_back( plugin(PluginInformation, PluginNames[I].second, Library)); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 2734a64776761..202488aaf9c31 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1937,7 +1937,8 @@ cl_int ExecCGCommand::enqueueImp() { NDRDescT &NDRDesc = ExecKernel->MNDRDesc; - if (MQueue->is_host()) { + if (MQueue->is_host() || + (MQueue->getPlugin().getBackend() == backend::esimd_cpu)) { for (ArgDesc &Arg : ExecKernel->MArgs) if (kernel_param_kind_t::kind_accessor == Arg.MType) { Requirement *Req = (Requirement *)(Arg.MPtr); @@ -1949,8 +1950,18 @@ cl_int ExecCGCommand::enqueueImp() { const detail::plugin &Plugin = EventImpls[0]->getPlugin(); Plugin.call(RawEvents.size(), &RawEvents[0]); } - ExecKernel->MHostKernel->call(NDRDesc, - getEvent()->getHostProfilingInfo()); + + if (MQueue->is_host()) { + ExecKernel->MHostKernel->call(NDRDesc, + getEvent()->getHostProfilingInfo()); + } else { + assert(MQueue->getPlugin().getBackend() == backend::esimd_cpu); + MQueue->getPlugin().call( + nullptr, + reinterpret_cast(ExecKernel->MHostKernel->getPtr()), + NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], + &NDRDesc.LocalSize[0], 0, nullptr, nullptr); + } return CL_SUCCESS; }