diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index 3801f7ce88321..c21cae8ecb65e 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -285,6 +285,15 @@ if(SYCL_BUILD_PI_ROCM)
   list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libspirv-builtins pi_rocm)
 endif()
 
+if (SYCL_BUILD_PI_ESIMD_CPU)
+  list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libcmrt-headers)
+  if (MSVC)
+    list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libcmrt-libs libcmrt-dlls)
+  else()
+    list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libcmrt-sos)
+  endif()
+endif()
+
 # Use it as fake dependency in order to force another command(s) to execute.
 add_custom_command(OUTPUT __force_it
   COMMAND "${CMAKE_COMMAND}" -E echo
diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp
index abae9c107b130..872c56813efb3 100755
--- a/sycl/include/CL/sycl/accessor.hpp
+++ b/sycl/include/CL/sycl/accessor.hpp
@@ -671,6 +671,11 @@ class image_accessor
                        AccessorBaseHost::getPtr() /*Ptr to Image*/);
 #endif
   }
+
+#ifndef __SYCL_DEVICE_ONLY__
+public:
+  void *get_pointer() const { return detail::AccessorBaseHost::getPtr(); }
+#endif // __SYCL_DEVICE_ONLY__
 };
 
 template <typename DataT, int Dimensions, access::mode AccessMode,
diff --git a/sycl/include/CL/sycl/detail/cg_types.hpp b/sycl/include/CL/sycl/detail/cg_types.hpp
index 0aa80c63bbef8..cd81c68134c2c 100644
--- a/sycl/include/CL/sycl/detail/cg_types.hpp
+++ b/sycl/include/CL/sycl/detail/cg_types.hpp
@@ -243,9 +243,9 @@ class HostTask {
 template <class KernelType, class KernelArgType, int Dims, typename KernelName>
 class HostKernel : public HostKernelBase {
   using IDBuilder = sycl::detail::Builder;
-  KernelType MKernel;
 
 public:
+  KernelType MKernel;
   HostKernel(KernelType Kernel) : MKernel(Kernel) {}
   void call(const NDRDescT &NDRDesc, HostProfilingInfo *HPI) override {
     // adjust ND range for serial host:
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index ad9b003c51a3f..e71d7770fd6b0 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -514,6 +514,98 @@ class __SYCL_EXPORT handler {
     return {z, y, x};
   }
 
+  /* The kernel passed to StoreLambda can take an id, an item or an nd_item as
+   * its argument. Since esimd plugin directly invokes the kernel (doesn’t use
+   * piKernelSetArg), the kernel argument type must be known to the plugin.
+   * However, passing kernel argument type to the plugin requires changing ABI
+   * in HostKernel class. To overcome this problem, helpers below wrap the
+   * “original” kernel with a functor that always takes an nd_item as argument.
+   * A functor is used instead of a lambda because extractArgsAndReqsFromLambda
+   * needs access to the “original” kernel and keeps references to its internal
+   * data, i.e. the kernel passed as argument cannot be local in scope. The
+   * functor itself is again encapsulated in a std::function since functor’s
+   * type is unknown to the plugin.
+   */
+
+  template <class KernelType, class NormalizedKernelType, int Dims>
+  KernelType *ResetHostKernelHelper(const KernelType &KernelFunc) {
+    NormalizedKernelType NormalizedKernel(KernelFunc);
+    auto NormalizedKernelFunc =
+        std::function<void(const sycl::nd_item<Dims> &)>(NormalizedKernel);
+    auto HostKernelPtr =
+        new detail::HostKernel<decltype(NormalizedKernelFunc),
+                               sycl::nd_item<Dims>, Dims, KernelType>(
+            NormalizedKernelFunc);
+    MHostKernel.reset(HostKernelPtr);
+    return &HostKernelPtr->MKernel.template target<NormalizedKernelType>()
+                ->MKernelFunc;
+  }
+
+  template <class KernelType, typename ArgT, int Dims>
+  typename std::enable_if<std::is_same<ArgT, sycl::id<Dims>>::value,
+                          KernelType *>::type
+  ResetHostKernel(const KernelType &KernelFunc) {
+    struct NormalizedKernelType {
+      KernelType MKernelFunc;
+      NormalizedKernelType(const KernelType &KernelFunc)
+          : MKernelFunc(KernelFunc) {}
+      void operator()(const nd_item<Dims> &Arg) {
+        MKernelFunc(Arg.get_global_id());
+      }
+    };
+    return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
+        KernelFunc);
+  }
+  template <class KernelType, typename ArgT, int Dims>
+  typename std::enable_if<std::is_same<ArgT, sycl::nd_item<Dims>>::value,
+                          KernelType *>::type
+  ResetHostKernel(const KernelType &KernelFunc) {
+    struct NormalizedKernelType {
+      KernelType MKernelFunc;
+      NormalizedKernelType(const KernelType &KernelFunc)
+          : MKernelFunc(KernelFunc) {}
+      void operator()(const nd_item<Dims> &Arg) { MKernelFunc(Arg); }
+    };
+    return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
+        KernelFunc);
+  }
+
+  template <class KernelType, typename ArgT, int Dims>
+  typename std::enable_if<std::is_same<ArgT, sycl::item<Dims, false>>::value,
+                          KernelType *>::type
+  ResetHostKernel(const KernelType &KernelFunc) {
+    struct NormalizedKernelType {
+      KernelType MKernelFunc;
+      NormalizedKernelType(const KernelType &KernelFunc)
+          : MKernelFunc(KernelFunc) {}
+      void operator()(const nd_item<Dims> &Arg) {
+        sycl::item<Dims, false> Item = detail::Builder::createItem<Dims, false>(
+            Arg.get_global_range(), Arg.get_global_id());
+        MKernelFunc(Item);
+      }
+    };
+    return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
+        KernelFunc);
+  }
+
+  template <class KernelType, typename ArgT, int Dims>
+  typename std::enable_if<std::is_same<ArgT, sycl::item<Dims, true>>::value,
+                          KernelType *>::type
+  ResetHostKernel(const KernelType &KernelFunc) {
+    struct NormalizedKernelType {
+      KernelType MKernelFunc;
+      NormalizedKernelType(const KernelType &KernelFunc)
+          : MKernelFunc(KernelFunc) {}
+      void operator()(const nd_item<Dims> &Arg) {
+        sycl::item<Dims, true> Item = detail::Builder::createItem<Dims, true>(
+            Arg.get_global_range(), Arg.get_global_id(), Arg.get_offset());
+        MKernelFunc(Item);
+      }
+    };
+    return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
+        KernelFunc);
+  }
+
   /// Stores lambda to the template-free object
   ///
   /// Also initializes kernel name, list of arguments and requirements using
@@ -530,9 +622,8 @@ class __SYCL_EXPORT handler {
           "kernel_handler is not yet supported by host device.",
           PI_INVALID_OPERATION);
     }
-    MHostKernel.reset(
-        new detail::HostKernel<KernelType, LambdaArgType, Dims, KernelName>(
-            KernelFunc));
+    KernelType *KernelPtr =
+        ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
 
     using KI = sycl::detail::KernelInfo<KernelName>;
     // Empty name indicates that the compilation happens without integration
@@ -540,8 +631,8 @@ class __SYCL_EXPORT handler {
     if (KI::getName() != nullptr && KI::getName()[0] != '\0') {
       // TODO support ESIMD in no-integration-header case too.
       MArgs.clear();
-      extractArgsAndReqsFromLambda(MHostKernel->getPtr(), KI::getNumParams(),
-                                   &KI::getParamDesc(0), KI::isESIMD());
+      extractArgsAndReqsFromLambda(reinterpret_cast<char *>(KernelPtr),
+                                   KI::getNumParams(), &KI::getParamDesc(0));
       MKernelName = KI::getName();
       MOSModuleHandle = detail::OSUtil::getOSModuleHandle(KI::getName());
     } else {
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/atomic_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/atomic_intrin.hpp
new file mode 100644
index 0000000000000..21cb6ad5c0826
--- /dev/null
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/atomic_intrin.hpp
@@ -0,0 +1,19 @@
+//==-------- atomic_intrin.hpp - Atomic intrinsic definition file ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <CL/sycl/exception.hpp>
+
+template <typename Ty> Ty atomic_add_fetch(Ty *ptr, Ty val) {
+#ifdef _WIN32
+  // TODO: Windows will be supported soon
+  throw cl::sycl::feature_not_supported();
+#else
+  return __atomic_add_fetch(ptr, val, __ATOMIC_RELAXED);
+#endif
+}
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/cmrt_if_defs.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/cmrt_if_defs.hpp
new file mode 100644
index 0000000000000..aed9e8cadbbda
--- /dev/null
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/cmrt_if_defs.hpp
@@ -0,0 +1,163 @@
+//==---------- cmrt_if_defs.hpp - CM-Runtime interface header file ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// \file cmrt_if_defs.hpp
+///
+/// Interface definitions for esimd_cpu pi module to communitcate with
+/// underlying CM emulation runtime library
+///
+/// \ingroup sycl_pi_esimd_cpu
+
+#pragma once
+
+/// CMRT Inteface Defines
+
+#include <unordered_map>
+
+// Base class to store common data
+struct _pi_object {
+  _pi_object() : RefCount{1} {}
+
+  std::atomic<pi_uint32> RefCount;
+};
+struct _pi_platform {
+  _pi_platform() {}
+
+  // Keep Version information.
+  std::string CmEmuVersion;
+};
+
+struct _pi_device : _pi_object {
+  _pi_device(pi_platform plt) : Platform{plt} {}
+
+  pi_platform Platform;
+};
+
+struct _pi_context : _pi_object {
+  _pi_context(pi_device DeviceArg, cm_support::CmDevice *CmDeviceArg)
+      : Device{DeviceArg}, CmDevicePtr{CmDeviceArg} {}
+
+  /// One-to-one mapping between Context and Device
+  pi_device Device;
+
+  cm_support::CmDevice *CmDevicePtr = nullptr;
+
+  /// Map SVM memory starting address to corresponding
+  /// CmBufferSVM object. CmBufferSVM object is needed to release memory.
+  std::unordered_map<void *, cm_support::CmBufferSVM *> Addr2CmBufferSVM;
+};
+
+struct _pi_queue : _pi_object {
+  _pi_queue(pi_context ContextArg, cm_support::CmQueue *CmQueueArg)
+      : Context{ContextArg}, CmQueuePtr{CmQueueArg} {}
+
+  // Keeps the PI context to which this queue belongs.
+  pi_context Context = nullptr;
+  cm_support::CmQueue *CmQueuePtr = nullptr;
+};
+
+struct _pi_mem : _pi_object {
+  _pi_mem() {}
+
+  pi_context Context;
+
+  char *MapHostPtr = nullptr;
+
+  // Mutex for load/store accessing
+  std::mutex mutexLock;
+
+  // Surface index used by CM
+  int SurfaceIndex;
+
+  // Supplementary data to keep track of the mappings of this memory
+  // created with piEnqueueMemBufferMap and piEnqueueMemImageMap.
+  struct Mapping {
+    // The offset in the buffer giving the start of the mapped region.
+    size_t Offset;
+    // The size of the mapped region.
+    size_t Size;
+  };
+
+  /*
+  // Method to get type of the derived object (image or buffer)
+  virtual bool isImage() const = 0;
+  */
+
+  virtual ~_pi_mem() = default;
+
+  _pi_mem_type getMemType() const { return MemType; };
+
+  /*
+  // Thread-safe methods to work with memory mappings
+  pi_result addMapping(void *MappedTo, size_t Size, size_t Offset);
+  pi_result removeMapping(void *MappedTo, Mapping &MapInfo);
+  */
+
+protected:
+  _pi_mem(pi_context ctxt, char *HostPtr, _pi_mem_type MemTypeArg,
+          int SurfaceIdxArg)
+      : Context{ctxt}, MapHostPtr{HostPtr},
+        SurfaceIndex{SurfaceIdxArg}, Mappings{}, MemType{MemTypeArg} {}
+
+private:
+  // The key is the host pointer representing an active mapping.
+  // The value is the information needed to maintain/undo the mapping.
+  std::unordered_map<void *, Mapping> Mappings;
+
+  // TODO: we'd like to create a thread safe map class instead of mutex + map,
+  // that must be carefully used together.
+  // The mutex that is used for thread-safe work with Mappings.
+  std::mutex MappingsMutex;
+
+  _pi_mem_type MemType;
+};
+
+struct _pi_buffer final : _pi_mem {
+  // Buffer/Sub-buffer constructor
+  _pi_buffer(pi_context ctxt, char *HostPtr, cm_support::CmBuffer *CmBufArg,
+             int SurfaceIdxArg, size_t SizeArg)
+      : _pi_mem(ctxt, HostPtr, PI_MEM_TYPE_BUFFER, SurfaceIdxArg),
+        CmBufferPtr{CmBufArg}, Size{SizeArg} {}
+
+  cm_support::CmBuffer *CmBufferPtr;
+  size_t Size;
+};
+
+struct _pi_image final : _pi_mem {
+  // Image constructor
+  _pi_image(pi_context ctxt, char *HostPtr, cm_support::CmSurface2D *CmSurfArg,
+            int SurfaceIdxArg, size_t WidthArg, size_t HeightArg, size_t BPPArg)
+      : _pi_mem(ctxt, HostPtr, PI_MEM_TYPE_IMAGE2D, SurfaceIdxArg),
+        CmSurfacePtr{CmSurfArg}, Width{WidthArg}, Height{HeightArg},
+        BytesPerPixel{BPPArg} {}
+
+  cm_support::CmSurface2D *CmSurfacePtr;
+  size_t Width;
+  size_t Height;
+  size_t BytesPerPixel;
+};
+
+struct _pi_event : _pi_object {
+  _pi_event() {}
+
+  cm_support::CmEvent *CmEventPtr = nullptr;
+  cm_support::CmQueue *OwnerQueue = nullptr;
+  pi_context Context = nullptr;
+  bool IsDummyEvent = false;
+};
+
+struct _pi_program : _pi_object {
+  _pi_program() {}
+
+  // Keep the context of the program.
+  pi_context Context;
+};
+
+struct _pi_kernel : _pi_object {
+  _pi_kernel() {}
+};
diff --git a/sycl/include/CL/sycl/INTEL/esimd/detail/emu/esimd_emu_functions_v1.h b/sycl/include/sycl/ext/intel/experimental/esimd/detail/emu/esimd_emu_functions_v1.h
similarity index 100%
rename from sycl/include/CL/sycl/INTEL/esimd/detail/emu/esimd_emu_functions_v1.h
rename to sycl/include/sycl/ext/intel/experimental/esimd/detail/emu/esimd_emu_functions_v1.h
diff --git a/sycl/include/CL/sycl/INTEL/esimd/detail/emu/esimdcpu_device_interface.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/emu/esimdcpu_device_interface.hpp
similarity index 100%
rename from sycl/include/CL/sycl/INTEL/esimd/detail/emu/esimdcpu_device_interface.hpp
rename to sycl/include/sycl/ext/intel/experimental/esimd/detail/emu/esimdcpu_device_interface.hpp
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
index 03ec0ab45b171..9225944df8581 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
@@ -521,6 +521,155 @@ __esimd_raw_send_store(uint8_t modifier, uint8_t execSize,
                        __SEIEED::vector_type_t<Ty1, N1> msgSrc0);
 #ifndef __SYCL_DEVICE_ONLY__
 
+#define __SYCL_EXPLICIT_SIMD_PLUGIN__
+
+// Header files required for accessing CM-managed memory - Surface,
+// buffer, etc
+namespace cm_support {
+#include <CL/cm_rt.h>
+} // namespace cm_support
+#include "cmrt_if_defs.hpp"
+
+#include <CL/sycl/backend_types.hpp>
+#include <CL/sycl/detail/pi.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/atomic_intrin.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/emu/esimdcpu_device_interface.hpp>
+
+namespace raw_send {
+
+enum class msgField : short {
+  OP,
+  VNNI,
+  ADDRSIZE,
+  DATASIZE,
+  VECTSIZE,
+  TRANSPOSE,
+  CACHE,
+  DSTLEN,
+  SRC0LEN,
+  ADDRTYPE
+};
+
+enum class msgOp : short {
+  DP_LOAD = 0x0, // scatter/vector load
+  LOAD_2D = 0x3,
+  DP_STORE = 0x4, // scatter/vector store
+  STORE_2D = 0x7,
+  OP_MAX = 0x3F
+};
+
+typedef struct _bitfields_ {
+  uint32_t offset;
+  uint32_t mask;
+} bitfields;
+
+const bitfields BIT_FIELDS[10] = {
+    {0, 0x3F},  // OP / 6 bits
+    {7, 0x1},   // VNNI -> LOAD only
+    {7, 0x3},   // Address size
+    {9, 0x7},   // DATASIZE
+    {12, 0x7},  // VECTSIZE
+    {15, 0x1},  // TRANSPOSE -> LOAD only
+    {17, 0x7},  // CACHE
+    {20, 0x1F}, // DSTLEN
+    {25, 0xF},  // SRC0LEN,
+    {29, 0x3}   // ADDRTYPE
+};
+uint32_t inline getMsgField(uint32_t msg, msgField field) {
+  uint32_t idx = static_cast<uint32_t>(field);
+  return ((msg >> BIT_FIELDS[idx].offset) & BIT_FIELDS[idx].mask);
+}
+
+auto inline getMsgOp(uint32_t msg) {
+  msgOp ret;
+  ret = static_cast<msgOp>(getMsgField((uint32_t)msg, msgField::OP));
+  return ret;
+}
+
+template <typename T, unsigned N>
+uint64_t inline getSurfaceBaseAddr(__SEIEED::vector_type_t<T, N> addrMsg) {
+  constexpr int sizeofT = sizeof(T);
+  uint64_t Ret = 0;
+
+  if constexpr (sizeofT == 4) {
+    Ret = (uint64_t)addrMsg[1] << 32;
+    Ret |= (uint64_t)addrMsg[0];
+  } else if constexpr (sizeofT == 8) {
+    Ret = addrMsg[0];
+  }
+
+  return Ret;
+}
+
+template <typename T, unsigned N>
+uint64_t inline getLaneAddr(__SEIEED::vector_type_t<T, N> addrMsg,
+                            unsigned lane_id) {
+  // (matrix_ref<T, R, C> addrMsg)
+  // vector_ref<uint64_t, 1> addr_ref = addrMsg.template select<1, 1, 2, 1>(0, 2
+  // * lane_id).template format<uint64_t, 1, 1>(); return addr_ref(0);
+  throw cl::sycl::feature_not_supported();
+}
+
+template <typename T, unsigned N>
+auto inline getSurfaceDim(__SEIEED::vector_type_t<T, N> addrMsg) {
+  __SEIEED::vector_type_t<uint32_t, 4> Ret;
+  constexpr int sizeofT = sizeof(T);
+
+  static_assert(sizeofT == 4, "Unsupported addrMsg format!!");
+
+  if constexpr (sizeofT == 4) {
+    for (int idx = 0; idx < 4; idx++) {
+      Ret[idx] = addrMsg[idx + 2];
+    }
+  }
+
+  return Ret;
+}
+
+template <typename T, unsigned N>
+auto inline getBlockOffsets(__SEIEED::vector_type_t<T, N> addrMsg) {
+  __SEIEED::vector_type_t<int32_t, 4> Ret;
+  constexpr int sizeofT = sizeof(T);
+
+  static_assert(sizeofT == 4, "Unsupported addrMsg format!!");
+
+  if constexpr (sizeofT == 4) {
+    for (int idx = 0; idx < 4; idx++) {
+      Ret[idx] = static_cast<int32_t>(addrMsg[idx + 5]);
+    }
+  }
+
+  return Ret;
+}
+
+template <typename T, unsigned N>
+auto inline getBlockDim(__SEIEED::vector_type_t<T, N> addrMsg) {
+  __SEIEED::vector_type_t<unsigned char, 4> Ret;
+  constexpr int sizeofT = sizeof(T);
+  T RawValue = 0;
+
+  static_assert(sizeofT == 4, "Unsupported addrMsg format!!");
+
+  if constexpr (sizeofT == 4) {
+    RawValue = addrMsg[7];
+    Ret[0] = (unsigned char)(RawValue & 0xFF);         // width
+    Ret[1] = (unsigned char)((RawValue >> 8) & 0xFF);  // height
+    Ret[2] = (unsigned char)((RawValue >> 24) & 0xFF); // For ArrayLen
+  }
+
+  assert(RawValue != 0);
+
+  return Ret;
+}
+
+template <typename T, unsigned N>
+auto inline getArrayLen(__SEIEED::vector_type_t<T, N> addrMsg) {
+  auto blkDim = getBlockDim<T, N>(addrMsg);
+  return (blkDim[2] >> 4);
+}
+
+} // namespace raw_send
+
 template <typename Ty, int N, int NumBlk, __SEIEE::CacheHint L1H,
           __SEIEE::CacheHint L3H>
 inline __SEIEED::vector_type_t<Ty, N * __SEIEED::ElemsPerAddrDecoding(NumBlk)>
@@ -693,48 +842,105 @@ template <typename Ty, int M, int N, typename TACC>
 inline __SEIEED::vector_type_t<Ty, M * N>
 __esimd_media_block_load(unsigned modififer, TACC handle, unsigned plane,
                          unsigned width, unsigned x, unsigned y) {
-  // On host the input surface is modeled as sycl image 2d object,
-  // and the read/write access is done through accessor,
-  // which is passed in as the handle argument.
-  auto range = __SEIEED::AccessorPrivateProxy::getImageRange(handle);
-  unsigned bpp = __SEIEED::AccessorPrivateProxy::getElemSize(handle);
-  unsigned vpp = bpp / sizeof(Ty);
-  unsigned int i = x / bpp;
-  unsigned int j = y;
-
-  assert(x % bpp == 0);
-  unsigned int xbound = range[0] - 1;
-  unsigned int ybound = range[1] - 1;
-
   __SEIEED::vector_type_t<Ty, M * N> vals;
-  for (int row = 0; row < M; row++) {
-    for (int col = 0; col < N; col += vpp) {
-      unsigned int xoff = (i > xbound) ? xbound : i;
-      unsigned int yoff = (j > ybound) ? ybound : j;
-      auto coords = cl::sycl::cl_int2(xoff, yoff);
-      cl::sycl::cl_uint4 data = handle.read(coords);
-
-      __SEIEED::vector_type_t<unsigned int, 4> res;
-      for (int idx = 0; idx < 4; idx++) {
-        res[idx] = data[idx];
+
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+
+  _pi_image *img =
+      static_cast<_pi_image *>(static_cast<void *>(handle.get_pointer()));
+  std::unique_lock<std::mutex> lock(img->mutexLock);
+
+  char *readBase = I->sycl_get_surface_base_addr_ptr(img->SurfaceIndex);
+
+  uint32_t bpp = static_cast<uint32_t>(img->BytesPerPixel);
+  uint32_t imgWidth = static_cast<uint32_t>(img->Width) * bpp;
+  uint32_t imgHeight = static_cast<uint32_t>(img->Height);
+  int x_pos_a, y_pos_a, offset, index;
+
+  // TODO : Remove intermediate 'in' matrix
+  std::vector<std::vector<Ty>> in(M, std::vector<Ty>(N));
+  int R = M;
+  int C = N;
+  for (int i = 0; i < R; i++) {
+    for (int j = 0; j < C; j++) {
+      x_pos_a = x + j * sizeof(Ty);
+      { y_pos_a = y + i; }
+      // We should check the boundary condition based on sizeof(Ty), x_pos_a is
+      // 0-based Note: Use a signed variable; otherwise sizeof(Ty) is unsigned
+      if ((x_pos_a + sizeof(Ty)) > imgWidth) {
+        // If we're trying to read outside the boundary, limit the value of
+        // x_pos_a Assumption -- We don't this situation:
+        //         x_pos_a  width's boundary
+        //           |      |
+        //           <---type(Ty)--->
+        // At most x_pos_a+sizeof(Ty) is exactly at the boundary.
+        x_pos_a = imgWidth;
+      }
+      if (y_pos_a > imgHeight - 1) {
+        y_pos_a = imgHeight - 1;
+      }
+      if (y_pos_a < 0) {
+        y_pos_a = 0;
+      }
+      {
+        if (x_pos_a < 0) {
+          // Need to align x position to bbp
+          int offset = x % bpp;
+          x_pos_a -= offset;
+        }
+        while (x_pos_a < 0) {
+          // If we're trying to read outside the left boundary, increase x_pos_a
+          x_pos_a += bpp;
+        }
       }
 
-      constexpr int refN = sizeof(cl::sycl::cl_uint4) / sizeof(Ty);
-      unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp;
-      using refTy = __SEIEED::vector_type_t<Ty, refN>;
-      auto ref = reinterpret_cast<refTy>(res);
-
-      unsigned int offset1 = col + row * N;
-      unsigned int offset2 = 0;
-      for (int idx = 0; idx < vpp; idx++) {
-        vals[offset1] = ref[offset2];
-        offset1++;
-        offset2 += stride;
+      if (x_pos_a >= imgWidth) {
+        {
+          x_pos_a = x_pos_a - bpp;
+          for (uint byte_count = 0; byte_count < sizeof(Ty); byte_count++) {
+            if (x_pos_a >= imgWidth) {
+              x_pos_a = x_pos_a - bpp;
+            }
+            offset = y_pos_a * imgWidth + x_pos_a;
+
+            /*
+              If destination size per element is less then or equal pixel size
+              of the surface move the pixel value accross the destination
+              elements. If destination size per element is greater then pixel
+              size of the surface replicate pixel value in the destination
+              element.
+            */
+            if (sizeof(Ty) <= bpp) {
+              for (uint bpp_count = 0; j < C && bpp_count < bpp;
+                   j++, bpp_count += sizeof(Ty)) {
+                in[i][j] = *((Ty *)(readBase + offset + bpp_count));
+              }
+              j--;
+              break;
+            } else {
+              // ((unsigned char*)in.get_addr(i*C+j))[byte_count] = *((unsigned
+              // char*)((char*)buff_iter->p + offset));
+              unsigned char *pTempBase =
+                  ((unsigned char *)in[i].data()) + j * sizeof(Ty);
+              pTempBase[byte_count] = *((unsigned char *)(readBase + offset));
+            }
+
+            x_pos_a = x_pos_a + 1;
+          }
+          x_pos_a = imgWidth;
+        }
+      } else {
+        offset = y_pos_a * imgWidth + x_pos_a;
+        { in[i][j] = *((Ty *)(readBase + offset)); }
       }
-      i++;
     }
-    i = x / bpp;
-    j++;
+  }
+
+  for (auto i = 0, k = 0; i < M; i++) {
+    for (auto j = 0; j < N; j++) {
+      vals[k++] = in[i][j];
+    }
   }
 
   return vals;
@@ -745,45 +951,57 @@ inline void __esimd_media_block_store(unsigned modififer, TACC handle,
                                       unsigned plane, unsigned width,
                                       unsigned x, unsigned y,
                                       __SEIEED::vector_type_t<Ty, M * N> vals) {
-  unsigned bpp = __SEIEED::AccessorPrivateProxy::getElemSize(handle);
-  unsigned vpp = bpp / sizeof(Ty);
-  auto range = __SEIEED::AccessorPrivateProxy::getImageRange(handle);
-  unsigned int i = x / bpp;
-  unsigned int j = y;
-
-  assert(x % bpp == 0);
-
-  for (int row = 0; row < M; row++) {
-    for (int col = 0; col < N; col += vpp) {
-      constexpr int Sz = sizeof(cl::sycl::cl_uint4) / sizeof(Ty);
-      __SEIEED::vector_type_t<Ty, Sz> res = 0;
-
-      unsigned int offset1 = col + row * N;
-      unsigned int offset2 = 0;
-      unsigned int stride = sizeof(cl::sycl::cl_uint4) / bpp;
-      for (int idx = 0; idx < vpp; idx++) {
-        res[offset2] = vals[offset1];
-        offset1++;
-        offset2 += stride;
-      }
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+
+  _pi_image *img =
+      static_cast<_pi_image *>(static_cast<void *>(handle.get_pointer()));
+
+  char *writeBase = I->sycl_get_surface_base_addr_ptr(img->SurfaceIndex);
+
+  uint32_t bpp = static_cast<uint32_t>(img->BytesPerPixel);
+  uint32_t imgWidth = static_cast<uint32_t>(img->Width) * bpp;
+  uint32_t imgHeight = static_cast<uint32_t>(img->Height);
+  int x_pos_a, y_pos_a, offset;
+
+  assert((x % 4) == 0);
+  assert((N * sizeof(Ty)) % 4 == 0);
 
-      using refTy = __SEIEED::vector_type_t<unsigned int, 4>;
-      auto ref = reinterpret_cast<refTy>(res);
+  // TODO : Remove intermediate 'out' matrix
+  std::vector<std::vector<Ty>> out(M, std::vector<Ty>(N));
 
-      cl::sycl::cl_uint4 data;
-      for (int idx = 0; idx < 4; idx++) {
-        data[idx] = ref[idx];
+  std::unique_lock<std::mutex> lock(img->mutexLock);
+
+  for (int i = 0, k = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      out[i][j] = vals[k++];
+    }
+  }
+
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      x_pos_a = x + j * sizeof(Ty);
+      { y_pos_a = y + i; }
+      if ((int)x_pos_a < 0) {
+        continue;
+      }
+      if ((int)y_pos_a < 0) {
+        continue;
+      }
+      if ((int)(x_pos_a + sizeof(Ty)) > imgWidth) {
+        continue;
       }
 
-      if (i < range[0] && j < range[1]) {
-        auto coords = cl::sycl::cl_int2(i, j);
-        handle.write(coords, data);
+      if ((int)y_pos_a > imgHeight - 1) {
+        continue;
       }
-      i++;
+      offset = y_pos_a * imgWidth + x_pos_a;
+      *((Ty *)(writeBase + offset)) = out[i][j];
     }
-    i = x / bpp;
-    j++;
   }
+
+  /// TODO : Optimize
+  I->cm_fence_ptr();
 }
 
 template <typename Ty, int N>
@@ -820,10 +1038,27 @@ __esimd_dp4(__SEIEED::vector_type_t<Ty, N> v1,
   return retv;
 }
 
-/// TODO
-inline void __esimd_barrier() {}
+inline void __esimd_slm_init(size_t size) {
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+
+  I->cm_slm_init_ptr(size);
+}
 
-inline void __esimd_sbarrier(__SEIEE::split_barrier_action flag) {}
+inline void __esimd_barrier() {
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+
+  I->cm_barrier_ptr();
+}
+
+inline void __esimd_sbarrier(
+    sycl::ext::intel::experimental::esimd::EsimdSbarrierType flag) {
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+
+  I->cm_sbarrier_ptr((uint32_t)flag);
+}
 
 inline void __esimd_slm_fence(uint8_t cntl) {}
 
@@ -832,6 +1067,17 @@ inline __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_read(__SEIEED::vector_type_t<uint32_t, N> addrs,
                  __SEIEED::vector_type_t<uint16_t, N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+
+  char *SlmBase = I->__cm_emu_get_slm_ptr();
+  for (int i = 0; i < N; ++i) {
+    if (pred[i]) {
+      Ty *addr = reinterpret_cast<Ty *>(addrs[i] + SlmBase);
+      retv[i] = *addr;
+    }
+  }
+
   return retv;
 }
 
@@ -839,19 +1085,49 @@ __esimd_slm_read(__SEIEED::vector_type_t<uint32_t, N> addrs,
 template <typename Ty, int N>
 inline void __esimd_slm_write(__SEIEED::vector_type_t<uint32_t, N> addrs,
                               __SEIEED::vector_type_t<Ty, N> vals,
-                              __SEIEED::vector_type_t<uint16_t, N> pred) {}
+                              __SEIEED::vector_type_t<uint16_t, N> pred) {
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+
+  char *SlmBase = I->__cm_emu_get_slm_ptr();
+  for (int i = 0; i < N; ++i) {
+    if (pred[i]) {
+      Ty *addr = reinterpret_cast<Ty *>(addrs[i] + SlmBase);
+      *addr = vals[i];
+    }
+  }
+}
 
 // slm_block_read reads a block of data from SLM
 template <typename Ty, int N>
 inline __SEIEED::vector_type_t<Ty, N> __esimd_slm_block_read(uint32_t addr) {
   __SEIEED::vector_type_t<Ty, N> retv;
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+  char *SlmBase = I->__cm_emu_get_slm_ptr();
+  addr <<= 4;
+  for (int i = 0; i < N; ++i) {
+    Ty *SlmAddr = reinterpret_cast<Ty *>(addr + SlmBase);
+    retv[i] = *SlmAddr;
+    addr += sizeof(Ty);
+  }
   return retv;
 }
 
 // slm_block_write writes a block of data to SLM
 template <typename Ty, int N>
 inline void __esimd_slm_block_write(uint32_t addr,
-                                    __SEIEED::vector_type_t<Ty, N> vals) {}
+                                    __SEIEED::vector_type_t<Ty, N> vals) {
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+  char *SlmBase = I->__cm_emu_get_slm_ptr();
+  addr <<= 4;
+  for (int i = 0; i < N; ++i) {
+    Ty *SlmAddr = reinterpret_cast<Ty *>(addr + SlmBase);
+    *SlmAddr = vals[i];
+    addr += sizeof(Ty);
+  }
+}
 
 // slm_read4 does SLM gather4
 template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask>
@@ -859,6 +1135,52 @@ inline __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)>
 __esimd_slm_read4(__SEIEED::vector_type_t<uint32_t, N> addrs,
                   __SEIEED::vector_type_t<uint16_t, N> pred) {
   __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> retv;
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+  char *ReadBase = I->__cm_emu_get_slm_ptr();
+
+  unsigned int Next = 0;
+  if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *addr = reinterpret_cast<Ty *>(addrs[I] + ReadBase);
+        retv[Next] = *addr;
+      }
+    }
+  }
+
+  ReadBase += sizeof(Ty);
+
+  if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *addr = reinterpret_cast<Ty *>(addrs[I] + ReadBase);
+        retv[Next] = *addr;
+      }
+    }
+  }
+
+  ReadBase += sizeof(Ty);
+
+  if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *addr = reinterpret_cast<Ty *>(addrs[I] + ReadBase);
+        retv[Next] = *addr;
+      }
+    }
+  }
+
+  ReadBase += sizeof(Ty);
+
+  if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *addr = reinterpret_cast<Ty *>(addrs[I] + ReadBase);
+        retv[Next] = *addr;
+      }
+    }
+  }
   return retv;
 }
 
@@ -867,7 +1189,55 @@ template <typename Ty, int N, __SEIEE::rgba_channel_mask Mask>
 inline void __esimd_slm_write4(
     __SEIEED::vector_type_t<uint32_t, N> addrs,
     __SEIEED::vector_type_t<Ty, N * get_num_channels_enabled(Mask)> vals,
-    __SEIEED::vector_type_t<uint16_t, N> pred) {}
+    __SEIEED::vector_type_t<uint16_t, N> pred) {
+
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+  char *WriteBase = I->__cm_emu_get_slm_ptr();
+
+  unsigned int Next = 0;
+  if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::R)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *addr = reinterpret_cast<Ty *>(addrs[I] + WriteBase);
+        *addr = vals[Next];
+      }
+    }
+  }
+
+  WriteBase += sizeof(Ty);
+
+  if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::G)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *addr = reinterpret_cast<Ty *>(addrs[I] + WriteBase);
+        *addr = vals[Next];
+      }
+    }
+  }
+
+  WriteBase += sizeof(Ty);
+
+  if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::B)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *addr = reinterpret_cast<Ty *>(addrs[I] + WriteBase);
+        *addr = vals[Next];
+      }
+    }
+  }
+
+  WriteBase += sizeof(Ty);
+
+  if (__SEIEE::is_channel_enabled(Mask, __SEIEE::rgba_channel::A)) {
+    for (int I = 0; I < N; I++, Next++) {
+      if (pred[I]) {
+        Ty *addr = reinterpret_cast<Ty *>(addrs[I] + WriteBase);
+        *addr = vals[Next];
+      }
+    }
+  }
+}
 
 // slm_atomic: SLM atomic
 template <__SEIEE::atomic_op Op, typename Ty, int N>
@@ -875,6 +1245,23 @@ inline __SEIEED::vector_type_t<Ty, N>
 __esimd_slm_atomic0(__SEIEED::vector_type_t<uint32_t, N> addrs,
                     __SEIEED::vector_type_t<uint16_t, N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+  char *WriteBase = I->__cm_emu_get_slm_ptr();
+
+  for (int i = 0; i < N; i++) {
+    if (pred[i]) {
+      Ty *p = reinterpret_cast<Ty *>(addrs[i] + WriteBase);
+
+      switch (Op) {
+      case __SEIEE::EsimdAtomicOpType::ATOMIC_INC:
+        retv[i] = atomic_add_fetch<Ty>(p, 1);
+        break;
+      default:
+        throw cl::sycl::feature_not_supported();
+      }
+    }
+  }
   return retv;
 }
 
@@ -884,6 +1271,21 @@ __esimd_slm_atomic1(__SEIEED::vector_type_t<uint32_t, N> addrs,
                     __SEIEED::vector_type_t<Ty, N> src0,
                     __SEIEED::vector_type_t<uint16_t, N> pred) {
   __SEIEED::vector_type_t<Ty, N> retv;
+
+  for (int i = 0; i < N; i++) {
+    if (pred[i]) {
+      Ty *p = reinterpret_cast<Ty *>(addrs[i]);
+
+      switch (Op) {
+      case __SEIEE::EsimdAtomicOpType::ATOMIC_ADD:
+        retv[i] = atomic_add_fetch<Ty>(p, src0[i]);
+        break;
+      default:
+        throw cl::sycl::feature_not_supported();
+      }
+    }
+  }
+
   return retv;
 }
 
@@ -930,15 +1332,59 @@ __esimd_flat_atomic2(__SEIEED::vector_type_t<uint64_t, N> addrs,
 template <typename Ty, int N, typename SurfIndAliasTy>
 inline __SEIEED::vector_type_t<Ty, N>
 __esimd_block_read(SurfIndAliasTy surf_ind, uint32_t offset) {
-  throw cl::sycl::feature_not_supported();
-  return __SEIEED::vector_type_t<Ty, N>();
+  __SEIEED::vector_type_t<Ty, N> retv;
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+  _pi_buffer *buf =
+      static_cast<_pi_buffer *>(static_cast<void *>(surf_ind.get_pointer()));
+
+  char *readBase = I->sycl_get_surface_base_addr_ptr(buf->SurfaceIndex);
+
+  uint32_t width = static_cast<uint32_t>(buf->Size);
+  uint32_t pos = offset;
+
+  std::unique_lock<std::mutex> lock(buf->mutexLock);
+
+  for (int idx = 0; idx < N; idx++) {
+    if (pos >= width) {
+      retv[idx] = 0;
+    } else {
+      retv[idx] = *((Ty *)(readBase + pos));
+    }
+    pos += (uint32_t)sizeof(Ty);
+  }
+
+  return retv;
 }
 
 template <typename Ty, int N, typename SurfIndAliasTy>
 inline void __esimd_block_write(SurfIndAliasTy surf_ind, uint32_t offset,
                                 __SEIEED::vector_type_t<Ty, N> vals) {
+  sycl::detail::ESIMDDeviceInterface *I =
+      sycl::detail::getESIMDDeviceInterface();
+  _pi_buffer *buf =
+      static_cast<_pi_buffer *>(static_cast<void *>(surf_ind.get_pointer()));
 
-  throw cl::sycl::feature_not_supported();
+  char *writeBase = I->sycl_get_surface_base_addr_ptr(buf->SurfaceIndex);
+
+  uint32_t width = static_cast<uint32_t>(buf->Size);
+  assert(buf->Size == width);
+
+  uint32_t pos = offset << 4;
+
+  std::unique_lock<std::mutex> lock(buf->mutexLock);
+
+  for (int idx = 0; idx < N; idx++) {
+    if (pos < width) {
+      *((Ty *)(writeBase + pos)) = vals[idx];
+    } else {
+      break;
+    }
+    pos += (uint32_t)sizeof(Ty);
+  }
+
+  /// TODO : Optimize
+  I->cm_fence_ptr();
 }
 
 /// \brief esimd_get_value
@@ -1032,8 +1478,173 @@ __esimd_raw_send_load(uint8_t modifier, uint8_t execSize,
                       uint32_t exDesc, uint32_t msgDesc,
                       __SEIEED::vector_type_t<Ty2, N2> msgSrc0,
                       __SEIEED::vector_type_t<Ty1, N1> msgDst) {
-  throw cl::sycl::feature_not_supported();
-  return 0;
+  assert(sfid == 0xF); // UGM type only
+
+  __SEIEED::vector_type_t<Ty1, N1> retv;
+
+  auto op = raw_send::getMsgOp(msgDesc);
+  assert(op == raw_send::msgOp::LOAD_2D);
+  uint64_t surfaceBase = raw_send::getSurfaceBaseAddr<Ty2, N2>(msgSrc0);
+  auto surfaceDim = raw_send::getSurfaceDim<Ty2, N2>(msgSrc0);
+  auto blockOffset = raw_send::getBlockOffsets<Ty2, N2>(msgSrc0);
+  auto blockDim = raw_send::getBlockDim<Ty2, N2>(msgSrc0);
+  auto arrayLen = raw_send::getArrayLen<Ty2, N2>(msgSrc0);
+
+  unsigned SurfaceWidth = surfaceDim[0] + 1;
+  unsigned SurfaceHeight = surfaceDim[1] + 1;
+  unsigned SurfacePitch = surfaceDim[2] + 1;
+
+  int X = blockOffset[0];
+  int Y = blockOffset[1];
+  int Width = blockDim[0] + 1;
+  int Height = blockDim[1] + 1;
+  int NBlks = arrayLen + 1;
+
+  bool Transposed =
+      raw_send::getMsgField(msgDesc, raw_send::msgField::TRANSPOSE);
+  bool Transformed = raw_send::getMsgField(msgDesc, raw_send::msgField::VNNI);
+
+  constexpr unsigned sizeofT = sizeof(Ty1);
+
+  char *buffBase = (char *)surfaceBase;
+
+  // TODO : Acquire mutex for the surface pointed to by 'surfaceBase'
+  int vecIdx = 0;
+  int blkCount = 0;
+
+  for (int xBase = X * sizeofT; blkCount < NBlks; xBase += sizeofT * Width) {
+    if (Transformed == true) {
+      constexpr int elems_per_DW = (sizeofT == 1) ? 4 : 2; /// VNNI_pack
+      if (Transposed == false) { /// Transform only load
+        int yRead = Y * SurfacePitch;
+        for (int u = 0; u < Height;
+             u += elems_per_DW, yRead += SurfacePitch * elems_per_DW) {
+          if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) {
+            /// Vertically out-of-bound, padding zero on out of boundary
+            for (int v = 0; v < Width; v += 1) {
+              for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) {
+                retv[vecIdx] = (Ty1)(0);
+              } // k loop
+            }
+            // vecIdx += Width * elems_per_DW;;
+            continue;
+          }
+
+          int xRead = xBase;
+          for (int v = 0; v < Width; v += 1, xRead += sizeofT) {
+            if ((xRead < 0) || (xRead >= SurfaceWidth)) {
+              /// Horizontally out-of-bound
+              for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) {
+                retv[vecIdx] = (Ty1)(0);
+              } // k loop
+              // vecIdx += elems_per_DW;
+              continue;
+            }
+
+            char *base = buffBase + yRead + xRead;
+            int offset = 0;
+            for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) {
+              retv[vecIdx] = *((Ty1 *)(base + offset));
+              // Increasing in Y-direction
+              offset += SurfacePitch;
+            } // k loop
+          }   // v loop
+        }     /// u loop
+      }       // Transposed = false
+      else    // Transposed == true
+      {       /// Transform & Transpose load
+        int xRead = xBase;
+        for (int v = 0; v < Width;
+             v += elems_per_DW, xRead += sizeofT * elems_per_DW) {
+          if ((xRead < 0) || (xRead >= SurfaceWidth)) {
+            // Horizontally out-of-bound
+            for (int u = 0; u < Height; u += 1) {
+              for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) {
+                retv[vecIdx] = (Ty1)(0);
+              } // k loop
+            }
+            // vecIdx += Height * elems_per_DW;
+            continue;
+          }
+
+          int yRead = Y * SurfacePitch;
+          for (int u = 0; u < Height; u += 1, yRead += SurfacePitch) {
+            if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) {
+              /// Vertically out-of-bound
+              for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) {
+                retv[vecIdx] = (Ty1)(0);
+              } // k loop
+              // vecIdx += elems_per_DW;
+              continue;
+            }
+
+            char *base = buffBase + yRead + xRead;
+            int offset = 0;
+            for (int k = 0; k < elems_per_DW; k++, vecIdx += 1) {
+              retv[vecIdx] = *((Ty1 *)(base + offset));
+              // Increasing in X-direction
+              offset += sizeofT;
+            } // k loop
+          }   // u loop
+        }     // v loop
+      }       // Transposed == true
+    }         // Transformed == true
+    else      // (Transformed == false)
+    {
+      if (Transposed == false) { /// Linear load - no transform, no transpose
+        int yRead = Y * SurfacePitch;
+        for (int u = 0; u < Height; u += 1, yRead += SurfacePitch) {
+          if ((yRead < 0) || (yRead >= SurfacePitch * SurfaceHeight)) {
+            // Vertically Out-of-bound
+            for (int v = 0; v < Width; v += 1, vecIdx += 1) {
+              retv[vecIdx] = (Ty1)(0);
+            }
+            // vecIdx += Width;
+            continue;
+          }
+
+          int xRead = xBase;
+          for (int v = 0; v < Width; v += 1, xRead += sizeofT, vecIdx += 1) {
+            if ((xRead >= 0) && (xRead < SurfaceWidth)) {
+              retv[vecIdx] = *((Ty1 *)(buffBase + yRead + xRead));
+            } else {
+              // Horizontally out of bound
+              retv[vecIdx] = (Ty1)(0);
+            }
+          } // v loop
+        }   // u loop
+      }     /// Transposed == false
+      else  // Transposed = true
+      {     /// Transpose load - no transform
+        int xRead = xBase;
+        for (int v = 0; v < Width; v += 1, xRead += sizeofT) {
+          if ((xRead < 0) || (xRead > SurfaceWidth)) {
+            // Horizontally out-of-bound
+            for (int u = 0; u < Height; u += 1, vecIdx += 1) {
+              retv[vecIdx] = (Ty1)(0);
+            }
+            // vecIdx += Height;
+            continue;
+          }
+
+          int yRead = Y * SurfacePitch;
+          for (int u = 0; u < Height;
+               u += 1, yRead += SurfacePitch, vecIdx += 1) {
+            if ((yRead >= 0) && (yRead < SurfacePitch * SurfaceHeight)) {
+              retv[vecIdx] = *((Ty1 *)(buffBase + yRead + xRead));
+            } else {
+              // Vertically out of bound
+              retv[vecIdx] = (Ty1)(0);
+            }
+          } // u loop
+        }   // v loop
+      }     // Transposed == true
+    }       // Transformed == false
+    blkCount += 1;
+    vecIdx = blkCount * Width * Height;
+  } // xBase loop
+
+  return retv;
 }
 
 /// \brief Raw sends store.
@@ -1068,7 +1679,44 @@ inline void __esimd_raw_sends_store(uint8_t modifier, uint8_t execSize,
                                     uint32_t msgDesc,
                                     __SEIEED::vector_type_t<Ty1, N1> msgSrc0,
                                     __SEIEED::vector_type_t<Ty2, N2> msgSrc1) {
-  throw cl::sycl::feature_not_supported();
+  assert(sfid == 0xF); // UGM type only
+  auto op = raw_send::getMsgOp(msgDesc);
+  assert(op == raw_send::msgOp::STORE_2D);
+  uint64_t surfaceBase = raw_send::getSurfaceBaseAddr<Ty1, N1>(msgSrc0);
+  auto surfaceDim = raw_send::getSurfaceDim<Ty1, N1>(msgSrc0);
+  auto blockOffset = raw_send::getBlockOffsets<Ty1, N1>(msgSrc0);
+  auto blockDim = raw_send::getBlockDim<Ty1, N1>(msgSrc0);
+
+  unsigned SurfaceWidth = surfaceDim[0] + 1;
+  unsigned SurfaceHeight = surfaceDim[1] + 1;
+  unsigned SurfacePitch = surfaceDim[2] + 1;
+
+  int X = blockOffset[0];
+  int Y = blockOffset[1];
+  int Width = blockDim[0] + 1;
+  int Height = blockDim[1] + 1;
+
+  constexpr unsigned sizeofT = sizeof(Ty2);
+
+  char *buffBase = (char *)surfaceBase;
+
+  int vecIdx = 0;
+  int rowCount = 0;
+  for (int yWrite = Y * SurfacePitch; rowCount < Height;
+       yWrite += SurfacePitch) {
+    if (yWrite == SurfacePitch * SurfaceHeight) {
+      // Vertically Out-of-bound
+      break;
+    }
+    int writeCount = 0;
+    for (int xWrite = X * sizeofT; writeCount < Width;
+         xWrite += sizeofT, vecIdx += 1, writeCount += 1) {
+      if (xWrite >= 0 && xWrite < SurfaceWidth) {
+        *((Ty2 *)(buffBase + yWrite + xWrite)) = msgSrc1[vecIdx];
+      }
+    } // xWrite loop
+    rowCount += 1;
+  } // yWrite loop
 }
 
 /// \brief Raw send store.
@@ -1096,6 +1744,13 @@ inline void __esimd_raw_send_store(uint8_t modifier, uint8_t execSize,
                                    uint8_t numSrc0, uint8_t sfid,
                                    uint32_t exDesc, uint32_t msgDesc,
                                    __SEIEED::vector_type_t<Ty1, N1> msgSrc0) {
+  auto op = raw_send::getMsgOp(msgDesc);
+
+  if (op == raw_send::msgOp::LOAD_2D) {
+    // Prefetch?
+    return;
+  }
+
   throw cl::sycl::feature_not_supported();
 }
 
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index dadcc437d0e5d..95dfdc9ab249b 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -803,7 +803,7 @@ media_block_store(AccessorTy acc, unsigned x, unsigned y, simd<T, m * n> vals) {
 
 #ifndef __SYCL_DEVICE_ONLY__
 
-inline void slm_init(uint32_t size) {}
+inline void slm_init(uint32_t size) { __esimd_slm_init(size); }
 
 #endif
 
diff --git a/sycl/plugins/esimd_cpu/CMakeLists.txt b/sycl/plugins/esimd_cpu/CMakeLists.txt
index b6c12e47b5e0f..4fbaf5ef65853 100755
--- a/sycl/plugins/esimd_cpu/CMakeLists.txt
+++ b/sycl/plugins/esimd_cpu/CMakeLists.txt
@@ -2,9 +2,112 @@
 # PI Esimd CPU  library
 # Create Shared library for libpi_esimd_cpu.so.
 
+include(ExternalProject)
+
 include_directories("${sycl_inc_dir}")
 include_directories(${OpenCL_INCLUDE_DIR})
-include_directories(${LIBCMRT_INCLUDE})
+
+if (NOT MSVC)
+  file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libva_build)
+  file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libva_install)
+  ExternalProject_Add(libva
+    GIT_REPOSITORY    https://github.com/intel/libva.git
+    GIT_TAG           bef69c5f380a27b6908d6daea0fc18ce50e33bb9
+    BINARY_DIR        ${CMAKE_CURRENT_BINARY_DIR}/libva_build
+    INSTALL_DIR       ${CMAKE_CURRENT_BINARY_DIR}/libva_install
+    CONFIGURE_COMMAND cd ${CMAKE_CURRENT_BINARY_DIR}/libva-prefix/src/libva && ./autogen.sh --prefix=${CMAKE_CURRENT_BINARY_DIR}/libva_install
+    BUILD_COMMAND     cd ${CMAKE_CURRENT_BINARY_DIR}/libva-prefix/src/libva && make -j
+    INSTALL_COMMAND   cd ${CMAKE_CURRENT_BINARY_DIR}/libva-prefix/src/libva && make install
+  )
+  ExternalProject_Add_Step(libva llvminstall
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps && ${CMAKE_COMMAND} -E copy_directory <INSTALL_DIR>/ ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps
+    COMMENT "Installing libva into the LLVM binary directory"
+    DEPENDEES install
+  )
+endif()
+
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build)
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install)
+
+if (MSVC)
+  set(LIBCM ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/libcm${CMAKE_STATIC_LIBRARY_SUFFIX})
+  set(LIBIGFXCMRT_EMU ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/igfxcmrt64_emu${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+  set(LIBCM ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/libcm${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(LIBIGFXCMRT_EMU ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/lib/libigfxcmrt_emu${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+if (DEFINED CM_LOCAL_SOURCE_DIR)
+    # Using local CM directory for online building without downloading
+  if (MSVC)
+    ExternalProject_Add(cm-emu
+      DOWNLOAD_COMMAND  ""
+      SOURCE_DIR        ${CM_LOCAL_SOURCE_DIR}
+      BINARY_DIR        ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build
+      INSTALL_DIR       ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install
+      CMAKE_ARGS        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+      BUILD_BYPRODUCTS  ${LIBCM} ${LIBIGFXCMRT_EMU}
+    )
+  else()
+    ExternalProject_Add(cm-emu
+      DOWNLOAD_COMMAND  ""
+      SOURCE_DIR        ${CM_LOCAL_SOURCE_DIR}
+      BINARY_DIR        ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build
+      INSTALL_DIR       ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install
+      CMAKE_ARGS        -DLIBVA_INSTALL_PATH=${CMAKE_CURRENT_BINARY_DIR}/libva_install
+                        -D__SYCL_EXPLICIT_SIMD_PLUGIN__=true
+                        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+      DEPENDS           libva
+      BUILD_BYPRODUCTS  ${LIBCM} ${LIBIGFXCMRT_EMU}
+    )
+  endif()
+else ()
+  if (DEFINED CM_PACKAGE_URL)
+    # Downloading pre-built CM Package
+    file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install)
+    ExternalProject_Add(cm-emu
+      URL               ${CM_PACKAGE_URL}
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      UPDATE_COMMAND    ""
+      INSTALL_DIR       ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install
+      INSTALL_COMMAND   ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/cm-emu-prefix/src/cm-emu/ <INSTALL_DIR>
+      BUILD_BYPRODUCTS  ${LIBCM} ${LIBIGFXCMRT_EMU}
+    )
+  else()
+    # Build from CM source tree fetched from github
+    ExternalProject_Add(cm-emu
+      GIT_REPOSITORY    https://github.com/intel/cm-cpu-emulation.git
+      BINARY_DIR        ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_build
+      INSTALL_DIR       ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install
+      CMAKE_ARGS        -DLIBVA_INSTALL_PATH=${CMAKE_CURRENT_BINARY_DIR}/libva_install
+                        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+      DEPENDS           libva
+      BUILD_BYPRODUCTS  ${LIBCM} ${LIBIGFXCMRT_EMU}
+    )
+  endif()
+endif ()
+ExternalProject_Add_Step(cm-emu llvminstall
+  COMMAND ${CMAKE_COMMAND} -E make_directory ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps && ${CMAKE_COMMAND} -E copy_directory <INSTALL_DIR>/ ${LLVM_BINARY_DIR}/pi_esimd_cpu_deps
+  COMMENT "Installing cm-emu into the LLVM binary directory"
+  DEPENDEES install
+)
+
+include_directories(${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/include/igfxcmrt_emu)
+include_directories(${LLVM_BINARY_DIR}/pi_esimd_cpu_deps/include/libcm/cm)
+
+# Compilation flag to exclude lines in header files imported from CM
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__SYCL_EXPLICIT_SIMD_PLUGIN__")
+
+set(CMAKE_CXX_STANDARD 17)
+
+# Compilation option modification to prevent build termination caused by
+# warnings from CM-imported files
+if (MSVC)
+string(REPLACE "/W4" " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+else()
+string(REPLACE "-pedantic" " " CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endif()
 
 add_library(pi_esimd_cpu SHARED
   "${sycl_inc_dir}/CL/sycl/detail/pi.h"
@@ -31,16 +134,42 @@ else()
   )
 endif()
 
+add_dependencies(pi_esimd_cpu OpenCL-Headers)
+add_dependencies(pi_esimd_cpu cm-emu)
 add_dependencies(sycl-toolchain pi_esimd_cpu)
 
-add_dependencies(pi_esimd_cpu
-  OpenCL-Headers)
-
-target_link_libraries(pi_esimd_cpu PRIVATE sycl)
+target_link_libraries(pi_esimd_cpu PRIVATE sycl ${LIBCM} ${LIBIGFXCMRT_EMU})
 set_target_properties(pi_esimd_cpu PROPERTIES LINKER_LANGUAGE CXX)
 
 add_common_options(pi_esimd_cpu)
 
 install(TARGETS pi_esimd_cpu
-    LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT pi_esimd_cpu
-    RUNTIME DESTINATION "bin" COMPONENT pi_esimd_cpu)
+        LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT pi_esimd_cpu
+        RUNTIME DESTINATION "bin" COMPONENT pi_esimd_cpu)
+
+# Copy CM Header files to $(INSTALL)/include/sycl/CL/
+install(DIRECTORY    ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/include/libcm/cm/
+        DESTINATION  ${SYCL_INCLUDE_DIR}/CL
+        COMPONENT    libcmrt-headers
+        FILES_MATCHING PATTERN "*.h"
+)
+
+# Copy '.so' files to '$(INSTALL)/lib'
+if (MSVC)
+  install(DIRECTORY   ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/lib/
+          DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+          COMPONENT   libcmrt-libs
+          FILES_MATCHING PATTERN "*.lib"
+  )
+  install(DIRECTORY   ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/bin/
+          DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
+          COMPONENT   libcmrt-dlls
+          FILES_MATCHING PATTERN "*.dll"
+  )
+else()
+  install(DIRECTORY   ${CMAKE_CURRENT_BINARY_DIR}/cm-emu_install/lib/
+          DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+          COMPONENT   libcmrt-sos
+          FILES_MATCHING PATTERN "*.so"
+  )
+endif()
diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp
index 63fc720f49eee..ce6126736275f 100644
--- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp
+++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.cpp
@@ -28,6 +28,8 @@
 #include <CL/sycl/nd_item.hpp>
 #include <CL/sycl/range.hpp>
 
+#include <esimdcpu_support.h>
+
 #include <cstdarg>
 #include <cstdio>
 #include <cstring>
@@ -38,17 +40,77 @@
 #include <thread>
 #include <utility>
 
-#ifdef __GNUC__
-// Linux
-#include <dlfcn.h>
-#else
-// Windows
-#include <windows.h>
-#endif
-
 #include "pi_esimd_cpu.hpp"
 
-#define PLACEHOLDER_UNUSED(x) (void)x
+namespace {
+
+template <typename T, typename Assign>
+pi_result getInfoImpl(size_t param_value_size, void *param_value,
+                      size_t *param_value_size_ret, T value, size_t value_size,
+                      Assign &&assign_func) {
+
+  if (param_value != nullptr) {
+
+    if (param_value_size < value_size) {
+      return PI_INVALID_VALUE;
+    }
+
+    assign_func(param_value, value, value_size);
+  }
+
+  if (param_value_size_ret != nullptr) {
+    *param_value_size_ret = value_size;
+  }
+
+  return PI_SUCCESS;
+}
+
+template <typename T>
+pi_result getInfo(size_t param_value_size, void *param_value,
+                  size_t *param_value_size_ret, T value) {
+
+  auto assignment = [](void *param_value, T value, size_t value_size) {
+    *static_cast<T *>(param_value) = value;
+  };
+
+  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
+                     sizeof(T), assignment);
+}
+
+template <typename T>
+pi_result getInfoArray(size_t array_length, size_t param_value_size,
+                       void *param_value, size_t *param_value_size_ret,
+                       T *value) {
+  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
+                     array_length * sizeof(T), memcpy);
+}
+
+template <>
+pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
+                                size_t *param_value_size_ret,
+                                const char *value) {
+  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
+                      param_value_size_ret, value);
+}
+
+class ReturnHelper {
+public:
+  ReturnHelper(size_t param_value_size, void *param_value,
+               size_t *param_value_size_ret)
+      : param_value_size(param_value_size), param_value(param_value),
+        param_value_size_ret(param_value_size_ret) {}
+
+  template <class T> pi_result operator()(const T &t) {
+    return getInfo(param_value_size, param_value, param_value_size_ret, t);
+  }
+
+private:
+  size_t param_value_size;
+  void *param_value;
+  size_t *param_value_size_ret;
+};
+
+} // anonymous namespace
 
 // Global variables used in PI_esimd_cpu
 // Note we only create a simple pointer variables such that C++ RT won't
@@ -115,121 +177,208 @@ LAMBDA_WRAPPER_TMPL(sycl::nd_item<3>, NDITEM_3DIM, 3)
 #undef _COMMA_
 #undef LAMBDA_WRAPPER_TMPL
 
-extern "C" inline void invokeLambda_ID_1DIM(void *Wrapper) {
-  auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_ID_1DIM *>(Wrapper);
+extern "C" {
 
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::id<1> instance using thread ID info
-  // retrieved from CM and call Lambda function
-  // LambdaWrapper->Func(id_1dim);
+inline void InvokeLambda_ID_1DIM(void *Wrapper) {
+  auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_ID_1DIM *>(Wrapper);
+  cl::sycl::id<1> Id1Dim(cm_support::get_thread_idx(0));
+  LambdaWrapper->Func(Id1Dim);
 }
 
-extern "C" inline void invokeLambda_ID_2DIM(void *Wrapper) {
+inline void InvokeLambda_ID_2DIM(void *Wrapper) {
   auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_ID_2DIM *>(Wrapper);
-
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::id<2> instance using thread ID info
-  // retrieved from CM and call Lambda function
-  // LambdaWrapper->Func(id_2dim);
+  cl::sycl::id<2> Id2Dim(cm_support::get_thread_idx(0),
+                         cm_support::get_thread_idx(1));
+  LambdaWrapper->Func(Id2Dim);
 }
 
-extern "C" inline void invokeLambda_ID_3DIM(void *Wrapper) {
+inline void InvokeLambda_ID_3DIM(void *Wrapper) {
   auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_ID_3DIM *>(Wrapper);
-
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::id<3> instance using thread ID info
-  // retrieved from CM and call Lambda function
-  // LambdaWrapper->Func(id_3dim);
+  cl::sycl::id<3> Id3Dim(cm_support::get_thread_idx(0),
+                         cm_support::get_thread_idx(1),
+                         cm_support::get_thread_idx(2));
+  LambdaWrapper->Func(Id3Dim);
 }
 
-extern "C" inline void invokeLambda_ITEM_1DIM(void *Wrapper) {
+inline void InvokeLambda_ITEM_1DIM(void *Wrapper) {
   auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_ITEM_1DIM *>(Wrapper);
-
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::item<1, false> instance using thread
-  // ID info retrieved from CM and call Lambda function
-  // LambdaWrapper->Func(item_1dim);
+  cl::sycl::item<1, false> Item1Dim = IDBuilder::createItem<1, false>(
+      {cm_support::get_thread_count(0)}, /// Extent
+      {cm_support::get_thread_idx(0)});  /// Index
+  LambdaWrapper->Func(Item1Dim);
 }
 
-extern "C" inline void invokeLambda_ITEM_2DIM(void *Wrapper) {
+inline void InvokeLambda_ITEM_2DIM(void *Wrapper) {
   auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_ITEM_2DIM *>(Wrapper);
-
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::item<2, false> instance using thread
-  // ID info retrieved from CM and call Lambda function
-  // LambdaWrapper->Func(item_2dim);
+  cl::sycl::item<2, false> Item2Dim = IDBuilder::createItem<2, false>(
+      {cm_support::get_thread_count(0), /// Extent
+       cm_support::get_thread_count(1)},
+      {cm_support::get_thread_idx(0), /// Index
+       cm_support::get_thread_idx(1)});
+  LambdaWrapper->Func(Item2Dim);
 }
 
-extern "C" inline void invokeLambda_ITEM_3DIM(void *Wrapper) {
+inline void InvokeLambda_ITEM_3DIM(void *Wrapper) {
   auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_ITEM_3DIM *>(Wrapper);
+  cl::sycl::item<3, false> Item3Dim = IDBuilder::createItem<3, false>(
+      {cm_support::get_thread_count(0), /// Extent
+       cm_support::get_thread_count(1), cm_support::get_thread_count(2)},
+      {cm_support::get_thread_idx(0), /// Index
+       cm_support::get_thread_idx(1), cm_support::get_thread_idx(2)});
 
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::item<3, false> instance using thread
-  // ID info retrieved from CM and call Lambda function
-  // LambdaWrapper->Func(item_3dim);
+  LambdaWrapper->Func(Item3Dim);
 }
 
-extern "C" inline void invokeLambda_ITEM_OFFSET_1DIM(void *Wrapper) {
+inline void InvokeLambda_ITEM_OFFSET_1DIM(void *Wrapper) {
   auto *LambdaWrapper =
       reinterpret_cast<LambdaWrapper_ITEM_OFFSET_1DIM *>(Wrapper);
 
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::item<1, true> instance using thread
-  // ID info retrieved from CM with GlobalOffset info and call Lambda
-  // function
-  // LambdaWrapper->Func(item_offset_1dim);
+  cl::sycl::item<1, true> ItemOffset1Dim = IDBuilder::createItem<1, true>(
+      {cm_support::get_thread_count(0)}, /// Extent
+      {cm_support::get_thread_idx(0) +
+       LambdaWrapper->GlobalOffset[0]}, /// Index
+      {LambdaWrapper->GlobalOffset[0]}  /// Offset
+  );
+  LambdaWrapper->Func(ItemOffset1Dim);
 }
 
-extern "C" inline void invokeLambda_ITEM_OFFSET_2DIM(void *Wrapper) {
+inline void InvokeLambda_ITEM_OFFSET_2DIM(void *Wrapper) {
   auto *LambdaWrapper =
       reinterpret_cast<LambdaWrapper_ITEM_OFFSET_2DIM *>(Wrapper);
 
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::item<2, true> instance using thread
-  // ID info retrieved from CM with GlobalOffset info and call Lambda
-  // function
-  // LambdaWrapper->Func(item_offset_2dim);
+  cl::sycl::item<2, true> ItemOffset2Dim = IDBuilder::createItem<2, true>(
+      {cm_support::get_thread_count(0), /// Extent
+       cm_support::get_thread_count(1)},
+      {cm_support::get_thread_idx(0) + LambdaWrapper->GlobalOffset[0], /// Index
+       cm_support::get_thread_idx(1) + LambdaWrapper->GlobalOffset[1]},
+      {LambdaWrapper->GlobalOffset[0], /// Offset
+       LambdaWrapper->GlobalOffset[1]});
+  LambdaWrapper->Func(ItemOffset2Dim);
 }
 
-extern "C" inline void invokeLambda_ITEM_OFFSET_3DIM(void *Wrapper) {
+inline void InvokeLambda_ITEM_OFFSET_3DIM(void *Wrapper) {
   auto *LambdaWrapper =
       reinterpret_cast<LambdaWrapper_ITEM_OFFSET_3DIM *>(Wrapper);
 
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::item<3, true> instance using thread
-  // ID info retrieved from CM with GlobalOffset info and call Lambda
-  // function
-  // LambdaWrapper->Func(item_offset_3dim);
+  cl::sycl::item<3, true> ItemOffset3Dim = IDBuilder::createItem<3, true>(
+      {cm_support::get_thread_count(0), /// Extent
+       cm_support::get_thread_count(1), cm_support::get_thread_count(2)},
+      {cm_support::get_thread_idx(0) + LambdaWrapper->GlobalOffset[0], /// Index
+       cm_support::get_thread_idx(1) + LambdaWrapper->GlobalOffset[1],
+       cm_support::get_thread_idx(2) + LambdaWrapper->GlobalOffset[2]},
+      {LambdaWrapper->GlobalOffset[0], /// Offset
+       LambdaWrapper->GlobalOffset[1], LambdaWrapper->GlobalOffset[2]});
+  LambdaWrapper->Func(ItemOffset3Dim);
 }
 
-extern "C" inline void invokeLambda_NDITEM_1DIM(void *Wrapper) {
+inline void InvokeLambda_NDITEM_1DIM(void *Wrapper) {
   auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_NDITEM_1DIM *>(Wrapper);
+  sycl::range<1> GroupSize(
+      sycl::detail::InitializedVal<1, sycl::range>::template get<0>());
+
+  if (LambdaWrapper->LocalSize[0] == 0 ||
+      LambdaWrapper->GlobalSize[0] % LambdaWrapper->LocalSize[0] != 0) {
+    throw sycl::nd_range_error("Invalid local size for global size - 1DIM",
+                               PI_INVALID_WORK_GROUP_SIZE);
+  }
+  GroupSize[0] = LambdaWrapper->GlobalSize[0] / LambdaWrapper->LocalSize[0];
+
+  const sycl::id<1> LocalID = {cm_support::get_thread_idx(0)};
+
+  const sycl::id<1> GroupID = {cm_support::get_group_idx(0)};
 
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::nd_item<1> instance using thread ID
-  // info retrieved from CM with GlobalOffset/GlobalSize/LocalSize
-  // info and call Lambda function
-  // LambdaWrapper->Func(nd_item_1dim);
+  const sycl::group<1> Group = IDBuilder::createGroup<1>(
+      LambdaWrapper->GlobalSize, LambdaWrapper->LocalSize, GroupSize, GroupID);
+
+  const sycl::id<1> GlobalID = GroupID * LambdaWrapper->LocalSize + LocalID +
+                               LambdaWrapper->GlobalOffset;
+  const sycl::item<1, /*Offset=*/true> GlobalItem =
+      IDBuilder::createItem<1, true>(LambdaWrapper->GlobalSize, GlobalID,
+                                     LambdaWrapper->GlobalOffset);
+  const sycl::item<1, /*Offset=*/false> LocalItem =
+      IDBuilder::createItem<1, false>(LambdaWrapper->LocalSize, LocalID);
+
+  const sycl::nd_item<1> NDItem1Dim =
+      IDBuilder::createNDItem<1>(GlobalItem, LocalItem, Group);
+
+  LambdaWrapper->Func(NDItem1Dim);
 }
 
-extern "C" inline void invokeLambda_NDITEM_2DIM(void *Wrapper) {
+inline void InvokeLambda_NDITEM_2DIM(void *Wrapper) {
   auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_NDITEM_2DIM *>(Wrapper);
+  sycl::range<2> GroupSize(
+      sycl::detail::InitializedVal<2, sycl::range>::template get<0>());
+
+  for (int I = 0; I < 2 /*Dims*/; ++I) {
+    if (LambdaWrapper->LocalSize[I] == 0 ||
+        LambdaWrapper->GlobalSize[I] % LambdaWrapper->LocalSize[I] != 0) {
+      throw sycl::nd_range_error("Invalid local size for global size - 2DIM",
+                                 PI_INVALID_WORK_GROUP_SIZE);
+    }
+    GroupSize[I] = LambdaWrapper->GlobalSize[I] / LambdaWrapper->LocalSize[I];
+  }
+
+  const sycl::id<2> LocalID = {cm_support::get_thread_idx(0),
+                               cm_support::get_thread_idx(1)};
+
+  const sycl::id<2> GroupID = {cm_support::get_group_idx(0),
+                               cm_support::get_group_idx(1)};
+
+  const sycl::group<2> Group = IDBuilder::createGroup<2>(
+      LambdaWrapper->GlobalSize, LambdaWrapper->LocalSize, GroupSize, GroupID);
 
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::nd_item<2> instance using thread ID
-  // info retrieved from CM with GlobalOffset/GlobalSize/LocalSize
-  // info and call Lambda function
-  // LambdaWrapper->Func(nd_item_2dim);
+  const sycl::id<2> GlobalID = GroupID * LambdaWrapper->LocalSize + LocalID +
+                               LambdaWrapper->GlobalOffset;
+  const sycl::item<2, /*Offset=*/true> GlobalItem =
+      IDBuilder::createItem<2, true>(LambdaWrapper->GlobalSize, GlobalID,
+                                     LambdaWrapper->GlobalOffset);
+  const sycl::item<2, /*Offset=*/false> LocalItem =
+      IDBuilder::createItem<2, false>(LambdaWrapper->LocalSize, LocalID);
+
+  const sycl::nd_item<2> NDItem2Dim =
+      IDBuilder::createNDItem<2>(GlobalItem, LocalItem, Group);
+
+  LambdaWrapper->Func(NDItem2Dim);
 }
 
-extern "C" inline void invokeLambda_NDITEM_3DIM(void *Wrapper) {
+inline void InvokeLambda_NDITEM_3DIM(void *Wrapper) {
   auto *LambdaWrapper = reinterpret_cast<LambdaWrapper_NDITEM_3DIM *>(Wrapper);
+  sycl::range<3> GroupSize(
+      sycl::detail::InitializedVal<3, sycl::range>::template get<0>());
+
+  for (int I = 0; I < 3 /*Dims*/; ++I) {
+    if (LambdaWrapper->LocalSize[I] == 0 ||
+        LambdaWrapper->GlobalSize[I] % LambdaWrapper->LocalSize[I] != 0) {
+      throw sycl::nd_range_error("Invalid local size for global size - 3DIM",
+                                 PI_INVALID_WORK_GROUP_SIZE);
+    }
+    GroupSize[I] = LambdaWrapper->GlobalSize[I] / LambdaWrapper->LocalSize[I];
+  }
+
+  const sycl::id<3> LocalID = {cm_support::get_thread_idx(0),
+                               cm_support::get_thread_idx(1),
+                               cm_support::get_thread_idx(2)};
+
+  const sycl::id<3> GroupID = {cm_support::get_group_idx(0),
+                               cm_support::get_group_idx(1),
+                               cm_support::get_group_idx(2)};
+
+  const sycl::group<3> Group = IDBuilder::createGroup<3>(
+      LambdaWrapper->GlobalSize, LambdaWrapper->LocalSize, GroupSize, GroupID);
 
-  PLACEHOLDER_UNUSED(LambdaWrapper);
-  // TODO : construct cl::sycl::nd_item<3> instance using thread ID
-  // info retrieved from CM with GlobalOffset/GlobalSize/LocalSize
-  // info and call Lambda function
-  // LambdaWrapper->Func(nd_item_3dim);
+  const sycl::id<3> GlobalID = GroupID * LambdaWrapper->LocalSize + LocalID +
+                               LambdaWrapper->GlobalOffset;
+  const sycl::item<3, /*Offset=*/true> GlobalItem =
+      IDBuilder::createItem<3, true>(LambdaWrapper->GlobalSize, GlobalID,
+                                     LambdaWrapper->GlobalOffset);
+  const sycl::item<3, /*Offset=*/false> LocalItem =
+      IDBuilder::createItem<3, false>(LambdaWrapper->LocalSize, LocalID);
+
+  const sycl::nd_item<3> NDItem3Dim =
+      IDBuilder::createNDItem<3>(GlobalItem, LocalItem, Group);
+
+  LambdaWrapper->Func(NDItem3Dim);
+}
 }
 
 // libCMBatch class defines interface for lauching kernels with
@@ -268,9 +417,10 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
 
     SpaceDim[0] = (uint32_t)Range[0];
 
-    PLACEHOLDER_UNUSED(WrappedLambda_ID_1DIM);
-    // TODO : Invoke invokeLambda_ID_1DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_ID_1DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ID_1DIM, GroupDim, SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ID_1DIM),
+                      WrappedLambda_ID_1DIM.get());
   }
 
   // ID_2DIM
@@ -284,9 +434,10 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
     SpaceDim[0] = (uint32_t)Range[0];
     SpaceDim[1] = (uint32_t)Range[1];
 
-    PLACEHOLDER_UNUSED(WrappedLambda_ID_2DIM);
-    // TODO : Invoke invokeLambda_ID_2DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_ID_2DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ID_2DIM, GroupDim, SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ID_2DIM),
+                      WrappedLambda_ID_2DIM.get());
   }
 
   // ID_3DIM
@@ -301,9 +452,10 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
     SpaceDim[1] = (uint32_t)Range[1];
     SpaceDim[2] = (uint32_t)Range[2];
 
-    PLACEHOLDER_UNUSED(WrappedLambda_ID_3DIM);
-    // TODO : Invoke invokeLambda_ID_3DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_ID_3DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ID_3DIM, GroupDim, SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ID_3DIM),
+                      WrappedLambda_ID_3DIM.get());
   }
 
   // Item w/o offset
@@ -317,9 +469,11 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
 
     SpaceDim[0] = (uint32_t)Range[0];
 
-    PLACEHOLDER_UNUSED(WrappedLambda_ITEM_1DIM);
-    // TODO : Invoke invokeLambda_ITEM_1DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_ITEM_1DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_1DIM, GroupDim,
+                            SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_1DIM),
+                      WrappedLambda_ITEM_1DIM.get());
   }
 
   template <class ArgT = KernelArgType>
@@ -333,9 +487,11 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
     SpaceDim[0] = (uint32_t)Range[0];
     SpaceDim[1] = (uint32_t)Range[1];
 
-    PLACEHOLDER_UNUSED(WrappedLambda_ITEM_2DIM);
-    // TODO : Invoke invokeLambda_ITEM_2DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_ITEM_2DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_2DIM, GroupDim,
+                            SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_2DIM),
+                      WrappedLambda_ITEM_2DIM.get());
   }
 
   template <class ArgT = KernelArgType>
@@ -350,9 +506,11 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
     SpaceDim[1] = (uint32_t)Range[1];
     SpaceDim[2] = (uint32_t)Range[2];
 
-    PLACEHOLDER_UNUSED(WrappedLambda_ITEM_3DIM);
-    // TODO : Invoke invokeLambda_ITEM_3DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_ITEM_3DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_3DIM, GroupDim,
+                            SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_3DIM),
+                      WrappedLambda_ITEM_3DIM.get());
   }
 
   // Item w/ offset
@@ -366,9 +524,11 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
 
     SpaceDim[0] = (uint32_t)Range[0];
 
-    PLACEHOLDER_UNUSED(WrappedLambda_ITEM_OFFSET_1DIM);
-    // TODO : Invoke invokeLambda_ITEM_OFFSET_1DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_ITEM_OFFSET_1DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_OFFSET_1DIM, GroupDim,
+                            SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_OFFSET_1DIM),
+                      WrappedLambda_ITEM_OFFSET_1DIM.get());
   }
 
   template <class ArgT = KernelArgType>
@@ -382,9 +542,11 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
     SpaceDim[0] = (uint32_t)Range[0];
     SpaceDim[1] = (uint32_t)Range[1];
 
-    PLACEHOLDER_UNUSED(WrappedLambda_ITEM_OFFSET_2DIM);
-    // TODO : Invoke invokeLambda_ITEM_OFFSET_2DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_ITEM_OFFSET_2DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_OFFSET_2DIM, GroupDim,
+                            SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_OFFSET_2DIM),
+                      WrappedLambda_ITEM_OFFSET_2DIM.get());
   }
 
   template <class ArgT = KernelArgType>
@@ -399,9 +561,11 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
     SpaceDim[1] = (uint32_t)Range[1];
     SpaceDim[2] = (uint32_t)Range[2];
 
-    PLACEHOLDER_UNUSED(WrappedLambda_ITEM_OFFSET_3DIM);
-    // TODO : Invoke invokeLambda_ITEM_OFFSET_3DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_ITEM_OFFSET_3DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_ITEM_OFFSET_3DIM, GroupDim,
+                            SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_ITEM_OFFSET_3DIM),
+                      WrappedLambda_ITEM_OFFSET_3DIM.get());
   }
 
   // NDItem_1DIM
@@ -418,9 +582,11 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
 
     GroupDim[0] = (uint32_t)(GlobalSize[0] / LocalSize[0]);
 
-    PLACEHOLDER_UNUSED(WrappedLambda_NDITEM_1DIM);
-    // TODO : Invoke invokeLambda_NDITEM_1DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_NDITEM_1DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_NDITEM_1DIM, GroupDim,
+                            SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_NDITEM_1DIM),
+                      WrappedLambda_NDITEM_1DIM.get());
   }
 
   // NDItem_2DIM
@@ -439,9 +605,11 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
     GroupDim[0] = (uint32_t)(GlobalSize[0] / LocalSize[0]);
     GroupDim[1] = (uint32_t)(GlobalSize[1] / LocalSize[1]);
 
-    PLACEHOLDER_UNUSED(WrappedLambda_NDITEM_2DIM);
-    // TODO : Invoke invokeLambda_NDITEM_2DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_NDITEM_2DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_NDITEM_2DIM, GroupDim,
+                            SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_NDITEM_2DIM),
+                      WrappedLambda_NDITEM_2DIM.get());
   }
 
   // NDItem_3DIM
@@ -462,30 +630,104 @@ template <class KernelType, class KernelArgType, int DIMS> class libCMBatch {
     GroupDim[1] = (uint32_t)(GlobalSize[1] / LocalSize[1]);
     GroupDim[2] = (uint32_t)(GlobalSize[2] / LocalSize[2]);
 
-    PLACEHOLDER_UNUSED(WrappedLambda_NDITEM_3DIM);
-    // TODO : Invoke invokeLambda_NDITEM_3DIM through CM's multi-threaded
-    // kernel launching with WrappedLambda_NDITEM_3DIM and dimension info
+    ESimdCPUKernel ESimdCPU((fptrVoid)InvokeLambda_NDITEM_3DIM, GroupDim,
+                            SpaceDim);
+
+    ESimdCPU.launchMT(sizeof(struct LambdaWrapper_NDITEM_3DIM),
+                      WrappedLambda_NDITEM_3DIM.get());
   }
 };
 
+/// Implementation for ESIMD_CPU device interface accessing ESIMD
+/// intrinsics and LibCM functionalties requred by intrinsics
 // Intrinsics
 sycl::detail::ESIMDDeviceInterface::ESIMDDeviceInterface() {
-  reserved = nullptr;
   version = ESIMDEmuPluginInterfaceVersion;
+  reserved = nullptr;
 
-  /// TODO : Fill *_ptr fields with function pointers from CM
-  /// functions prefixed with 'cm_support'
+  /* From 'esimd_emu_functions_v1.h' : Start */
+  cm_barrier_ptr = cm_support::barrier;
+  cm_sbarrier_ptr = cm_support::split_barrier;
+  cm_fence_ptr = cm_support::fence;
 
-  cm_barrier_ptr = nullptr;  /* cm_support::barrier; */
-  cm_sbarrier_ptr = nullptr; /* cm_support::split_barrier; */
-  cm_fence_ptr = nullptr;    /* cm_support::fence; */
+  sycl_get_surface_base_addr_ptr = cm_support::get_surface_base_addr;
+  __cm_emu_get_slm_ptr = cm_support::get_slm_base;
+  cm_slm_init_ptr = cm_support::init_slm;
+  /* From 'esimd_emu_functions_v1.h' : End */
+}
+
+/// Implementation for Host Kernel Launch used by
+/// piEnqueueKernelLaunch
+template <typename T> using KernelFunc = std::function<void(T)>;
 
-  sycl_get_surface_base_addr_ptr =
-      nullptr;                    /* cm_support::get_surface_base_addr; */
-  __cm_emu_get_slm_ptr = nullptr; /* cm_support::get_slm_base; */
-  cm_slm_init_ptr = nullptr;      /* cm_support::init_slm; */
+template <int NDims> struct InvokeBaseImpl {
+  static sycl::range<NDims> get_range(const size_t *GlobalWorkSize);
+};
+
+static constexpr bool isNull(int NDims, const size_t *R) {
+  return ((0 == R[0]) && (1 > NDims || 0 == R[1]) && (2 > NDims || 0 == R[2]));
 }
 
+template <int NDims, typename ArgTy> struct InvokeImpl {
+
+  template <int _NDims = NDims>
+  static typename std::enable_if<_NDims == 1, sycl::range<1>>::type
+  get_range(const size_t *a) {
+    return sycl::range<1>(a[0]);
+  }
+
+  template <int _NDims = NDims>
+  static typename std::enable_if<_NDims == 2, sycl::range<2>>::type
+  get_range(const size_t *a) {
+    return sycl::range<2>(a[0], a[1]);
+  }
+
+  template <int _NDims = NDims>
+  static typename std::enable_if<_NDims == 3, sycl::range<3>>::type
+  get_range(const size_t *a) {
+    return sycl::range<3>(a[0], a[1], a[2]);
+  }
+
+  static void invoke(void *fptr, const sycl::range<NDims> &range) {
+    auto f = reinterpret_cast<std::function<void(const ArgTy &)> *>(fptr);
+    libCMBatch<KernelFunc<const ArgTy &>, ArgTy, NDims> CmThreading(*f);
+    CmThreading.runIterationSpace(range);
+  }
+
+  static void invoke(void *fptr, const size_t *GlobalWorkSize) {
+    sycl::range<NDims> range = get_range(GlobalWorkSize);
+    invoke(fptr, range);
+  }
+
+  static void invoke(void *fptr, const size_t *GlobalWorkOffset,
+                     const size_t *GlobalWorkSize) {
+    auto GlobalSize = get_range(GlobalWorkSize);
+    sycl::id<NDims> GlobalOffset = get_range(GlobalWorkOffset);
+
+    auto f = reinterpret_cast<std::function<void(const ArgTy &)> *>(fptr);
+    libCMBatch<KernelFunc<const ArgTy &>, ArgTy, NDims> CmThreading(*f);
+    CmThreading.runIterationSpace(GlobalSize, GlobalOffset);
+  }
+
+  static void invoke(void *fptr, const size_t *GlobalWorkOffset,
+                     const size_t *GlobalWorkSize,
+                     const size_t *LocalWorkSize) {
+    const size_t LocalWorkSz[] = {1, 1, 1};
+    if (isNull(NDims, LocalWorkSize)) {
+      LocalWorkSize = LocalWorkSz;
+    }
+
+    auto GlobalSize = get_range(GlobalWorkSize);
+    auto LocalSize = get_range(LocalWorkSize);
+    sycl::id<NDims> GlobalOffset = get_range(GlobalWorkOffset);
+
+    auto f = reinterpret_cast<std::function<void(const ArgTy &)> *>(fptr);
+    libCMBatch<KernelFunc<const ArgTy &>, ArgTy, NDims> CmThreading(*f);
+
+    CmThreading.runIterationSpace(LocalSize, GlobalSize, GlobalOffset);
+  }
+};
+
 extern "C" {
 
 #define DIE_NO_IMPLEMENTATION                                                  \
@@ -506,22 +748,52 @@ extern "C" {
 
 pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
                          pi_uint32 *NumPlatforms) {
-  (void)NumEntries;
-  (void)Platforms;
-  (void)NumPlatforms;
-  DIE_NO_IMPLEMENTATION;
+  if (NumEntries == 0 && Platforms != nullptr) {
+    return PI_INVALID_VALUE;
+  }
+  if (Platforms == nullptr && NumPlatforms == nullptr) {
+    return PI_INVALID_VALUE;
+  }
+
+  if (Platforms && NumEntries > 0) {
+    *Platforms = new _pi_platform();
+    Platforms[0]->CmEmuVersion = std::string("0.0.1");
+  }
+
+  if (NumPlatforms) {
+    *NumPlatforms = 1;
+  }
+
   return PI_SUCCESS;
 }
 
 pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName,
                             size_t ParamValueSize, void *ParamValue,
                             size_t *ParamValueSizeRet) {
-  (void)Platform;
-  (void)ParamName;
-  (void)ParamValueSize;
-  (void)ParamValue;
-  (void)ParamValueSizeRet;
-  DIE_NO_IMPLEMENTATION;
+  assert(Platform);
+  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
+
+  switch (ParamName) {
+  case PI_PLATFORM_INFO_NAME:
+    return ReturnValue("Intel(R) ESIMD_CPU/GPU");
+
+  case PI_PLATFORM_INFO_VENDOR:
+    return ReturnValue("Intel(R) Corporation");
+
+  case PI_PLATFORM_INFO_VERSION:
+    return ReturnValue(Platform->CmEmuVersion);
+
+  case PI_PLATFORM_INFO_PROFILE:
+    return ReturnValue("CM_FULL_PROFILE");
+
+  case PI_PLATFORM_INFO_EXTENSIONS:
+    return ReturnValue("");
+
+  default:
+    // TODO: implement other parameters
+    die("Unsupported ParamName in piPlatformGetInfo");
+  }
+
   return PI_SUCCESS;
 }
 
@@ -538,35 +810,148 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle, pi_platform *) {
 pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
                        pi_uint32 NumEntries, pi_device *Devices,
                        pi_uint32 *NumDevices) {
-  (void)Platform;
-  (void)DeviceType;
-  (void)NumEntries;
-  (void)Devices;
-  (void)NumDevices;
-  DIE_NO_IMPLEMENTATION;
+  if (NumEntries == 0) {
+    if (NumDevices) {
+      *NumDevices = 1;
+    } else {
+      return PI_INVALID_VALUE;
+    }
+  }
+
+  if (NumDevices) {
+    *NumDevices = 1;
+  } else {
+    // assert(NumEntries == 1);
+    Devices[0] = new _pi_device(Platform);
+  }
+
   return PI_SUCCESS;
 }
 
 pi_result piDeviceRetain(pi_device Device) {
-  (void)Device;
-  DIE_NO_IMPLEMENTATION;
+  assert(Device);
+
+  ++(Device->RefCount);
+
   return PI_SUCCESS;
 }
 
 pi_result piDeviceRelease(pi_device) {
-  DIE_NO_IMPLEMENTATION;
+  CONTINUE_NO_IMPLEMENTATION;
   return PI_SUCCESS;
 }
 
 pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
                           size_t ParamValueSize, void *ParamValue,
                           size_t *ParamValueSizeRet) {
-  (void)Device;
-  (void)ParamName;
-  (void)ParamValueSize;
-  (void)ParamValue;
-  (void)ParamValueSizeRet;
-  DIE_NO_IMPLEMENTATION;
+  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
+
+  switch (ParamName) {
+  case PI_DEVICE_INFO_TYPE:
+    return ReturnValue(PI_DEVICE_TYPE_GPU);
+  case PI_DEVICE_INFO_PARENT_DEVICE:
+    return ReturnValue(pi_device{0});
+  case PI_DEVICE_INFO_PLATFORM:
+    return ReturnValue(Device->Platform);
+  case PI_DEVICE_INFO_NAME:
+    return ReturnValue("ESIMD_CPU");
+  case PI_DEVICE_INFO_IMAGE_SUPPORT:
+    return ReturnValue(pi_bool{true});
+  case PI_DEVICE_INFO_DRIVER_VERSION:
+    return ReturnValue("0.0.1");
+  case PI_DEVICE_INFO_VENDOR:
+    return ReturnValue("Intel(R) Corporation");
+  case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH:
+    return ReturnValue(size_t{8192});
+  case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT:
+    return ReturnValue(size_t{8192});
+  case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY:
+    return ReturnValue(pi_bool{1});
+
+#define UNSUPPORTED_INFO(info)                                                 \
+  case info:                                                                   \
+    std::cerr << std::endl                                                     \
+              << "Unsupported defice info = " << #info << std::endl;           \
+    DIE_NO_IMPLEMENTATION;                                                     \
+    break;
+
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_VENDOR_ID)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_EXTENSIONS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_COMPILER_AVAILABLE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_LINKER_AVAILABLE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_COMPUTE_UNITS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_ADDRESS_BITS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_LOCAL_MEM_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_AVAILABLE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_VERSION)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_REFERENCE_COUNT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_PROPERTIES)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PARTITION_TYPE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_OPENCL_C_VERSION)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PRINTF_BUFFER_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PROFILE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_BUILT_IN_KERNELS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_QUEUE_PROPERTIES)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_EXECUTION_CAPABILITIES)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_ENDIAN_LITTLE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_LOCAL_MEM_TYPE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_CONSTANT_ARGS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_PARAMETER_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_SAMPLERS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_SINGLE_FP_CONFIG)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_HALF_FP_CONFIG)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_DOUBLE_FP_CONFIG)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_IL_VERSION)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_HOST_SUPPORT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_DEVICE_SUPPORT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT)
+    UNSUPPORTED_INFO(PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT)
+
+#undef UNSUPPORTED_INFO
+  default:
+    DIE_NO_IMPLEMENTATION;
+  }
   return PI_SUCCESS;
 }
 
@@ -593,13 +978,28 @@ pi_result piContextCreate(const pi_context_properties *Properties,
                                             const void *PrivateInfo, size_t CB,
                                             void *UserData),
                           void *UserData, pi_context *RetContext) {
-  (void)Properties;
-  (void)NumDevices;
-  (void)Devices;
-  (void)PFnNotify;
-  (void)UserData;
-  (void)RetContext;
-  DIE_NO_IMPLEMENTATION;
+  if (NumDevices != 1) {
+    return PI_INVALID_VALUE;
+  }
+  assert(Devices);
+  assert(RetContext);
+
+  cm_support::CmDevice *device = nullptr;
+  unsigned int version = 0;
+
+  int result = cm_support::CreateCmDevice(device, version);
+
+  if (result != cm_support::CM_SUCCESS) {
+    return PI_INVALID_VALUE;
+  }
+
+  try {
+    *RetContext = new _pi_context(*Devices, device);
+  } catch (const std::bad_alloc &) {
+    return PI_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
   return PI_SUCCESS;
 }
 
@@ -633,18 +1033,36 @@ pi_result piContextRetain(pi_context) {
 }
 
 pi_result piContextRelease(pi_context Context) {
-  (void)Context;
-  DIE_NO_IMPLEMENTATION;
+  if ((Context == nullptr) || (Context->CmDevicePtr == nullptr)) {
+    return PI_INVALID_CONTEXT;
+  }
+
+  int result = cm_support::DestroyCmDevice(Context->CmDevicePtr);
+  if (result != cm_support::CM_SUCCESS) {
+    return PI_INVALID_CONTEXT;
+  }
+
+  delete Context;
   return PI_SUCCESS;
 }
 
 pi_result piQueueCreate(pi_context Context, pi_device Device,
                         pi_queue_properties Properties, pi_queue *Queue) {
-  (void)Context;
-  (void)Device;
-  (void)Properties;
-  (void)Queue;
-  DIE_NO_IMPLEMENTATION;
+  cm_support::CmQueue *cmQueue;
+
+  int result = Context->CmDevicePtr->CreateQueue(cmQueue);
+  if (result != cm_support::CM_SUCCESS) {
+    return PI_INVALID_CONTEXT;
+  }
+
+  try {
+    *Queue = new _pi_queue(Context, cmQueue);
+  } catch (const std::bad_alloc &) {
+    return PI_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
+
   return PI_SUCCESS;
 }
 
@@ -659,8 +1077,13 @@ pi_result piQueueRetain(pi_queue) {
 }
 
 pi_result piQueueRelease(pi_queue Queue) {
-  (void)Queue;
-  DIE_NO_IMPLEMENTATION;
+  if ((Queue == nullptr) || (Queue->CmQueuePtr == nullptr)) {
+    return PI_INVALID_QUEUE;
+  }
+
+  // TODO : Destory 'Queue->CmQueuePtr'?
+  delete Queue;
+
   return PI_SUCCESS;
 }
 
@@ -683,13 +1106,42 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle, pi_context,
 pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
                             void *HostPtr, pi_mem *RetMem,
                             const pi_mem_properties *properties) {
-  (void)Context;
-  (void)Flags;
-  (void)Size;
-  (void)HostPtr;
-  (void)RetMem;
-  (void)properties;
-  DIE_NO_IMPLEMENTATION;
+  assert((Flags & PI_MEM_FLAGS_ACCESS_RW) != 0);
+  assert(Context);
+  assert(RetMem);
+
+  cm_support::CmBuffer *CmBuf = nullptr;
+  cm_support::SurfaceIndex *CmIndex;
+
+  int status = Context->CmDevicePtr->CreateBuffer(
+      static_cast<unsigned int>(Size), CmBuf);
+
+  if (status != cm_support::CM_SUCCESS) {
+    return PI_OUT_OF_HOST_MEMORY;
+  }
+
+  status = CmBuf->GetIndex(CmIndex);
+
+  if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
+      (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
+    status =
+        CmBuf->WriteSurface(reinterpret_cast<const unsigned char *>(HostPtr),
+                            nullptr, static_cast<unsigned int>(Size));
+  }
+
+  auto HostPtrOrNull =
+      (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast<char *>(HostPtr) : nullptr;
+
+  try {
+    *RetMem =
+        new _pi_buffer(Context, HostPtrOrNull, CmBuf,
+                       /* integer buffer index */ CmIndex->get_data(), Size);
+  } catch (const std::bad_alloc &) {
+    return PI_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
+
   return PI_SUCCESS;
 }
 
@@ -704,22 +1156,138 @@ pi_result piMemRetain(pi_mem) {
 }
 
 pi_result piMemRelease(pi_mem Mem) {
-  (void)Mem;
-  DIE_NO_IMPLEMENTATION;
+  if (Mem->getMemType() == PI_MEM_TYPE_BUFFER) {
+    _pi_buffer *pi_buf = static_cast<_pi_buffer *>(Mem);
+    int result = Mem->Context->CmDevicePtr->DestroySurface(pi_buf->CmBufferPtr);
+
+    if (result != cm_support::CM_SUCCESS) {
+      return PI_INVALID_MEM_OBJECT;
+    }
+  } else if (Mem->getMemType() == PI_MEM_TYPE_IMAGE2D) {
+    _pi_image *pi_image = static_cast<_pi_image *>(Mem);
+    int result =
+        Mem->Context->CmDevicePtr->DestroySurface(pi_image->CmSurfacePtr);
+
+    if (result != cm_support::CM_SUCCESS) {
+      return PI_INVALID_MEM_OBJECT;
+    }
+  } else if (Mem->getMemType() == PI_MEM_TYPE_IMAGE2D) {
+    _pi_image *pi_img = static_cast<_pi_image *>(Mem);
+    int result =
+        Mem->Context->CmDevicePtr->DestroySurface(pi_img->CmSurfacePtr);
+    if (result != cm_support::CM_SUCCESS) {
+      return PI_INVALID_MEM_OBJECT;
+    }
+  } else {
+    return PI_INVALID_MEM_OBJECT;
+  }
+
   return PI_SUCCESS;
 }
 
+cm_support::CM_SURFACE_FORMAT
+piImageFormatToCmFormat(const pi_image_format *piFormat) {
+  using ULongPair = std::pair<unsigned long, unsigned long>;
+  using FmtMap = std::map<ULongPair, cm_support::CM_SURFACE_FORMAT>;
+  static const FmtMap pi2cm = {
+      {{PI_IMAGE_CHANNEL_TYPE_UNORM_INT8, PI_IMAGE_CHANNEL_ORDER_RGBA},
+       cm_support::CM_SURFACE_FORMAT_A8R8G8B8},
+
+      {{PI_IMAGE_CHANNEL_TYPE_UNORM_INT8, PI_IMAGE_CHANNEL_ORDER_ARGB},
+       cm_support::CM_SURFACE_FORMAT_A8R8G8B8},
+
+      {{PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, PI_IMAGE_CHANNEL_ORDER_RGBA},
+       cm_support::CM_SURFACE_FORMAT_A8R8G8B8},
+
+      {{PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, PI_IMAGE_CHANNEL_ORDER_RGBA},
+       cm_support::CM_SURFACE_FORMAT_R32G32B32A32F},
+  };
+  auto result = pi2cm.find(
+      {piFormat->image_channel_data_type, piFormat->image_channel_order});
+  if (result != pi2cm.end()) {
+    return result->second;
+  }
+  DIE_NO_IMPLEMENTATION;
+  return cm_support::CM_SURFACE_FORMAT_A8R8G8B8;
+}
+
 pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                            const pi_image_format *ImageFormat,
                            const pi_image_desc *ImageDesc, void *HostPtr,
                            pi_mem *RetImage) {
-  (void)Context;
-  (void)Flags;
-  (void)ImageFormat;
-  (void)ImageDesc;
-  (void)HostPtr;
-  (void)RetImage;
-  DIE_NO_IMPLEMENTATION;
+  if (ImageFormat == nullptr || ImageDesc == nullptr)
+    return PI_INVALID_VALUE;
+
+  switch (ImageDesc->image_type) {
+  case PI_MEM_TYPE_IMAGE2D:
+    break;
+  case PI_MEM_TYPE_IMAGE1D_BUFFER:
+    // NOTE : Temporarily added for enabling vadd_1d and
+    // vadd_raw_send. Remove for migration to github as
+    // 'wrapIntoImageBuffer' is deprecated in github repo
+    assert(ImageFormat->image_channel_data_type ==
+           PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8);
+    assert(ImageFormat->image_channel_order == PI_IMAGE_CHANNEL_ORDER_R);
+    assert(ImageDesc->image_height == 0);
+    return piMemBufferCreate(Context, Flags, ImageDesc->image_width, HostPtr,
+                             RetImage);
+  default:
+    return PI_INVALID_MEM_OBJECT;
+  }
+
+  auto bytesPerPixel = 4;
+  switch (ImageFormat->image_channel_data_type) {
+  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
+    bytesPerPixel = 16;
+    break;
+  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
+  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8:
+    bytesPerPixel = 4;
+    break;
+  default:
+    return PI_INVALID_VALUE;
+  }
+
+  cm_support::CmSurface2D *CmSurface = nullptr;
+  cm_support::SurfaceIndex *CmIndex;
+
+  int status = Context->CmDevicePtr->CreateSurface2D(
+      static_cast<unsigned int>(ImageDesc->image_width),
+      static_cast<unsigned int>(ImageDesc->image_height),
+      piImageFormatToCmFormat(ImageFormat), CmSurface);
+
+  if (status != cm_support::CM_SUCCESS) {
+    return PI_OUT_OF_HOST_MEMORY;
+  }
+
+  status = CmSurface->GetIndex(CmIndex);
+
+  if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
+      (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
+
+    if (HostPtr != nullptr) {
+
+      status = CmSurface->WriteSurface(
+          reinterpret_cast<const unsigned char *>(HostPtr), nullptr,
+          static_cast<unsigned int>(ImageDesc->image_width *
+                                    ImageDesc->image_height * bytesPerPixel));
+    }
+  }
+
+  auto HostPtrOrNull =
+      (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast<char *>(HostPtr) : nullptr;
+
+  try {
+    *RetImage = new _pi_image(Context, HostPtrOrNull, CmSurface,
+                              /* integer surface index */ CmIndex->get_data(),
+                              ImageDesc->image_width, ImageDesc->image_height,
+                              bytesPerPixel);
+  } catch (const std::bad_alloc &) {
+    return PI_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
+
   return PI_SUCCESS;
 }
 
@@ -875,19 +1443,21 @@ pi_result piEventGetInfo(pi_event, pi_event_info, size_t, void *, size_t *) {
 pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
                                   size_t ParamValueSize, void *ParamValue,
                                   size_t *ParamValueSizeRet) {
-  (void)Event;
-  (void)ParamName;
-  (void)ParamValueSize;
-  (void)ParamValue;
-  (void)ParamValueSizeRet;
-  DIE_NO_IMPLEMENTATION;
+  std::cerr << "Warning : Profiling Not supported under PI_ESIMD_CPU"
+            << std::endl;
   return PI_SUCCESS;
 }
 
 pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
-  (void)NumEvents;
-  (void)EventList;
-  DIE_NO_IMPLEMENTATION;
+  for (int i = 0; i < (int)NumEvents; i++) {
+    if (EventList[i]->IsDummyEvent) {
+      continue;
+    }
+    int result = EventList[i]->CmEventPtr->WaitForTaskFinished();
+    if (result != cm_support::CM_SUCCESS) {
+      return PI_OUT_OF_RESOURCES;
+    }
+  }
   return PI_SUCCESS;
 }
 
@@ -908,8 +1478,17 @@ pi_result piEventRetain(pi_event) {
 }
 
 pi_result piEventRelease(pi_event Event) {
-  (void)Event;
-  DIE_NO_IMPLEMENTATION;
+  if (!Event->IsDummyEvent) {
+    if ((Event->CmEventPtr == nullptr) || (Event->OwnerQueue == nullptr)) {
+      return PI_INVALID_EVENT;
+    }
+    int result = Event->OwnerQueue->DestroyEvent(Event->CmEventPtr);
+    if (result != cm_support::CM_SUCCESS) {
+      return PI_INVALID_EVENT;
+    }
+  }
+  delete Event;
+
   return PI_SUCCESS;
 }
 
@@ -962,16 +1541,32 @@ pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src,
                                  pi_uint32 NumEventsInWaitList,
                                  const pi_event *EventWaitList,
                                  pi_event *Event) {
-  (void)Queue;
-  (void)Src;
-  (void)BlockingRead;
-  (void)Offset;
-  (void)Size;
-  (void)Dst;
-  (void)NumEventsInWaitList;
-  (void)EventWaitList;
-  (void)Event;
-  DIE_NO_IMPLEMENTATION;
+  /// TODO : Support Blocked read, 'Queue' handling
+  assert(BlockingRead == false);
+  assert(NumEventsInWaitList == 0);
+
+  _pi_buffer *buf = static_cast<_pi_buffer *>(Src);
+
+  int status =
+      buf->CmBufferPtr->ReadSurface(reinterpret_cast<unsigned char *>(Dst),
+                                    nullptr, // event
+                                    static_cast<uint64_t>(Size));
+
+  if (status != cm_support::CM_SUCCESS) {
+    return PI_INVALID_MEM_OBJECT;
+  }
+
+  if (Event) {
+    try {
+      *Event = new _pi_event();
+    } catch (const std::bad_alloc &) {
+      return PI_OUT_OF_HOST_MEMORY;
+    } catch (...) {
+      return PI_ERROR_UNKNOWN;
+    }
+    (*Event)->IsDummyEvent = true;
+  }
+
   return PI_SUCCESS;
 }
 
@@ -1048,18 +1643,25 @@ pi_result piEnqueueMemImageRead(pi_queue command_queue, pi_mem image,
                                 pi_uint32 num_events_in_wait_list,
                                 const pi_event *event_wait_list,
                                 pi_event *event) {
-  (void)command_queue;
-  (void)image;
-  (void)blocking_read;
-  (void)origin;
-  (void)region;
-  (void)row_pitch;
-  (void)slice_pitch;
-  (void)ptr;
-  (void)num_events_in_wait_list;
-  (void)event_wait_list;
-  (void)event;
-  DIE_NO_IMPLEMENTATION;
+  _pi_image *img = static_cast<_pi_image *>(image);
+  int status =
+      img->CmSurfacePtr->ReadSurface(reinterpret_cast<unsigned char *>(ptr),
+                                     nullptr, // event
+                                     row_pitch * (region->height));
+  if (status != cm_support::CM_SUCCESS) {
+    return PI_INVALID_MEM_OBJECT;
+  }
+
+  if (event) {
+    try {
+      *event = new _pi_event();
+    } catch (const std::bad_alloc &) {
+      return PI_OUT_OF_HOST_MEMORY;
+    } catch (...) {
+      return PI_ERROR_UNKNOWN;
+    }
+    (*event)->IsDummyEvent = true;
+  }
   return PI_SUCCESS;
 }
 
@@ -1096,17 +1698,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
                       const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
                       pi_uint32 NumEventsInWaitList,
                       const pi_event *EventWaitList, pi_event *Event) {
-  (void)Queue;
-  (void)Kernel;
-  (void)WorkDim;
-  (void)GlobalWorkOffset;
-  (void)GlobalWorkSize;
-  (void)LocalWorkSize;
-  (void)NumEventsInWaitList;
-  (void)EventWaitList;
-  (void)Event;
-  DIE_NO_IMPLEMENTATION;
-  return PI_SUCCESS;
+  switch (WorkDim) {
+  case 1:
+    InvokeImpl<1, sycl::nd_item<1>>::invoke(Kernel, GlobalWorkOffset,
+                                            GlobalWorkSize, LocalWorkSize);
+    return PI_SUCCESS;
+
+  case 2:
+    InvokeImpl<2, sycl::nd_item<2>>::invoke(Kernel, GlobalWorkOffset,
+                                            GlobalWorkSize, LocalWorkSize);
+    return PI_SUCCESS;
+
+  case 3:
+    InvokeImpl<3, sycl::nd_item<3>>::invoke(Kernel, GlobalWorkOffset,
+                                            GlobalWorkSize, LocalWorkSize);
+    return PI_SUCCESS;
+
+  default:
+    DIE_NO_IMPLEMENTATION;
+    return PI_ERROR_UNKNOWN;
+  }
 }
 
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle, pi_context, bool,
@@ -1149,20 +1760,36 @@ pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
                               pi_device Device,
                               pi_usm_mem_properties *Properties, size_t Size,
                               pi_uint32 Alignment) {
-  (void)ResultPtr;
-  (void)Context;
-  (void)Device;
-  (void)Properties;
-  (void)Size;
-  (void)Alignment;
-  DIE_NO_IMPLEMENTATION;
+  assert(Context);
+  assert(ResultPtr);
+
+  cm_support::CmBufferSVM *buf = nullptr;
+  void *pSystemMem = nullptr;
+  int32_t ret = Context->CmDevicePtr->CreateBufferSVM(
+      Size, pSystemMem, CM_SVM_ACCESS_FLAG_DEFAULT, buf);
+
+  if (ret != cm_support::CM_SUCCESS) {
+    return PI_OUT_OF_HOST_MEMORY;
+  }
+  *ResultPtr = pSystemMem;
+  auto it = Context->Addr2CmBufferSVM.find(pSystemMem);
+  assert(Context->Addr2CmBufferSVM.end() == it);
+  Context->Addr2CmBufferSVM[pSystemMem] = buf;
   return PI_SUCCESS;
 }
 
 pi_result piextUSMFree(pi_context Context, void *Ptr) {
-  (void)Context;
-  (void)Ptr;
-  DIE_NO_IMPLEMENTATION;
+  assert(Context);
+  assert(Ptr);
+
+  cm_support::CmBufferSVM *buf = Context->Addr2CmBufferSVM[Ptr];
+  assert(buf);
+  auto count = Context->Addr2CmBufferSVM.erase(Ptr);
+  assert(1 == count);
+  int32_t ret = Context->CmDevicePtr->DestroyBufferSVM(buf);
+  if (cm_support::CM_SUCCESS != ret) {
+    return PI_ERROR_UNKNOWN;
+  }
   return PI_SUCCESS;
 }
 
diff --git a/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp b/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp
index e6b17be78db34..55dfb9528ad8a 100755
--- a/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp
+++ b/sycl/plugins/esimd_cpu/pi_esimd_cpu.hpp
@@ -23,6 +23,10 @@
 
 #include <malloc.h>
 
+namespace cm_support {
+#include <cm_rt.h>
+} // namespace cm_support
+
 template <class To, class From> To pi_cast(From Value) {
   // TODO: see if more sanity checks are possible.
   assert(sizeof(From) == sizeof(To));
@@ -43,4 +47,5 @@ template <> uint32_t pi_cast(uint64_t Value) {
   std::terminate();
 }
 
-#include <CL/sycl/INTEL/esimd/detail/emu/esimdcpu_device_interface.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/cmrt_if_defs.hpp>
+#include <sycl/ext/intel/experimental/esimd/detail/emu/esimdcpu_device_interface.hpp>
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index dd91d24eba642..158a540771a1c 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -131,11 +131,12 @@ template <> class SYCLConfig<SYCL_BE> {
       return BackendPtr;
 
     const char *ValStr = BaseT::getRawValue();
-    const std::array<std::pair<std::string, backend>, 5> SyclBeMap = {
+    const std::array<std::pair<std::string, backend>, 6> SyclBeMap = {
         {{"PI_OPENCL", backend::opencl},
          {"PI_LEVEL_ZERO", backend::level_zero},
          {"PI_LEVEL0", backend::level_zero}, // for backward compatibility
          {"PI_CUDA", backend::cuda},
+         {"PI_ESIMD_CPU", backend::esimd_cpu},
          {"PI_ROCM", backend::rocm}}};
     if (ValStr) {
       auto It = std::find_if(
@@ -145,7 +146,8 @@ template <> class SYCLConfig<SYCL_BE> {
           });
       if (It == SyclBeMap.end())
         pi::die("Invalid backend. "
-                "Valid values are PI_OPENCL/PI_LEVEL_ZERO/PI_CUDA/PI_ROCM");
+                "Valid values are "
+                "PI_OPENCL/PI_LEVEL_ZERO/PI_CUDA/PI_ESIMD_CPU/PI_ROCM");
       static backend Backend = It->second;
       BackendPtr = &Backend;
     }
@@ -185,11 +187,12 @@ static const std::array<std::pair<std::string, info::device_type>, 5>
                           {"*", info::device_type::all}}};
 
 // Array is used by SYCL_DEVICE_FILTER and SYCL_DEVICE_ALLOWLIST
-static const std::array<std::pair<std::string, backend>, 5> SyclBeMap = {
+static const std::array<std::pair<std::string, backend>, 6> SyclBeMap = {
     {{"host", backend::host},
      {"opencl", backend::opencl},
      {"level_zero", backend::level_zero},
      {"cuda", backend::cuda},
+     {"esimd_cpu", backend::esimd_cpu},
      {"*", backend::all}}};
 
 template <> class SYCLConfig<SYCL_DEVICE_FILTER> {
diff --git a/sycl/source/detail/device_filter.cpp b/sycl/source/detail/device_filter.cpp
index 8558d0d9e4f94..a438911fe4696 100644
--- a/sycl/source/detail/device_filter.cpp
+++ b/sycl/source/detail/device_filter.cpp
@@ -75,7 +75,8 @@ device_filter::device_filter(const std::string &FilterString) {
     } catch (...) {
       std::string Message =
           std::string("Invalid device filter: ") + FilterString +
-          "\nPossible backend values are {host,opencl,level_zero,cuda,*}.\n"
+          "\nPossible backend values are "
+          "{host,opencl,level_zero,cuda,esimd_cpu*}.\n"
           "Possible device types are {host,cpu,gpu,acc,*}.\n"
           "Device number should be an non-negative integer.\n";
       throw cl::sycl::invalid_parameter_error(Message, PI_INVALID_VALUE);
diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp
index 5745e4c2c58ae..1f4257e63d667 100644
--- a/sycl/source/detail/pi.cpp
+++ b/sycl/source/detail/pi.cpp
@@ -237,12 +237,14 @@ bool findPlugins(vector_class<std::pair<std::string, backend>> &PluginNames) {
     PluginNames.emplace_back(__SYCL_LEVEL_ZERO_PLUGIN_NAME,
                              backend::level_zero);
     PluginNames.emplace_back(__SYCL_CUDA_PLUGIN_NAME, backend::cuda);
+    PluginNames.emplace_back(__SYCL_ESIMD_CPU_PLUGIN_NAME, backend::esimd_cpu);
     PluginNames.emplace_back(__SYCL_ROCM_PLUGIN_NAME, backend::rocm);
   } else {
     std::vector<device_filter> Filters = FilterList->get();
     bool OpenCLFound = false;
     bool LevelZeroFound = false;
     bool CudaFound = false;
+    bool EsimdCpuFound = false;
     bool RocmFound = false;
     for (const device_filter &Filter : Filters) {
       backend Backend = Filter.Backend;
@@ -261,6 +263,12 @@ bool findPlugins(vector_class<std::pair<std::string, backend>> &PluginNames) {
         PluginNames.emplace_back(__SYCL_CUDA_PLUGIN_NAME, backend::cuda);
         CudaFound = true;
       }
+      if (!EsimdCpuFound &&
+          (Backend == backend::esimd_cpu || Backend == backend::all)) {
+        PluginNames.emplace_back(__SYCL_ESIMD_CPU_PLUGIN_NAME,
+                                 backend::esimd_cpu);
+        EsimdCpuFound = true;
+      }
       if (!RocmFound && (Backend == backend::rocm || Backend == backend::all)) {
         PluginNames.emplace_back(__SYCL_ROCM_PLUGIN_NAME, backend::rocm);
         RocmFound = true;
@@ -379,6 +387,11 @@ static void initializePlugins(vector_class<plugin> *Plugins) {
       // Use the LEVEL_ZERO plugin as the GlobalPlugin
       GlobalPlugin = std::make_shared<plugin>(PluginInformation,
                                               backend::level_zero, Library);
+    } else if (InteropBE == backend::esimd_cpu &&
+               PluginNames[I].first.find("esimd_cpu") != std::string::npos) {
+      // Use the ESIMD_CPU plugin as the GlobalPlugin
+      GlobalPlugin = std::make_shared<plugin>(PluginInformation,
+                                              backend::esimd_cpu, Library);
     }
     Plugins->emplace_back(
         plugin(PluginInformation, PluginNames[I].second, Library));
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 2734a64776761..202488aaf9c31 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -1937,7 +1937,8 @@ cl_int ExecCGCommand::enqueueImp() {
 
     NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
 
-    if (MQueue->is_host()) {
+    if (MQueue->is_host() ||
+        (MQueue->getPlugin().getBackend() == backend::esimd_cpu)) {
       for (ArgDesc &Arg : ExecKernel->MArgs)
         if (kernel_param_kind_t::kind_accessor == Arg.MType) {
           Requirement *Req = (Requirement *)(Arg.MPtr);
@@ -1949,8 +1950,18 @@ cl_int ExecCGCommand::enqueueImp() {
         const detail::plugin &Plugin = EventImpls[0]->getPlugin();
         Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
       }
-      ExecKernel->MHostKernel->call(NDRDesc,
-                                    getEvent()->getHostProfilingInfo());
+
+      if (MQueue->is_host()) {
+        ExecKernel->MHostKernel->call(NDRDesc,
+                                      getEvent()->getHostProfilingInfo());
+      } else {
+        assert(MQueue->getPlugin().getBackend() == backend::esimd_cpu);
+        MQueue->getPlugin().call<PiApiKind::piEnqueueKernelLaunch>(
+            nullptr,
+            reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
+            NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
+            &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
+      }
 
       return CL_SUCCESS;
     }