From c2f7fecde86367d1e0f06653410c99a655ea1eba Mon Sep 17 00:00:00 2001
From: David Wrighton <davidwr@microsoft.com>
Date: Mon, 19 May 2025 16:16:15 -0700
Subject: [PATCH 1/3] Reapply "Create a single copy of stub templates
 (#114462)" (#115665)

This reverts commit f7fc17859c961a3b2b58a8502a69fd37cc5e2b6b.
---
 src/coreclr/clrdefinitions.cmake              |   4 +
 src/coreclr/inc/executableallocator.h         |  15 +
 src/coreclr/inc/loaderheap.h                  |  24 +-
 src/coreclr/minipal/Unix/doublemapping.cpp    | 323 ++++++++++++++++++
 src/coreclr/minipal/Windows/doublemapping.cpp |  20 ++
 src/coreclr/minipal/minipal.h                 |  35 ++
 .../nativeaot/Runtime/unix/PalRedhawkUnix.cpp |   2 +-
 src/coreclr/utilcode/executableallocator.cpp  |  76 ++++-
 .../utilcode/interleavedloaderheap.cpp        |  98 +++++-
 src/coreclr/vm/amd64/thunktemplates.S         | 146 ++++++++
 src/coreclr/vm/arm64/thunktemplates.S         | 111 ++++++
 src/coreclr/vm/callcounting.cpp               |  27 +-
 src/coreclr/vm/callcounting.h                 |   2 +-
 src/coreclr/vm/loaderallocator.cpp            |   9 +-
 src/coreclr/vm/precode.cpp                    |  53 ++-
 src/coreclr/vm/precode.h                      |   9 +-
 16 files changed, 904 insertions(+), 50 deletions(-)

diff --git a/src/coreclr/clrdefinitions.cmake b/src/coreclr/clrdefinitions.cmake
index efb6ab0738a1a5..cb3b645ff0e58d 100644
--- a/src/coreclr/clrdefinitions.cmake
+++ b/src/coreclr/clrdefinitions.cmake
@@ -218,6 +218,10 @@ if (FEATURE_STUBPRECODE_DYNAMIC_HELPERS)
   add_definitions(-DFEATURE_STUBPRECODE_DYNAMIC_HELPERS)
 endif()
 
+if (CLR_CMAKE_TARGET_APPLE)
+  add_definitions(-DFEATURE_MAP_THUNKS_FROM_IMAGE)
+endif()
+
 # Use this function to enable building with a specific target OS and architecture set of defines
 # This is known to work for the set of defines used by the JIT and gcinfo, it is not likely correct for
 # other components of the runtime
diff --git a/src/coreclr/inc/executableallocator.h b/src/coreclr/inc/executableallocator.h
index 11caf3a6857d2d..973b950ad369bc 100644
--- a/src/coreclr/inc/executableallocator.h
+++ b/src/coreclr/inc/executableallocator.h
@@ -182,6 +182,9 @@ class ExecutableAllocator
     // Return true if double mapping is enabled.
     static bool IsDoubleMappingEnabled();
 
+    // Release memory allocated via DoubleMapping for either templates or normal double mapped data
+    void ReleaseWorker(void* pRX, bool releaseTemplate);
+
     // Initialize the allocator instance
     bool Initialize();
 
@@ -262,6 +265,18 @@ class ExecutableAllocator
 
     // Unmap the RW mapping at the specified address
     void UnmapRW(void* pRW);
+
+    // Allocate thunks from a template. pTemplate is the return value from CreateTemplate
+    void* AllocateThunksFromTemplate(void *pTemplate, size_t templateSize);
+
+    // Free a set of thunks allocated from templates. pThunks must have been returned from AllocateThunksFromTemplate
+    void FreeThunksFromTemplate(void *pThunks, size_t templateSize);
+
+    // Create a template
+    // If templateInImage is not null, it will attempt to use it as the template, otherwise it will create an temporary in memory file to serve as the template
+    // Some OS/Architectures may/may not be able to work with this, so this api is permitted to return NULL, and callers should have an alternate approach using
+    // the codePageGenerator directly.
+    void* CreateTemplate(void* templateInImage, size_t templateSize, void (*codePageGenerator)(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size));
 };
 
 #define ExecutableWriterHolder ExecutableWriterHolderNoLog
diff --git a/src/coreclr/inc/loaderheap.h b/src/coreclr/inc/loaderheap.h
index 782f93cedc6264..d3040e0b4aa448 100644
--- a/src/coreclr/inc/loaderheap.h
+++ b/src/coreclr/inc/loaderheap.h
@@ -455,10 +455,19 @@ class UnlockedLoaderHeap : public UnlockedLoaderHeapBase
     static void WeGotAFaultNowWhat(UnlockedLoaderHeap *pHeap);
 };
 
+struct InterleavedLoaderHeapConfig
+{
+    uint32_t StubSize;
+    void* Template;
+    void (*CodePageGenerator)(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size);
+};
+
+void InitializeLoaderHeapConfig(InterleavedLoaderHeapConfig *pConfig, size_t stubSize, void* templateInImage, void (*codePageGenerator)(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size));
+
 //===============================================================================
 // This is the base class for InterleavedLoaderHeap It's used as a simple
 // allocator for stubs in a scheme where each stub is a small fixed size, and is paired
-// with memory which is GetOSStubPageSize() bytes away. In addition there is an
+// with memory which is GetStubCodePageSize() bytes away. In addition there is an
 // ability to free is via a "backout" mechanism that is not considered to have good performance.
 //
 //===============================================================================
@@ -492,16 +501,13 @@ class UnlockedInterleavedLoaderHeap : public UnlockedLoaderHeapBase
 
     InterleavedStubFreeListNode  *m_pFreeListHead;
 
-public:
-public:
-    void                (*m_codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size);
+    const InterleavedLoaderHeapConfig *m_pConfig;
 
 #ifndef DACCESS_COMPILE
 protected:
     UnlockedInterleavedLoaderHeap(
         RangeList *pRangeList,
-        void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size),
-        DWORD dwGranularity);
+        const InterleavedLoaderHeapConfig *pConfig);
 
     virtual ~UnlockedInterleavedLoaderHeap();
 #endif
@@ -1039,13 +1045,11 @@ class InterleavedLoaderHeap : public UnlockedInterleavedLoaderHeap
 public:
     InterleavedLoaderHeap(RangeList *pRangeList,
                BOOL fUnlocked,
-               void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size),
-               DWORD dwGranularity
+               const InterleavedLoaderHeapConfig *pConfig
                )
       : UnlockedInterleavedLoaderHeap(
                            pRangeList,
-                           codePageGenerator,
-                           dwGranularity),
+                           pConfig),
         m_CriticalSection(fUnlocked ? NULL : CreateLoaderHeapLock())
     {
         WRAPPER_NO_CONTRACT;
diff --git a/src/coreclr/minipal/Unix/doublemapping.cpp b/src/coreclr/minipal/Unix/doublemapping.cpp
index b866da9f93e6f1..4a2516bea58484 100644
--- a/src/coreclr/minipal/Unix/doublemapping.cpp
+++ b/src/coreclr/minipal/Unix/doublemapping.cpp
@@ -9,6 +9,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <inttypes.h>
+#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
@@ -25,6 +26,11 @@
 #include "minipal.h"
 #include "minipal/cpufeatures.h"
 
+#ifndef TARGET_APPLE
+#include <link.h>
+#include <dlfcn.h>
+#endif // TARGET_APPLE
+
 #ifdef TARGET_APPLE
 
 #include <mach/mach.h>
@@ -253,3 +259,320 @@ bool VMToOSInterface::ReleaseRWMapping(void* pStart, size_t size)
 {
     return munmap(pStart, size) != -1;
 }
+
+#ifndef TARGET_APPLE
+#define MAX_TEMPLATE_THUNK_TYPES 3 // Maximum number of times the CreateTemplate api can be called
+struct TemplateThunkMappingData
+{
+    int fdImage;
+    off_t offsetInFileOfStartOfSection;
+    void* addrOfStartOfSection; // Always NULL if the template mapping data could not be initialized
+    void* addrOfEndOfSection;
+    bool imageTemplates;
+    int templatesCreated;
+    off_t nonImageTemplateCurrent;
+};
+
+struct InitializeTemplateThunkLocals
+{
+    void* pTemplate;
+    Dl_info info;
+    TemplateThunkMappingData data;
+};
+
+static TemplateThunkMappingData *s_pThunkData = NULL;
+
+#ifdef FEATURE_MAP_THUNKS_FROM_IMAGE
+
+static Elf32_Word Elf32_WordMin(Elf32_Word left, Elf32_Word  right)
+{
+    return left < right ? left : right;
+}
+
+static int InitializeTemplateThunkMappingDataPhdrCallback(struct dl_phdr_info *info, size_t size, void *dataPtr)
+{
+    InitializeTemplateThunkLocals *locals = (InitializeTemplateThunkLocals*)dataPtr;
+
+    if ((void*)info->dlpi_addr == locals->info.dli_fbase)
+    {
+        for (size_t j = 0; j < info->dlpi_phnum; j++)
+        {
+            uint8_t* baseSectionAddr = (uint8_t*)locals->info.dli_fbase + info->dlpi_phdr[j].p_vaddr;
+            if (locals->pTemplate < baseSectionAddr)
+            {
+                // Address is before the virtual address of this section begins
+                continue;
+            }
+
+            // Since this is all in support of mapping code from the file, we need to ensure that the region we find
+            // is actually present in the file.
+            Elf32_Word sizeOfSectionWhichCanBeMapped = Elf32_WordMin(info->dlpi_phdr[j].p_filesz, info->dlpi_phdr[j].p_memsz);
+
+            uint8_t* endAddressAllowedForTemplate = baseSectionAddr + sizeOfSectionWhichCanBeMapped;
+            if (locals->pTemplate >= endAddressAllowedForTemplate)
+            {
+                // Template is after the virtual address of this section ends (or the mappable region of the file)
+                continue;
+            }
+
+            // At this point, we have found the template section. Attempt to open the file, and record the various offsets for future use
+
+            if (strlen(info->dlpi_name) == 0)
+            {
+                // This image cannot be directly referenced without capturing the argv[0] parameter
+                return -1;
+            }
+
+            int fdImage = open(info->dlpi_name, O_RDONLY);
+            if (fdImage == -1)
+            {
+                return -1; // Opening the image didn't work
+            }
+            
+            locals->data.fdImage = fdImage;
+            locals->data.offsetInFileOfStartOfSection = info->dlpi_phdr[j].p_offset;
+            locals->data.addrOfStartOfSection = baseSectionAddr;
+            locals->data.addrOfEndOfSection = baseSectionAddr + sizeOfSectionWhichCanBeMapped;
+            locals->data.imageTemplates = true;
+            return 1; // We have found the result. Abort further processing.
+        }
+    }
+
+    // This isn't the interesting .so
+    return 0;
+}
+#endif // FEATURE_MAP_THUNKS_FROM_IMAGE
+
+TemplateThunkMappingData *InitializeTemplateThunkMappingData(void* pTemplate)
+{
+    InitializeTemplateThunkLocals locals;
+    locals.pTemplate = pTemplate;
+    locals.data.fdImage = 0;
+    locals.data.offsetInFileOfStartOfSection = 0;
+    locals.data.addrOfStartOfSection = NULL;
+    locals.data.addrOfEndOfSection = NULL;
+    locals.data.imageTemplates = false;
+    locals.data.nonImageTemplateCurrent = 0;
+    locals.data.templatesCreated = 0;
+
+#ifdef FEATURE_MAP_THUNKS_FROM_IMAGE
+    if (dladdr(pTemplate, &locals.info) != 0)
+    {
+        dl_iterate_phdr(InitializeTemplateThunkMappingDataPhdrCallback, &locals);
+    }
+#endif // FEATURE_MAP_THUNKS_FROM_IMAGE
+
+    if (locals.data.addrOfStartOfSection == NULL)
+    {
+        // This is the detail of thunk data which indicates if we were able to compute the template mapping data from the image.
+
+#ifdef TARGET_FREEBSD
+        int fd = shm_open(SHM_ANON, O_RDWR | O_CREAT, S_IRWXU);
+#elif defined(TARGET_LINUX) || defined(TARGET_ANDROID)
+        int fd = memfd_create("doublemapper-template", MFD_CLOEXEC);
+#else
+        int fd = -1;
+    
+#ifndef TARGET_ANDROID
+        // Bionic doesn't have shm_{open,unlink}
+        // POSIX fallback
+        if (fd == -1)
+        {
+            char name[24];
+            sprintf(name, "/shm-dotnet-template-%d", getpid());
+            name[sizeof(name) - 1] = '\0';
+            shm_unlink(name);
+            fd = shm_open(name, O_RDWR | O_CREAT | O_EXCL | O_NOFOLLOW, 0600);
+            shm_unlink(name);
+        }
+#endif // !TARGET_ANDROID
+#endif
+        if (fd != -1)
+        {
+            off_t maxFileSize = MAX_TEMPLATE_THUNK_TYPES * 0x10000; // The largest page size we support currently is 64KB.
+            if (ftruncate(fd, maxFileSize) == -1) // Reserve a decent size chunk of logical memory for these things.
+            {
+                close(fd);
+            }
+            else
+            {
+                locals.data.fdImage = fd;
+                locals.data.offsetInFileOfStartOfSection = 0;
+                // We simulate the template thunk mapping data existing in mapped ram, by declaring that it exists at at
+                // an address which is not NULL, and which is naturally aligned on the largest page size supported by any
+                // architecture we support (0x10000). We do this, as the generalized logic here is designed around remapping
+                // already mapped memory, and by doing this we are able to share that logic.
+                locals.data.addrOfStartOfSection = (void*)0x10000;
+                locals.data.addrOfEndOfSection = ((uint8_t*)locals.data.addrOfStartOfSection) + maxFileSize;
+                locals.data.imageTemplates = false;
+            }
+        }
+    }
+
+
+    TemplateThunkMappingData *pAllocatedData = (TemplateThunkMappingData*)malloc(sizeof(TemplateThunkMappingData));
+    *pAllocatedData = locals.data;
+    TemplateThunkMappingData *pExpectedNull = NULL; 
+    if (__atomic_compare_exchange_n (&s_pThunkData, &pExpectedNull, pAllocatedData, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED))
+    {
+        return pAllocatedData;
+    }
+    else
+    {
+        free(pAllocatedData);
+        return __atomic_load_n(&s_pThunkData, __ATOMIC_ACQUIRE);
+    }
+}
+#endif
+
+bool VMToOSInterface::AllocateThunksFromTemplateRespectsStartAddress()
+{
+#ifdef TARGET_APPLE
+    return false;
+#else
+    return true;
+#endif
+}
+
+void* VMToOSInterface::CreateTemplate(void* pImageTemplate, size_t templateSize, void (*codePageGenerator)(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size))
+{
+#ifdef TARGET_APPLE
+    return pImageTemplate;
+#elif defined(TARGET_X86)
+    return NULL; // X86 doesn't support high performance relative addressing, which makes the template system not work
+#else
+    if (pImageTemplate == NULL)
+        return NULL;
+
+    TemplateThunkMappingData* pThunkData = __atomic_load_n(&s_pThunkData, __ATOMIC_ACQUIRE);
+    if (s_pThunkData == NULL)
+    {
+        pThunkData = InitializeTemplateThunkMappingData(pImageTemplate);
+    }
+
+    // Unable to create template mapping region
+    if (pThunkData->addrOfStartOfSection == NULL)
+    {
+        return NULL;
+    }
+
+    int templatesCreated = __atomic_add_fetch(&pThunkData->templatesCreated, 1, __ATOMIC_SEQ_CST);
+    assert(templatesCreated <= MAX_TEMPLATE_THUNK_TYPES);
+
+    if (!pThunkData->imageTemplates)
+    {
+        // Need to allocate a memory mapped region to fill in the data
+        off_t locationInFileToStoreGeneratedCode = __atomic_fetch_add((off_t*)&pThunkData->nonImageTemplateCurrent, (off_t)templateSize, __ATOMIC_SEQ_CST);
+        void* mappedMemory = mmap(NULL, templateSize, PROT_READ | PROT_WRITE, MAP_SHARED, pThunkData->fdImage, locationInFileToStoreGeneratedCode);
+        if (mappedMemory != MAP_FAILED)
+        {
+            codePageGenerator((uint8_t*)mappedMemory, (uint8_t*)mappedMemory, templateSize);
+            munmap(mappedMemory, templateSize);
+            return ((uint8_t*)pThunkData->addrOfStartOfSection) + locationInFileToStoreGeneratedCode;
+        }
+        else
+        {
+            return NULL;
+        }
+    }
+    else
+    {
+        return pImageTemplate;
+    }
+#endif
+}
+
+void* VMToOSInterface::AllocateThunksFromTemplate(void* pTemplate, size_t templateSize, void* pStartSpecification)
+{
+#ifdef TARGET_APPLE
+    vm_address_t addr, taddr;
+    vm_prot_t prot, max_prot;
+    kern_return_t ret;
+
+    // Allocate two contiguous ranges of memory: the first range will contain the stubs
+    // and the second range will contain their data.
+    do
+    {
+        ret = vm_allocate(mach_task_self(), &addr, templateSize * 2, VM_FLAGS_ANYWHERE);
+    } while (ret == KERN_ABORTED);
+
+    if (ret != KERN_SUCCESS)
+    {
+        return NULL;
+    }
+
+    do
+    {
+        ret = vm_remap(
+            mach_task_self(), &addr, templateSize, 0, VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
+            mach_task_self(), (vm_address_t)pTemplate, FALSE, &prot, &max_prot, VM_INHERIT_SHARE);
+    } while (ret == KERN_ABORTED);
+
+    if (ret != KERN_SUCCESS)
+    {
+        do
+        {
+            ret = vm_deallocate(mach_task_self(), addr, templateSize * 2);
+        } while (ret == KERN_ABORTED);
+
+        return NULL;
+    }
+    return (void*)addr;
+#else
+    TemplateThunkMappingData* pThunkData = __atomic_load_n(&s_pThunkData, __ATOMIC_ACQUIRE);
+    if (s_pThunkData == NULL)
+    {
+        pThunkData = InitializeTemplateThunkMappingData(pTemplate);
+    }
+
+    if (pThunkData->addrOfStartOfSection == NULL)
+    {
+        // This is the detail of thunk data which indicates if we were able to compute the template mapping data
+        return NULL;
+    }
+
+    if (pTemplate < pThunkData->addrOfStartOfSection)
+    {
+        return NULL;
+    }
+
+    uint8_t* endOfTemplate = ((uint8_t*)pTemplate + templateSize);
+    if (endOfTemplate > pThunkData->addrOfEndOfSection)
+        return NULL;
+
+    size_t sectionOffset = (uint8_t*)pTemplate - (uint8_t*)pThunkData->addrOfStartOfSection;
+    off_t fileOffset = pThunkData->offsetInFileOfStartOfSection + sectionOffset;
+
+    void *pStart = mmap(pStartSpecification, templateSize * 2, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | (pStartSpecification != NULL ? MAP_FIXED : 0), -1, 0);
+    if (pStart == MAP_FAILED)
+    {
+        return NULL;
+    }
+
+    void *pStartCode = mmap(pStart, templateSize, PROT_READ | PROT_EXEC, MAP_PRIVATE | MAP_FIXED, pThunkData->fdImage, fileOffset);
+    if (pStart != pStartCode)
+    {
+        munmap(pStart, templateSize * 2);
+        return NULL;
+    }
+
+    return pStart;
+#endif
+}
+
+bool VMToOSInterface::FreeThunksFromTemplate(void* thunks, size_t templateSize)
+{
+#ifdef TARGET_APPLE
+    kern_return_t ret;
+
+    do
+    {
+        ret = vm_deallocate(mach_task_self(), (vm_address_t)thunks, templateSize * 2);
+    } while (ret == KERN_ABORTED);
+
+    return ret == KERN_SUCCESS ? true : false;
+#else
+    munmap(thunks, templateSize * 2);
+    return true;
+#endif
+}
diff --git a/src/coreclr/minipal/Windows/doublemapping.cpp b/src/coreclr/minipal/Windows/doublemapping.cpp
index 9e8ddfed8e964d..f5f25f2bec92cc 100644
--- a/src/coreclr/minipal/Windows/doublemapping.cpp
+++ b/src/coreclr/minipal/Windows/doublemapping.cpp
@@ -210,3 +210,23 @@ bool VMToOSInterface::ReleaseRWMapping(void* pStart, size_t size)
 {
     return UnmapViewOfFile(pStart);
 }
+
+void* VMToOSInterface::CreateTemplate(void* pImageTemplate, size_t templateSize, void (*codePageGenerator)(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size))
+{
+    return NULL;
+}
+
+bool VMToOSInterface::AllocateThunksFromTemplateRespectsStartAddress()
+{
+    return false;
+}
+
+void* VMToOSInterface::AllocateThunksFromTemplate(void* pTemplate, size_t templateSize, void* pStart)
+{
+    return NULL;
+}
+
+bool VMToOSInterface::FreeThunksFromTemplate(void* thunks, size_t templateSize)
+{
+    return false;
+}
diff --git a/src/coreclr/minipal/minipal.h b/src/coreclr/minipal/minipal.h
index afecd9ce74dc72..01f497e60e6d7e 100644
--- a/src/coreclr/minipal/minipal.h
+++ b/src/coreclr/minipal/minipal.h
@@ -75,6 +75,41 @@ class VMToOSInterface
     // Return:
     //  true if it succeeded, false if it failed
     static bool ReleaseRWMapping(void* pStart, size_t size);
+
+    // Create a template for use by AllocateThunksFromTemplate
+    // Parameters:
+    //  pImageTemplate    - Address of start of template in the image for coreclr. (All addresses passed to the api in a process must be from the same module, if any call uses a pImageTemplate, all calls MUST)
+    //  templateSize      - Size of the template
+    //  codePageGenerator - If the system is unable to use pImageTemplate, use this parameter to generate the code page instead
+    //
+    // Return:
+    //  NULL if creating the template fails
+    //  Non-NULL, a pointer to the template
+    static void* CreateTemplate(void* pImageTemplate, size_t templateSize, void (*codePageGenerator)(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size));
+
+    // Indicate if the AllocateThunksFromTemplate function respects the pStart address passed to AllocateThunksFromTemplate on this platform
+    // Return:
+    //  true if the parameter is respected, false if not
+    static bool AllocateThunksFromTemplateRespectsStartAddress();
+
+    // Allocate thunks from template
+    // Parameters:
+    //  pTemplate    - Value returned from CreateTemplate
+    //  templateSize - Size of the templates block in the image
+    //  pStart       - Where to allocate (Specify NULL if no particular address is required). If non-null, this must be an address returned by ReserveDoubleMappedMemory
+    //
+    // Return:
+    //  NULL if the allocation fails
+    //  Non-NULL, a pointer to the allocated region.
+    static void* AllocateThunksFromTemplate(void* pTemplate, size_t templateSize, void* pStart);
+
+    // Free thunks allocated from template
+    // Parameters:
+    //  pThunks      - Address previously returned by AllocateThunksFromTemplate
+    //  templateSize - Size of the templates block in the image
+    // Return:
+    //  true if it succeeded, false if it failed
+    static bool FreeThunksFromTemplate(void* thunks, size_t templateSize);
 };
 
 #if defined(HOST_64BIT) && defined(FEATURE_CACHED_INTERFACE_DISPATCH)
diff --git a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp
index a928e7018da25f..94ad25ceab8bdb 100644
--- a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp
+++ b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp
@@ -526,7 +526,7 @@ REDHAWK_PALEXPORT UInt32_BOOL REDHAWK_PALAPI PalAllocateThunksFromTemplate(HANDL
     vm_prot_t prot, max_prot;
     kern_return_t ret;
 
-    // Allocate two contiguous ranges of memory: the first range will contain the trampolines
+    // Allocate two contiguous ranges of memory: the first range will contain the stubs
     // and the second range will contain their data.
     do
     {
diff --git a/src/coreclr/utilcode/executableallocator.cpp b/src/coreclr/utilcode/executableallocator.cpp
index d145ab03987a08..0242377072238c 100644
--- a/src/coreclr/utilcode/executableallocator.cpp
+++ b/src/coreclr/utilcode/executableallocator.cpp
@@ -503,6 +503,11 @@ void* ExecutableAllocator::Commit(void* pStart, size_t size, bool isExecutable)
 }
 
 void ExecutableAllocator::Release(void* pRX)
+{
+    ReleaseWorker(pRX, false /* this is the standard Release of normally allocated memory */);
+}
+
+void ExecutableAllocator::ReleaseWorker(void* pRX, bool releaseTemplate)
 {
     LIMITED_METHOD_CONTRACT;
 
@@ -548,9 +553,19 @@ void ExecutableAllocator::Release(void* pRX)
                 cachedMappingThatOverlaps = FindOverlappingCachedMapping(pBlock);
             }
 
-            if (!VMToOSInterface::ReleaseDoubleMappedMemory(m_doubleMemoryMapperHandle, pRX, pBlock->offset, pBlock->size))
+            if (releaseTemplate)
             {
-                g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("Releasing the double mapped memory failed"));
+                if (!VMToOSInterface::FreeThunksFromTemplate(pRX, pBlock->size / 2))
+                {
+                    g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("Releasing the template mapped memory failed"));
+                }
+            }
+            else
+            {
+                if (!VMToOSInterface::ReleaseDoubleMappedMemory(m_doubleMemoryMapperHandle, pRX, pBlock->offset, pBlock->size))
+                {
+                    g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("Releasing the double mapped memory failed"));
+                }
             }
             // Put the released block into the free block list
             pBlock->baseRX = NULL;
@@ -962,3 +977,60 @@ void ExecutableAllocator::UnmapRW(void* pRW)
         g_fatalErrorHandler(COR_E_EXECUTIONENGINE, W("Releasing the RW mapping failed"));
     }
 }
+
+void* ExecutableAllocator::AllocateThunksFromTemplate(void *pTemplate, size_t templateSize)
+{
+    if (IsDoubleMappingEnabled() && VMToOSInterface::AllocateThunksFromTemplateRespectsStartAddress())
+    {
+        CRITSEC_Holder csh(m_CriticalSection);
+        
+        bool isFreeBlock;
+        BlockRX* block = AllocateBlock(templateSize * 2, &isFreeBlock);
+        if (block == NULL)
+        {
+            return NULL;
+        }
+        
+        void* result = VMToOSInterface::ReserveDoubleMappedMemory(m_doubleMemoryMapperHandle, block->offset, templateSize * 2, 0, 0);
+
+        if (result != NULL)
+        {
+            block->baseRX = result;
+            AddRXBlock(block);
+        }
+        else
+        {
+            BackoutBlock(block, isFreeBlock);
+        }
+
+        void *pTemplateAddressAllocated = VMToOSInterface::AllocateThunksFromTemplate(pTemplate, templateSize, block->baseRX);
+
+        if (pTemplateAddressAllocated == NULL)
+        {
+            ReleaseWorker(block->baseRX, false);
+        }
+
+        return pTemplateAddressAllocated;
+    }
+    else
+    {
+        return VMToOSInterface::AllocateThunksFromTemplate(pTemplate, templateSize, NULL);
+    }
+}
+
+void ExecutableAllocator::FreeThunksFromTemplate(void *pThunks, size_t templateSize)
+{
+    if (IsDoubleMappingEnabled() && VMToOSInterface::AllocateThunksFromTemplateRespectsStartAddress())
+    {
+        ReleaseWorker(pThunks, true /* This is a release of template allocated memory */);
+    }
+    else
+    {
+        VMToOSInterface::FreeThunksFromTemplate(pThunks, templateSize);
+    }
+}
+
+void* ExecutableAllocator::CreateTemplate(void* templateInImage, size_t templateSize, void (*codePageGenerator)(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size))
+{
+    return VMToOSInterface::CreateTemplate(templateInImage, templateSize, codePageGenerator);
+}
diff --git a/src/coreclr/utilcode/interleavedloaderheap.cpp b/src/coreclr/utilcode/interleavedloaderheap.cpp
index d908ea20c194db..082e337caebda1 100644
--- a/src/coreclr/utilcode/interleavedloaderheap.cpp
+++ b/src/coreclr/utilcode/interleavedloaderheap.cpp
@@ -33,10 +33,13 @@ namespace
 
 UnlockedInterleavedLoaderHeap::UnlockedInterleavedLoaderHeap(
                                        RangeList *pRangeList,
-                                       void (*codePageGenerator)(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size),
-                                       DWORD dwGranularity) :
+                                       const InterleavedLoaderHeapConfig *pConfig) :
     UnlockedLoaderHeapBase(LoaderHeapImplementationKind::Interleaved),
-    m_pFreeListHead(NULL)
+    m_pEndReservedRegion(NULL),
+    m_dwGranularity(pConfig->StubSize),
+    m_pRangeList(pRangeList),
+    m_pFreeListHead(NULL),
+    m_pConfig(pConfig)
 {
     CONTRACTL
     {
@@ -46,15 +49,7 @@ UnlockedInterleavedLoaderHeap::UnlockedInterleavedLoaderHeap(
     }
     CONTRACTL_END;
 
-    m_pEndReservedRegion         = NULL;
-
-    m_pRangeList                 = pRangeList;
-
     _ASSERTE((GetStubCodePageSize() % GetOsPageSize()) == 0); // Stub code page size MUST be in increments of the page size. (Really it must be a power of 2 as well, but this is good enough)
-    m_dwGranularity = dwGranularity;
-
-    _ASSERTE(codePageGenerator != NULL);
-    m_codePageGenerator = codePageGenerator;
 }
 
 // ~LoaderHeap is not synchronised (obviously)
@@ -80,7 +75,14 @@ UnlockedInterleavedLoaderHeap::~UnlockedInterleavedLoaderHeap()
         pVirtualAddress = pSearch->pVirtualAddress;
         pNext = pSearch->pNext;
 
-        ExecutableAllocator::Instance()->Release(pVirtualAddress);
+        if (m_pConfig->Template != NULL)
+        {
+            ExecutableAllocator::Instance()->FreeThunksFromTemplate(pVirtualAddress, GetStubCodePageSize());
+        }
+        else
+        {
+            ExecutableAllocator::Instance()->Release(pVirtualAddress);
+        }
 
         delete pSearch;
     }
@@ -101,6 +103,7 @@ size_t UnlockedInterleavedLoaderHeap::GetBytesAvailReservedRegion()
 
 BOOL UnlockedInterleavedLoaderHeap::CommitPages(void* pData, size_t dwSizeToCommitPart)
 {
+    _ASSERTE(m_pConfig->Template == NULL); // This path should only be used for LoaderHeaps which use the standard ExecutableAllocator functions
     // Commit first set of pages, since it will contain the LoaderHeapBlock
     {
         void *pTemp = ExecutableAllocator::Instance()->Commit(pData, dwSizeToCommitPart, IsExecutable());
@@ -121,7 +124,7 @@ BOOL UnlockedInterleavedLoaderHeap::CommitPages(void* pData, size_t dwSizeToComm
     }
 
     ExecutableWriterHolder<BYTE> codePageWriterHolder((BYTE*)pData, dwSizeToCommitPart, ExecutableAllocator::DoNotAddToCache);
-    m_codePageGenerator(codePageWriterHolder.GetRW(), (BYTE*)pData, dwSizeToCommitPart);
+    m_pConfig->CodePageGenerator(codePageWriterHolder.GetRW(), (BYTE*)pData, dwSizeToCommitPart);
     FlushInstructionCache(GetCurrentProcess(), pData, dwSizeToCommitPart);
 
     return TRUE;
@@ -137,6 +140,8 @@ BOOL UnlockedInterleavedLoaderHeap::UnlockedReservePages(size_t dwSizeToCommit)
     }
     CONTRACTL_END;
 
+    _ASSERTE(m_pConfig->Template == NULL); // This path should only be used for LoaderHeaps which use the standard ExecutableAllocator functions
+
     size_t dwSizeToReserve;
 
     // Round to page size again
@@ -222,6 +227,14 @@ BOOL UnlockedInterleavedLoaderHeap::UnlockedReservePages(size_t dwSizeToCommit)
     return TRUE;
 }
 
+void ReleaseAllocatedThunks(BYTE* thunks)
+{
+    ExecutableAllocator::Instance()->FreeThunksFromTemplate(thunks, GetStubCodePageSize());
+}
+
+using ThunkMemoryHolder = SpecializedWrapper<BYTE, ReleaseAllocatedThunks>;
+
+
 // Get some more committed pages - either commit some more in the current reserved region, or, if it
 // has run out, reserve another set of pages.
 // Returns: FALSE if we can't get any more memory
@@ -237,6 +250,57 @@ BOOL UnlockedInterleavedLoaderHeap::GetMoreCommittedPages(size_t dwMinSize)
     }
     CONTRACTL_END;
 
+    if (m_pConfig->Template != NULL)
+    {
+        ThunkMemoryHolder newAllocatedThunks = (BYTE*)ExecutableAllocator::Instance()->AllocateThunksFromTemplate(m_pConfig->Template, GetStubCodePageSize());
+        if (newAllocatedThunks == NULL)
+        {
+            return FALSE;
+        }
+
+        NewHolder<LoaderHeapBlock> pNewBlock = new (nothrow) LoaderHeapBlock;
+        if (pNewBlock == NULL)
+        {
+            return FALSE;
+        }
+
+        size_t dwSizeToReserve = GetStubCodePageSize() * 2;
+    
+        // Record reserved range in range list, if one is specified
+        // Do this AFTER the commit - otherwise we'll have bogus ranges included.
+        if (m_pRangeList != NULL)
+        {
+            if (!m_pRangeList->AddRange((const BYTE *) newAllocatedThunks,
+                                        ((const BYTE *) newAllocatedThunks) + dwSizeToReserve,
+                                        (void *) this))
+            {
+                return FALSE;
+            }
+        }
+    
+        m_dwTotalAlloc += dwSizeToReserve;
+    
+        pNewBlock.SuppressRelease();
+        newAllocatedThunks.SuppressRelease();
+    
+        pNewBlock->dwVirtualSize    = dwSizeToReserve;
+        pNewBlock->pVirtualAddress  = newAllocatedThunks;
+        pNewBlock->pNext            = m_pFirstBlock;
+        pNewBlock->m_fReleaseMemory = TRUE;
+    
+        // Add to the linked list
+        m_pFirstBlock = pNewBlock;
+    
+        m_pAllocPtr = (BYTE*)newAllocatedThunks;
+        m_pPtrToEndOfCommittedRegion = m_pAllocPtr + GetStubCodePageSize();
+        m_pEndReservedRegion = m_pAllocPtr + dwSizeToReserve; // For consistency with the non-template path m_pEndReservedRegion is after the end of the data area
+        m_dwTotalAlloc += GetStubCodePageSize();
+
+        return TRUE;
+    }
+
+    // From here, all work is only for the dynamically allocated InterleavedLoaderHeap path
+
     // If we have memory we can use, what are you doing here!
     _ASSERTE(dwMinSize > (SIZE_T)(m_pPtrToEndOfCommittedRegion - m_pAllocPtr));
 
@@ -474,5 +538,13 @@ void *UnlockedInterleavedLoaderHeap::UnlockedAllocStub(
 
     return pResult;
 }
+
+void InitializeLoaderHeapConfig(InterleavedLoaderHeapConfig *pConfig, size_t stubSize, void* templateInImage, void (*codePageGenerator)(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size))
+{
+    pConfig->StubSize = (uint32_t)stubSize;
+    pConfig->Template = ExecutableAllocator::Instance()->CreateTemplate(templateInImage, GetStubCodePageSize(), codePageGenerator);
+    pConfig->CodePageGenerator = codePageGenerator;
+}
+
 #endif // #ifndef DACCESS_COMPILE
 
diff --git a/src/coreclr/vm/amd64/thunktemplates.S b/src/coreclr/vm/amd64/thunktemplates.S
index ebb0f6f67f193d..611556da202bb9 100644
--- a/src/coreclr/vm/amd64/thunktemplates.S
+++ b/src/coreclr/vm/amd64/thunktemplates.S
@@ -5,9 +5,155 @@
 #include "unixasmmacros.inc"
 #include "asmconstants.h"
 
+#ifdef FEATURE_MAP_THUNKS_FROM_IMAGE
+
+#define POINTER_SIZE 0x08
+
+#define THUNKS_MAP_SIZE 0x4000
+
+#define PAGE_SIZE 0x4000
+#define PAGE_SIZE_LOG2 14
+
+
+#define DATA_SLOT(stub, field, thunkSize, thunkTemplateName) C_FUNC(thunkTemplateName) + THUNKS_MAP_SIZE + stub##Data__##field + IN_PAGE_INDEX * thunkSize
+
+// ----------
+// StubPrecode
+// ----------
+
+#define STUB_PRECODE_CODESIZE 0x18 // 3 instructions, 13 bytes encoded + 11 bytes of padding
+#define STUB_PRECODE_DATASIZE 0x18 // 2 qwords + a BYTE
+.set STUB_PRECODE_NUM_THUNKS_PER_MAPPING,(THUNKS_MAP_SIZE / STUB_PRECODE_CODESIZE)
+
+.macro THUNKS_BLOCK_STUB_PRECODE
+    IN_PAGE_INDEX = 0
+    .rept STUB_PRECODE_NUM_THUNKS_PER_MAPPING
+
+    mov    r10, [rip + DATA_SLOT(StubPrecode, SecretParam, STUB_PRECODE_CODESIZE, StubPrecodeCodeTemplate)]
+    jmp    [rip + DATA_SLOT(StubPrecode, Target, STUB_PRECODE_CODESIZE, StubPrecodeCodeTemplate)]
+    // The above is 13 bytes
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    IN_PAGE_INDEX = IN_PAGE_INDEX + 1
+    .endr
+.endm
+
+    .text
+    .p2align PAGE_SIZE_LOG2
+LEAF_ENTRY StubPrecodeCodeTemplate
+    THUNKS_BLOCK_STUB_PRECODE
+LEAF_END_MARKED StubPrecodeCodeTemplate, _TEXT
+
+// ----------
+// FixupPrecode
+// ----------
+
+#define FIXUP_PRECODE_CODESIZE 0x18 
+#define FIXUP_PRECODE_DATASIZE 0x18 // 3 qwords
+.set FIXUP_PRECODE_NUM_THUNKS_PER_MAPPING,(THUNKS_MAP_SIZE / FIXUP_PRECODE_CODESIZE)
+
+.macro THUNKS_BLOCK_FIXUP_PRECODE
+    IN_PAGE_INDEX = 0
+    .rept FIXUP_PRECODE_NUM_THUNKS_PER_MAPPING
+
+        jmp    [rip + DATA_SLOT(FixupPrecode, Target, FIXUP_PRECODE_CODESIZE, FixupPrecodeCodeTemplate)]
+        mov    r10, [rip + DATA_SLOT(FixupPrecode, MethodDesc, FIXUP_PRECODE_CODESIZE, FixupPrecodeCodeTemplate)]
+        jmp    [rip + DATA_SLOT(FixupPrecode, PrecodeFixupThunk, FIXUP_PRECODE_CODESIZE, FixupPrecodeCodeTemplate)]
+        // The above is 19 bytes
+        int 3
+        int 3
+        int 3
+        int 3
+        int 3
+    IN_PAGE_INDEX = IN_PAGE_INDEX + 1
+    .endr
+.endm
+
+    .text
+    .p2align PAGE_SIZE_LOG2
+LEAF_ENTRY FixupPrecodeCodeTemplate
+    THUNKS_BLOCK_FIXUP_PRECODE
+    // We need 16 bytes of padding to pad this out
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+LEAF_END_MARKED FixupPrecodeCodeTemplate, _TEXT
+
+// ----------
+// CallCountingStub
+// ----------
+
+#define CALLCOUNTING_CODESIZE 0x18
+#define CALLCOUNTING_DATASIZE 0x18 // 3 qwords
+.set CALLCOUNTING_NUM_THUNKS_PER_MAPPING, (THUNKS_MAP_SIZE / CALLCOUNTING_CODESIZE)
+.macro THUNKS_BLOCK_CALLCOUNTING
+    IN_PAGE_INDEX = 0
+    .rept CALLCOUNTING_NUM_THUNKS_PER_MAPPING
+
+        mov    rax,QWORD PTR [rip + DATA_SLOT(CallCountingStub, RemainingCallCountCell, CALLCOUNTING_CODESIZE, CallCountingStubCodeTemplate)]
+        dec    WORD PTR [rax]
+        je     0f
+        jmp    QWORD PTR [rip + DATA_SLOT(CallCountingStub, TargetForMethod, CALLCOUNTING_CODESIZE, CallCountingStubCodeTemplate)]
+    0:
+        jmp    QWORD PTR [rip + DATA_SLOT(CallCountingStub, TargetForThresholdReached, CALLCOUNTING_CODESIZE, CallCountingStubCodeTemplate)]
+    IN_PAGE_INDEX = IN_PAGE_INDEX + 1
+    .endr
+.endm
+
+    .text
+    .p2align PAGE_SIZE_LOG2
+LEAF_ENTRY CallCountingStubCodeTemplate
+    THUNKS_BLOCK_CALLCOUNTING
+    // We need 16 bytes of padding to pad this out
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+    int 3
+LEAF_END_MARKED CallCountingStubCodeTemplate, _TEXT
+
+#endif
+
 // STUB_PAGE_SIZE must match the behavior of GetStubCodePageSize() on this architecture/os
 STUB_PAGE_SIZE = 16384
 
+#ifdef DATA_SLOT
+#undef DATA_SLOT
+#endif
+
 #define DATA_SLOT(stub, field) C_FUNC(stub##Code) + STUB_PAGE_SIZE + stub##Data__##field
 
 LEAF_ENTRY StubPrecodeCode, _TEXT
diff --git a/src/coreclr/vm/arm64/thunktemplates.S b/src/coreclr/vm/arm64/thunktemplates.S
index df2abf7c29e0f7..bbbc490854721e 100644
--- a/src/coreclr/vm/arm64/thunktemplates.S
+++ b/src/coreclr/vm/arm64/thunktemplates.S
@@ -4,6 +4,117 @@
 #include "unixasmmacros.inc"
 #include "asmconstants.h"
 
+#ifdef FEATURE_MAP_THUNKS_FROM_IMAGE
+#define POINTER_SIZE 0x08
+// Since Arm64 supports 4KB, 16KB and 64KB page sizes, as the templates is only defined for 16KB page size, this cannot be used
+// in a general purpose Linux environment. However it CAN be used on Apple platforms, which specify that 16KB is the system standard
+// page size.
+
+#define THUNKS_MAP_SIZE 0x4000
+
+#define PAGE_SIZE 0x4000
+#define PAGE_SIZE_LOG2 14
+
+
+#define DATA_SLOT(stub, field, thunkSize, thunkTemplateName) C_FUNC(thunkTemplateName) + THUNKS_MAP_SIZE + stub##Data__##field + IN_PAGE_INDEX * thunkSize
+
+// ----------
+// StubPrecode
+// ----------
+
+#define STUB_PRECODE_CODESIZE 0x18 // 3 instructions, 4 bytes each (and we also have 12 bytes of padding)
+#define STUB_PRECODE_DATASIZE 0x18 // 2 qwords + 1 byte
+.set STUB_PRECODE_NUM_THUNKS_PER_MAPPING, (THUNKS_MAP_SIZE / STUB_PRECODE_CODESIZE)
+
+.macro THUNKS_BLOCK_STUB_PRECODE
+    IN_PAGE_INDEX = 0
+    .rept STUB_PRECODE_NUM_THUNKS_PER_MAPPING
+
+    ldr x10, DATA_SLOT(StubPrecode, Target, STUB_PRECODE_CODESIZE, StubPrecodeCodeTemplate)
+    ldr x12, DATA_SLOT(StubPrecode, SecretParam, STUB_PRECODE_CODESIZE, StubPrecodeCodeTemplate)
+    br x10
+
+    brk     0xf000      // Stubs need to be 24-byte in size to allow for the data to be 2 pointers + 1 byte
+    brk     0xf000      // Stubs need to be 24-byte in size to allow for the data to be 2 pointers + 1 byte
+    brk     0xf000      // Stubs need to be 24-byte in size to allow for the data to be 2 pointers + 1 byte
+
+    IN_PAGE_INDEX = IN_PAGE_INDEX + 1
+    .endr
+.endm
+
+    .text
+    .p2align PAGE_SIZE_LOG2
+LEAF_ENTRY StubPrecodeCodeTemplate
+    THUNKS_BLOCK_STUB_PRECODE
+LEAF_END_MARKED StubPrecodeCodeTemplate, _TEXT
+
+// ----------
+// FixupPrecode
+// ----------
+
+#define FIXUP_PRECODE_CODESIZE 0x18 // 5 instructions, 4 bytes each (and we also have 4 bytes of padding)
+#define FIXUP_PRECODE_DATASIZE 0x18 // 3 qwords
+.set FIXUP_PRECODE_NUM_THUNKS_PER_MAPPING,(THUNKS_MAP_SIZE / FIXUP_PRECODE_CODESIZE)
+
+.macro THUNKS_BLOCK_FIXUP_PRECODE
+    IN_PAGE_INDEX = 0
+    .rept FIXUP_PRECODE_NUM_THUNKS_PER_MAPPING
+
+    ldr x11, DATA_SLOT(FixupPrecode, Target, FIXUP_PRECODE_CODESIZE, FixupPrecodeCodeTemplate)
+    br  x11
+    ldr x12, DATA_SLOT(FixupPrecode, MethodDesc, FIXUP_PRECODE_CODESIZE, FixupPrecodeCodeTemplate)
+    ldr x11, DATA_SLOT(FixupPrecode, PrecodeFixupThunk, FIXUP_PRECODE_CODESIZE, FixupPrecodeCodeTemplate)
+    br  x11        
+    brk     0xf000      // Stubs need to be 24-byte in size to allow for the data to be 3 pointers
+
+    IN_PAGE_INDEX = IN_PAGE_INDEX + 1
+    .endr
+.endm
+
+    .text
+    .p2align PAGE_SIZE_LOG2
+LEAF_ENTRY FixupPrecodeCodeTemplate
+    THUNKS_BLOCK_FIXUP_PRECODE
+LEAF_END_MARKED FixupPrecodeCodeTemplate, _TEXT
+
+// ----------
+// CallCountingStub
+// ----------
+
+#define CALLCOUNTING_CODESIZE 0x28 // 5 instructions, 4 bytes each (and we also have 4 bytes of padding)
+#define CALLCOUNTING_DATASIZE 0x18 // 3 qwords
+.set CALLCOUNTING_NUM_THUNKS_PER_MAPPING, (THUNKS_MAP_SIZE / CALLCOUNTING_CODESIZE)
+
+.macro THUNKS_BLOCK_CALLCOUNTING
+    IN_PAGE_INDEX = 0
+    .rept CALLCOUNTING_NUM_THUNKS_PER_MAPPING
+
+        ldr  x9, DATA_SLOT(CallCountingStub, RemainingCallCountCell, CALLCOUNTING_CODESIZE, CallCountingStubCodeTemplate)
+        ldrh w10, [x9]
+        subs w10, w10, #1
+        strh w10, [x9]
+        beq 0f
+        ldr  x9, DATA_SLOT(CallCountingStub, TargetForMethod, CALLCOUNTING_CODESIZE, CallCountingStubCodeTemplate)
+        br   x9
+0:
+        ldr  x10, DATA_SLOT(CallCountingStub, TargetForThresholdReached, CALLCOUNTING_CODESIZE, CallCountingStubCodeTemplate)
+        br   x10
+        brk     0xf000      // Stubs need to be 40-byte in size to allow for the data to be pointer aligned
+
+    IN_PAGE_INDEX = IN_PAGE_INDEX + 1
+    .endr
+.endm
+
+    .text
+    .p2align PAGE_SIZE_LOG2
+LEAF_ENTRY CallCountingStubCodeTemplate
+    THUNKS_BLOCK_CALLCOUNTING
+LEAF_END_MARKED CallCountingStubCodeTemplate, _TEXT
+#endif
+
+#ifdef DATA_SLOT
+#undef DATA_SLOT
+#endif
 #define DATA_SLOT(stub, field) . - (. - C_FUNC(stub##Code\STUB_PAGE_SIZE)) + \STUB_PAGE_SIZE + stub##Data__##field
 
     .irp STUB_PAGE_SIZE, 16384, 32768, 65536
diff --git a/src/coreclr/vm/callcounting.cpp b/src/coreclr/vm/callcounting.cpp
index 0f26b7d4090096..f5168fc0f799b1 100644
--- a/src/coreclr/vm/callcounting.cpp
+++ b/src/coreclr/vm/callcounting.cpp
@@ -293,6 +293,14 @@ void (*CallCountingStub::CallCountingStubCode)();
 
 #ifndef DACCESS_COMPILE
 
+static InterleavedLoaderHeapConfig s_callCountingHeapConfig;
+
+#ifdef FEATURE_MAP_THUNKS_FROM_IMAGE
+extern "C" void CallCountingStubCodeTemplate();
+#else
+#define CallCountingStubCodeTemplate NULL
+#endif
+
 void CallCountingStub::StaticInitialize()
 {
 #if defined(TARGET_ARM64) && defined(TARGET_UNIX)
@@ -310,14 +318,22 @@ void CallCountingStub::StaticInitialize()
             EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size"));
     }
     #undef ENUM_PAGE_SIZE
+
+    if (CallCountingStubCodeTemplate != NULL && pageSize != 0x4000)
+    {
+        // This should fail if the template is used on a platform which doesn't support the supported page size for templates
+        ThrowHR(COR_E_EXECUTIONENGINE);
+    }
 #else
     _ASSERTE((SIZE_T)((BYTE*)CallCountingStubCode_End - (BYTE*)CallCountingStubCode) <= CallCountingStub::CodeSize);
 #endif
+
+    InitializeLoaderHeapConfig(&s_callCountingHeapConfig, CallCountingStub::CodeSize, (void*)CallCountingStubCodeTemplate, CallCountingStub::GenerateCodePage);
 }
 
 #endif // DACCESS_COMPILE
 
-void CallCountingStub::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T pageSize)
+void CallCountingStub::GenerateCodePage(uint8_t* pageBase, uint8_t* pageBaseRX, size_t pageSize)
 {
 #ifdef TARGET_X86
     int totalCodeSize = (pageSize / CallCountingStub::CodeSize) * CallCountingStub::CodeSize;
@@ -328,13 +344,13 @@ void CallCountingStub::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T
 
         // Set absolute addresses of the slots in the stub
         BYTE* pCounterSlot = pageBaseRX + i + pageSize + offsetof(CallCountingStubData, RemainingCallCountCell);
-        *(BYTE**)(pageBase + i + SYMBOL_VALUE(CallCountingStubCode_RemainingCallCountCell_Offset)) = pCounterSlot;
+        *(uint8_t**)(pageBase + i + SYMBOL_VALUE(CallCountingStubCode_RemainingCallCountCell_Offset)) = pCounterSlot;
 
         BYTE* pTargetSlot = pageBaseRX + i + pageSize + offsetof(CallCountingStubData, TargetForMethod);
-        *(BYTE**)(pageBase + i + SYMBOL_VALUE(CallCountingStubCode_TargetForMethod_Offset)) = pTargetSlot;
+        *(uint8_t**)(pageBase + i + SYMBOL_VALUE(CallCountingStubCode_TargetForMethod_Offset)) = pTargetSlot;
 
         BYTE* pCountReachedZeroSlot = pageBaseRX + i + pageSize + offsetof(CallCountingStubData, TargetForThresholdReached);
-        *(BYTE**)(pageBase + i + SYMBOL_VALUE(CallCountingStubCode_TargetForThresholdReached_Offset)) = pCountReachedZeroSlot;
+        *(uint8_t**)(pageBase + i + SYMBOL_VALUE(CallCountingStubCode_TargetForThresholdReached_Offset)) = pCountReachedZeroSlot;
     }
 #else // TARGET_X86
     FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)CallCountingStubCode), CallCountingStub::CodeSize, pageSize);
@@ -354,7 +370,7 @@ NOINLINE InterleavedLoaderHeap *CallCountingManager::CallCountingStubAllocator::
 
     _ASSERTE(m_heap == nullptr);
 
-    InterleavedLoaderHeap *heap = new InterleavedLoaderHeap(&m_heapRangeList, true /* fUnlocked */, CallCountingStub::GenerateCodePage, CallCountingStub::CodeSize);
+    InterleavedLoaderHeap *heap = new InterleavedLoaderHeap(&m_heapRangeList, true /* fUnlocked */, &s_callCountingHeapConfig);
     m_heap = heap;
     return heap;
 }
@@ -475,6 +491,7 @@ CallCountingManager::~CallCountingManager()
 }
 
 #ifndef DACCESS_COMPILE
+
 void CallCountingManager::StaticInitialize()
 {
     WRAPPER_NO_CONTRACT;
diff --git a/src/coreclr/vm/callcounting.h b/src/coreclr/vm/callcounting.h
index 75a907f4d6ea3c..59071aa51f140b 100644
--- a/src/coreclr/vm/callcounting.h
+++ b/src/coreclr/vm/callcounting.h
@@ -150,7 +150,7 @@ class CallCountingStub
     static void StaticInitialize();
 #endif // !DACCESS_COMPILE
 
-    static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size);
+    static void GenerateCodePage(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size);
 
     PTR_CallCount GetRemainingCallCountCell() const;
     PCODE GetTargetForMethod() const;
diff --git a/src/coreclr/vm/loaderallocator.cpp b/src/coreclr/vm/loaderallocator.cpp
index 5fe3bb2faf2831..f31d2d068bbfb8 100644
--- a/src/coreclr/vm/loaderallocator.cpp
+++ b/src/coreclr/vm/loaderallocator.cpp
@@ -1208,8 +1208,7 @@ void LoaderAllocator::Init(BYTE *pExecutableHeapMemory)
     m_pNewStubPrecodeHeap = new (&m_NewStubPrecodeHeapInstance) InterleavedLoaderHeap(
                                                                            &m_stubPrecodeRangeList,
                                                                            false /* fUnlocked */,
-                                                                           StubPrecode::GenerateCodePage,
-                                                                           StubPrecode::CodeSize);
+                                                                           &s_stubPrecodeHeapConfig);
 
 #if defined(FEATURE_STUBPRECODE_DYNAMIC_HELPERS) && defined(FEATURE_READYTORUN)
     if (IsCollectible())
@@ -1219,14 +1218,12 @@ void LoaderAllocator::Init(BYTE *pExecutableHeapMemory)
     m_pDynamicHelpersStubHeap = new (&m_DynamicHelpersHeapInstance) InterleavedLoaderHeap(
                                                                                &m_dynamicHelpersRangeList,
                                                                                false /* fUnlocked */,
-                                                                               StubPrecode::GenerateCodePage,
-                                                                               StubPrecode::CodeSize);
+                                                                               &s_stubPrecodeHeapConfig);
 #endif // defined(FEATURE_STUBPRECODE_DYNAMIC_HELPERS) && defined(FEATURE_READYTORUN)
 
     m_pFixupPrecodeHeap = new (&m_FixupPrecodeHeapInstance) InterleavedLoaderHeap(&m_fixupPrecodeRangeList,
                                                                        false /* fUnlocked */,
-                                                                       FixupPrecode::GenerateCodePage,
-                                                                       FixupPrecode::CodeSize);
+                                                                       &s_fixupStubPrecodeHeapConfig);
 
     // Initialize the EE marshaling data to NULL.
     m_pMarshalingData = NULL;
diff --git a/src/coreclr/vm/precode.cpp b/src/coreclr/vm/precode.cpp
index e3e3983e8716e1..798e9849de3a6a 100644
--- a/src/coreclr/vm/precode.cpp
+++ b/src/coreclr/vm/precode.cpp
@@ -15,6 +15,11 @@
 #include "perfmap.h"
 #endif
 
+InterleavedLoaderHeapConfig s_stubPrecodeHeapConfig;
+#ifdef HAS_FIXUP_PRECODE
+InterleavedLoaderHeapConfig s_fixupStubPrecodeHeapConfig;
+#endif
+
 //==========================================================================================
 // class Precode
 //==========================================================================================
@@ -495,6 +500,12 @@ void (*StubPrecode::StubPrecodeCode)();
 void (*StubPrecode::StubPrecodeCode_End)();
 #endif
 
+#ifdef FEATURE_MAP_THUNKS_FROM_IMAGE
+extern "C" void StubPrecodeCodeTemplate();
+#else
+#define StubPrecodeCodeTemplate NULL
+#endif
+
 void StubPrecode::StaticInitialize()
 {
 #if defined(TARGET_ARM64) && defined(TARGET_UNIX)
@@ -512,6 +523,13 @@ void StubPrecode::StaticInitialize()
         default:
             EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size"));
     }
+
+    if (StubPrecodeCodeTemplate != NULL && pageSize != 0x4000)
+    {
+        // This should fail if the template is used on a platform which doesn't support the supported page size for templates
+        ThrowHR(COR_E_EXECUTIONENGINE);
+    }
+
     #undef ENUM_PAGE_SIZE
 #else
     _ASSERTE((SIZE_T)((BYTE*)StubPrecodeCode_End - (BYTE*)StubPrecodeCode) <= StubPrecode::CodeSize);
@@ -524,21 +542,22 @@ void StubPrecode::StaticInitialize()
     _ASSERTE((*((BYTE*)PCODEToPINSTR((PCODE)StubPrecodeCode) + OFFSETOF_PRECODE_TYPE)) == StubPrecode::Type);
 #endif
 
+    InitializeLoaderHeapConfig(&s_stubPrecodeHeapConfig, StubPrecode::CodeSize, (void*)StubPrecodeCodeTemplate, StubPrecode::GenerateCodePage);
 }
 
-void StubPrecode::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T pageSize)
+void StubPrecode::GenerateCodePage(uint8_t* pageBase, uint8_t* pageBaseRX, size_t pageSize)
 {
 #ifdef TARGET_X86
     int totalCodeSize = (pageSize / StubPrecode::CodeSize) * StubPrecode::CodeSize;
     for (int i = 0; i < totalCodeSize; i += StubPrecode::CodeSize)
     {
-        memcpy(pageBase + i, (const void*)StubPrecodeCode, (BYTE*)StubPrecodeCode_End - (BYTE*)StubPrecodeCode);
+        memcpy(pageBase + i, (const void*)StubPrecodeCode, (uint8_t*)StubPrecodeCode_End - (uint8_t*)StubPrecodeCode);
 
-        BYTE* pTargetSlot = pageBaseRX + i + pageSize + offsetof(StubPrecodeData, Target);
-        *(BYTE**)(pageBase + i + SYMBOL_VALUE(StubPrecodeCode_Target_Offset)) = pTargetSlot;
+        uint8_t* pTargetSlot = pageBaseRX + i + pageSize + offsetof(StubPrecodeData, Target);
+        *(uint8_t**)(pageBase + i + SYMBOL_VALUE(StubPrecodeCode_Target_Offset)) = pTargetSlot;
 
         BYTE* pMethodDescSlot = pageBaseRX + i + pageSize + offsetof(StubPrecodeData, SecretParam);
-        *(BYTE**)(pageBase + i + SYMBOL_VALUE(StubPrecodeCode_MethodDesc_Offset)) = pMethodDescSlot;
+        *(uint8_t**)(pageBase + i + SYMBOL_VALUE(StubPrecodeCode_MethodDesc_Offset)) = pMethodDescSlot;
     }
 #else // TARGET_X86
     FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)StubPrecodeCode), StubPrecode::CodeSize, pageSize);
@@ -626,6 +645,12 @@ void (*FixupPrecode::FixupPrecodeCode)();
 void (*FixupPrecode::FixupPrecodeCode_End)();
 #endif
 
+#ifdef FEATURE_MAP_THUNKS_FROM_IMAGE
+extern "C" void FixupPrecodeCodeTemplate();
+#else
+#define FixupPrecodeCodeTemplate NULL
+#endif
+
 void FixupPrecode::StaticInitialize()
 {
 #if defined(TARGET_ARM64) && defined(TARGET_UNIX)
@@ -645,6 +670,12 @@ void FixupPrecode::StaticInitialize()
             EEPOLICY_HANDLE_FATAL_ERROR_WITH_MESSAGE(COR_E_EXECUTIONENGINE, W("Unsupported OS page size"));
     }
     #undef ENUM_PAGE_SIZE
+
+    if (FixupPrecodeCodeTemplate != NULL && pageSize != 0x4000)
+    {
+        // This should fail if the template is used on a platform which doesn't support the supported page size for templates
+        ThrowHR(COR_E_EXECUTIONENGINE);
+    }
 #else
     _ASSERTE((SIZE_T)((BYTE*)FixupPrecodeCode_End - (BYTE*)FixupPrecodeCode) <= FixupPrecode::CodeSize);
 #endif
@@ -655,9 +686,11 @@ void FixupPrecode::StaticInitialize()
 #else
     _ASSERTE(*((BYTE*)PCODEToPINSTR((PCODE)FixupPrecodeCode) + OFFSETOF_PRECODE_TYPE) == FixupPrecode::Type);
 #endif
+
+    InitializeLoaderHeapConfig(&s_fixupStubPrecodeHeapConfig, FixupPrecode::CodeSize, (void*)FixupPrecodeCodeTemplate, FixupPrecode::GenerateCodePage);
 }
 
-void FixupPrecode::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T pageSize)
+void FixupPrecode::GenerateCodePage(uint8_t* pageBase, uint8_t* pageBaseRX, size_t pageSize)
 {
 #ifdef TARGET_X86
     int totalCodeSize = (pageSize / FixupPrecode::CodeSize) * FixupPrecode::CodeSize;
@@ -665,14 +698,14 @@ void FixupPrecode::GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T pag
     for (int i = 0; i < totalCodeSize; i += FixupPrecode::CodeSize)
     {
         memcpy(pageBase + i, (const void*)FixupPrecodeCode, FixupPrecode::CodeSize);
-        BYTE* pTargetSlot = pageBaseRX + i + pageSize + offsetof(FixupPrecodeData, Target);
-        *(BYTE**)(pageBase + i + SYMBOL_VALUE(FixupPrecodeCode_Target_Offset)) = pTargetSlot;
+        uint8_t* pTargetSlot = pageBaseRX + i + pageSize + offsetof(FixupPrecodeData, Target);
+        *(uint8_t**)(pageBase + i + SYMBOL_VALUE(FixupPrecodeCode_Target_Offset)) = pTargetSlot;
 
         BYTE* pMethodDescSlot = pageBaseRX + i + pageSize + offsetof(FixupPrecodeData, MethodDesc);
-        *(BYTE**)(pageBase + i + SYMBOL_VALUE(FixupPrecodeCode_MethodDesc_Offset)) = pMethodDescSlot;
+        *(uint8_t**)(pageBase + i + SYMBOL_VALUE(FixupPrecodeCode_MethodDesc_Offset)) = pMethodDescSlot;
 
         BYTE* pPrecodeFixupThunkSlot = pageBaseRX + i + pageSize + offsetof(FixupPrecodeData, PrecodeFixupThunk);
-        *(BYTE**)(pageBase + i + SYMBOL_VALUE(FixupPrecodeCode_PrecodeFixupThunk_Offset)) = pPrecodeFixupThunkSlot;
+        *(uint8_t**)(pageBase + i + SYMBOL_VALUE(FixupPrecodeCode_PrecodeFixupThunk_Offset)) = pPrecodeFixupThunkSlot;
     }
 #else // TARGET_X86
     FillStubCodePage(pageBase, (const void*)PCODEToPINSTR((PCODE)FixupPrecodeCode), FixupPrecode::CodeSize, pageSize);
diff --git a/src/coreclr/vm/precode.h b/src/coreclr/vm/precode.h
index 87570f217292a0..64394d259e91a4 100644
--- a/src/coreclr/vm/precode.h
+++ b/src/coreclr/vm/precode.h
@@ -225,7 +225,7 @@ struct StubPrecode
         pData->Target = (PCODE)target;
     }
 
-    static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size);
+    static void GenerateCodePage(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size);
 
 #endif // !DACCESS_COMPILE
 };
@@ -428,7 +428,7 @@ struct FixupPrecode
 
     static void StaticInitialize();
 
-    static void GenerateCodePage(BYTE* pageBase, BYTE* pageBaseRX, SIZE_T size);
+    static void GenerateCodePage(uint8_t* pageBase, uint8_t* pageBaseRX, size_t size);
 
     PTR_FixupPrecodeData GetData() const
     {
@@ -861,4 +861,9 @@ struct PrecodeMachineDescriptor
 };
 #endif //DACCESS_COMPILE
 
+extern InterleavedLoaderHeapConfig s_stubPrecodeHeapConfig;
+#ifdef HAS_FIXUP_PRECODE
+extern InterleavedLoaderHeapConfig s_fixupStubPrecodeHeapConfig;
+#endif
+
 #endif // __PRECODE_H__

From 5a256fbd94d322cb251202175c4de361d51bbd54 Mon Sep 17 00:00:00 2001
From: David Wrighton <davidwr@microsoft.com>
Date: Mon, 19 May 2025 16:22:04 -0700
Subject: [PATCH 2/3] Disable feature on Apple platforms for now

---
 src/coreclr/clrdefinitions.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/clrdefinitions.cmake b/src/coreclr/clrdefinitions.cmake
index cb3b645ff0e58d..2ce02d51585b09 100644
--- a/src/coreclr/clrdefinitions.cmake
+++ b/src/coreclr/clrdefinitions.cmake
@@ -219,7 +219,8 @@ if (FEATURE_STUBPRECODE_DYNAMIC_HELPERS)
 endif()
 
 if (CLR_CMAKE_TARGET_APPLE)
-  add_definitions(-DFEATURE_MAP_THUNKS_FROM_IMAGE)
+#  Re-enable when the dbgshim is fixed and generally available
+#  add_definitions(-DFEATURE_MAP_THUNKS_FROM_IMAGE)
 endif()
 
 # Use this function to enable building with a specific target OS and architecture set of defines

From b4fc7c0f152fa53ad11150ae3152bc2123b06433 Mon Sep 17 00:00:00 2001
From: David Wrighton <davidwr@microsoft.com>
Date: Tue, 20 May 2025 09:43:23 -0700
Subject: [PATCH 3/3] Update src/coreclr/clrdefinitions.cmake

Co-authored-by: Tom McDonald <tommcdon@microsoft.com>
---
 src/coreclr/clrdefinitions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/clrdefinitions.cmake b/src/coreclr/clrdefinitions.cmake
index 2ce02d51585b09..c7f6ae41d4d8aa 100644
--- a/src/coreclr/clrdefinitions.cmake
+++ b/src/coreclr/clrdefinitions.cmake
@@ -219,7 +219,7 @@ if (FEATURE_STUBPRECODE_DYNAMIC_HELPERS)
 endif()
 
 if (CLR_CMAKE_TARGET_APPLE)
-#  Re-enable when the dbgshim is fixed and generally available
+#  Re-enable when dbgshim containing https://github.com/dotnet/diagnostics/pull/5487 is generally available
 #  add_definitions(-DFEATURE_MAP_THUNKS_FROM_IMAGE)
 endif()